# Author: Arnaud Joly, Joel Nothman, Hamzeh Alsalhi
#
# License: BSD 3 clause
"""
Multi-class / multi-label utility function
==========================================
"""
from collections.abc import Sequence
from itertools import chain
from scipy.sparse import issparse
from scipy.sparse.base import spmatrix
from scipy.sparse import dok_matrix
from scipy.sparse import lil_matrix
import scipy.sparse as sp
import numpy as np
import pandas as pd
def _assert_all_finite(X, allow_nan=False):
"""Like assert_all_finite, but only for ndarray."""
X = np.asanyarray(X)
# First try an O(n) time, O(1) space solution for the common case that
# everything is finite; fall back to O(n) space np.isfinite to prevent
# false positives from overflow in sum method. The sum is also calculated
# safely to reduce dtype induced overflows.
is_float = X.dtype.kind in "fc"
if is_float and (np.isfinite(np.sum(X))):
pass
elif is_float:
msg_err = "Input contains {} or a value too large for {!r}."
if (
allow_nan
and np.isinf(X).any()
or not allow_nan
and not np.isfinite(X).all()
):
type_err = "infinity" if allow_nan else "NaN, infinity"
raise ValueError(msg_err.format(type_err, X.dtype))
# for object dtype data, we only check for NaNs (GH-13254)
elif X.dtype == np.dtype("object") and not allow_nan:
if np.isnan(X).any():
raise ValueError("Input contains NaN")
[docs]def assert_all_finite(X, allow_nan=False):
"""Throw a ValueError if X contains NaN or infinity.
Parameters
----------
X : array or sparse matrix
allow_nan : bool
"""
_assert_all_finite(X.data if sp.issparse(X) else X, allow_nan)
def _unique_multiclass(y):
if hasattr(y, "__array__"):
return np.unique(np.asarray(y))
else:
return set(y)
def _unique_indicator(y):
"""
Not implemented
"""
raise IndexError(
f"""Given labels are of size {y.shape} while they should be (n_samples,) \n"""
+ """If attempting multilabel classification, try using TabNetMultiTaskClassification """
+ """or TabNetRegressor"""
)
_FN_UNIQUE_LABELS = {
"binary": _unique_multiclass,
"multiclass": _unique_multiclass,
"multilabel-indicator": _unique_indicator,
}
[docs]def unique_labels(*ys):
"""Extract an ordered array of unique labels
We don't allow:
- mix of multilabel and multiclass (single label) targets
- mix of label indicator matrix and anything else,
because there are no explicit labels)
- mix of label indicator matrices of different sizes
- mix of string and integer labels
At the moment, we also don't allow "multiclass-multioutput" input type.
Parameters
----------
*ys : array-likes
Returns
-------
out : numpy array of shape [n_unique_labels]
An ordered array of unique labels.
Examples
--------
>>> from sklearn.utils.multiclass import unique_labels
>>> unique_labels([3, 5, 5, 5, 7, 7])
array([3, 5, 7])
>>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4])
array([1, 2, 3, 4])
>>> unique_labels([1, 2, 10], [5, 11])
array([ 1, 2, 5, 10, 11])
"""
if not ys:
raise ValueError("No argument has been passed.")
# Check that we don't mix label format
ys_types = set(type_of_target(x) for x in ys)
if ys_types == {"binary", "multiclass"}:
ys_types = {"multiclass"}
if len(ys_types) > 1:
raise ValueError("Mix type of y not allowed, got types %s" % ys_types)
label_type = ys_types.pop()
# Get the unique set of labels
_unique_labels = _FN_UNIQUE_LABELS.get(label_type, None)
if not _unique_labels:
raise ValueError("Unknown label type: %s" % repr(ys))
ys_labels = set(chain.from_iterable(_unique_labels(y) for y in ys))
# Check that we don't mix string type with number type
if len(set(isinstance(label, str) for label in ys_labels)) > 1:
raise ValueError("Mix of label input types (string and number)")
return np.array(sorted(ys_labels))
def _is_integral_float(y):
return y.dtype.kind == "f" and np.all(y.astype(int) == y)
[docs]def is_multilabel(y):
"""Check if ``y`` is in a multilabel format.
Parameters
----------
y : numpy array of shape [n_samples]
Target values.
Returns
-------
out : bool
Return ``True``, if ``y`` is in a multilabel format, else ```False``.
Examples
--------
>>> import numpy as np
>>> from sklearn.utils.multiclass import is_multilabel
>>> is_multilabel([0, 1, 0, 1])
False
>>> is_multilabel([[1], [0, 2], []])
False
>>> is_multilabel(np.array([[1, 0], [0, 0]]))
True
>>> is_multilabel(np.array([[1], [0], [0]]))
False
>>> is_multilabel(np.array([[1, 0, 0]]))
True
"""
if hasattr(y, "__array__"):
y = np.asarray(y)
if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1):
return False
if issparse(y):
if isinstance(y, (dok_matrix, lil_matrix)):
y = y.tocsr()
return (
len(y.data) == 0
or np.unique(y.data).size == 1
and (
y.dtype.kind in "biu"
or _is_integral_float(np.unique(y.data)) # bool, int, uint
)
)
else:
labels = np.unique(y)
return len(labels) < 3 and (
y.dtype.kind in "biu" or _is_integral_float(labels) # bool, int, uint
)
[docs]def check_classification_targets(y):
"""Ensure that target y is of a non-regression type.
Only the following target types (as defined in type_of_target) are allowed:
'binary', 'multiclass', 'multiclass-multioutput',
'multilabel-indicator', 'multilabel-sequences'
Parameters
----------
y : array-like
"""
y_type = type_of_target(y)
if y_type not in [
"binary",
"multiclass",
"multiclass-multioutput",
"multilabel-indicator",
"multilabel-sequences",
]:
raise ValueError("Unknown label type: %r" % y_type)
[docs]def type_of_target(y):
"""Determine the type of data indicated by the target.
Note that this type is the most specific type that can be inferred.
For example:
* ``binary`` is more specific but compatible with ``multiclass``.
* ``multiclass`` of integers is more specific but compatible with
``continuous``.
* ``multilabel-indicator`` is more specific but compatible with
``multiclass-multioutput``.
Parameters
----------
y : array-like
Returns
-------
target_type : string
One of:
* 'continuous': `y` is an array-like of floats that are not all
integers, and is 1d or a column vector.
* 'continuous-multioutput': `y` is a 2d array of floats that are
not all integers, and both dimensions are of size > 1.
* 'binary': `y` contains <= 2 discrete values and is 1d or a column
vector.
* 'multiclass': `y` contains more than two discrete values, is not a
sequence of sequences, and is 1d or a column vector.
* 'multiclass-multioutput': `y` is a 2d array that contains more
than two discrete values, is not a sequence of sequences, and both
dimensions are of size > 1.
* 'multilabel-indicator': `y` is a label indicator matrix, an array
of two dimensions with at least two columns, and at most 2 unique
values.
* 'unknown': `y` is array-like but none of the above, such as a 3d
array, sequence of sequences, or an array of non-sequence objects.
Examples
--------
>>> import numpy as np
>>> type_of_target([0.1, 0.6])
'continuous'
>>> type_of_target([1, -1, -1, 1])
'binary'
>>> type_of_target(['a', 'b', 'a'])
'binary'
>>> type_of_target([1.0, 2.0])
'binary'
>>> type_of_target([1, 0, 2])
'multiclass'
>>> type_of_target([1.0, 0.0, 3.0])
'multiclass'
>>> type_of_target(['a', 'b', 'c'])
'multiclass'
>>> type_of_target(np.array([[1, 2], [3, 1]]))
'multiclass-multioutput'
>>> type_of_target([[1, 2]])
'multiclass-multioutput'
>>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))
'continuous-multioutput'
>>> type_of_target(np.array([[0, 1], [1, 1]]))
'multilabel-indicator'
"""
valid = (
isinstance(y, (Sequence, spmatrix)) or hasattr(y, "__array__")
) and not isinstance(y, str)
if not valid:
raise ValueError(
"Expected array-like (array or non-string sequence), " "got %r" % y
)
sparseseries = y.__class__.__name__ == "SparseSeries"
if sparseseries:
raise ValueError("y cannot be class 'SparseSeries'.")
if is_multilabel(y):
return "multilabel-indicator"
try:
y = np.asarray(y)
except ValueError:
# Known to fail in numpy 1.3 for array of arrays
return "unknown"
# The old sequence of sequences format
try:
if (
not hasattr(y[0], "__array__")
and isinstance(y[0], Sequence)
and not isinstance(y[0], str)
):
raise ValueError(
"You appear to be using a legacy multi-label data"
" representation. Sequence of sequences are no"
" longer supported; use a binary array or sparse"
" matrix instead - the MultiLabelBinarizer"
" transformer can convert to this format."
)
except IndexError:
pass
# Invalid inputs
if y.ndim > 2 or (y.dtype == object and len(y) and not isinstance(y.flat[0], str)):
return "unknown" # [[[1, 2]]] or [obj_1] and not ["label_1"]
if y.ndim == 2 and y.shape[1] == 0:
return "unknown" # [[]]
if y.ndim == 2 and y.shape[1] > 1:
suffix = "-multioutput" # [[1, 2], [1, 2]]
else:
suffix = "" # [1, 2, 3] or [[1], [2], [3]]
# check float and contains non-integer float values
if y.dtype.kind == "f" and np.any(y != y.astype(int)):
# [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
_assert_all_finite(y)
return "continuous" + suffix
if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
return "multiclass" + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
else:
return "binary" # [1, 2] or [["a"], ["b"]]
[docs]def check_unique_type(y):
target_types = pd.Series(y).map(type).unique()
if len(target_types) != 1:
raise TypeError(
f"Values on the target must have the same type. Target has types {target_types}"
)
[docs]def infer_output_dim(y_train):
"""
Infer output_dim from targets
Parameters
----------
y_train : np.array
Training targets
Returns
-------
output_dim : int
Number of classes for output
train_labels : list
Sorted list of initial classes
"""
check_unique_type(y_train)
train_labels = unique_labels(y_train)
output_dim = len(train_labels)
return output_dim, train_labels
[docs]def check_output_dim(labels, y):
if y is not None:
check_unique_type(y)
valid_labels = unique_labels(y)
if not set(valid_labels).issubset(set(labels)):
raise ValueError(
f"""Valid set -- {set(valid_labels)} --
contains unkown targets from training --
{set(labels)}"""
)
return
[docs]def infer_multitask_output(y_train):
"""
Infer output_dim from targets
This is for multiple tasks.
Parameters
----------
y_train : np.ndarray
Training targets
Returns
-------
tasks_dims : list
Number of classes for output
tasks_labels : list
List of sorted list of initial classes
"""
if len(y_train.shape) < 2:
raise ValueError(
"y_train should be of shape (n_examples, n_tasks)"
+ f"but got {y_train.shape}"
)
nb_tasks = y_train.shape[1]
tasks_dims = []
tasks_labels = []
for task_idx in range(nb_tasks):
try:
output_dim, train_labels = infer_output_dim(y_train[:, task_idx])
tasks_dims.append(output_dim)
tasks_labels.append(train_labels)
except ValueError as err:
raise ValueError(f"""Error for task {task_idx} : {err}""")
return tasks_dims, tasks_labels