from typing import Dict, Union, List, Tuple, Optional
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
import numpy as np
def _unique_firstaxis(X, return_inverse: bool = False):
""" Find sorted, unique array elements.
Has a (slow) fallback, if the numpy array is mixed typed and
cannot be used with numpy's unique method.
>>> _unique_firstaxis([[0.1, 'high'], [0.3, 'low'], [0.1, 'high'], [0.4, 'low']]).tolist()
[['0.1', 'high'], ['0.3', 'low'], ['0.4', 'low']]
>>> u, i = _unique_firstaxis([[0.1, 'high'], [0.3, 'low'], [0.1, 'high'], [0.4, 'low']], return_inverse=True)
>>> u.tolist(), i.tolist()
([['0.1', 'high'], ['0.3', 'low'], ['0.4', 'low']], [0, 1, 0, 2])
"""
X = check_array(X, dtype=None, ensure_2d=True)
if X.dtype == object:
py_X = tuple(map(tuple, X))
uniques = sorted(set(py_X))
if return_inverse:
ix_map = {val: ix for ix, val in enumerate(uniques)}
indices = [ix_map[val] for val in py_X]
return np.array(uniques, dtype=X.dtype), np.array(indices)
else:
return uniques
else:
return np.unique(X, axis=0, return_inverse=return_inverse)
[docs]
class MultiColumnLabelEncoder(LabelEncoder):
""" Encoder for objects that are a combination of labels in multiple columns.
Extends the function of scikit-learn's label encoder to 2d arrays.
See :class:`sklearn.preprocessing.LabelEncoder` for more information.
>>> encoder = MultiColumnLabelEncoder()
>>> label_data = [[0.1, 'high'], [0.3, 'low'], [0.1, 'high'], [0.1, 'low']]
>>> encoder.fit(label_data).transform(label_data).tolist()
[0, 2, 0, 1]
>>> encoder.fit_transform(label_data).tolist()
[0, 2, 0, 1]
>>> encoder.inverse_transform([2, 1, 0]).tolist()
[['0.3', 'low'], ['0.1', 'low'], ['0.1', 'high']]
"""
[docs]
def fit(self, X, y=None):
self.classes_ = _unique_firstaxis(X)
return self
[docs]
class SharedColumnEncoder(TransformerMixin, BaseEstimator):
""" Wrapper to share an encoder across all columns.
>>> encoder = SharedColumnEncoder(LabelEncoder())
>>> label_data = [[0.1, 0.3, 0.4], [0.4, 0.1, 0.3], [0.5, 0.3, 0.3]]
>>> encoder.fit(label_data).transform(label_data).tolist()
[[0, 1, 2], [2, 0, 1], [3, 1, 1]]
>>> encoder.fit_transform(label_data).tolist()
[[0, 1, 2], [2, 0, 1], [3, 1, 1]]
>>> encoder.inverse_transform([[2, 2], [1, 0], [0, 1]]).tolist()
[[0.4, 0.4], [0.3, 0.1], [0.1, 0.3]]
"""
[docs]
def __init__(self, encoder):
self.encoder_ = encoder
def fit(self, X, y=None):
X = check_array(X, allow_nd=True, dtype=None)
self.encoder_.fit(X.reshape(-1, *X.shape[2:]))
return self
def transform(self, X, y=None):
X = check_array(X, allow_nd=True, dtype=None)
return self.encoder_.transform(X.reshape(-1, *X.shape[2:])).reshape(X.shape[:2])
def inverse_transform(self, X, y=None):
X = check_array(X, allow_nd=True, dtype=None)
return self.encoder_.inverse_transform(X.reshape(-1, *X.shape[2:])).reshape(X.shape[0], -1)
def _more_tags(self):
return {}
[docs]
def query_from_columns(data: Union[np.ndarray, "pandas.DataFrame"], # noqa: F821 ignore pandas, not a library dep
query_columns: Union[List[str], List[int]],
response_columns: Optional[Union[List[str], List[int], str, int]] = None,
response_map: Optional[Dict[str, Union[bool, int]]] = None,
return_transformer: bool = False) \
-> Union[Tuple[np.ndarray, np.ndarray],
Tuple[Tuple[np.ndarray, np.ndarray], Tuple[TransformerMixin, TransformerMixin]]]:
""" Extract queries with indices from feature columns in a DataFrame.
Comparison-based data in this libarary is typically represented by a collection
of unique object indices. For example, [[1, 0, 2], [0, 2, 3]] could encode two triplet
comparisons between objects 0, 1, 2, and 3.
Experimental data, however, often stores the objects as featurized columns in a dataframe, describing the
presented stimuli.
There the same comparisons could be represented by two rows with columns
`alpha1`, `tau1` `alpha2`, `tau2`, `alpha3`, `tau3` and `Response`.
The `query_from_columns` function allows to extract the comparsion queries
from such a dataframe by identifying the unique objects (e.g. unique combination of `alpha` and `tau` here).
.. note::
If the dataframe already contains unique indices for the objects per query,
consider accessing the indices directly, e.g. `df[['anchor_ix', 'pos_ix', 'neg_ix']].values.astype(int), df['response'].values.astype(bool)`.
>>> import pandas as pd
>>> frame = pd.DataFrame({'alpha1': [0.1, 0.7, 0.1], 'tau1': [0, 0, 1],
... 'alpha2': [0.3, 0.3, 0.7], 'tau2': [1, 0, 0],
... 'alpha3': [0.7, 0.3, 0.7], 'tau3': [0, 1, 0], 'Response': [1, 0, 0]})
>>> q, r = query_from_columns(frame, ['alpha1', 'alpha2', 'alpha3'], 'Response', response_map={1: True, 0: False})
>>> q.tolist(), r.tolist()
([[0, 1, 2], [2, 1, 1], [0, 2, 2]], [True, False, False])
>>> q, r = query_from_columns(np.array(frame), [0, 2, 4], response_columns=-1, response_map={1: True, 0: False})
>>> q.tolist(), r.tolist()
([[0, 1, 2], [2, 1, 1], [0, 2, 2]], [True, False, False])
>>> q, r = query_from_columns(frame, [('alpha1', 'tau1'), ('alpha2', 'tau2'), ('alpha3', 'tau3')],
... response_columns='Response', response_map={1: True, 0: False})
>>> q.tolist(), r.tolist()
([[0, 3, 4], [4, 2, 3], [1, 4, 4]], [True, False, False])
>>> q, r = query_from_columns(frame, [('alpha1', 'tau1'), ('alpha2', 'tau2'), ('alpha3', 'tau3')],
... response_columns='Response')
>>> q.tolist(), r.tolist()
([[0, 3, 4], [4, 2, 3], [1, 4, 4]], [1, 0, 0])
The indices can be used to get the object attributes from the dataframe.
This might be helpful in visulizations and for debugging.
In the following example, the object-feature to object-index transformer object
is accessed to get the object attributes from the object index.
>>> (q,r), (q_transform, r_transform) = query_from_columns(
... np.array(frame), [0, 2, 4], -1, {1: True, 0: False}, return_transformer=True)
>>> q_transform.inverse_transform(q).tolist()
[[0.1, 0.3, 0.7], [0.7, 0.3, 0.3], [0.1, 0.7, 0.7]]
Args:
data: Tabular query representation (n_queries, n_columns)
query_columns: Indices or column-labels in data per query entry.
Columns can be grouped as tuples, if multiple columns define an object.
response_columns: Indices or column-labels in data per response entry.
response_map: Dictionary mapping the response entries in data to {-1, 1} or {False, True}. If none, use the original response.
return_transformer: If true, transformer objects for the query and response are returned.
Returns:
Tuple with arrays for the queries and responses.
If return_transform=True, an additional tuple with transformer objects is returned.
"""
if not hasattr(data, 'columns'): # is no pandas Dataframe?
data = check_array(data, dtype=None).T
query_data = np.swapaxes(np.stack([data[np.array(c)] for c in query_columns]), 0, 1)
if len(query_data.shape) == 3:
query_enc = SharedColumnEncoder(MultiColumnLabelEncoder())
else:
query_enc = SharedColumnEncoder(LabelEncoder())
query = query_enc.fit_transform(query_data)
if response_columns:
if response_map is None:
response_enc = FunctionTransformer(
func=lambda x: x,
inverse_func=lambda x: x,
check_inverse=False)
else:
inverse_map = {v: k for k, v in response_map.items()}
response_enc = FunctionTransformer(
func=np.vectorize(response_map.get),
inverse_func=np.vectorize(inverse_map.get),
check_inverse=False)
response = response_enc.fit_transform(data[response_columns])
if return_transformer:
return (query, response), (query_enc, response_enc)
else:
return query, response
else:
if return_transformer:
return query, query_enc
else:
return query