Source code for cblearn.embedding._mlds

from typing import Union

import numpy as np
from sklearn.base import BaseEstimator
from sklearn.utils import check_random_state
from sklearn.linear_model import LogisticRegression
from scipy.stats import norm
from scipy.optimize import minimize

from cblearn import utils
from cblearn.embedding._base import TripletEmbeddingMixin


[docs] class MLDS(BaseEstimator, TripletEmbeddingMixin): """ A maximum-likelihood difference scaling (MLDS) estimator . MLDS [1]_ is limited to monotonic, one-dimensional embeddings. note:: This method assumes, that the objects can be embedded in a one-dimensional space and that the object indices are consistent to their order in this space. Attributes: embedding_: The final embedding, shape (n_objects, 1) log_likelihood_: The final log-likelihood of the embedding. n_iter_: Optimization iterations >>> from cblearn import datasets >>> true_embedding = sorted(np.random.uniform(1, 2, (15, 1))) >>> triplets = datasets.make_random_triplets(true_embedding, size=400, monotonic=True, result_format='list-order') >>> triplets.shape, np.unique(triplets).shape ((400, 3), (15,)) >>> estimator = MLDS(random_state=42).fit(triplets) >>> estimator.embedding_.shape (15, 1) >>> estimator.score(triplets) > 0.9 True >>> estimator = MLDS(method='optim', random_state=42).fit(triplets) >>> estimator.score(triplets) > 0.9 True References ---------- .. [1] M Knoblauch, K., & Maloney, L. T. (2012). Modeling Psychophysical Data in R. Springer New York. https://doi.org/10.1007/978-1-4614-4475-6 """
[docs] def __init__(self, n_components: int = 1, random_state: Union[None, int, np.random.RandomState] = None, method='glm', verbose: int = 0, max_iter: int = 1000): """ Args: n_components: Embedding dimension for api compatibility. Only 1 is supported for MLDS. random_state: The seed of the pseudo random number generator used to initialize the optimization. method: Optimizer method, either 'glm' or 'optim'. verbose: Enable verbose output. max_iter: Maximum number of optimization iterations. """ if n_components != 1: raise ValueError(f"MLDS expects n_components=1, got {n_components}") self.n_components = n_components self.random_state = random_state self.method = method self.verbose = verbose self.max_iter = max_iter
def _log_likelihood(self, x, quadruplet, answer, float_min=np.finfo(float).tiny): prob = norm.cdf((x[quadruplet[:, 0]] - x[quadruplet[:, 1]]) - (x[quadruplet[:, 2]] - x[quadruplet[:, 3]])) log_likelihood = (np.log(np.maximum(prob ** answer, float_min)) + np.log(np.maximum((1 - prob) ** (1 - answer), float_min))) return log_likelihood.sum()
[docs] def fit(self, X: utils.Query, y: np.ndarray = None) -> 'MLDS': """Computes the embedding. Args: X: The training input samples, shape (n_samples, 3) y: Ignored init: Initial embedding for optimization Returns: This estimator """ self.fit_X_ = utils.check_query(X, result_format='list-order') # for data validation in .transform random_state = check_random_state(self.random_state) triplets, answer = utils.check_query_response(X, y, result_format='list-boolean') self.n_features_in_ = 3 n_objects = triplets.max() + 1 quads = triplets[:, [1, 0, 0, 2]] flip = quads[:, [0, 1]].max(axis=1) > quads[:, [2, 3]].min(axis=1) # make sure that we "standardize" the order of quadruplets to ensure # that both True/False answers occur, which is required by the Logistic Regression quads = np.where(np.c_[flip, flip, flip, flip], quads[:, [2, 3, 0, 1]], quads) answer[flip] = ~answer[flip] if self.method.lower() == 'glm': X01, rows = np.zeros((len(quads), n_objects)), np.arange(len(triplets)) X01[rows, quads[:, 0]] += 1 X01[rows, quads[:, 3]] += 1 X01[rows, quads[:, 1]] -= 1 X01[rows, quads[:, 2]] -= 1 glm = LogisticRegression(verbose=self.verbose, max_iter=self.max_iter, fit_intercept=False, random_state=random_state) glm.fit(X01, answer.astype(int)) self.embedding_ = glm.coef_.reshape(-1, 1) self.log_likelihood_ = glm.predict_log_proba(X01)[rows, answer.astype(int)].mean() self.n_iter_ = glm.n_iter_ elif self.method.lower() == 'optim': def objective(*args): return -self._log_likelihood(*args) init = np.linspace(0, 1, n_objects) result = minimize(objective, init, args=(quads, answer), method='L-BFGS-B', options=dict(maxiter=self.max_iter, disp=self.verbose)) if self.verbose and not result.success: print(f"MLDS's optimization failed with reason: {result.message}.") self.embedding_ = result.x.reshape(-1, 1) self.log_likelihood_ = -result.fun self.n_iter_ = result.nit else: raise ValueError(f"Expects optimizer method in {{glm, optim}}, got {self.method}") self.embedding_ -= self.embedding_.min() return self