Source code for cblearn.embedding._mlds

from typing import Union

import numpy as np
from sklearn.base import BaseEstimator
from sklearn.utils import check_random_state
from sklearn.linear_model import LogisticRegression
from scipy.stats import norm
from scipy.optimize import minimize

from cblearn import utils
from cblearn.embedding._base import TripletEmbeddingMixin



[docs]
class MLDS(BaseEstimator, TripletEmbeddingMixin):
    """ A maximum-likelihood difference scaling (MLDS) estimator .

    MLDS [1]_ is limited to monotonic, one-dimensional embeddings.

    note::
        This method assumes, that the objects can be embedded in a one-dimensional space
        and that the object indices are consistent to their order in this space.

    Attributes:
        embedding_: The final embedding, shape (n_objects, 1)
        log_likelihood_: The final log-likelihood of the embedding.
        n_iter_: Optimization iterations


    >>> from cblearn import datasets
    >>> true_embedding = sorted(np.random.uniform(1, 2, (15, 1)))
    >>> triplets = datasets.make_random_triplets(true_embedding, size=400, monotonic=True, result_format='list-order')
    >>> triplets.shape, np.unique(triplets).shape
    ((400, 3), (15,))
    >>> estimator = MLDS(random_state=42).fit(triplets)
    >>> estimator.embedding_.shape
    (15, 1)
    >>> estimator.score(triplets) > 0.9
    True
    >>> estimator = MLDS(method='optim', random_state=42).fit(triplets)
    >>> estimator.score(triplets) > 0.9
    True


    References
    ----------
    .. [1] M Knoblauch, K., & Maloney, L. T. (2012). Modeling Psychophysical Data in R.
           Springer New York. https://doi.org/10.1007/978-1-4614-4475-6
    """


[docs]
    def __init__(self, n_components: int = 1, random_state: Union[None, int, np.random.RandomState] = None,
                 method='glm', verbose: int = 0, max_iter: int = 1000):
        """
        Args:
            n_components: Embedding dimension for api compatibility. Only 1 is supported for MLDS.
            random_state: The seed of the pseudo random number generator used to initialize the optimization.
            method: Optimizer method, either 'glm' or 'optim'.
            verbose: Enable verbose output.
            max_iter: Maximum number of optimization iterations.
        """
        if n_components != 1:
            raise ValueError(f"MLDS expects n_components=1, got {n_components}")
        self.n_components = n_components
        self.random_state = random_state
        self.method = method
        self.verbose = verbose
        self.max_iter = max_iter


    def _log_likelihood(self, x, quadruplet, answer, float_min=np.finfo(float).tiny):
        prob = norm.cdf((x[quadruplet[:, 0]] - x[quadruplet[:, 1]])
                        - (x[quadruplet[:, 2]] - x[quadruplet[:, 3]]))
        log_likelihood = (np.log(np.maximum(prob ** answer, float_min))
                          + np.log(np.maximum((1 - prob) ** (1 - answer), float_min)))
        return log_likelihood.sum()


[docs]
    def fit(self, X: utils.Query, y: np.ndarray = None) -> 'MLDS':
        """Computes the embedding.

        Args:
            X: The training input samples, shape (n_samples, 3)
            y: Ignored
            init: Initial embedding for optimization
        Returns:
            This estimator
        """
        self.fit_X_ = utils.check_query(X, result_format='list-order')  # for data validation in .transform
        random_state = check_random_state(self.random_state)

        triplets, answer = utils.check_query_response(X, y, result_format='list-boolean')
        self.n_features_in_ = 3
        n_objects = triplets.max() + 1
        quads = triplets[:, [1, 0, 0, 2]]
        flip = quads[:, [0, 1]].max(axis=1) > quads[:, [2, 3]].min(axis=1)
        # make sure that we "standardize" the order of quadruplets to ensure
        # that both True/False answers occur, which is required by the Logistic Regression
        quads = np.where(np.c_[flip, flip, flip, flip], quads[:, [2, 3, 0, 1]], quads)
        answer[flip] = ~answer[flip]
        if self.method.lower() == 'glm':
            X01, rows = np.zeros((len(quads), n_objects)), np.arange(len(triplets))
            X01[rows, quads[:, 0]] += 1
            X01[rows, quads[:, 3]] += 1
            X01[rows, quads[:, 1]] -= 1
            X01[rows, quads[:, 2]] -= 1
            glm = LogisticRegression(verbose=self.verbose, max_iter=self.max_iter,
                                     fit_intercept=False, random_state=random_state)
            glm.fit(X01, answer.astype(int))
            self.embedding_ = glm.coef_.reshape(-1, 1)
            self.log_likelihood_ = glm.predict_log_proba(X01)[rows, answer.astype(int)].mean()
            self.n_iter_ = glm.n_iter_
        elif self.method.lower() == 'optim':
            def objective(*args):
                return -self._log_likelihood(*args)

            init = np.linspace(0, 1, n_objects)
            result = minimize(objective, init, args=(quads, answer),
                              method='L-BFGS-B', options=dict(maxiter=self.max_iter, disp=self.verbose))
            if self.verbose and not result.success:
                print(f"MLDS's optimization failed with reason: {result.message}.")
            self.embedding_ = result.x.reshape(-1, 1)
            self.log_likelihood_ = -result.fun
            self.n_iter_ = result.nit
        else:
            raise ValueError(f"Expects optimizer method in {{glm, optim}}, got {self.method}")

        self.embedding_ -= self.embedding_.min()
        return self