Source code for cblearn.embedding._dims

""" This file contains dimension estimation utilities. """
from dataclasses import dataclass
from sklearn.model_selection import validation_curve
from sklearn.model_selection import RepeatedKFold
from scipy import stats
import numpy as np

from cblearn import utils



[docs]
@dataclass
class DimensionEstimationResult:
    """ Result object for dimensionality estimation of embeddings.

    Attributes:
        estimated_dimension: The estimated dimensionality
        dimensions: The tested dimensions
        train_scores: The training scores for each dimension
        test_scores: The test scores for each dimension
        stats_result: The result of the hypothesis test
    """
    estimated_dimension: int  # The estimated dimensionality
    dimensions: np.ndarray
    train_scores: np.ndarray
    test_scores: np.ndarray
    stats_result: dict


[docs]
    def plot_scores(self, train_kwargs={}, test_kwargs={}):
        """ Plot the train and test scores per dimesionality of the embedding.

        Args:
            train_kwargs: Keyword arguments for the training scores plot.
            test_kwargs: Keyword arguments for the test scores plot.
        """
        import matplotlib.pyplot as plt

        plot_validation_curve(self.dimensions, self.train_scores, self.test_scores,
                              train_kwargs, test_kwargs)
        plt.axvline(self.estimated_dimension, color='k', linestyle='--', label="Estimated Dimension")
        plt.legend(loc="best")




def plot_validation_curve(x, train_scores, test_scores, train_kwargs={}, test_kwargs={}):
    import matplotlib.pyplot as plt

    _train_kwargs = {"lw": 2, "color": "C0"}
    _train_kwargs.update(train_kwargs)
    _test_kwargs = {"lw": 2, "color": "C1"}
    _test_kwargs.update(test_kwargs)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.plot(
        x, train_scores_mean, label="Training", **_train_kwargs
    )
    plt.fill_between(
        x,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.2,
        **_train_kwargs
    )
    plt.plot(
        x, test_scores_mean, label="Validation", **_test_kwargs
    )
    plt.fill_between(
        x,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.2,
        **_test_kwargs
    )


def _holm_correction(pvals, alpha=0.05, is_sorted=False, returnsorted=False):
    """ Holm-Bonferroni method for multiple hypothesis testing correction.
        This code was adapted from the statsmodels library, which is licensed under BSD-3.
    """
    pvals = np.asarray(pvals)
    alphaf = alpha  # Notation ?

    if not is_sorted:
        sortind = np.argsort(pvals)
        pvals = np.take(pvals, sortind)

    ntests = len(pvals)
    alphacSidak = 1 - np.power((1. - alphaf), 1./ntests)
    alphacBonf = alphaf / float(ntests)
    notreject = pvals > alphaf / np.arange(ntests, 0, -1)
    nr_index = np.nonzero(notreject)[0]
    if nr_index.size == 0: # nonreject is empty, all rejected
        notrejectmin = len(pvals)
    else:
        notrejectmin = np.min(nr_index)
    notreject[notrejectmin:] = True
    reject = ~notreject
    pvals_corrected_raw = pvals * np.arange(ntests, 0, -1)
    pvals_corrected = np.maximum.accumulate(pvals_corrected_raw)
    del pvals_corrected_raw

    if pvals_corrected is not None:  #not necessary anymore
        pvals_corrected[pvals_corrected > 1] = 1
    if is_sorted or returnsorted:
        return reject, pvals_corrected, alphacSidak, alphacBonf
    else:
        pvals_corrected_ = np.empty_like(pvals_corrected)
        pvals_corrected_[sortind] = pvals_corrected
        del pvals_corrected
        reject_ = np.empty_like(reject)
        reject_[sortind] = reject
        return reject_, pvals_corrected_, alphacSidak, alphacBonf


def _sequential_crossval_ttest(test_scores_cv, n_splits, alpha):
    """
    sample_sequence: Array of shape (n_samples, n_steps)
    """
    differences = np.diff(test_scores_cv)
    n_samples, n_steps = differences.shape

    test_train_ratio = 1 / (n_splits - 1)
    effect = differences.mean(axis=0) / differences.std(axis=0, ddof=1)
    # Nadeau and Bengio correction of dependent Student's t-test due to Cross Validation
    t_stats = effect / np.sqrt(1 / n_samples + test_train_ratio)

    p_values = stats.t.sf(t_stats, n_samples - 1)  # right-tailed t-test
    assert t_stats.shape == (n_steps,)

    # holm-bonferroni correction
    result = _holm_correction(p_values, alpha=alpha)
    return {'reject': result[0], 'pvals': p_values, 'pvals_corrected': result[1],
            'tstats': t_stats, 'effectsize': effect,
            'alpha': alpha, 'alpha_corrected': result[3], 'step': np.arange(n_steps)}



[docs]
def estimate_dimensionality_cv(estimator, queries, responses=None,
                               test_dimensions: list = [1, 2, 3], n_splits=10, n_repeats=1,
                               refit=True, alpha=0.05, param_name="n_components", n_jobs=-1, random_state=None):
    """ Estimates the dimensionality of the embedding space.

        The procedure estimates embeddings for the provided *test_dimensions*
        and evaluates the fit (triplet accuracy) through cross-validation [1]_.
        The estimated dimension is the lowest,
        that has the best fit for the provided data. The test compares the increase in accuracy;
        if the increase is not significant, the dimension is considered to be sufficient.
        Testing a larger range of dimensions can reduce the test sensitivity due to multiple testing correction.

        Attributes:
          estimator: The embedding estimator to use.
          queries: The triplet queries to embed.
          responses: Optional responses, if not encoded in triplets.
          test_dimensions: The dimensions to test as a monotonic increasing list.
          n_splits: The number of splits to use for cross-validation.
          n_repeats: The number of repeatitions of each cross-validation split.
                     Use 1 for fast results, but 10 or more for more reliable results.
          refit: if true, then fit the estimator on the entire dataset using the best dimensionality.
          alpha: The significance level for the hypothesis test.
          param_name: The name of the estimator parameter that describes the embedding dimensionality.
          n_jobs: The number of parallel jobs to use for cross-validation.
          random_state: The random state or seed to use for CV splits.

        Returns:
          result: A result object with the estimated dimension and other information.

        Examples:

        >>> from cblearn.embedding import estimate_dimensionality_cv
        >>> from cblearn.embedding import SOE
        >>> from cblearn.datasets import make_random_triplets
        >>> rs = np.random.RandomState(42)
        >>> true_embedding = rs.rand(15, 2)  # 15 points in 2D
        >>> triplets = make_random_triplets(true_embedding, result_format='list-order', size=1000, random_state=rs)
        >>> estimator = SOE(n_components=1)
        >>> dim_result = estimate_dimensionality_cv(estimator, triplets, test_dimensions=[1, 2, 3], n_splits=5, refit=True)
        >>> dim_result.estimated_dimension
        2
        >>> true_embedding.shape == estimator.embedding_.shape
        True
        >>> dim_result.plot_scores()


        References
        ----------
        .. [1]  Künstle, D.-E., von Luxburg, U., & Wichmann, F. A. (2022).
                Estimating the perceived dimension of psychophysical stimuli
                using triplet accuracy and hypothesis testing.
                Journal of Vision, 22(13), 5. https://doi.org/10.1167/jov.22.13.5
    """
    if np.diff(test_dimensions).min() < 1:
        raise ValueError("test_dimensions must be monotonically increasing")

    queries = utils.check_query_response(queries, responses, result_format='list-order')
    cv_folds = RepeatedKFold(n_repeats=n_repeats, n_splits=n_splits, random_state=random_state)
    train_scores, test_scores = validation_curve(
        estimator, queries, y=None, cv=cv_folds, n_jobs=n_jobs,
        param_name=param_name, param_range=test_dimensions)

    stat_results = _sequential_crossval_ttest(test_scores.T, n_splits, alpha=alpha)
    for ix, reject in enumerate(stat_results['reject']):
        if not reject:
            estimated_dimension = test_dimensions[ix]
            break
    else:
        estimated_dimension = test_dimensions[-1]

    if refit:
        estimator.set_params(**{param_name: estimated_dimension})
        estimator.fit(queries, y=None)
    return DimensionEstimationResult(estimated_dimension, test_dimensions, train_scores, test_scores, stat_results)