Source code for cblearn.embedding._dims

""" This file contains dimension estimation utilities. """
from dataclasses import dataclass
from sklearn.model_selection import validation_curve
from sklearn.model_selection import RepeatedKFold
from scipy import stats
import numpy as np

from cblearn import utils


[docs] @dataclass class DimensionEstimationResult: """ Result object for dimensionality estimation of embeddings. Attributes: estimated_dimension: The estimated dimensionality dimensions: The tested dimensions train_scores: The training scores for each dimension test_scores: The test scores for each dimension stats_result: The result of the hypothesis test """ estimated_dimension: int # The estimated dimensionality dimensions: np.ndarray train_scores: np.ndarray test_scores: np.ndarray stats_result: dict
[docs] def plot_scores(self, train_kwargs={}, test_kwargs={}): """ Plot the train and test scores per dimesionality of the embedding. Args: train_kwargs: Keyword arguments for the training scores plot. test_kwargs: Keyword arguments for the test scores plot. """ import matplotlib.pyplot as plt plot_validation_curve(self.dimensions, self.train_scores, self.test_scores, train_kwargs, test_kwargs) plt.axvline(self.estimated_dimension, color='k', linestyle='--', label="Estimated Dimension") plt.legend(loc="best")
def plot_validation_curve(x, train_scores, test_scores, train_kwargs={}, test_kwargs={}): import matplotlib.pyplot as plt _train_kwargs = {"lw": 2, "color": "C0"} _train_kwargs.update(train_kwargs) _test_kwargs = {"lw": 2, "color": "C1"} _test_kwargs.update(test_kwargs) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.plot( x, train_scores_mean, label="Training", **_train_kwargs ) plt.fill_between( x, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, **_train_kwargs ) plt.plot( x, test_scores_mean, label="Validation", **_test_kwargs ) plt.fill_between( x, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, **_test_kwargs ) def _holm_correction(pvals, alpha=0.05, is_sorted=False, returnsorted=False): """ Holm-Bonferroni method for multiple hypothesis testing correction. This code was adapted from the statsmodels library, which is licensed under BSD-3. """ pvals = np.asarray(pvals) alphaf = alpha # Notation ? if not is_sorted: sortind = np.argsort(pvals) pvals = np.take(pvals, sortind) ntests = len(pvals) alphacSidak = 1 - np.power((1. - alphaf), 1./ntests) alphacBonf = alphaf / float(ntests) notreject = pvals > alphaf / np.arange(ntests, 0, -1) nr_index = np.nonzero(notreject)[0] if nr_index.size == 0: # nonreject is empty, all rejected notrejectmin = len(pvals) else: notrejectmin = np.min(nr_index) notreject[notrejectmin:] = True reject = ~notreject pvals_corrected_raw = pvals * np.arange(ntests, 0, -1) pvals_corrected = np.maximum.accumulate(pvals_corrected_raw) del pvals_corrected_raw if pvals_corrected is not None: #not necessary anymore pvals_corrected[pvals_corrected > 1] = 1 if is_sorted or returnsorted: return reject, pvals_corrected, alphacSidak, alphacBonf else: pvals_corrected_ = np.empty_like(pvals_corrected) pvals_corrected_[sortind] = pvals_corrected del pvals_corrected reject_ = np.empty_like(reject) reject_[sortind] = reject return reject_, pvals_corrected_, alphacSidak, alphacBonf def _sequential_crossval_ttest(test_scores_cv, n_splits, alpha): """ sample_sequence: Array of shape (n_samples, n_steps) """ differences = np.diff(test_scores_cv) n_samples, n_steps = differences.shape test_train_ratio = 1 / (n_splits - 1) effect = differences.mean(axis=0) / differences.std(axis=0, ddof=1) # Nadeau and Bengio correction of dependent Student's t-test due to Cross Validation t_stats = effect / np.sqrt(1 / n_samples + test_train_ratio) p_values = stats.t.sf(t_stats, n_samples - 1) # right-tailed t-test assert t_stats.shape == (n_steps,) # holm-bonferroni correction result = _holm_correction(p_values, alpha=alpha) return {'reject': result[0], 'pvals': p_values, 'pvals_corrected': result[1], 'tstats': t_stats, 'effectsize': effect, 'alpha': alpha, 'alpha_corrected': result[3], 'step': np.arange(n_steps)}
[docs] def estimate_dimensionality_cv(estimator, queries, responses=None, test_dimensions: list = [1, 2, 3], n_splits=10, n_repeats=1, refit=True, alpha=0.05, param_name="n_components", n_jobs=-1, random_state=None): """ Estimates the dimensionality of the embedding space. The procedure estimates embeddings for the provided *test_dimensions* and evaluates the fit (triplet accuracy) through cross-validation [1]_. The estimated dimension is the lowest, that has the best fit for the provided data. The test compares the increase in accuracy; if the increase is not significant, the dimension is considered to be sufficient. Testing a larger range of dimensions can reduce the test sensitivity due to multiple testing correction. Attributes: estimator: The embedding estimator to use. queries: The triplet queries to embed. responses: Optional responses, if not encoded in triplets. test_dimensions: The dimensions to test as a monotonic increasing list. n_splits: The number of splits to use for cross-validation. n_repeats: The number of repeatitions of each cross-validation split. Use 1 for fast results, but 10 or more for more reliable results. refit: if true, then fit the estimator on the entire dataset using the best dimensionality. alpha: The significance level for the hypothesis test. param_name: The name of the estimator parameter that describes the embedding dimensionality. n_jobs: The number of parallel jobs to use for cross-validation. random_state: The random state or seed to use for CV splits. Returns: result: A result object with the estimated dimension and other information. Examples: >>> from cblearn.embedding import estimate_dimensionality_cv >>> from cblearn.embedding import SOE >>> from cblearn.datasets import make_random_triplets >>> rs = np.random.RandomState(42) >>> true_embedding = rs.rand(15, 2) # 15 points in 2D >>> triplets = make_random_triplets(true_embedding, result_format='list-order', size=1000, random_state=rs) >>> estimator = SOE(n_components=1) >>> dim_result = estimate_dimensionality_cv(estimator, triplets, test_dimensions=[1, 2, 3], n_splits=5, refit=True) >>> dim_result.estimated_dimension 2 >>> true_embedding.shape == estimator.embedding_.shape True >>> dim_result.plot_scores() References ---------- .. [1] Künstle, D.-E., von Luxburg, U., & Wichmann, F. A. (2022). Estimating the perceived dimension of psychophysical stimuli using triplet accuracy and hypothesis testing. Journal of Vision, 22(13), 5. https://doi.org/10.1167/jov.22.13.5 """ if np.diff(test_dimensions).min() < 1: raise ValueError("test_dimensions must be monotonically increasing") queries = utils.check_query_response(queries, responses, result_format='list-order') cv_folds = RepeatedKFold(n_repeats=n_repeats, n_splits=n_splits, random_state=random_state) train_scores, test_scores = validation_curve( estimator, queries, y=None, cv=cv_folds, n_jobs=n_jobs, param_name=param_name, param_range=test_dimensions) stat_results = _sequential_crossval_ttest(test_scores.T, n_splits, alpha=alpha) for ix, reject in enumerate(stat_results['reject']): if not reject: estimated_dimension = test_dimensions[ix] break else: estimated_dimension = test_dimensions[-1] if refit: estimator.set_params(**{param_name: estimated_dimension}) estimator.fit(queries, y=None) return DimensionEstimationResult(estimated_dimension, test_dimensions, train_scores, test_scores, stat_results)