Source code for cblearn.datasets._triplet_response

""" Function in this file judge triplets, based on ground-truth embedding and possible noise patterns. """
from typing import Dict, Callable, Optional, Union

from sklearn.utils import check_random_state, check_array
from sklearn.metrics import pairwise
import numpy as np

from cblearn import utils
from cblearn.datasets._datatypes import NoiseTarget, Distance


def _count_unique_items(query):
    """ Count unique items per row in a 2D array.

    Efficient approach even for large number of rows
    and integer items:
    https://stackoverflow.com/a/48473125
    """
    sorted_query = np.sort(query, axis=1)
    return (sorted_query[:, 1:] != sorted_query[:, :-1]).sum(axis=1) + 1


def _count_unique_items(query):
    """ Count unique items per row in a 2D array.

    Efficient approach even for large number of rows
    and integer items:
    https://stackoverflow.com/a/48473125
    """
    sorted_query = np.sort(query, axis=1)
    return (sorted_query[:, 1:] != sorted_query[:, :-1]).sum(axis=1) + 1


[docs] def noisy_triplet_response(triplets: utils.Query, embedding: np.ndarray, result_format: Optional[str] = None, noise: Union[None, str, Callable] = None, noise_options: Dict = {}, noise_target: Union[str, NoiseTarget] = 'differences', random_state: Union[None, int, np.random.RandomState] = None, distance: Union[str, Distance] = 'euclidean' ) -> utils.Response: """ Triplet response for an embedding with noise. Args: triplets: Numpy array or sparse matrix of triplet indices embedding: Numpy array of object coordinates, (n_objects, n_components) or distance matrix (n_objects, n_objects) result_format: Format of the result. If none, keeps input format. noise: Noise distribution. Can be the name of a distribution function from :class:`numpy.random.RandomState` or a function accepting the same arguments. If None, no noise will be applied. noise_options: Additional arguments passed to the noise function as keyword arguments. noise_target: 'points' if noise should be added to triplet coordinates or 'differences' if noise should be added to distance difference. random_state: State or seed for noise sampling. distance: {'euclidean', 'precomputed'}. Specifies distance metrix between embedding points or if distances are passed directly as distance matrix. Returns: Response in format as defined by response_format, either numpy array (n_triplets,) or sparse matrix If return_indices is True, a tuple of indices and responses can be returned >>> from cblearn.datasets import noisy_triplet_response >>> triplets = [[0, 1, 2], [1, 2, 3]] >>> embedding = [[0.1], [0.5], [0.9], [1.]] >>> noisy_triplet_response(triplets, embedding, result_format='list-order') array([[0, 1, 2], [1, 2, 3]], dtype=uint32) >>> noisy_triplet_response(triplets, embedding, result_format='list-order', ... noise='normal', noise_options={'scale': 1}, random_state=42) array([[0, 2, 1], [1, 2, 3]], dtype=uint32) >>> from sklearn.metrics.pairwise import euclidean_distances >>> distances = euclidean_distances(embedding) >>> print(distances.shape) (4, 4) >>> noisy_triplet_response(triplets, distances, result_format='list-order', distance='precomputed') array([[0, 1, 2], [1, 2, 3]], dtype=uint32) """ noise_target = NoiseTarget(noise_target) distance = Distance(distance) result_format = utils.check_format(result_format, triplets, None) triplets: np.ndarray = utils.check_query(triplets, result_format=utils.QueryFormat.LIST) embedding = check_array(embedding) if triplets.shape[1] != 3: raise ValueError("Triplets require 3 columns.") if (triplets < 0).any() or (triplets >= embedding.shape[0]).any(): raise ValueError("Triplet indices must be within the range of the embedding.") non_unique_rows = _count_unique_items(triplets) != 3 if (non_unique_rows).any(): raise ValueError(f"Triplets must contain unique indices, got {triplets[non_unique_rows]}.") if isinstance(noise, str): random_state = check_random_state(random_state) noise_fun: Callable = getattr(random_state, noise) elif callable(noise): noise_fun = noise if distance is Distance.EUCLIDEAN: y_triplets = embedding[triplets.ravel()].reshape(-1, 3, embedding.shape[1]) if noise is not None and noise_target is NoiseTarget.POINTS: y_triplets += noise_fun(size=y_triplets.shape, **noise_options) near_distance = pairwise.paired_euclidean_distances(y_triplets[:, 0], y_triplets[:, 1]) far_distance = pairwise.paired_euclidean_distances(y_triplets[:, 0], y_triplets[:, 2]) elif distance is Distance.PRECOMPUTED: if noise is not None and noise_target is NoiseTarget.POINTS: raise ValueError("Applying noise on points is not possible for precomputed distances.") near_distance = embedding[triplets[:, 0], triplets[:, 1]] far_distance = embedding[triplets[:, 0], triplets[:, 2]] differences = near_distance - far_distance if noise is not None and noise_target is NoiseTarget.DIFFERENCES: differences += noise_fun(size=differences.shape, **noise_options) return utils.check_query_response(triplets, response=(differences < 0), result_format=result_format, standard=False)
[docs] def triplet_response(triplets: utils.Query, embedding: np.ndarray, result_format: Optional[str] = None, distance: Union[str, Distance] = 'euclidean') -> utils.Response: """ Triplet responses for an embedding. The default assumes Euclidean distances between embedding points. >>> triplets = [[1, 0, 2], [1, 2, 0]] >>> points = [[0], [4], [5]] >>> triplets, response = triplet_response(triplets, points, result_format='list-boolean') >>> triplets, response (array([[1, 0, 2], [1, 0, 2]], dtype=uint32), array([False, False])) To use alternative distance metrics, you can pass precomputed distances instead of an embedding. >>> from sklearn.metrics import pairwise >>> distances = pairwise.manhattan_distances(points) >>> triplets, response = triplet_response(triplets, distances, result_format='list-boolean', distance='precomputed') >>> response array([False, False]) Args: triplets: Numpy array or sparse matrix of triplet indices embedding: Numpy array of object coordinates, (n_objects, n_components) result_format: Format of the result. If none, keeps input format. distance: {'euclidean', 'precomputed'}. Specifies distance metrix between embedding points or if distances are passed directly as distance matrix. Returns: Responses in format as defined by response_format either numpy array (n_triplets,) or sparse matrix If return_indices is True, a tuple of indices and responses can be returned """ return noisy_triplet_response(triplets, embedding, noise=None, result_format=result_format, distance=distance)