Source code for cblearn.datasets._nature_vogue_similarity

from pathlib import Path
import logging
import joblib
import os
from typing import Optional, Union
import zipfile

import numpy as np
from sklearn.datasets import _base
from sklearn.utils import check_random_state, Bunch
from cblearn.preprocessing import query_from_columns

ARCHIVE = _base.RemoteFileMetadata(
    filename='nature_and_vogue_triplets.zip',
    url='http://anttiukkonen.com/nature_and_vogue_triplets.zip',
    checksum=('db4718bb742a2492dee87e94daead3070b2afa3a9174aa717be336940e47e3cc'))

logger = logging.getLogger(__name__)


[docs] def fetch_nature_scene_similarity(data_home: Optional[os.PathLike] = None, download_if_missing: bool = True, shuffle: bool = True, random_state: Optional[np.random.RandomState] = None, return_triplets: bool = False) -> Union[Bunch, np.ndarray]: """ Load the nature scene similarity dataset (odd-one-out). =================== ===================== Triplets 3355 Objects (Scenes) 120 =================== ===================== See :ref:`nature_vogue_dataset` for a detailed description. >>> dataset = fetch_nature_scene_similarity(shuffle=True) # doctest: +REMOTE_DATA >>> dataset.image_label[[0, -1]].tolist() # doctest: +REMOTE_DATA ['art114.jpg', 'n344019.jpg'] >>> dataset.triplet.shape # doctest: +REMOTE_DATA (3355, 3) Args: data_home : optional, default: None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : optional, default=True shuffle: default = True Shuffle the order of triplet constraints. random_state: optional, default = None Initialization for shuffle random generator return_triplets : boolean, default=False. If True, returns numpy array instead of a Bunch object. Returns: dataset : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. triplet : ndarray, shape (n_triplets, 3) Each row corresponding odd-one-out query. The columns represent the odd image and two others. class_label : ndarray, shape (120, ) Names of the scene images. DESCR : string Description of the dataset. triplets : numpy arrays (n_triplets, 3) Only present when `return_triplets=True`. Raises: IOError: If the data is not locally available, but download_if_missing=False """ return _fetch_nature_vogue('nature', data_home, download_if_missing, shuffle, random_state, return_triplets)
[docs] def fetch_vogue_cover_similarity(data_home: Optional[os.PathLike] = None, download_if_missing: bool = True, shuffle: bool = True, random_state: Optional[np.random.RandomState] = None, return_triplets: bool = False) -> Union[Bunch, np.ndarray]: """ Load the vogue cover similarity dataset (odd-one-out). =================== ===================== Triplets 1107 Objects (Covers) 60 =================== ===================== See :ref:`nature_vogue_dataset` for a detailed description. >>> dataset = fetch_vogue_cover_similarity(shuffle=True) # doctest: +REMOTE_DATA >>> dataset.image_label[[0, -1]].tolist() # doctest: +REMOTE_DATA ['Cover_uk_VOgue_MAY10_V_29mar10_bt_268x353.jpg', 'voguecoverapr11_bt_268x353.jpg'] >>> dataset.triplet.shape # doctest: +REMOTE_DATA (1107, 3) Args: data_home : optional, default: None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : optional, default=True shuffle: default = True Shuffle the order of triplet constraints. random_state: optional, default = None Initialization for shuffle random generator return_triplets : boolean, default=False. If True, returns numpy array instead of a Bunch object. Returns: dataset : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. triplet : ndarray, shape (n_triplets, 3) Each row corresponding odd-one-out query. The columns represent the odd image and two others. class_label : ndarray, shape (120, ) Names of the scene images. DESCR : string Description of the dataset. triplets : numpy arrays (n_triplets, 3) Only present when `return_triplets=True`. Raises: IOError: If the data is not locally available, but download_if_missing=False """ return _fetch_nature_vogue('vogue', data_home, download_if_missing, shuffle, random_state, return_triplets)
def _fetch_nature_vogue(dataset: str, data_home: Optional[os.PathLike] = None, download_if_missing: bool = True, shuffle: bool = True, random_state: Optional[np.random.RandomState] = None, return_triplets: bool = False) -> Union[Bunch, np.ndarray]: data_home = Path(_base.get_data_home(data_home=data_home)) if not data_home.exists(): data_home.mkdir() filepath = Path(_base._pkl_filepath(data_home, 'nature_vogue_similarity.pkz')) if not filepath.exists(): if not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") logger.info('Downloading material similarity data from {} to {}'.format(ARCHIVE.url, data_home)) archive_path = _base._fetch_remote(ARCHIVE, dirname=data_home) with zipfile.ZipFile(archive_path) as zf: with zf.open('nature/nature_triplets.txt', 'r') as f: nature_data = np.loadtxt(f, dtype=str) with zf.open('vogue/vogue_triplets.txt', 'r') as f: vogue_data = np.loadtxt(f, dtype=str) joblib.dump((nature_data, vogue_data), filepath, compress=6) os.remove(archive_path) else: (nature_data, vogue_data) = joblib.load(filepath) if dataset == 'nature': data = nature_data[:, [2, 0, 1]] elif dataset == 'vogue': data = vogue_data[:, [2, 0, 1]] triplets, encoder = query_from_columns(data, [0, 1, 2], return_transformer=True) classes = encoder.encoder_.classes_ if shuffle: random_state = check_random_state(random_state) triplets = random_state.permutation(triplets) if return_triplets: return triplets module_path = Path(__file__).parent with module_path.joinpath('descr', 'nature_vogue_similarity.rst').open() as rst_file: fdescr = rst_file.read() return Bunch(triplet=triplets, image_label=classes, DESCR=fdescr)