Source code for dpemu.dataset_utils

# MIT License
#
# Copyright (c) 2019 Tuomas Halvari, Juha Harviainen, Juha Mylläri, Antti Röyskö, Juuso Silvennoinen
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import warnings
from subprocess import Popen

import cv2
from numpy.random import RandomState
from pycocotools.coco import COCO
from sklearn.datasets import fetch_20newsgroups, fetch_openml, load_digits
from sklearn.model_selection import train_test_split

from dpemu.utils import get_project_root

random_state = RandomState(42)


[docs]def load_newsgroups(subset="all", n_categories=20):
    """Fetches the 20 newsgroups dataset and returns its desired subset.

    Args:
        subset (str, optional): If "test" then a smaller dataset is used instead of the full one. Defaults to "all".
        n_categories (int, optional): The number of categories to be included. Defaults to 20.

    Returns:
        tuple: The dataset, categories as integers, category names and the name of the dataset.
    """
    categories = [
        "alt.atheism",
        "comp.graphics",
        "sci.space",
        "rec.autos",
        "rec.sport.hockey",
        "rec.sport.baseball",
        "sci.electronics",
        "misc.forsale",
        "sci.crypt",
        "talk.politics.mideast",
        "sci.med",
        "comp.sys.mac.hardware",
        "comp.windows.x",
        "rec.motorcycles",
        "soc.religion.christian",
        "talk.politics.misc",
        "talk.religion.misc",
        "talk.politics.guns",
        "comp.sys.ibm.pc.hardware"
    ]
    if not 0 < n_categories < 21:
        n_categories = 20
    newsgroups = fetch_20newsgroups(subset=subset, categories=categories[:n_categories],
                                    remove=("headers", "footers", "quotes"), random_state=random_state)
    return newsgroups["data"], newsgroups["target"].astype(int), newsgroups["target_names"], "20newsgroups"


def __split_data(data, labels, n_data):
    """Returns a subset of a given size of the original data and labels.

    Args:
        data (list): Original data.
        labels (list): Original labels.
        n_data (int): Size of the subset.

    Returns:
        tuple: a subset of data and a subset of labels.
    """
    if 0 < n_data < data.shape[0]:
        data, _, labels, _ = train_test_split(data, labels, train_size=n_data, random_state=random_state)
    return data, labels


[docs]def load_digits_(n_data=1797):
    """Fetches the digits dataset and returns its desired subset.

    Args:
        n_data (int, optional): The size of the wanted subset. Defaults to 1797.

    Returns:
        tuple: The dataset, the labels of data points, the names of categories and the name of the dataset.
    """
    digits = load_digits()
    data, labels = __split_data(digits["data"], digits["target"], n_data)
    return data, labels, None, "Digits"


[docs]def load_mnist_unsplit(n_data=70000):
    """Fetches the MNIST dataset and returns its subset.

    Args:
        n_data (int, optional): The size of the wanted subset. Defaults to 70000.

    Returns:
        tuple: The dataset, the labels of data points, the names of categories and the name of the dataset.
    """
    mnist = fetch_openml("mnist_784")
    data, labels = __split_data(mnist["data"], mnist["target"].astype(int), n_data)
    return data, labels, None, "MNIST"


[docs]def load_mnist(reshape_to_28x28=False, integer_values=False):
    """Fetches the MNIST dataset and returns its desired subset.

    Args:
        reshape_to_28x28 (bool, optional): The data is reshaped to 28x28 images if true. Defaults to False.
        integer_values (bool, optional): The data is typecast to integers if true. Defaults to False.

    Returns:
        tuple: Training pixel data, training labels, test pixel data, test labels.
    """
    from contextlib import redirect_stderr
    warnings.simplefilter(action='ignore', category=FutureWarning)
    with redirect_stderr(open(os.devnull, 'w')):
        from keras.datasets.mnist import load_data as load_mnist_data

    (x_train, y_train), (x_test, y_test) = load_mnist_data()
    if not reshape_to_28x28:
        x_train = x_train.reshape((-1, 28 * 28))
        x_test = x_test.reshape((-1, 28 * 28))
    if not integer_values:
        x_train = x_train.astype('float')
        y_train = y_train.astype('float')
        x_test = x_test.astype('float')
        y_train = y_train.astype('float')
    return x_train, y_train, x_test, y_test


[docs]def load_fashion(n_data=70000):
    """Fetches the fashion MNIST dataset and returns its desired subset.

    Args:
        n_data (int, optional): The size of the wanted subset. Defaults to 70000.

    Returns:
        tuple: The dataset, the labels of elements, the names of categories and the name of the dataset.
    """
    mnist = fetch_openml("Fashion-MNIST")
    data, labels = __split_data(mnist["data"], mnist["target"].astype(int), n_data)
    label_names = [
        "T-shirt",
        "Trouser",
        "Pullover",
        "Dress",
        "Coat",
        "Sandal",
        "Shirt",
        "Sneaker",
        "Bag",
        "Ankle boot"
    ]
    return data, labels, label_names, "Fashion MNIST"


[docs]def load_coco_val_2017(n=5000, is_shuffled=False):
    """Fetches the COCO dataset and returns its desired subset.

    Args:
        n (int, optional): The size of the wanted subset. Defaults to 5000.
        is_shuffled (bool, optional): If true, then the chosen subset of the data will be shuffled. Defaults to False.

    Returns:
        tuple: The dataset, the labels of elements, the names of categories and the name of the dataset.
    """
    if n not in range(1, 5000):
        n = 5000
    img_folder = f"{get_project_root()}/data/val2017"
    if not os.path.isdir(img_folder):
        Popen(["./scripts/get_coco_dataset.sh"], cwd=get_project_root()).wait()

    coco = COCO(f"{get_project_root()}/data/annotations/instances_val2017.json")
    img_ids = coco.getImgIds()
    if is_shuffled:
        random_state.shuffle(img_ids)
    img_ids = img_ids[:n]
    img_dicts = coco.loadImgs(img_ids)
    img_filenames = [img_dict["file_name"] for img_dict in img_dicts]
    imgs = [cv2.imread(os.path.join(img_folder, img_filename)) for img_filename in img_filenames]
    imgs = [cv2.cvtColor(img, cv2.COLOR_BGR2RGB) for img in imgs]
    with open(f"{get_project_root()}/data/coco.names", "r") as fp:
        class_names = [line.strip() for line in fp.readlines()]

    return imgs, img_ids, class_names, img_filenames