Source code for dpemu.pg_utils

# MIT License
#
# Copyright (c) 2019 Tuomas Halvari, Juha Harviainen, Juha Mylläri, Antti Röyskö, Juuso Silvennoinen
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import json
from random import randint

import numpy as np


[docs]def load_ocr_error_params(path_to_error_params):
    """Loads error parameters from a JSON-file.

    Args:
        path_to_error_params (str): A string containing the relative or absolute path to the file.

    Returns:
        dict: A Python dictionary.
    """
    return json.load(open(path_to_error_params))


[docs]def normalize_ocr_error_params(params):
    """Normalises numerical weights associated with a character's OCR-error likelihoods.

    For every character found in the dict, the value associated with it
    is a list containing numerical weights. These weights are normalised
    so that they sum to 1, and can thus be used as probabilities. Every
    probability is then attached to the event of a character changing to
    another character specified in the .json file which was loaded using
    the load_ocr_error_params function.

    Args:
        params (dict): A dict containing character-list pairs.

    Returns:
        dict: A dict containing normalised probabilities for every character.
    """
    return {k: (v[0], normalize_weights(v[1])) for k, v in params.items()}


[docs]def normalize_weights(weights):
    """Normalises a list of numerical values (weights) into probabilities.

    Every weight in the list is assigned a probability proportional to its
    value divided by the sum of all values.

    Args:
        weights (list): A list of numerical values

    Returns:
        list: A list containing values which sum to 1.
    """
    total = sum(weights)
    return [weight / total for weight in weights]


[docs]def to_time_series_x_y(data, x_length):
    """
    Convert time series data to pairs of x, y where x is a vector of x_length
    consecutive observations and y is the observation immediately following x.

    Args:
        data ([type]): The data used.
        x_length (int): Length of the x vector.

    Returns:
        The x, y pair.
    """
    x = np.array([data[i - x_length:i] for i in range(x_length, data.shape[0])])
    y = np.array([data[i] for i in range(x_length, data.shape[0])])
    return x, y


[docs]def first_dimension_length(array):
    """Returns the length of the first dimension of the provided array or list.

    Args:
        array (list or numpy.ndarray): An array.

    Returns:
        int: The length of the first dimension of the array.
    """
    if type(array) is list:
        return len(array)
    else:
        return array.shape[0]


[docs]def generate_random_dict_key(dct, prefix):
    """Generates a random string that is not already in the dict.

    Args:
        dct (dict): A Python dictionary.
        prefix (str): A prefix for the random key.

    Returns:
        str: A randomly generated key.
    """

    key = prefix
    while key in dct:
        key += "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[randint(0, 25)]
    return key