Source code for dpemu.filters.text

# MIT License
#
# Copyright (c) 2019 Tuomas Halvari, Juha Harviainen, Juha Mylläri, Antti Röyskö, Juuso Silvennoinen
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import numpy as np
from dpemu.filters import Filter


[docs]class MissingArea(Filter): """Emulates stains in text causing characters to not be readable. At every position in the input text, with the given probability, generates a random radius with the radius_generator parameter function, and replaces all characters within that radius with the given missing value. Inherits Filter class. """
[docs] def __init__(self, probability_id, radius_generator_id, missing_value_id): """ Args: probability_id (str): The key mapping to the probability of creating a stain at every position. radius_generator_id (str): The key mapping to the radius_generator function. missing_value_id (str): The key mapping to the missing value to replace characters with. """ self.probability_id = probability_id self.radius_generator_id = radius_generator_id self.missing_value_id = missing_value_id super().__init__()
[docs] def apply(self, node_data, random_state, named_dims): if self.probability == 0: return for index, _ in np.ndenumerate(node_data): # 1. Get indexes of newline characters. We will not touch those string = node_data[index] row_starts = [0] for i, c in enumerate(string): if c == '\n': row_starts.append(i + 1) if not row_starts or row_starts[-1] != len(string): row_starts.append(len(string)) height = len(row_starts) - 1 widths = np.array([row_starts[i + 1] - row_starts[i] - 1 for i in range(height)]) if len(widths) > 0: width = np.max(widths) else: width = 0 # 2. Generate error errs = np.zeros(shape=(height + 1, width + 1)) ind = -1 while True: ind += random_state.geometric(self.probability) if ind >= width * height: break y = ind // width x = ind - y * width r = self.radius_generator.generate(random_state) x0 = max(x - r, 0) x1 = min(x + r + 1, width) y0 = max(y - r, 0) y1 = min(y + r + 1, height) errs[y0, x0] += 1 errs[y0, x1] -= 1 errs[y1, x0] -= 1 errs[y1, x1] += 1 # 3. Perform prefix sums, create mask errs = np.cumsum(errs, axis=0) errs = np.cumsum(errs, axis=1) errs = (errs > 0) mask = np.zeros(len(string)) for y in range(height): ind = row_starts[y] mask[ind:ind + widths[y]] = errs[y, 0:widths[y]] # 4. Apply error to string res_str = "".join([' ' if mask[i] else string[i] for i in range(len(mask))]) node_data[index] = res_str
# TODO: why p_id? Why not just the distribution?
[docs]class OCRError(Filter): """Emulates optical character recognition (OCR) errors. Provided a probability distribution specifying how probable it is to mistakenly read a given character as another, randomly replaces characters according to that distribution. Example weights for the distribution can be found in the data directory. These files are: example_ocr_error_config.json These weights can be loaded and normalized into a probability distribution using functions from dpemu/pg_utils.py. Inherits Filter class. """
[docs] def __init__(self, normalized_params_id, p_id): """ Args: normalized_params_id (str): The key mapping to the character replacement probability distribution. p_id (str): The key mapping to the probability distribution of a character replacement being applied. """ self.normalized_params_id = normalized_params_id self.p_id = p_id super().__init__()
[docs] def apply(self, node_data, random_state, named_dims): for index, string_ in np.ndenumerate(node_data): node_data[index] = (self.generate_ocr_errors(string_, random_state))
[docs] def generate_ocr_errors(self, string_, random_state): return "".join([self.replace_char(c, random_state) for c in string_])
[docs] def replace_char(self, c, random_state): if c in self.normalized_params and random_state.random_sample() < self.p: chars, probs = self.normalized_params[c] return random_state.choice(chars, 1, p=probs)[0] return c
[docs]class Uppercase(Filter): """Randomly convert characters to uppercase. For each character in the input text, with the given probability converts it to uppercase. Inherits Filter class. """
[docs] def __init__(self, probability_id): """ Args: probability_id (str): The key mapping to the probability of characters being converted to uppercase. """ self.prob_id = probability_id super().__init__()
[docs] def apply(self, node_data, random_state, named_dims): def stochastic_upper(char, probability): if random_state.rand() <= probability: return char.upper() return char for index, element in np.ndenumerate(node_data): original_string = element modified_string = "".join( [stochastic_upper(c, self.prob) for c in original_string]) node_data[index] = modified_string