# MIT License
#
# Copyright (c) 2019 Tuomas Halvari, Juha Harviainen, Juha Mylläri, Antti Röyskö, Juuso Silvennoinen
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import numpy as np
from dpemu.filters import Filter
[docs]class MissingArea(Filter):
"""Emulates stains in text causing characters to not be readable.
At every position in the input text, with the given probability,
generates a random radius with the radius_generator parameter function,
and replaces all characters within that radius with the given missing value.
Inherits Filter class.
"""
[docs] def __init__(self, probability_id, radius_generator_id, missing_value_id):
"""
Args:
probability_id (str): The key mapping to the probability of creating a stain at every position.
radius_generator_id (str): The key mapping to the radius_generator function.
missing_value_id (str): The key mapping to the missing value to replace characters with.
"""
self.probability_id = probability_id
self.radius_generator_id = radius_generator_id
self.missing_value_id = missing_value_id
super().__init__()
[docs] def apply(self, node_data, random_state, named_dims):
if self.probability == 0:
return
for index, _ in np.ndenumerate(node_data):
# 1. Get indexes of newline characters. We will not touch those
string = node_data[index]
row_starts = [0]
for i, c in enumerate(string):
if c == '\n':
row_starts.append(i + 1)
if not row_starts or row_starts[-1] != len(string):
row_starts.append(len(string))
height = len(row_starts) - 1
widths = np.array([row_starts[i + 1] - row_starts[i] - 1 for i in range(height)])
if len(widths) > 0:
width = np.max(widths)
else:
width = 0
# 2. Generate error
errs = np.zeros(shape=(height + 1, width + 1))
ind = -1
while True:
ind += random_state.geometric(self.probability)
if ind >= width * height:
break
y = ind // width
x = ind - y * width
r = self.radius_generator.generate(random_state)
x0 = max(x - r, 0)
x1 = min(x + r + 1, width)
y0 = max(y - r, 0)
y1 = min(y + r + 1, height)
errs[y0, x0] += 1
errs[y0, x1] -= 1
errs[y1, x0] -= 1
errs[y1, x1] += 1
# 3. Perform prefix sums, create mask
errs = np.cumsum(errs, axis=0)
errs = np.cumsum(errs, axis=1)
errs = (errs > 0)
mask = np.zeros(len(string))
for y in range(height):
ind = row_starts[y]
mask[ind:ind + widths[y]] = errs[y, 0:widths[y]]
# 4. Apply error to string
res_str = "".join([' ' if mask[i] else string[i] for i in range(len(mask))])
node_data[index] = res_str
# TODO: why p_id? Why not just the distribution?
[docs]class OCRError(Filter):
"""Emulates optical character recognition (OCR) errors.
Provided a probability distribution specifying how probable it is to mistakenly
read a given character as another, randomly replaces characters according to that distribution.
Example weights for the distribution can be found in the data directory. These files are:
example_ocr_error_config.json
These weights can be loaded and normalized into a probability distribution
using functions from dpemu/pg_utils.py.
Inherits Filter class.
"""
[docs] def __init__(self, normalized_params_id, p_id):
"""
Args:
normalized_params_id (str): The key mapping to the character replacement probability distribution.
p_id (str): The key mapping to the probability distribution of a character replacement being applied.
"""
self.normalized_params_id = normalized_params_id
self.p_id = p_id
super().__init__()
[docs] def apply(self, node_data, random_state, named_dims):
for index, string_ in np.ndenumerate(node_data):
node_data[index] = (self.generate_ocr_errors(string_, random_state))
[docs] def generate_ocr_errors(self, string_, random_state):
return "".join([self.replace_char(c, random_state) for c in string_])
[docs] def replace_char(self, c, random_state):
if c in self.normalized_params and random_state.random_sample() < self.p:
chars, probs = self.normalized_params[c]
return random_state.choice(chars, 1, p=probs)[0]
return c
[docs]class Uppercase(Filter):
"""Randomly convert characters to uppercase.
For each character in the input text, with the given probability converts it to uppercase.
Inherits Filter class.
"""
[docs] def __init__(self, probability_id):
"""
Args:
probability_id (str): The key mapping to the probability of characters being converted to uppercase.
"""
self.prob_id = probability_id
super().__init__()
[docs] def apply(self, node_data, random_state, named_dims):
def stochastic_upper(char, probability):
if random_state.rand() <= probability:
return char.upper()
return char
for index, element in np.ndenumerate(node_data):
original_string = element
modified_string = "".join(
[stochastic_upper(c, self.prob) for c in original_string])
node_data[index] = modified_string