Skip to content
This repository was archived by the owner on Sep 19, 2023. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
250 changes: 177 additions & 73 deletions nmtwizard/preprocess/operators/noise.py
Original file line number Diff line number Diff line change
@@ -1,98 +1,99 @@
import random
import copy
import os
import logging

import pyonmttok

from nmtwizard.logger import get_logger
from nmtwizard.preprocess import prepoperator
from nmtwizard.preprocess.tu import TokReplace
import fasttext

logger = get_logger(__name__)

@prepoperator.register_operator("noise")
class Noise(prepoperator.TUOperator):
@classmethod
def _config_schema(cls):
schema = super(Noise, cls)._config_schema()

noise_block = {
"lang": {"type": "string"},
"drop_word_prob": {"type": "number", "minimum": 0, "maximum": 1},
"drop_space_prob": {"type": "number", "minimum": 0, "maximum": 1},
"insert_space_prob": {"type": "number", "minimum": 0, "maximum": 1},
"drop_char_prob": {"type": "number", "minimum": 0, "maximum": 1},
"duplicate_char_prob": {"type": "number", "minimum": 0, "maximum": 1},
"swap_char_prob": {"type": "number", "minimum": 0, "maximum": 1},
"substitute_char_prob": {"type": "number", "minimum": 0, "maximum": 1},
"add_marker": {"type": "boolean"},
}
schema["properties"].update(
{
"source": {
"type": "object",
"properties": noise_block,
"additionalProperties": False,
}
}
)
schema["properties"].update(noise_block)

return schema

@staticmethod
def is_applied_for(process_type):
return process_type == prepoperator.ProcessType.TRAINING

def __init__(self, config, *args):
source_config = config.get("source")
if source_config:
config = source_config
class Noiser:
def __init__(self, config):
self._drop_word_prob = config.get("drop_word_prob", 0)
self._duplicate_word_prob = config.get("duplicate_word_prob", 0)
self._swap_word_prob = config.get("swap_word_prob", 0)
substitute_word_config = config.get("substitute_word", None)
self._substitute_word_prob = 0
if substitute_word_config:
self._substitute_word_prob = substitute_word_config.get("prob", 0)
if self._substitute_word_prob:
word_embedding_file = substitute_word_config.get("word_embedding_file")
self._word_embedding_model = None
if word_embedding_file is not None:
if not os.path.isfile(word_embedding_file):
raise ValueError(
"Word embedding file doesn't exist: %s"
% (word_embedding_file)
)
self._word_embedding_model = fasttext.load_model(
word_embedding_file
)
self._nn = substitute_word_config.get("nearest_neighbors_num")
self._drop_space_prob = config.get("drop_space_prob", 0)
self._insert_space_prob = config.get("insert_space_prob", 0)
self._drop_char_prob = config.get("drop_char_prob", 0)
self._duplicate_char_prob = config.get("duplicate_char_prob", 0)
self._swap_char_prob = config.get("swap_char_prob", 0)
self._substitute_char_prob = config.get("substitute_char_prob", 0)
self._add_marker = config.get("add_marker", 0)

def _preprocess_tu(self, tu, *args):
original_tokens = copy.deepcopy(tu.src_tok.token_objects)
tu = self._apply_space_insertion_noise(tu)
src_tok = tu.src_tok
tokens = src_tok.token_objects
new_tokens = [self._apply_word_noise(tokens[0])]
tu.src_tok = (src_tok.tokenizer, new_tokens)
if self._add_marker and new_tokens != original_tokens:
tu.replace_tokens_side("source", (0, 0, ["⦅mrk_noisy⦆"]))
return [tu]

def _apply_space_insertion_noise(self, tu):
src_tok = tu.src_tok
tokens = src_tok.token_objects[0]
added_spaces = 0
for pos, token in enumerate(tokens):
def apply_noise(self, tokens):
new_tokens = []
for token in tokens:
if not token.is_placeholder():
if (
if self._drop_word_prob > 0 and random.random() <= self._drop_word_prob:
continue
# TODO : joiners
elif (
self._duplicate_word_prob > 0
and random.random() <= self._duplicate_word_prob
):
new_tokens.extend([token, token])
continue
elif (
len(new_tokens) > 0
and self._swap_word_prob > 0
and random.random() <= self._swap_word_prob
):
new_tokens.insert(-1, token)
continue
elif (
self._substitute_word_prob > 0
and self._word_embedding_model is not None
and random.random() <= self._substitute_word_prob
and all(c.isalpha() for c in token.surface)
):
nearest_neighbors = (
self._word_embedding_model.get_nearest_neighbors(
token.surface, k=self._nn
)
)
nearest_neighbors = [
nn[1]
for nn in nearest_neighbors
if all(c.isalpha() for c in nn[1])
]
if nearest_neighbors:
token.surface = random.choice(nearest_neighbors)
new_tokens.append(token)
continue
elif (
self._insert_space_prob > 0
and random.random() <= self._insert_space_prob
and len(token) > 1
):
new_space_index = random.randint(1, len(token) - 1)
first_part_surface = token.surface[0:new_space_index]
second_part_surface = token.surface[new_space_index:]
tu.replace_tokens_side(
"source",
(
pos + added_spaces,
1,
[first_part_surface, second_part_surface],
),
)
added_spaces += 1
return tu

def _apply_word_noise(self, tokens):
new_tokens = []
for token in tokens:
if not token.is_placeholder():
if self._drop_word_prob > 0 and random.random() <= self._drop_word_prob:
token.surface = first_part_surface
second_part_token = pyonmttok.Token(token)
second_part_token.surface = second_part_surface
new_tokens.extend([token, second_part_token])
continue
elif (
self._drop_space_prob > 0
Expand All @@ -106,11 +107,17 @@ def _apply_word_noise(self, tokens):
or self._swap_char_prob > 0
or self._substitute_char_prob > 0
):
token.surface = self._apply_character_noise(token.surface)
token.surface = self.apply_character_noise(token.surface)
if len(token.surface) != 0: # Delete token if empty.
new_tokens.append(token)
return new_tokens

def apply_noise_batch(self, tokens_batch):
new_tokens_batch = []
for tokens in tokens_batch:
new_tokens_batch.append(self.apply_noise(tokens))
return new_tokens_batch

@staticmethod
def get_neighbor_keys_on_qwerty(key):
lines = "qwertyuiop", "asdfghjkl", "zxcvbnm"
Expand All @@ -125,7 +132,7 @@ def get_neighbor_keys_on_qwerty(key):
if len(line) > index + i and line[index + i] != key and index + i >= 0
]

def _apply_character_noise(self, cur_surface):
def apply_character_noise(self, cur_surface):
new_surface = ""
i = 0
while i < len(cur_surface):
Expand Down Expand Up @@ -155,3 +162,100 @@ def _apply_character_noise(self, cur_surface):
new_surface += cur_surface[i]
i += 1
return new_surface


@prepoperator.register_operator("noise")
class Noise(prepoperator.Operator):
@classmethod
def _config_schema(cls):
schema = super(Noise, cls)._config_schema()

noise_block = {
"lang": {"type": "string"},
"drop_word_prob": {"type": "number", "minimum": 0, "maximum": 1},
"duplicate_word_prob": {"type": "number", "minimum": 0, "maximum": 1},
"swap_word_prob": {"type": "number", "minimum": 0, "maximum": 1},
"substitute_word": {
"properties": {
"prob": {"type": "number", "minimum": 0, "maximum": 1},
"word_embedding_file": {"type": "string"},
"nearest_neighbors_num": {"type": "integer"},
},
"type": "object",
"additionalProperties": False,
},
"drop_space_prob": {"type": "number", "minimum": 0, "maximum": 1},
"insert_space_prob": {"type": "number", "minimum": 0, "maximum": 1},
"drop_char_prob": {"type": "number", "minimum": 0, "maximum": 1},
"duplicate_char_prob": {"type": "number", "minimum": 0, "maximum": 1},
"swap_char_prob": {"type": "number", "minimum": 0, "maximum": 1},
"substitute_char_prob": {"type": "number", "minimum": 0, "maximum": 1},
"add_marker": {"type": "boolean"},
}
schema["properties"].update(
{
"source": {
"type": "object",
"properties": noise_block,
"additionalProperties": False,
}
}
)
schema["properties"].update(noise_block)

return schema

@staticmethod
def is_applied_for(process_type):
return process_type == prepoperator.ProcessType.TRAINING

def get_shared_classes():
return [Noiser]

@staticmethod
def get_shared_builders(config, process_type):
# Only build noiser as shared object for word substitution with embeddings
word_emb = config.get("substitute_word", {}).get("word_embedding_file")
if word_emb:
return {"noiser": (Noiser, (config,))}
else:
return None

def __init__(self, config, process_type, build_state, shared_state=None):
source_config = config.get("source")
if source_config:
config = source_config
self._noiser = shared_state.get("noiser") if shared_state else None
if not self._noiser:
self._noiser = Noiser(config)
self._add_marker = config.get("add_marker", 0)

def _preprocess(self, tu_batch):
tu_list, meta_batch = tu_batch

src_tokens = []
src_detok = []
for tu in tu_list:
src_tok = tu.src_tok
src_tokens.append(src_tok.token_objects[0])
src_detok.append(tu.src_detok)

src_tokens_noisy = self._noiser.apply_noise_batch(src_tokens)

for detok, tok_noisy, tu in zip(src_detok, src_tokens_noisy, tu_list):
src_tok = tu.src_tok
tu.src_tok = (src_tok.tokenizer, [tok_noisy])
new_detok = tu.src_detok
if detok != new_detok:
if self._add_marker:
tu.replace_tokens_side("source", (0, 0, ["⦅mrk_noisy⦆"]))
log_level = logging.INFO if self._verbose else logging.DEBUG
if logger.isEnabledFor(log_level):
logger.info(
"'%s' operator modifies source in preprocess.\nSRC BEFORE : %s\nSRC AFTER : %s",
self.name,
detok,
new_detok,
)

return tu_list, meta_batch
2 changes: 1 addition & 1 deletion nmtwizard/preprocess/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def _select_lines(f):
# 1 if full sample (lines_kept == lines_count or no gsample)
# >1 if oversampling (lines_kept > lines_count)
# 0 if undersampling (lines_kept < lines_count)
min_occurrence = not gsample or int(f.lines_kept / f.lines_count)
min_occurrence = int(f.lines_kept / f.lines_count) or int(not gsample)

if min_occurrence:
random_sample = {i: min_occurrence for i in range(f.lines_count)}
Expand Down
15 changes: 14 additions & 1 deletion test/test_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,6 @@ def test_tokenization_with_non_iso_639_lang():
"hello.",
["h ello.", "he llo.", "hel lo.", "hell o."],
),
(dict(insert_space_prob=1, drop_space_prob=1), True, "hello.", ["hello."]),
(dict(substitute_char_prob=1), True, "pp", ["oo", "ol", "lo", "ll"]),
(
dict(drop_space_prob=1, add_marker=True),
Expand All @@ -172,6 +171,20 @@ def test_tokenization_with_non_iso_639_lang():
"⦅mrk_noisy⦆ hell o.",
],
),
(dict(duplicate_word_prob=1), True, "hello.", ["hello hello.."]),
(dict(swap_word_prob=1), True, "hello.", [". hello"]),
(
dict(
substitute_word={
"prob": 1,
"word_embedding_file": "/nfs/SSALING-DATA/segal/dev/systran-docker/nmt-wizard-docker/test/corpus/resources/embeddings/dbpedia.ftz",
"nearest_neighbors_num": 5,
}
),
True,
"hello.",
["translator.", "dichotomy.", "violin.", "clarinetist.", "luce."],
),
],
)
def test_noise(config, training, text, expected):
Expand Down