From a238cde060ec8b4bbc48884c22ceeeeda3042086 Mon Sep 17 00:00:00 2001 From: Natalia Segal Date: Fri, 26 Nov 2021 14:45:00 +0100 Subject: [PATCH 1/8] First draft for word noise --- nmtwizard/preprocess/operators/noise.py | 32 +++++++++++++++++++++++++ test/test_operators.py | 3 +++ 2 files changed, 35 insertions(+) diff --git a/nmtwizard/preprocess/operators/noise.py b/nmtwizard/preprocess/operators/noise.py index 34052028..e3575b1f 100644 --- a/nmtwizard/preprocess/operators/noise.py +++ b/nmtwizard/preprocess/operators/noise.py @@ -1,8 +1,10 @@ import random import copy +import os from nmtwizard.preprocess import prepoperator from nmtwizard.preprocess.tu import TokReplace +import fasttext @prepoperator.register_operator("noise") @@ -14,6 +16,10 @@ def _config_schema(cls): noise_block = { "lang": {"type": "string"}, "drop_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "duplicate_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "swap_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "substitute_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "word_embedding_file": {"type": "string"}, "drop_space_prob": {"type": "number", "minimum": 0, "maximum": 1}, "insert_space_prob": {"type": "number", "minimum": 0, "maximum": 1}, "drop_char_prob": {"type": "number", "minimum": 0, "maximum": 1}, @@ -44,6 +50,19 @@ def __init__(self, config, *args): if source_config: config = source_config self._drop_word_prob = config.get("drop_word_prob", 0) + self._duplicate_word_prob = config.get("duplicate_word_prob", 0) + self._swap_word_prob = config.get("swap_word_prob", 0) + self._substitute_word_prob = config.get("substitute_word_prob", 0) + # TODO: SharedState builder ? + self._word_embedding_file = config.get("word_embedding_file") + self._word_embedding_model = None + if self._word_embedding_file is not None: + if not os.path.isfile(self._word_embedding_file): + raise ValueError( + "Word embedding file doesn't exist: %s" + % (self._word_embedding_file) + ) + self._word_embedding_model = fasttext.load_model(self._word_embedding_file) self._drop_space_prob = config.get("drop_space_prob", 0) self._insert_space_prob = config.get("insert_space_prob", 0) self._drop_char_prob = config.get("drop_char_prob", 0) @@ -94,6 +113,19 @@ def _apply_word_noise(self, tokens): if not token.is_placeholder(): if self._drop_word_prob > 0 and random.random() <= self._drop_word_prob: continue + # TODO : joiners + elif self._duplicate_word_prob > 0 and random.random() <= self._duplicate_word_prob: + new_tokens.extend([token, token]) + continue + elif len(new_tokens) > 0 and self._swap_word_prob > 0 and random.random() <= self._swap_word_prob: + new_tokens.insert(-1, token) + continue + elif self._word_embedding_model is not None and self._substitute_word_prob > 0 and random.random() <= self._substitute_word_prob and all(c.isalpha() for c in token.surface): + nearest_neighbors = self._word_embedding_model.get_nearest_neighbors(token.surface, k=5) # TODO : define k as an option + nearest_neighbors = [nn[1] for nn in nearest_neighbors if all(c.isalpha() for c in nn[1])] + token.surface = random.choice(nearest_neighbors) + new_tokens.append(token) + continue elif ( self._drop_space_prob > 0 and random.random() <= self._drop_space_prob diff --git a/test/test_operators.py b/test/test_operators.py index 65366e00..43ae6846 100644 --- a/test/test_operators.py +++ b/test/test_operators.py @@ -172,6 +172,9 @@ def test_tokenization_with_non_iso_639_lang(): "⦅mrk_noisy⦆ hell o.", ], ), + (dict(duplicate_word_prob=1), True, "hello.", ["hello hello.."]), + (dict(swap_word_prob=1), True, "hello.", [". hello"]), + (dict(substitute_word_prob=1, word_embedding_file="/nfs/SSALING-DATA/segal/dev/systran-docker/nmt-wizard-docker/test/corpus/resources/embeddings/dbpedia.ftz"), True, "hello.", ['translator.', 'dichotomy.', 'violin.', 'clarinetist.', 'luce.']), ], ) def test_noise(config, training, text, expected): From 507fe0c36a82aa84f58925f7cb6c6851cdecbad2 Mon Sep 17 00:00:00 2001 From: Natalia Segal Date: Mon, 29 Nov 2021 10:47:49 +0100 Subject: [PATCH 2/8] Improve noise configuration --- nmtwizard/preprocess/operators/noise.py | 45 ++++++++++++++++--------- test/test_operators.py | 2 +- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/nmtwizard/preprocess/operators/noise.py b/nmtwizard/preprocess/operators/noise.py index e3575b1f..966109d7 100644 --- a/nmtwizard/preprocess/operators/noise.py +++ b/nmtwizard/preprocess/operators/noise.py @@ -18,8 +18,15 @@ def _config_schema(cls): "drop_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, "duplicate_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, "swap_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, - "substitute_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, - "word_embedding_file": {"type": "string"}, + "substitute_word": { + "properties" : { + "prob": {"type": "number", "minimum": 0, "maximum": 1}, + "word_embedding_file": {"type": "string"}, + "nearest_neighbors_num": {"type": "integer"} + }, + "type": "object", + "additionalProperties": False + }, "drop_space_prob": {"type": "number", "minimum": 0, "maximum": 1}, "insert_space_prob": {"type": "number", "minimum": 0, "maximum": 1}, "drop_char_prob": {"type": "number", "minimum": 0, "maximum": 1}, @@ -52,17 +59,22 @@ def __init__(self, config, *args): self._drop_word_prob = config.get("drop_word_prob", 0) self._duplicate_word_prob = config.get("duplicate_word_prob", 0) self._swap_word_prob = config.get("swap_word_prob", 0) - self._substitute_word_prob = config.get("substitute_word_prob", 0) - # TODO: SharedState builder ? - self._word_embedding_file = config.get("word_embedding_file") - self._word_embedding_model = None - if self._word_embedding_file is not None: - if not os.path.isfile(self._word_embedding_file): - raise ValueError( - "Word embedding file doesn't exist: %s" - % (self._word_embedding_file) - ) - self._word_embedding_model = fasttext.load_model(self._word_embedding_file) + substitute_word_config = config.get("substitute_word", None) + self._substitute_word_prob = 0 + if substitute_word_config: + self._substitute_word_prob = substitute_word_config.get("prob", 0) + if self._substitute_word_prob: + # TODO: SharedState builder ? + word_embedding_file = substitute_word_config.get("word_embedding_file") + self._word_embedding_model = None + if word_embedding_file is not None: + if not os.path.isfile(word_embedding_file): + raise ValueError( + "Word embedding file doesn't exist: %s" + % (word_embedding_file) + ) + self._word_embedding_model = fasttext.load_model(word_embedding_file) + self._nn = substitute_word_config.get("nearest_neighbors_num") self._drop_space_prob = config.get("drop_space_prob", 0) self._insert_space_prob = config.get("insert_space_prob", 0) self._drop_char_prob = config.get("drop_char_prob", 0) @@ -120,10 +132,11 @@ def _apply_word_noise(self, tokens): elif len(new_tokens) > 0 and self._swap_word_prob > 0 and random.random() <= self._swap_word_prob: new_tokens.insert(-1, token) continue - elif self._word_embedding_model is not None and self._substitute_word_prob > 0 and random.random() <= self._substitute_word_prob and all(c.isalpha() for c in token.surface): - nearest_neighbors = self._word_embedding_model.get_nearest_neighbors(token.surface, k=5) # TODO : define k as an option + elif self._substitute_word_prob > 0 and self._word_embedding_model is not None and random.random() <= self._substitute_word_prob and all(c.isalpha() for c in token.surface): + nearest_neighbors = self._word_embedding_model.get_nearest_neighbors(token.surface, k=self._nn) nearest_neighbors = [nn[1] for nn in nearest_neighbors if all(c.isalpha() for c in nn[1])] - token.surface = random.choice(nearest_neighbors) + if nearest_neighbors: + token.surface = random.choice(nearest_neighbors) new_tokens.append(token) continue elif ( diff --git a/test/test_operators.py b/test/test_operators.py index 43ae6846..846820d1 100644 --- a/test/test_operators.py +++ b/test/test_operators.py @@ -174,7 +174,7 @@ def test_tokenization_with_non_iso_639_lang(): ), (dict(duplicate_word_prob=1), True, "hello.", ["hello hello.."]), (dict(swap_word_prob=1), True, "hello.", [". hello"]), - (dict(substitute_word_prob=1, word_embedding_file="/nfs/SSALING-DATA/segal/dev/systran-docker/nmt-wizard-docker/test/corpus/resources/embeddings/dbpedia.ftz"), True, "hello.", ['translator.', 'dichotomy.', 'violin.', 'clarinetist.', 'luce.']), + (dict(substitute_word={"prob" : 1, "word_embedding_file" : "/nfs/SSALING-DATA/segal/dev/systran-docker/nmt-wizard-docker/test/corpus/resources/embeddings/dbpedia.ftz", "nearest_neighbors_num":5}), True, "hello.", ['translator.', 'dichotomy.', 'violin.', 'clarinetist.', 'luce.']), ], ) def test_noise(config, training, text, expected): From 0098ab2b7470c65f8f829f8c5df2cd27670791c3 Mon Sep 17 00:00:00 2001 From: Natalia Segal Date: Mon, 29 Nov 2021 10:57:33 +0100 Subject: [PATCH 3/8] Apply black --- nmtwizard/preprocess/operators/noise.py | 42 +++++++++++++++++++------ test/test_operators.py | 13 +++++++- 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/nmtwizard/preprocess/operators/noise.py b/nmtwizard/preprocess/operators/noise.py index 966109d7..5bc45ea2 100644 --- a/nmtwizard/preprocess/operators/noise.py +++ b/nmtwizard/preprocess/operators/noise.py @@ -19,13 +19,13 @@ def _config_schema(cls): "duplicate_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, "swap_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, "substitute_word": { - "properties" : { + "properties": { "prob": {"type": "number", "minimum": 0, "maximum": 1}, "word_embedding_file": {"type": "string"}, - "nearest_neighbors_num": {"type": "integer"} + "nearest_neighbors_num": {"type": "integer"}, }, "type": "object", - "additionalProperties": False + "additionalProperties": False, }, "drop_space_prob": {"type": "number", "minimum": 0, "maximum": 1}, "insert_space_prob": {"type": "number", "minimum": 0, "maximum": 1}, @@ -73,8 +73,10 @@ def __init__(self, config, *args): "Word embedding file doesn't exist: %s" % (word_embedding_file) ) - self._word_embedding_model = fasttext.load_model(word_embedding_file) - self._nn = substitute_word_config.get("nearest_neighbors_num") + self._word_embedding_model = fasttext.load_model( + word_embedding_file + ) + self._nn = substitute_word_config.get("nearest_neighbors_num") self._drop_space_prob = config.get("drop_space_prob", 0) self._insert_space_prob = config.get("insert_space_prob", 0) self._drop_char_prob = config.get("drop_char_prob", 0) @@ -126,15 +128,35 @@ def _apply_word_noise(self, tokens): if self._drop_word_prob > 0 and random.random() <= self._drop_word_prob: continue # TODO : joiners - elif self._duplicate_word_prob > 0 and random.random() <= self._duplicate_word_prob: + elif ( + self._duplicate_word_prob > 0 + and random.random() <= self._duplicate_word_prob + ): new_tokens.extend([token, token]) continue - elif len(new_tokens) > 0 and self._swap_word_prob > 0 and random.random() <= self._swap_word_prob: + elif ( + len(new_tokens) > 0 + and self._swap_word_prob > 0 + and random.random() <= self._swap_word_prob + ): new_tokens.insert(-1, token) continue - elif self._substitute_word_prob > 0 and self._word_embedding_model is not None and random.random() <= self._substitute_word_prob and all(c.isalpha() for c in token.surface): - nearest_neighbors = self._word_embedding_model.get_nearest_neighbors(token.surface, k=self._nn) - nearest_neighbors = [nn[1] for nn in nearest_neighbors if all(c.isalpha() for c in nn[1])] + elif ( + self._substitute_word_prob > 0 + and self._word_embedding_model is not None + and random.random() <= self._substitute_word_prob + and all(c.isalpha() for c in token.surface) + ): + nearest_neighbors = ( + self._word_embedding_model.get_nearest_neighbors( + token.surface, k=self._nn + ) + ) + nearest_neighbors = [ + nn[1] + for nn in nearest_neighbors + if all(c.isalpha() for c in nn[1]) + ] if nearest_neighbors: token.surface = random.choice(nearest_neighbors) new_tokens.append(token) diff --git a/test/test_operators.py b/test/test_operators.py index 846820d1..21adacd1 100644 --- a/test/test_operators.py +++ b/test/test_operators.py @@ -174,7 +174,18 @@ def test_tokenization_with_non_iso_639_lang(): ), (dict(duplicate_word_prob=1), True, "hello.", ["hello hello.."]), (dict(swap_word_prob=1), True, "hello.", [". hello"]), - (dict(substitute_word={"prob" : 1, "word_embedding_file" : "/nfs/SSALING-DATA/segal/dev/systran-docker/nmt-wizard-docker/test/corpus/resources/embeddings/dbpedia.ftz", "nearest_neighbors_num":5}), True, "hello.", ['translator.', 'dichotomy.', 'violin.', 'clarinetist.', 'luce.']), + ( + dict( + substitute_word={ + "prob": 1, + "word_embedding_file": "/nfs/SSALING-DATA/segal/dev/systran-docker/nmt-wizard-docker/test/corpus/resources/embeddings/dbpedia.ftz", + "nearest_neighbors_num": 5, + } + ), + True, + "hello.", + ["translator.", "dichotomy.", "violin.", "clarinetist.", "luce."], + ), ], ) def test_noise(config, training, text, expected): From 922941074f651a8f8a8e3d163ebe42ff3c569e34 Mon Sep 17 00:00:00 2001 From: Natalia Segal Date: Mon, 29 Nov 2021 15:49:20 +0100 Subject: [PATCH 4/8] Fix oversampling with gsample 0 --- nmtwizard/preprocess/sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nmtwizard/preprocess/sampler.py b/nmtwizard/preprocess/sampler.py index 5228b23b..ef0dac11 100644 --- a/nmtwizard/preprocess/sampler.py +++ b/nmtwizard/preprocess/sampler.py @@ -256,7 +256,7 @@ def _select_lines(f): # 1 if full sample (lines_kept == lines_count or no gsample) # >1 if oversampling (lines_kept > lines_count) # 0 if undersampling (lines_kept < lines_count) - min_occurrence = not gsample or int(f.lines_kept / f.lines_count) + min_occurrence = int(f.lines_kept / f.lines_count) or int(not gsample) if min_occurrence: random_sample = {i: min_occurrence for i in range(f.lines_count)} From 947856b2c3f6379fe70787239541bcedef840f66 Mon Sep 17 00:00:00 2001 From: Natalia Segal Date: Tue, 30 Nov 2021 11:12:35 +0100 Subject: [PATCH 5/8] Add shared state --- nmtwizard/preprocess/operators/noise.py | 49 ++++++++++++++++++------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/nmtwizard/preprocess/operators/noise.py b/nmtwizard/preprocess/operators/noise.py index 5bc45ea2..085b9ecb 100644 --- a/nmtwizard/preprocess/operators/noise.py +++ b/nmtwizard/preprocess/operators/noise.py @@ -52,7 +52,28 @@ def _config_schema(cls): def is_applied_for(process_type): return process_type == prepoperator.ProcessType.TRAINING - def __init__(self, config, *args): + def get_shared_classes(): + return [fasttext.load_model] + + @staticmethod + def get_shared_builders(config, process_type): + word_embedding_file = config.get("substitute_word", {}).get("word_embedding_file") + if word_embedding_file is None: + return None + if not os.path.isfile(word_embedding_file): + raise ValueError( + "Word embedding file doesn't exist: %s" + % (word_embedding_file) + ) + return { + "word_embedding_model": ( + fasttext.load_model, + (word_embedding_file,) + ) + } + + + def __init__(self, config, process_type, build_state, shared_state=None): source_config = config.get("source") if source_config: config = source_config @@ -64,19 +85,21 @@ def __init__(self, config, *args): if substitute_word_config: self._substitute_word_prob = substitute_word_config.get("prob", 0) if self._substitute_word_prob: - # TODO: SharedState builder ? - word_embedding_file = substitute_word_config.get("word_embedding_file") - self._word_embedding_model = None - if word_embedding_file is not None: - if not os.path.isfile(word_embedding_file): - raise ValueError( - "Word embedding file doesn't exist: %s" - % (word_embedding_file) + self._word_embedding_model = shared_state.get("word_embedding_model") if shared_state else None + if not shared_state: + # TODO: batched processing + word_embedding_file = substitute_word_config.get("word_embedding_file") + self._word_embedding_model = None + if word_embedding_file is not None: + if not os.path.isfile(word_embedding_file): + raise ValueError( + "Word embedding file doesn't exist: %s" + % (word_embedding_file) + ) + self._word_embedding_model = fasttext.load_model( + word_embedding_file ) - self._word_embedding_model = fasttext.load_model( - word_embedding_file - ) - self._nn = substitute_word_config.get("nearest_neighbors_num") + self._nn = substitute_word_config.get("nearest_neighbors_num") self._drop_space_prob = config.get("drop_space_prob", 0) self._insert_space_prob = config.get("insert_space_prob", 0) self._drop_char_prob = config.get("drop_char_prob", 0) From e55c11d12024e871fef6c4e1501dbb74cf8d7063 Mon Sep 17 00:00:00 2001 From: Natalia Segal Date: Tue, 30 Nov 2021 14:18:30 +0100 Subject: [PATCH 6/8] Create Noiser object --- nmtwizard/preprocess/operators/noise.py | 232 +++++++++++------------- test/test_operators.py | 1 - 2 files changed, 109 insertions(+), 124 deletions(-) diff --git a/nmtwizard/preprocess/operators/noise.py b/nmtwizard/preprocess/operators/noise.py index 085b9ecb..1ef2a9ef 100644 --- a/nmtwizard/preprocess/operators/noise.py +++ b/nmtwizard/preprocess/operators/noise.py @@ -2,81 +2,16 @@ import copy import os +import pyonmttok + from nmtwizard.preprocess import prepoperator from nmtwizard.preprocess.tu import TokReplace import fasttext -@prepoperator.register_operator("noise") -class Noise(prepoperator.TUOperator): - @classmethod - def _config_schema(cls): - schema = super(Noise, cls)._config_schema() - - noise_block = { - "lang": {"type": "string"}, - "drop_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, - "duplicate_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, - "swap_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, - "substitute_word": { - "properties": { - "prob": {"type": "number", "minimum": 0, "maximum": 1}, - "word_embedding_file": {"type": "string"}, - "nearest_neighbors_num": {"type": "integer"}, - }, - "type": "object", - "additionalProperties": False, - }, - "drop_space_prob": {"type": "number", "minimum": 0, "maximum": 1}, - "insert_space_prob": {"type": "number", "minimum": 0, "maximum": 1}, - "drop_char_prob": {"type": "number", "minimum": 0, "maximum": 1}, - "duplicate_char_prob": {"type": "number", "minimum": 0, "maximum": 1}, - "swap_char_prob": {"type": "number", "minimum": 0, "maximum": 1}, - "substitute_char_prob": {"type": "number", "minimum": 0, "maximum": 1}, - "add_marker": {"type": "boolean"}, - } - schema["properties"].update( - { - "source": { - "type": "object", - "properties": noise_block, - "additionalProperties": False, - } - } - ) - schema["properties"].update(noise_block) - - return schema +class Noiser: - @staticmethod - def is_applied_for(process_type): - return process_type == prepoperator.ProcessType.TRAINING - - def get_shared_classes(): - return [fasttext.load_model] - - @staticmethod - def get_shared_builders(config, process_type): - word_embedding_file = config.get("substitute_word", {}).get("word_embedding_file") - if word_embedding_file is None: - return None - if not os.path.isfile(word_embedding_file): - raise ValueError( - "Word embedding file doesn't exist: %s" - % (word_embedding_file) - ) - return { - "word_embedding_model": ( - fasttext.load_model, - (word_embedding_file,) - ) - } - - - def __init__(self, config, process_type, build_state, shared_state=None): - source_config = config.get("source") - if source_config: - config = source_config + def __init__(self, config): self._drop_word_prob = config.get("drop_word_prob", 0) self._duplicate_word_prob = config.get("duplicate_word_prob", 0) self._swap_word_prob = config.get("swap_word_prob", 0) @@ -85,66 +20,27 @@ def __init__(self, config, process_type, build_state, shared_state=None): if substitute_word_config: self._substitute_word_prob = substitute_word_config.get("prob", 0) if self._substitute_word_prob: - self._word_embedding_model = shared_state.get("word_embedding_model") if shared_state else None - if not shared_state: - # TODO: batched processing - word_embedding_file = substitute_word_config.get("word_embedding_file") - self._word_embedding_model = None - if word_embedding_file is not None: - if not os.path.isfile(word_embedding_file): - raise ValueError( - "Word embedding file doesn't exist: %s" - % (word_embedding_file) - ) - self._word_embedding_model = fasttext.load_model( - word_embedding_file + # TODO: batched processing + word_embedding_file = substitute_word_config.get("word_embedding_file") + self._word_embedding_model = None + if word_embedding_file is not None: + if not os.path.isfile(word_embedding_file): + raise ValueError( + "Word embedding file doesn't exist: %s" + % (word_embedding_file) ) - self._nn = substitute_word_config.get("nearest_neighbors_num") + self._word_embedding_model = fasttext.load_model( + word_embedding_file + ) + self._nn = substitute_word_config.get("nearest_neighbors_num") self._drop_space_prob = config.get("drop_space_prob", 0) self._insert_space_prob = config.get("insert_space_prob", 0) self._drop_char_prob = config.get("drop_char_prob", 0) self._duplicate_char_prob = config.get("duplicate_char_prob", 0) self._swap_char_prob = config.get("swap_char_prob", 0) self._substitute_char_prob = config.get("substitute_char_prob", 0) - self._add_marker = config.get("add_marker", 0) - - def _preprocess_tu(self, tu, *args): - original_tokens = copy.deepcopy(tu.src_tok.token_objects) - tu = self._apply_space_insertion_noise(tu) - src_tok = tu.src_tok - tokens = src_tok.token_objects - new_tokens = [self._apply_word_noise(tokens[0])] - tu.src_tok = (src_tok.tokenizer, new_tokens) - if self._add_marker and new_tokens != original_tokens: - tu.replace_tokens_side("source", (0, 0, ["⦅mrk_noisy⦆"])) - return [tu] - - def _apply_space_insertion_noise(self, tu): - src_tok = tu.src_tok - tokens = src_tok.token_objects[0] - added_spaces = 0 - for pos, token in enumerate(tokens): - if not token.is_placeholder(): - if ( - self._insert_space_prob > 0 - and random.random() <= self._insert_space_prob - and len(token) > 1 - ): - new_space_index = random.randint(1, len(token) - 1) - first_part_surface = token.surface[0:new_space_index] - second_part_surface = token.surface[new_space_index:] - tu.replace_tokens_side( - "source", - ( - pos + added_spaces, - 1, - [first_part_surface, second_part_surface], - ), - ) - added_spaces += 1 - return tu - def _apply_word_noise(self, tokens): + def apply_noise(self, tokens): new_tokens = [] for token in tokens: if not token.is_placeholder(): @@ -184,6 +80,19 @@ def _apply_word_noise(self, tokens): token.surface = random.choice(nearest_neighbors) new_tokens.append(token) continue + elif ( + self._insert_space_prob > 0 + and random.random() <= self._insert_space_prob + and len(token) > 1 + ): + new_space_index = random.randint(1, len(token) - 1) + first_part_surface = token.surface[0:new_space_index] + second_part_surface = token.surface[new_space_index:] + token.surface = first_part_surface + second_part_token = pyonmttok.Token(token) + second_part_token.surface = second_part_surface + new_tokens.extend([token, second_part_token]) + continue elif ( self._drop_space_prob > 0 and random.random() <= self._drop_space_prob @@ -196,7 +105,7 @@ def _apply_word_noise(self, tokens): or self._swap_char_prob > 0 or self._substitute_char_prob > 0 ): - token.surface = self._apply_character_noise(token.surface) + token.surface = self.apply_character_noise(token.surface) if len(token.surface) != 0: # Delete token if empty. new_tokens.append(token) return new_tokens @@ -215,7 +124,7 @@ def get_neighbor_keys_on_qwerty(key): if len(line) > index + i and line[index + i] != key and index + i >= 0 ] - def _apply_character_noise(self, cur_surface): + def apply_character_noise(self, cur_surface): new_surface = "" i = 0 while i < len(cur_surface): @@ -245,3 +154,80 @@ def _apply_character_noise(self, cur_surface): new_surface += cur_surface[i] i += 1 return new_surface + + +@prepoperator.register_operator("noise") +class Noise(prepoperator.TUOperator): + @classmethod + def _config_schema(cls): + schema = super(Noise, cls)._config_schema() + + noise_block = { + "lang": {"type": "string"}, + "drop_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "duplicate_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "swap_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "substitute_word": { + "properties": { + "prob": {"type": "number", "minimum": 0, "maximum": 1}, + "word_embedding_file": {"type": "string"}, + "nearest_neighbors_num": {"type": "integer"}, + }, + "type": "object", + "additionalProperties": False, + }, + "drop_space_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "insert_space_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "drop_char_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "duplicate_char_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "swap_char_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "substitute_char_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "add_marker": {"type": "boolean"}, + } + schema["properties"].update( + { + "source": { + "type": "object", + "properties": noise_block, + "additionalProperties": False, + } + } + ) + schema["properties"].update(noise_block) + + return schema + + @staticmethod + def is_applied_for(process_type): + return process_type == prepoperator.ProcessType.TRAINING + + def get_shared_classes(): + return [Noiser] + + @staticmethod + def get_shared_builders(config, process_type): + return { + "noiser": ( + Noiser, + (config,) + ) + } + + + def __init__(self, config, process_type, build_state, shared_state=None): + source_config = config.get("source") + if source_config: + config = source_config + self._noiser = shared_state.get("noiser") if shared_state else Noiser(config) + self._add_marker = config.get("add_marker", 0) + + def _preprocess_tu(self, tu, *args): + original_tokens = copy.deepcopy(tu.src_tok.token_objects) + src_tok = tu.src_tok + tokens = src_tok.token_objects[0] + new_tokens = [self._noiser.apply_noise(tokens)] + tu.src_tok = (src_tok.tokenizer, new_tokens) + if self._add_marker and new_tokens != original_tokens: + tu.replace_tokens_side("source", (0, 0, ["⦅mrk_noisy⦆"])) + return [tu] + diff --git a/test/test_operators.py b/test/test_operators.py index 21adacd1..676f07b9 100644 --- a/test/test_operators.py +++ b/test/test_operators.py @@ -153,7 +153,6 @@ def test_tokenization_with_non_iso_639_lang(): "hello.", ["h ello.", "he llo.", "hel lo.", "hell o."], ), - (dict(insert_space_prob=1, drop_space_prob=1), True, "hello.", ["hello."]), (dict(substitute_char_prob=1), True, "pp", ["oo", "ol", "lo", "ll"]), ( dict(drop_space_prob=1, add_marker=True), From 846fd060e552e8afb5d5234ed6f5911defc5216b Mon Sep 17 00:00:00 2001 From: Natalia Segal Date: Wed, 1 Dec 2021 11:57:43 +0100 Subject: [PATCH 7/8] Batch implementation --- nmtwizard/preprocess/operators/noise.py | 71 ++++++++++++++++++------- 1 file changed, 53 insertions(+), 18 deletions(-) diff --git a/nmtwizard/preprocess/operators/noise.py b/nmtwizard/preprocess/operators/noise.py index 1ef2a9ef..6ad0ced1 100644 --- a/nmtwizard/preprocess/operators/noise.py +++ b/nmtwizard/preprocess/operators/noise.py @@ -1,13 +1,16 @@ import random import copy import os +import logging import pyonmttok +from nmtwizard.logger import get_logger from nmtwizard.preprocess import prepoperator from nmtwizard.preprocess.tu import TokReplace import fasttext +logger = get_logger(__name__) class Noiser: @@ -20,7 +23,6 @@ def __init__(self, config): if substitute_word_config: self._substitute_word_prob = substitute_word_config.get("prob", 0) if self._substitute_word_prob: - # TODO: batched processing word_embedding_file = substitute_word_config.get("word_embedding_file") self._word_embedding_model = None if word_embedding_file is not None: @@ -110,6 +112,12 @@ def apply_noise(self, tokens): new_tokens.append(token) return new_tokens + def apply_noise_batch(self, tokens_batch): + new_tokens_batch = [] + for tokens in tokens_batch: + new_tokens_batch.append(self.apply_noise(tokens)) + return new_tokens_batch + @staticmethod def get_neighbor_keys_on_qwerty(key): lines = "qwertyuiop", "asdfghjkl", "zxcvbnm" @@ -157,7 +165,7 @@ def apply_character_noise(self, cur_surface): @prepoperator.register_operator("noise") -class Noise(prepoperator.TUOperator): +class Noise(prepoperator.Operator): @classmethod def _config_schema(cls): schema = super(Noise, cls)._config_schema() @@ -206,28 +214,55 @@ def get_shared_classes(): @staticmethod def get_shared_builders(config, process_type): - return { - "noiser": ( - Noiser, - (config,) - ) - } + # Only build noiser as shared object for word substitution with embeddings + word_emb = config.get("substitute_word", {}).get("word_embedding_file") + if word_emb: + return { + "noiser": ( + Noiser, + (config,) + ) + } + else: + return None def __init__(self, config, process_type, build_state, shared_state=None): source_config = config.get("source") if source_config: config = source_config - self._noiser = shared_state.get("noiser") if shared_state else Noiser(config) + self._noiser = shared_state.get("noiser") if shared_state else None + if not self._noiser: + self._noiser = Noiser(config) self._add_marker = config.get("add_marker", 0) - def _preprocess_tu(self, tu, *args): - original_tokens = copy.deepcopy(tu.src_tok.token_objects) - src_tok = tu.src_tok - tokens = src_tok.token_objects[0] - new_tokens = [self._noiser.apply_noise(tokens)] - tu.src_tok = (src_tok.tokenizer, new_tokens) - if self._add_marker and new_tokens != original_tokens: - tu.replace_tokens_side("source", (0, 0, ["⦅mrk_noisy⦆"])) - return [tu] + def _preprocess(self, tu_batch): + tu_list, meta_batch = tu_batch + + src_tokens = [] + src_detok = [] + for tu in tu_list: + src_tok = tu.src_tok + src_tokens.append(src_tok.token_objects[0]) + src_detok.append(tu.src_detok) + + src_tokens_noisy = self._noiser.apply_noise_batch(src_tokens) + + for detok, tok_noisy, tu in zip(src_detok, src_tokens_noisy, tu_list): + src_tok = tu.src_tok + tu.src_tok = (src_tok.tokenizer, [tok_noisy]) + new_detok = tu.src_detok + if detok != new_detok: + if self._add_marker: + tu.replace_tokens_side("source", (0, 0, ["⦅mrk_noisy⦆"])) + log_level = logging.INFO if self._verbose else logging.DEBUG + if logger.isEnabledFor(log_level): + logger.info( + "'%s' operator modifies source in preprocess.\nSRC BEFORE : %s\nSRC AFTER : %s", + self.name, + detok, + new_detok, + ) + + return tu_list, meta_batch From 97b22f746c1e132c3137c74121888fefec2bd36f Mon Sep 17 00:00:00 2001 From: Natalia Segal Date: Wed, 1 Dec 2021 12:40:23 +0100 Subject: [PATCH 8/8] Apply black --- nmtwizard/preprocess/operators/noise.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/nmtwizard/preprocess/operators/noise.py b/nmtwizard/preprocess/operators/noise.py index 6ad0ced1..615d4b2d 100644 --- a/nmtwizard/preprocess/operators/noise.py +++ b/nmtwizard/preprocess/operators/noise.py @@ -12,8 +12,8 @@ logger = get_logger(__name__) -class Noiser: +class Noiser: def __init__(self, config): self._drop_word_prob = config.get("drop_word_prob", 0) self._duplicate_word_prob = config.get("duplicate_word_prob", 0) @@ -217,16 +217,10 @@ def get_shared_builders(config, process_type): # Only build noiser as shared object for word substitution with embeddings word_emb = config.get("substitute_word", {}).get("word_embedding_file") if word_emb: - return { - "noiser": ( - Noiser, - (config,) - ) - } + return {"noiser": (Noiser, (config,))} else: return None - def __init__(self, config, process_type, build_state, shared_state=None): source_config = config.get("source") if source_config: @@ -236,7 +230,6 @@ def __init__(self, config, process_type, build_state, shared_state=None): self._noiser = Noiser(config) self._add_marker = config.get("add_marker", 0) - def _preprocess(self, tu_batch): tu_list, meta_batch = tu_batch