diff --git a/nmtwizard/preprocess/operators/noise.py b/nmtwizard/preprocess/operators/noise.py index 34052028..615d4b2d 100644 --- a/nmtwizard/preprocess/operators/noise.py +++ b/nmtwizard/preprocess/operators/noise.py @@ -1,75 +1,88 @@ import random import copy +import os +import logging +import pyonmttok + +from nmtwizard.logger import get_logger from nmtwizard.preprocess import prepoperator from nmtwizard.preprocess.tu import TokReplace +import fasttext +logger = get_logger(__name__) -@prepoperator.register_operator("noise") -class Noise(prepoperator.TUOperator): - @classmethod - def _config_schema(cls): - schema = super(Noise, cls)._config_schema() - - noise_block = { - "lang": {"type": "string"}, - "drop_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, - "drop_space_prob": {"type": "number", "minimum": 0, "maximum": 1}, - "insert_space_prob": {"type": "number", "minimum": 0, "maximum": 1}, - "drop_char_prob": {"type": "number", "minimum": 0, "maximum": 1}, - "duplicate_char_prob": {"type": "number", "minimum": 0, "maximum": 1}, - "swap_char_prob": {"type": "number", "minimum": 0, "maximum": 1}, - "substitute_char_prob": {"type": "number", "minimum": 0, "maximum": 1}, - "add_marker": {"type": "boolean"}, - } - schema["properties"].update( - { - "source": { - "type": "object", - "properties": noise_block, - "additionalProperties": False, - } - } - ) - schema["properties"].update(noise_block) - return schema - - @staticmethod - def is_applied_for(process_type): - return process_type == prepoperator.ProcessType.TRAINING - - def __init__(self, config, *args): - source_config = config.get("source") - if source_config: - config = source_config +class Noiser: + def __init__(self, config): self._drop_word_prob = config.get("drop_word_prob", 0) + self._duplicate_word_prob = config.get("duplicate_word_prob", 0) + self._swap_word_prob = config.get("swap_word_prob", 0) + substitute_word_config = config.get("substitute_word", None) + self._substitute_word_prob = 0 + if substitute_word_config: + self._substitute_word_prob = substitute_word_config.get("prob", 0) + if self._substitute_word_prob: + word_embedding_file = substitute_word_config.get("word_embedding_file") + self._word_embedding_model = None + if word_embedding_file is not None: + if not os.path.isfile(word_embedding_file): + raise ValueError( + "Word embedding file doesn't exist: %s" + % (word_embedding_file) + ) + self._word_embedding_model = fasttext.load_model( + word_embedding_file + ) + self._nn = substitute_word_config.get("nearest_neighbors_num") self._drop_space_prob = config.get("drop_space_prob", 0) self._insert_space_prob = config.get("insert_space_prob", 0) self._drop_char_prob = config.get("drop_char_prob", 0) self._duplicate_char_prob = config.get("duplicate_char_prob", 0) self._swap_char_prob = config.get("swap_char_prob", 0) self._substitute_char_prob = config.get("substitute_char_prob", 0) - self._add_marker = config.get("add_marker", 0) - def _preprocess_tu(self, tu, *args): - original_tokens = copy.deepcopy(tu.src_tok.token_objects) - tu = self._apply_space_insertion_noise(tu) - src_tok = tu.src_tok - tokens = src_tok.token_objects - new_tokens = [self._apply_word_noise(tokens[0])] - tu.src_tok = (src_tok.tokenizer, new_tokens) - if self._add_marker and new_tokens != original_tokens: - tu.replace_tokens_side("source", (0, 0, ["⦅mrk_noisy⦆"])) - return [tu] - - def _apply_space_insertion_noise(self, tu): - src_tok = tu.src_tok - tokens = src_tok.token_objects[0] - added_spaces = 0 - for pos, token in enumerate(tokens): + def apply_noise(self, tokens): + new_tokens = [] + for token in tokens: if not token.is_placeholder(): - if ( + if self._drop_word_prob > 0 and random.random() <= self._drop_word_prob: + continue + # TODO : joiners + elif ( + self._duplicate_word_prob > 0 + and random.random() <= self._duplicate_word_prob + ): + new_tokens.extend([token, token]) + continue + elif ( + len(new_tokens) > 0 + and self._swap_word_prob > 0 + and random.random() <= self._swap_word_prob + ): + new_tokens.insert(-1, token) + continue + elif ( + self._substitute_word_prob > 0 + and self._word_embedding_model is not None + and random.random() <= self._substitute_word_prob + and all(c.isalpha() for c in token.surface) + ): + nearest_neighbors = ( + self._word_embedding_model.get_nearest_neighbors( + token.surface, k=self._nn + ) + ) + nearest_neighbors = [ + nn[1] + for nn in nearest_neighbors + if all(c.isalpha() for c in nn[1]) + ] + if nearest_neighbors: + token.surface = random.choice(nearest_neighbors) + new_tokens.append(token) + continue + elif ( self._insert_space_prob > 0 and random.random() <= self._insert_space_prob and len(token) > 1 @@ -77,22 +90,10 @@ def _apply_space_insertion_noise(self, tu): new_space_index = random.randint(1, len(token) - 1) first_part_surface = token.surface[0:new_space_index] second_part_surface = token.surface[new_space_index:] - tu.replace_tokens_side( - "source", - ( - pos + added_spaces, - 1, - [first_part_surface, second_part_surface], - ), - ) - added_spaces += 1 - return tu - - def _apply_word_noise(self, tokens): - new_tokens = [] - for token in tokens: - if not token.is_placeholder(): - if self._drop_word_prob > 0 and random.random() <= self._drop_word_prob: + token.surface = first_part_surface + second_part_token = pyonmttok.Token(token) + second_part_token.surface = second_part_surface + new_tokens.extend([token, second_part_token]) continue elif ( self._drop_space_prob > 0 @@ -106,11 +107,17 @@ def _apply_word_noise(self, tokens): or self._swap_char_prob > 0 or self._substitute_char_prob > 0 ): - token.surface = self._apply_character_noise(token.surface) + token.surface = self.apply_character_noise(token.surface) if len(token.surface) != 0: # Delete token if empty. new_tokens.append(token) return new_tokens + def apply_noise_batch(self, tokens_batch): + new_tokens_batch = [] + for tokens in tokens_batch: + new_tokens_batch.append(self.apply_noise(tokens)) + return new_tokens_batch + @staticmethod def get_neighbor_keys_on_qwerty(key): lines = "qwertyuiop", "asdfghjkl", "zxcvbnm" @@ -125,7 +132,7 @@ def get_neighbor_keys_on_qwerty(key): if len(line) > index + i and line[index + i] != key and index + i >= 0 ] - def _apply_character_noise(self, cur_surface): + def apply_character_noise(self, cur_surface): new_surface = "" i = 0 while i < len(cur_surface): @@ -155,3 +162,100 @@ def _apply_character_noise(self, cur_surface): new_surface += cur_surface[i] i += 1 return new_surface + + +@prepoperator.register_operator("noise") +class Noise(prepoperator.Operator): + @classmethod + def _config_schema(cls): + schema = super(Noise, cls)._config_schema() + + noise_block = { + "lang": {"type": "string"}, + "drop_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "duplicate_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "swap_word_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "substitute_word": { + "properties": { + "prob": {"type": "number", "minimum": 0, "maximum": 1}, + "word_embedding_file": {"type": "string"}, + "nearest_neighbors_num": {"type": "integer"}, + }, + "type": "object", + "additionalProperties": False, + }, + "drop_space_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "insert_space_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "drop_char_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "duplicate_char_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "swap_char_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "substitute_char_prob": {"type": "number", "minimum": 0, "maximum": 1}, + "add_marker": {"type": "boolean"}, + } + schema["properties"].update( + { + "source": { + "type": "object", + "properties": noise_block, + "additionalProperties": False, + } + } + ) + schema["properties"].update(noise_block) + + return schema + + @staticmethod + def is_applied_for(process_type): + return process_type == prepoperator.ProcessType.TRAINING + + def get_shared_classes(): + return [Noiser] + + @staticmethod + def get_shared_builders(config, process_type): + # Only build noiser as shared object for word substitution with embeddings + word_emb = config.get("substitute_word", {}).get("word_embedding_file") + if word_emb: + return {"noiser": (Noiser, (config,))} + else: + return None + + def __init__(self, config, process_type, build_state, shared_state=None): + source_config = config.get("source") + if source_config: + config = source_config + self._noiser = shared_state.get("noiser") if shared_state else None + if not self._noiser: + self._noiser = Noiser(config) + self._add_marker = config.get("add_marker", 0) + + def _preprocess(self, tu_batch): + tu_list, meta_batch = tu_batch + + src_tokens = [] + src_detok = [] + for tu in tu_list: + src_tok = tu.src_tok + src_tokens.append(src_tok.token_objects[0]) + src_detok.append(tu.src_detok) + + src_tokens_noisy = self._noiser.apply_noise_batch(src_tokens) + + for detok, tok_noisy, tu in zip(src_detok, src_tokens_noisy, tu_list): + src_tok = tu.src_tok + tu.src_tok = (src_tok.tokenizer, [tok_noisy]) + new_detok = tu.src_detok + if detok != new_detok: + if self._add_marker: + tu.replace_tokens_side("source", (0, 0, ["⦅mrk_noisy⦆"])) + log_level = logging.INFO if self._verbose else logging.DEBUG + if logger.isEnabledFor(log_level): + logger.info( + "'%s' operator modifies source in preprocess.\nSRC BEFORE : %s\nSRC AFTER : %s", + self.name, + detok, + new_detok, + ) + + return tu_list, meta_batch diff --git a/nmtwizard/preprocess/sampler.py b/nmtwizard/preprocess/sampler.py index 5228b23b..ef0dac11 100644 --- a/nmtwizard/preprocess/sampler.py +++ b/nmtwizard/preprocess/sampler.py @@ -256,7 +256,7 @@ def _select_lines(f): # 1 if full sample (lines_kept == lines_count or no gsample) # >1 if oversampling (lines_kept > lines_count) # 0 if undersampling (lines_kept < lines_count) - min_occurrence = not gsample or int(f.lines_kept / f.lines_count) + min_occurrence = int(f.lines_kept / f.lines_count) or int(not gsample) if min_occurrence: random_sample = {i: min_occurrence for i in range(f.lines_count)} diff --git a/test/test_operators.py b/test/test_operators.py index 65366e00..676f07b9 100644 --- a/test/test_operators.py +++ b/test/test_operators.py @@ -153,7 +153,6 @@ def test_tokenization_with_non_iso_639_lang(): "hello.", ["h ello.", "he llo.", "hel lo.", "hell o."], ), - (dict(insert_space_prob=1, drop_space_prob=1), True, "hello.", ["hello."]), (dict(substitute_char_prob=1), True, "pp", ["oo", "ol", "lo", "ll"]), ( dict(drop_space_prob=1, add_marker=True), @@ -172,6 +171,20 @@ def test_tokenization_with_non_iso_639_lang(): "⦅mrk_noisy⦆ hell o.", ], ), + (dict(duplicate_word_prob=1), True, "hello.", ["hello hello.."]), + (dict(swap_word_prob=1), True, "hello.", [". hello"]), + ( + dict( + substitute_word={ + "prob": 1, + "word_embedding_file": "/nfs/SSALING-DATA/segal/dev/systran-docker/nmt-wizard-docker/test/corpus/resources/embeddings/dbpedia.ftz", + "nearest_neighbors_num": 5, + } + ), + True, + "hello.", + ["translator.", "dichotomy.", "violin.", "clarinetist.", "luce."], + ), ], ) def test_noise(config, training, text, expected):