diff --git a/docs/ext/autoclassmembersdiagram.py b/docs/ext/autoclassmembersdiagram.py index c603202c..cb975f1f 100644 --- a/docs/ext/autoclassmembersdiagram.py +++ b/docs/ext/autoclassmembersdiagram.py @@ -39,6 +39,7 @@ class MagicTraits(object): "__getattr__": "attributes", "__getattribute__": "attributes", "__len__": "len", + "__hash__": "hashable", "__subclasshook__": False, "__repr__": False, "__str__": False, diff --git a/setup.py b/setup.py index e8c5ba16..73a2f602 100644 --- a/setup.py +++ b/setup.py @@ -62,6 +62,7 @@ def filter_requirements(line): 'jsonrpcserver>=4.0.1', 'gunicorn>=19.9.0', 'docutils>=0.14', + 'edit_distance>=1.0.4', 'editdistance>=0.5.3', 'Unidecode>=1.1.2', ], diff --git a/src/benchmarkstt/diff/__init__.py b/src/benchmarkstt/diff/__init__.py index 889d4ef5..3da540a5 100644 --- a/src/benchmarkstt/diff/__init__.py +++ b/src/benchmarkstt/diff/__init__.py @@ -2,11 +2,39 @@ Responsible for calculating differences. """ -from abc import ABC, abstractmethod +from abc import ABC, ABCMeta, abstractmethod from benchmarkstt.factory import CoreFactory +from collections import namedtuple -class Differ(ABC): +OpcodeCounts = namedtuple('OpcodeCounts', + ('equal', 'replace', 'insert', 'delete')) + + +def get_opcode_counts(opcodes) -> OpcodeCounts: + counts = OpcodeCounts(0, 0, 0, 0)._asdict() + for tag, alo, ahi, blo, bhi in opcodes: + if tag == 'equal': + counts[tag] += ahi - alo + elif tag == 'insert': + counts[tag] += bhi - blo + elif tag == 'delete': + counts[tag] += ahi - alo + elif tag == 'replace': + ca = ahi - alo + cb = bhi - blo + if ca < cb: + counts['insert'] += cb - ca + counts['replace'] += ca + elif ca > cb: + counts['delete'] += ca - cb + counts['replace'] += cb + else: + counts[tag] += ahi - alo + return OpcodeCounts(counts['equal'], counts['replace'], counts['insert'], counts['delete']) + + +class DifferInterface(ABC): @abstractmethod def __init__(self, a, b): """ @@ -32,5 +60,31 @@ def get_opcodes(self): """ raise NotImplementedError() + @abstractmethod + def get_opcode_counts(self): + raise NotImplementedError() + + @abstractmethod + def get_error_rate(self): + raise NotImplementedError() + + +class Differ(DifferInterface, metaclass=ABCMeta): + """ + Provides pre-made (probably sub-optimal) implementations of + get_opcode_counts() and get_error_rate() + """ + + def get_opcode_counts(self): + return get_opcode_counts(self.get_opcodes()) + + def get_error_rate(self): + counts = self.get_opcode_counts() + + changes = counts.replace + counts.delete + counts.insert + total = counts.equal + counts.replace + counts.delete + + return changes / total + -factory = CoreFactory(Differ, False) +factory = CoreFactory(DifferInterface, False) diff --git a/src/benchmarkstt/diff/core.py b/src/benchmarkstt/diff/core.py index 69efe442..17e98021 100644 --- a/src/benchmarkstt/diff/core.py +++ b/src/benchmarkstt/diff/core.py @@ -4,6 +4,8 @@ from difflib import SequenceMatcher from benchmarkstt.diff import Differ +import edit_distance +import editdistance class RatcliffObershelp(Differ): @@ -12,9 +14,7 @@ class RatcliffObershelp(Differ): From difflib.SequenceMatcher_ (Copyright_ 2001-2020, Python Software Foundation.) - SequenceMatcher is a flexible class for comparing pairs of sequences of - any type, so long as the sequence elements are hashable. The basic - algorithm predates, and is a little fancier than, an algorithm + The basic algorithm predates, and is a little fancier than, an algorithm published in the late 1980's by Ratcliff and Obershelp under the hyperbolic name "gestalt pattern matching". The basic idea is to find the longest contiguous matching subsequence that contains no "junk" @@ -29,11 +29,56 @@ class RatcliffObershelp(Differ): """ def __init__(self, a, b, **kwargs): - if 'autojunk' not in kwargs: - kwargs['autojunk'] = False kwargs['a'] = a kwargs['b'] = b - self.matcher = SequenceMatcher(**kwargs) + self._kwargs = kwargs + self._matcher = SequenceMatcher(**self._kwargs) def get_opcodes(self): - return self.matcher.get_opcodes() + return self._matcher.get_opcodes() + + +class Levenshtein(Differ): + """ + Levenshtein_ distance is the minimum edit distance. + + .. _Levenshtein: https://en.wikipedia.org/wiki/Levenshtein_distance + """ + + def __init__(self, a, b, **kwargs): + kwargs['a'] = a + kwargs['b'] = b + if 'action_function' not in kwargs: + kwargs['action_function'] = edit_distance.highest_match_action + self._kwargs = kwargs + self._matcher = edit_distance.SequenceMatcher(**self._kwargs) + + def get_opcodes(self): + raise NotImplementedError("not supported by %r" % (self,)) + + def get_error_rate(self): + a = self._kwargs['a'] + b = self._kwargs['b'] + len_a = len(a) + if len_a == 0: + return 0 if len(b) == 0 else 1 + return editdistance.eval(a, b) / len_a + + @staticmethod + def simplify_opcodes(opcodes): + new_codes = [] + prev = None + for cur in opcodes: + if prev is None: + prev = cur + elif cur[0] == prev[0]: + prev[2] = cur[2] + prev[4] = cur[4] + else: + new_codes.append(tuple(prev)) + prev = cur + + if prev is not None: + new_codes.append(tuple(prev)) + + return new_codes diff --git a/src/benchmarkstt/metrics/core.py b/src/benchmarkstt/metrics/core.py index 99fcc79f..3d55ce0c 100644 --- a/src/benchmarkstt/metrics/core.py +++ b/src/benchmarkstt/metrics/core.py @@ -1,8 +1,9 @@ from benchmarkstt.schema import Schema import logging -import json -from benchmarkstt.diff import Differ -from benchmarkstt.diff.core import RatcliffObershelp +from collections import namedtuple +from typing import Union +from benchmarkstt.diff import DifferInterface, factory as differ_factory +from benchmarkstt.diff.core import RatcliffObershelp, Levenshtein from benchmarkstt.diff.formatter import format_diff from benchmarkstt.metrics import Metric from collections import namedtuple @@ -13,40 +14,21 @@ OpcodeCounts = namedtuple('OpcodeCounts', ('equal', 'replace', 'insert', 'delete')) +type_schema = Union[Schema, list] +type_differ = DifferInterface + def traversible(schema, key=None): if key is None: key = 'item' - return [word[key] for word in schema] - - -def get_opcode_counts(opcodes) -> OpcodeCounts: - counts = OpcodeCounts(0, 0, 0, 0)._asdict() - for tag, alo, ahi, blo, bhi in opcodes: - if tag == 'equal': - counts[tag] += ahi - alo - elif tag == 'insert': - counts[tag] += bhi - blo - elif tag == 'delete': - counts[tag] += ahi - alo - elif tag == 'replace': - ca = ahi - alo - cb = bhi - blo - if ca < cb: - counts['insert'] += cb - ca - counts['replace'] += ca - elif ca > cb: - counts['delete'] += ca - cb - counts['replace'] += cb - else: - counts[tag] += ahi - alo - return OpcodeCounts(counts['equal'], counts['replace'], counts['insert'], counts['delete']) + return [item if type(item) is str else item[key] for item in schema] -def get_differ(a, b, differ_class: Differ): - if differ_class is None: - # differ_class = HuntMcIlroy +def get_differ(a, b, differ_class: type_differ): + if differ_class is None or differ_class == '': differ_class = RatcliffObershelp + elif type(differ_class) is str: + differ_class = differ_factory[differ_class] return differ_class(traversible(a), traversible(b)) @@ -54,16 +36,17 @@ class WordDiffs(Metric): """ Present differences on a per-word basis + :param differ_class: see :py:mod:`benchmarkstt.diff.core` :param dialect: Presentation format. Default is 'ansi'. + :example differ_class: 'levenshtein' :example dialect: 'html' - :param differ_class: For future use. """ - def __init__(self, dialect=None, differ_class: Differ = None): + def __init__(self, differ_class: type_differ = None, dialect: str = None): self._differ_class = differ_class self._dialect = dialect - def compare(self, ref: Schema, hyp: Schema): + def compare(self, ref: type_schema, hyp: type_schema): differ = get_differ(ref, hyp, differ_class=self._differ_class) a = traversible(ref) b = traversible(hyp) @@ -92,58 +75,44 @@ class WER(Metric): See https://docs.python.org/3/library/difflib.html - [Mode: 'levenshtein'] In the context of WER, Levenshtein - distance is the minimum edit distance computed at the - word level. This implementation uses the Editdistance - c++ implementation by Hiroyuki Tanaka: - https://github.com/aflc/editdistance. See: - https://en.wikipedia.org/wiki/Levenshtein_distance - :param mode: 'strict' (default), 'hunt' or 'levenshtein'. - :param differ_class: For future use. + :param differ_class: see :py:mod:`benchmarkstt.diff.core` """ # WER modes MODE_STRICT = 'strict' MODE_HUNT = 'hunt' - MODE_LEVENSHTEIN = 'levenshtein' DEL_PENALTY = 1 INS_PENALTY = 1 SUB_PENALTY = 1 - def __init__(self, mode=None, differ_class: Differ = None): + def __init__(self, mode=None, differ_class: Union[str, type_differ, None] = None): self._mode = mode - if mode == self.MODE_LEVENSHTEIN: - return if differ_class is None: differ_class = RatcliffObershelp self._differ_class = differ_class + if mode == self.MODE_HUNT: self.DEL_PENALTY = self.INS_PENALTY = .5 - def compare(self, ref: Schema, hyp: Schema) -> float: - if self._mode == self.MODE_LEVENSHTEIN: - ref_list = [i['item'] for i in ref] - hyp_list = [i['item'] for i in hyp] - total_ref = len(ref_list) - if total_ref == 0: - return 0 if len(hyp_list) == 0 else 1 - return editdistance.eval(ref_list, hyp_list) / total_ref - + def compare(self, ref: type_schema, hyp: type_schema) -> float: diffs = get_differ(ref, hyp, differ_class=self._differ_class) - counts = get_opcode_counts(diffs.get_opcodes()) + try: + counts = diffs.get_opcode_counts() - changes = counts.replace * self.SUB_PENALTY + \ - counts.delete * self.DEL_PENALTY + \ - counts.insert * self.INS_PENALTY + changes = counts.replace * self.SUB_PENALTY + \ + counts.delete * self.DEL_PENALTY + \ + counts.insert * self.INS_PENALTY - total = counts.equal + counts.replace + counts.delete - if total == 0: - return 1 if changes else 0 - return changes / total + total = counts.equal + counts.replace + counts.delete + if total == 0: + return 1 if changes else 0 + return changes / total + except NotImplementedError: + return diffs.get_error_rate() class CER(Metric): @@ -173,40 +142,36 @@ class CER(Metric): will first be split into words, ['aa','bb','cc'], and then merged into a final string for evaluation: 'aabbcc'. - :param mode: 'levenshtein' (default). - :param differ_class: For future use. + :param differ_class: see :py:mod:`benchmarkstt.diff.core` """ - # CER modes - MODE_LEVENSHTEIN = 'levenshtein' + def __init__(self, differ_class: Union[str, type_differ, None] = None): + self._differ_class = Levenshtein if differ_class is None else differ_class - def __init__(self, mode=None, differ_class=None): - self._mode = mode - - def compare(self, ref: Schema, hyp: Schema): - ref_str = ''.join([i['item'] for i in ref]) - hyp_str = ''.join([i['item'] for i in hyp]) - total_ref = len(ref_str) + def compare(self, ref: type_schema, hyp: type_schema): + ref_str = ''.join(traversible(ref)) + hyp_str = ''.join(traversible(hyp)) - if total_ref == 0: + if len(ref_str) == 0: return 0 if len(hyp_str) == 0 else 1 - return editdistance.eval(ref_str, hyp_str) / total_ref + diffs = get_differ(ref_str, hyp_str, differ_class=self._differ_class) + return diffs.get_error_rate() class DiffCounts(Metric): """ Get the amount of differences between reference and hypothesis + + :param differ_class: see :py:mod:`benchmarkstt.diff.core` """ - def __init__(self, differ_class: Differ = None): - if differ_class is None: - differ_class = RatcliffObershelp + def __init__(self, differ_class: Union[str, type_differ, None] = None): self._differ_class = differ_class - def compare(self, ref: Schema, hyp: Schema) -> OpcodeCounts: + def compare(self, ref: type_schema, hyp: type_schema) -> OpcodeCounts: diffs = get_differ(ref, hyp, differ_class=self._differ_class) - return get_opcode_counts(diffs.get_opcodes()) + return diffs.get_opcode_counts() class BEER(Metric): diff --git a/src/benchmarkstt/schema.py b/src/benchmarkstt/schema.py index af581b2f..7c67bfca 100644 --- a/src/benchmarkstt/schema.py +++ b/src/benchmarkstt/schema.py @@ -3,7 +3,6 @@ """ import json from collections.abc import Mapping -from typing import Union from collections import defaultdict @@ -51,6 +50,9 @@ def __iter__(self): def __repr__(self): return 'Item(%s)' % (self.json(),) + def __hash__(self): + return hash(self._val['item']) + def json(self, **kwargs): return Schema.dumps(self, **kwargs) diff --git a/tests/benchmarkstt/test_benchmarkstt.py b/tests/benchmarkstt/test_benchmarkstt.py index f7a9d5a1..f7469f16 100644 --- a/tests/benchmarkstt/test_benchmarkstt.py +++ b/tests/benchmarkstt/test_benchmarkstt.py @@ -11,16 +11,6 @@ def _(): return _ -class ToDefer: - def __init__(self, value): - self.value = value - self.cb_count = 0 - - def __repr__(self): - self.cb_count += 1 - return '' % (repr(self.value),) - - def test_deferred_str(): callback = cb('test') deferred = DeferredCallback(callback) diff --git a/tests/benchmarkstt/test_cli.py b/tests/benchmarkstt/test_cli.py index 6d1c4061..46c8bb4f 100644 --- a/tests/benchmarkstt/test_cli.py +++ b/tests/benchmarkstt/test_cli.py @@ -71,6 +71,16 @@ ']}\n]\n' ], ['normalization -i ./resources/test/_data/candide.txt ./resources/test/_data/candide.txt -o /dev/null', 2], + ['metrics -r "HELLO WORLD OF MINE" --hypothesis "GOODBYE CRUEL WORLD OF MINE" -rt argument -ht argument ' + '--worddiffs ratcliffobershelp --output-format json', + '[\n\t{"title": "worddiffs", "result": [' + '{"type": "replace", "reference": "HELLO", "hypothesis": "GOODBYE"}, ' + '{"type": "insert", "reference": null, "hypothesis": "CRUEL"}, ' + '{"type": "equal", "reference": "WORLD", "hypothesis": "WORLD"}, ' + '{"type": "equal", "reference": "OF", "hypothesis": "OF"}, ' + '{"type": "equal", "reference": "MINE", "hypothesis": "MINE"}' + ']}\n]\n' + ], ['metrics -r "HELLO WORLD OF MINE" --hypothesis "GOODBYE CRUEL WORLD OF MINE" -rt argument -ht argument ' '--worddiffs --output-format json', '[\n\t{"title": "worddiffs", "result": [' diff --git a/tests/benchmarkstt/test_diff.py b/tests/benchmarkstt/test_diff.py index 742038c7..4999b07e 100644 --- a/tests/benchmarkstt/test_diff.py +++ b/tests/benchmarkstt/test_diff.py @@ -1,33 +1,77 @@ from benchmarkstt import diff +from benchmarkstt.diff.core import RatcliffObershelp, Levenshtein import pytest -differs = [differ.cls for differ in diff.factory] +differs = [differ.cls for differ in diff.factory if differ.name != 'levenshtein'] differs_decorator = pytest.mark.parametrize('differ', differs) +all_differs = [differ.cls for differ in diff.factory] +all_differs_decorator = pytest.mark.parametrize('differ', all_differs) + + +def clean_opcode(opcode): + kind, alo, ahi, blo, bhi = opcode + if kind == 'delete': # blo and bhi are irrelevant + blo = bhi = None + elif kind == 'insert': + ahi = None + return kind, alo, ahi, blo, bhi + + +def clean_opcodes(opcodes): + return list(map(clean_opcode, opcodes)) + + +# def test_simple_levenshtein_ratcliff_similarity(): +# a = list('012345') +# b = list('023x45') +# assert(clean_opcodes(Levenshtein(a, b).get_opcodes()) == +# clean_opcodes(RatcliffObershelp(a, b).get_opcodes())) + + +@differs_decorator +def test_simple(differ): + sm = differ( + '0123456HIJkopq', + '0123456HIJKlmnopq' + ) + assert(clean_opcodes(sm.get_opcodes()) == + clean_opcodes([('equal', 0, 10, 0, 10), + ('replace', 10, 11, 10, 14), + ('equal', 11, 14, 14, 17)])) + @differs_decorator def test_one_insert(differ): sm = differ('b' * 100, 'a' + 'b' * 100) - assert list(sm.get_opcodes()) == [('insert', 0, 0, 0, 1), - ('equal', 0, 100, 1, 101)] + assert(clean_opcodes(sm.get_opcodes()) == + clean_opcodes([('insert', 0, 0, 0, 1), + ('equal', 0, 100, 1, 101)])) + sm = differ('b' * 100, 'b' * 50 + 'a' + 'b' * 50) - assert list(sm.get_opcodes()) == [('equal', 0, 50, 0, 50), - ('insert', 50, 50, 50, 51), - ('equal', 50, 100, 51, 101)] - ref = "a b c d e f" - hyp = "a b d e kfmod fgdjn idf giudfg diuf dufg idgiudgd" - sm = differ(ref, hyp) - assert list(sm.get_opcodes()) == [('equal', 0, 3, 0, 3), - ('delete', 3, 5, 3, 3), - ('equal', 5, 10, 3, 8), - ('insert', 10, 10, 8, 9), - ('equal', 10, 11, 9, 10), - ('insert', 11, 11, 10, 49)] + assert(clean_opcodes(sm.get_opcodes()) == + clean_opcodes([('equal', 0, 50, 0, 50), + ('insert', 50, 50, 50, 51), + ('equal', 50, 100, 51, 101)])) @differs_decorator def test_one_delete(differ): sm = differ('a' * 40 + 'c' + 'b' * 40, 'a' * 40 + 'b' * 40) - assert list(sm.get_opcodes()) == [('equal', 0, 40, 0, 40), - ('delete', 40, 41, 40, 40), - ('equal', 41, 81, 40, 80)] + assert(clean_opcodes(sm.get_opcodes()) == + clean_opcodes([('equal', 0, 40, 0, 40), + ('delete', 40, 41, 40, 40), + ('equal', 41, 81, 40, 80)])) + + +def test_ratcliffobershelp(): + ref = "a b c d e f" + hyp = "a b d e kfmod fgdjn idf giudfg diuf dufg idgiudgd" + sm = RatcliffObershelp(ref, hyp) + assert(clean_opcodes(sm.get_opcodes()) == + clean_opcodes([('equal', 0, 3, 0, 3), + ('delete', 3, 5, 3, 3), + ('equal', 5, 10, 3, 8), + ('insert', 10, 10, 8, 9), + ('equal', 10, 11, 9, 10), + ('insert', 11, 11, 10, 49)])) diff --git a/tests/benchmarkstt/test_metrics_core.py b/tests/benchmarkstt/test_metrics_core.py index a00ffe45..711c5f9c 100644 --- a/tests/benchmarkstt/test_metrics_core.py +++ b/tests/benchmarkstt/test_metrics_core.py @@ -38,7 +38,7 @@ def test_wer(a, b, exp): assert WER(mode=WER.MODE_STRICT).compare(PlainText(a), PlainText(b)) == wer_strict assert WER(mode=WER.MODE_HUNT).compare(PlainText(a), PlainText(b)) == wer_hunt - assert WER(mode=WER.MODE_LEVENSHTEIN).compare(PlainText(a), PlainText(b)) == wer_levenshtein + assert WER(differ_class='levenshtein').compare(PlainText(a), PlainText(b)) == wer_levenshtein @pytest.mark.parametrize('a,b,entities_list,weights,exp_beer,exp_occ', [ @@ -112,4 +112,4 @@ def test_wa_beer(a, b, entities_list, weights, exp): def test_cer(a, b, exp): cer_levenshtein, = exp - assert CER(mode=CER.MODE_LEVENSHTEIN).compare(PlainText(a), PlainText(b)) == cer_levenshtein + assert CER(differ_class='levenshtein').compare(PlainText(a), PlainText(b)) == cer_levenshtein