add get_opcode_counts to Diff classes (from metrics)

MikeSmithEU · MikeSmithEU · commit 950c08122c75 · 2021-01-23T16:53:07.000+01:00
diff --git a/src/benchmarkstt/diff/__init__.py b/src/benchmarkstt/diff/__init__.py
@@ -2,11 +2,39 @@
 Responsible for calculating differences.
 """
 
-from abc import ABC, abstractmethod
+from abc import ABC, ABCMeta, abstractmethod
 from benchmarkstt.factory import CoreFactory
+from collections import namedtuple
 
 
-class Differ(ABC):
+OpcodeCounts = namedtuple('OpcodeCounts',
+                          ('equal', 'replace', 'insert', 'delete'))
+
+
+def get_opcode_counts(opcodes) -> OpcodeCounts:
+    counts = OpcodeCounts(0, 0, 0, 0)._asdict()
+    for tag, alo, ahi, blo, bhi in opcodes:
+        if tag == 'equal':
+            counts[tag] += ahi - alo
+        elif tag == 'insert':
+            counts[tag] += bhi - blo
+        elif tag == 'delete':
+            counts[tag] += ahi - alo
+        elif tag == 'replace':
+            ca = ahi - alo
+            cb = bhi - blo
+            if ca < cb:
+                counts['insert'] += cb - ca
+                counts['replace'] += ca
+            elif ca > cb:
+                counts['delete'] += ca - cb
+                counts['replace'] += cb
+            else:
+                counts[tag] += ahi - alo
+    return OpcodeCounts(counts['equal'], counts['replace'], counts['insert'], counts['delete'])
+
+
+class DifferInterface(ABC):
     @abstractmethod
     def __init__(self, a, b):
         """
@@ -32,5 +60,8 @@ def get_opcodes(self):
         """
         raise NotImplementedError()
 
+    @abstractmethod
+    def get_opcode_counts(self):
+        raise NotImplementedError()
 
-factory = CoreFactory(Differ, False)
+factory = CoreFactory(DifferInterface, False)
diff --git a/src/benchmarkstt/metrics/core.py b/src/benchmarkstt/metrics/core.py
@@ -19,29 +19,6 @@ def traversible(schema, key=None):
     return [word[key] for word in schema]
 
 
-def get_opcode_counts(opcodes) -> OpcodeCounts:
-    counts = OpcodeCounts(0, 0, 0, 0)._asdict()
-    for tag, alo, ahi, blo, bhi in opcodes:
-        if tag == 'equal':
-            counts[tag] += ahi - alo
-        elif tag == 'insert':
-            counts[tag] += bhi - blo
-        elif tag == 'delete':
-            counts[tag] += ahi - alo
-        elif tag == 'replace':
-            ca = ahi - alo
-            cb = bhi - blo
-            if ca < cb:
-                counts['insert'] += cb - ca
-                counts['replace'] += ca
-            elif ca > cb:
-                counts['delete'] += ca - cb
-                counts['replace'] += cb
-            else:
-                counts[tag] += ahi - alo
-    return OpcodeCounts(counts['equal'], counts['replace'], counts['insert'], counts['delete'])
-
-
 def get_differ(a, b, differ_class: Differ):
     if differ_class is None or differ_class == '':
         differ_class = RatcliffObershelp
@@ -125,7 +102,7 @@ def compare(self, ref: Schema, hyp: Schema) -> float:
 
         diffs = get_differ(ref, hyp, differ_class=self._differ_class)
 
-        counts = get_opcode_counts(diffs.get_opcodes())
+        counts = diffs.get_opcode_counts()
 
         changes = counts.replace * self.SUB_PENALTY + \
             counts.delete * self.DEL_PENALTY + \
@@ -199,7 +176,7 @@ def __init__(self, differ_class: Differ = None):
 
     def compare(self, ref: Schema, hyp: Schema) -> OpcodeCounts:
         diffs = get_differ(ref, hyp, differ_class=self._differ_class)
-        return get_opcode_counts(diffs.get_opcodes())
+        return diffs.get_opcode_counts()
 
 
 class BEER(Metric):
diff --git a/tests/benchmarkstt/test_cli.py b/tests/benchmarkstt/test_cli.py
@@ -71,28 +71,26 @@
      ']}\n]\n'
      ],
     ['normalization -i ./resources/test/_data/candide.txt ./resources/test/_data/candide.txt -o /dev/null', 2],
-    [
-        'metrics -r "HELLO WORLD OF MINE" --hypothesis "GOODBYE CRUEL WORLD OF MINE" -rt argument -ht argument '
-        '--worddiffs levenshtein --output-format json',
-        '[\n\t{"title": "worddiffs", "result": ['
-        '{"type": "replace", "reference": "HELLO", "hypothesis": "GOODBYE"}, '
-        '{"type": "insert", "reference": null, "hypothesis": "CRUEL"}, '
-        '{"type": "equal", "reference": "WORLD", "hypothesis": "WORLD"}, '
-        '{"type": "equal", "reference": "OF", "hypothesis": "OF"}, '
-        '{"type": "equal", "reference": "MINE", "hypothesis": "MINE"}'
-        ']}\n]\n'
-    ],
-    [
-        'metrics -r "HELLO WORLD OF MINE" --hypothesis "GOODBYE CRUEL WORLD OF MINE" -rt argument -ht argument '
-        '--worddiffs --output-format json',
-        '[\n\t{"title": "worddiffs", "result": ['
-        '{"type": "replace", "reference": "HELLO", "hypothesis": "GOODBYE"}, '
-        '{"type": "insert", "reference": null, "hypothesis": "CRUEL"}, '
-        '{"type": "equal", "reference": "WORLD", "hypothesis": "WORLD"}, '
-        '{"type": "equal", "reference": "OF", "hypothesis": "OF"}, '
-        '{"type": "equal", "reference": "MINE", "hypothesis": "MINE"}'
-        ']}\n]\n'
-    ],
+    ['metrics -r "HELLO WORLD OF MINE" --hypothesis "GOODBYE CRUEL WORLD OF MINE" -rt argument -ht argument '
+     '--worddiffs ratcliffobershelp --output-format json',
+     '[\n\t{"title": "worddiffs", "result": ['
+     '{"type": "replace", "reference": "HELLO", "hypothesis": "GOODBYE"}, '
+     '{"type": "insert", "reference": null, "hypothesis": "CRUEL"}, '
+     '{"type": "equal", "reference": "WORLD", "hypothesis": "WORLD"}, '
+     '{"type": "equal", "reference": "OF", "hypothesis": "OF"}, '
+     '{"type": "equal", "reference": "MINE", "hypothesis": "MINE"}'
+     ']}\n]\n'
+     ],
+    ['metrics -r "HELLO WORLD OF MINE" --hypothesis "GOODBYE CRUEL WORLD OF MINE" -rt argument -ht argument '
+     '--worddiffs --output-format json',
+     '[\n\t{"title": "worddiffs", "result": ['
+     '{"type": "replace", "reference": "HELLO", "hypothesis": "GOODBYE"}, '
+     '{"type": "insert", "reference": null, "hypothesis": "CRUEL"}, '
+     '{"type": "equal", "reference": "WORLD", "hypothesis": "WORLD"}, '
+     '{"type": "equal", "reference": "OF", "hypothesis": "OF"}, '
+     '{"type": "equal", "reference": "MINE", "hypothesis": "MINE"}'
+     ']}\n]\n'
+     ],
     ['metrics -r "HELLO CRUEL WORLD OF MINE" -h "GOODBYE WORLD OF MINE" -rt argument -ht argument '
      '--worddiffs --output-format json',
      '[\n\t{"title": "worddiffs", "result": ['
diff --git a/tests/benchmarkstt/test_diff.py b/tests/benchmarkstt/test_diff.py
@@ -1,22 +1,32 @@
 from benchmarkstt import diff
-from benchmarkstt.diff.core import RatcliffObershelp
+from benchmarkstt.diff.core import RatcliffObershelp, Levenshtein
 import pytest
 
 differs = [differ.cls for differ in diff.factory if differ.name != 'levenshtein']
 differs_decorator = pytest.mark.parametrize('differ', differs)
 
+all_differs = [differ.cls for differ in diff.factory]
+all_differs_decorator = pytest.mark.parametrize('differ', all_differs)
 
-@differs_decorator
-def test_simplest(differ):
-    sm = differ(
-        list('012345'),
-        list('023345')
-    )
-    assert list(sm.get_opcodes()) == [('equal', 0, 1, 0, 1),
-                                      ('delete', 1, 2, 1, 1),
-                                      ('equal', 2, 3, 1, 2),
-                                      ('insert', 3, 3, 2, 3),
-                                      ('equal', 3, 6, 3, 6)]
+
+def clean_opcode(opcode):
+    kind, alo, ahi, blo, bhi = opcode
+    if kind == 'delete':  # blo and bhi are irrelevant
+        blo = bhi = None
+    elif kind == 'insert':
+        ahi = None
+    return kind, alo, ahi, blo, bhi
+
+
+def clean_opcodes(opcodes):
+    return list(map(clean_opcode, opcodes))
+
+
+def test_simple_levenshtein_ratcliff_similarity():
+    a = list('012345')
+    b = list('023x45')
+    assert clean_opcodes(Levenshtein(a, b).get_opcodes()) == \
+           clean_opcodes(RatcliffObershelp(a, b).get_opcodes())
 
 
 @differs_decorator
@@ -25,37 +35,42 @@ def test_simple(differ):
         '0123456HIJkopq',
         '0123456HIJKlmnopq'
     )
-    assert list(sm.get_opcodes()) == [('equal', 0, 10, 0, 10),
-                                      ('replace', 10, 11, 10, 14),
-                                      ('equal', 11, 14, 14, 17)]
+    assert clean_opcodes(sm.get_opcodes()) == \
+           clean_opcodes([('equal', 0, 10, 0, 10),
+                          ('replace', 10, 11, 10, 14),
+                          ('equal', 11, 14, 14, 17)])
 
 
 @differs_decorator
 def test_one_insert(differ):
     sm = differ('b' * 100, 'a' + 'b' * 100)
-    assert list(sm.get_opcodes()) == [('insert', 0, 0, 0, 1),
-                                      ('equal', 0, 100, 1, 101)]
+    assert clean_opcodes(sm.get_opcodes()) == \
+           clean_opcodes([('insert', 0, 0, 0, 1),
+                          ('equal', 0, 100, 1, 101)])
     sm = differ('b' * 100, 'b' * 50 + 'a' + 'b' * 50)
-    assert list(sm.get_opcodes()) == [('equal', 0, 50, 0, 50),
-                                      ('insert', 50, 50, 50, 51),
-                                      ('equal', 50, 100, 51, 101)]
+    assert clean_opcodes(sm.get_opcodes()) == \
+           clean_opcodes([('equal', 0, 50, 0, 50),
+                          ('insert', 50, 50, 50, 51),
+                          ('equal', 50, 100, 51, 101)])
 
 
 @differs_decorator
 def test_one_delete(differ):
     sm = differ('a' * 40 + 'c' + 'b' * 40, 'a' * 40 + 'b' * 40)
-    assert list(sm.get_opcodes()) == [('equal', 0, 40, 0, 40),
-                                      ('delete', 40, 41, 40, 40),
-                                      ('equal', 41, 81, 40, 80)]
+    assert clean_opcodes(sm.get_opcodes()) == \
+           clean_opcodes([('equal', 0, 40, 0, 40),
+                          ('delete', 40, 41, 40, 40),
+                          ('equal', 41, 81, 40, 80)])
 
 
 def test_ratcliffobershelp():
     ref = "a b c d e f"
     hyp = "a b d e kfmod fgdjn idf giudfg diuf dufg idgiudgd"
     sm = RatcliffObershelp(ref, hyp)
-    assert list(sm.get_opcodes()) == [('equal', 0, 3, 0, 3),
-                                      ('delete', 3, 5, 3, 3),
-                                      ('equal', 5, 10, 3, 8),
-                                      ('insert', 10, 10, 8, 9),
-                                      ('equal', 10, 11, 9, 10),
-                                      ('insert', 11, 11, 10, 49)]
+    assert clean_opcodes(sm.get_opcodes()) == \
+           clean_opcodes([('equal', 0, 3, 0, 3),
+                          ('delete', 3, 5, 3, 3),
+                          ('equal', 5, 10, 3, 8),
+                          ('insert', 10, 10, 8, 9),
+                          ('equal', 10, 11, 9, 10),
+                          ('insert', 11, 11, 10, 49)])