Skip to content

Commit 950c081

Browse files
committed
add get_opcode_counts to Diff classes (from metrics)
1 parent ec23dba commit 950c081

File tree

4 files changed

+100
-79
lines changed

4 files changed

+100
-79
lines changed

src/benchmarkstt/diff/__init__.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,39 @@
22
Responsible for calculating differences.
33
"""
44

5-
from abc import ABC, abstractmethod
5+
from abc import ABC, ABCMeta, abstractmethod
66
from benchmarkstt.factory import CoreFactory
7+
from collections import namedtuple
78

89

9-
class Differ(ABC):
10+
OpcodeCounts = namedtuple('OpcodeCounts',
11+
('equal', 'replace', 'insert', 'delete'))
12+
13+
14+
def get_opcode_counts(opcodes) -> OpcodeCounts:
15+
counts = OpcodeCounts(0, 0, 0, 0)._asdict()
16+
for tag, alo, ahi, blo, bhi in opcodes:
17+
if tag == 'equal':
18+
counts[tag] += ahi - alo
19+
elif tag == 'insert':
20+
counts[tag] += bhi - blo
21+
elif tag == 'delete':
22+
counts[tag] += ahi - alo
23+
elif tag == 'replace':
24+
ca = ahi - alo
25+
cb = bhi - blo
26+
if ca < cb:
27+
counts['insert'] += cb - ca
28+
counts['replace'] += ca
29+
elif ca > cb:
30+
counts['delete'] += ca - cb
31+
counts['replace'] += cb
32+
else:
33+
counts[tag] += ahi - alo
34+
return OpcodeCounts(counts['equal'], counts['replace'], counts['insert'], counts['delete'])
35+
36+
37+
class DifferInterface(ABC):
1038
@abstractmethod
1139
def __init__(self, a, b):
1240
"""
@@ -32,5 +60,8 @@ def get_opcodes(self):
3260
"""
3361
raise NotImplementedError()
3462

63+
@abstractmethod
64+
def get_opcode_counts(self):
65+
raise NotImplementedError()
3566

36-
factory = CoreFactory(Differ, False)
67+
factory = CoreFactory(DifferInterface, False)

src/benchmarkstt/metrics/core.py

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -19,29 +19,6 @@ def traversible(schema, key=None):
1919
return [word[key] for word in schema]
2020

2121

22-
def get_opcode_counts(opcodes) -> OpcodeCounts:
23-
counts = OpcodeCounts(0, 0, 0, 0)._asdict()
24-
for tag, alo, ahi, blo, bhi in opcodes:
25-
if tag == 'equal':
26-
counts[tag] += ahi - alo
27-
elif tag == 'insert':
28-
counts[tag] += bhi - blo
29-
elif tag == 'delete':
30-
counts[tag] += ahi - alo
31-
elif tag == 'replace':
32-
ca = ahi - alo
33-
cb = bhi - blo
34-
if ca < cb:
35-
counts['insert'] += cb - ca
36-
counts['replace'] += ca
37-
elif ca > cb:
38-
counts['delete'] += ca - cb
39-
counts['replace'] += cb
40-
else:
41-
counts[tag] += ahi - alo
42-
return OpcodeCounts(counts['equal'], counts['replace'], counts['insert'], counts['delete'])
43-
44-
4522
def get_differ(a, b, differ_class: Differ):
4623
if differ_class is None or differ_class == '':
4724
differ_class = RatcliffObershelp
@@ -125,7 +102,7 @@ def compare(self, ref: Schema, hyp: Schema) -> float:
125102

126103
diffs = get_differ(ref, hyp, differ_class=self._differ_class)
127104

128-
counts = get_opcode_counts(diffs.get_opcodes())
105+
counts = diffs.get_opcode_counts()
129106

130107
changes = counts.replace * self.SUB_PENALTY + \
131108
counts.delete * self.DEL_PENALTY + \
@@ -199,7 +176,7 @@ def __init__(self, differ_class: Differ = None):
199176

200177
def compare(self, ref: Schema, hyp: Schema) -> OpcodeCounts:
201178
diffs = get_differ(ref, hyp, differ_class=self._differ_class)
202-
return get_opcode_counts(diffs.get_opcodes())
179+
return diffs.get_opcode_counts()
203180

204181

205182
class BEER(Metric):

tests/benchmarkstt/test_cli.py

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -71,28 +71,26 @@
7171
']}\n]\n'
7272
],
7373
['normalization -i ./resources/test/_data/candide.txt ./resources/test/_data/candide.txt -o /dev/null', 2],
74-
[
75-
'metrics -r "HELLO WORLD OF MINE" --hypothesis "GOODBYE CRUEL WORLD OF MINE" -rt argument -ht argument '
76-
'--worddiffs levenshtein --output-format json',
77-
'[\n\t{"title": "worddiffs", "result": ['
78-
'{"type": "replace", "reference": "HELLO", "hypothesis": "GOODBYE"}, '
79-
'{"type": "insert", "reference": null, "hypothesis": "CRUEL"}, '
80-
'{"type": "equal", "reference": "WORLD", "hypothesis": "WORLD"}, '
81-
'{"type": "equal", "reference": "OF", "hypothesis": "OF"}, '
82-
'{"type": "equal", "reference": "MINE", "hypothesis": "MINE"}'
83-
']}\n]\n'
84-
],
85-
[
86-
'metrics -r "HELLO WORLD OF MINE" --hypothesis "GOODBYE CRUEL WORLD OF MINE" -rt argument -ht argument '
87-
'--worddiffs --output-format json',
88-
'[\n\t{"title": "worddiffs", "result": ['
89-
'{"type": "replace", "reference": "HELLO", "hypothesis": "GOODBYE"}, '
90-
'{"type": "insert", "reference": null, "hypothesis": "CRUEL"}, '
91-
'{"type": "equal", "reference": "WORLD", "hypothesis": "WORLD"}, '
92-
'{"type": "equal", "reference": "OF", "hypothesis": "OF"}, '
93-
'{"type": "equal", "reference": "MINE", "hypothesis": "MINE"}'
94-
']}\n]\n'
95-
],
74+
['metrics -r "HELLO WORLD OF MINE" --hypothesis "GOODBYE CRUEL WORLD OF MINE" -rt argument -ht argument '
75+
'--worddiffs ratcliffobershelp --output-format json',
76+
'[\n\t{"title": "worddiffs", "result": ['
77+
'{"type": "replace", "reference": "HELLO", "hypothesis": "GOODBYE"}, '
78+
'{"type": "insert", "reference": null, "hypothesis": "CRUEL"}, '
79+
'{"type": "equal", "reference": "WORLD", "hypothesis": "WORLD"}, '
80+
'{"type": "equal", "reference": "OF", "hypothesis": "OF"}, '
81+
'{"type": "equal", "reference": "MINE", "hypothesis": "MINE"}'
82+
']}\n]\n'
83+
],
84+
['metrics -r "HELLO WORLD OF MINE" --hypothesis "GOODBYE CRUEL WORLD OF MINE" -rt argument -ht argument '
85+
'--worddiffs --output-format json',
86+
'[\n\t{"title": "worddiffs", "result": ['
87+
'{"type": "replace", "reference": "HELLO", "hypothesis": "GOODBYE"}, '
88+
'{"type": "insert", "reference": null, "hypothesis": "CRUEL"}, '
89+
'{"type": "equal", "reference": "WORLD", "hypothesis": "WORLD"}, '
90+
'{"type": "equal", "reference": "OF", "hypothesis": "OF"}, '
91+
'{"type": "equal", "reference": "MINE", "hypothesis": "MINE"}'
92+
']}\n]\n'
93+
],
9694
['metrics -r "HELLO CRUEL WORLD OF MINE" -h "GOODBYE WORLD OF MINE" -rt argument -ht argument '
9795
'--worddiffs --output-format json',
9896
'[\n\t{"title": "worddiffs", "result": ['

tests/benchmarkstt/test_diff.py

Lines changed: 44 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,32 @@
11
from benchmarkstt import diff
2-
from benchmarkstt.diff.core import RatcliffObershelp
2+
from benchmarkstt.diff.core import RatcliffObershelp, Levenshtein
33
import pytest
44

55
differs = [differ.cls for differ in diff.factory if differ.name != 'levenshtein']
66
differs_decorator = pytest.mark.parametrize('differ', differs)
77

8+
all_differs = [differ.cls for differ in diff.factory]
9+
all_differs_decorator = pytest.mark.parametrize('differ', all_differs)
810

9-
@differs_decorator
10-
def test_simplest(differ):
11-
sm = differ(
12-
list('012345'),
13-
list('023345')
14-
)
15-
assert list(sm.get_opcodes()) == [('equal', 0, 1, 0, 1),
16-
('delete', 1, 2, 1, 1),
17-
('equal', 2, 3, 1, 2),
18-
('insert', 3, 3, 2, 3),
19-
('equal', 3, 6, 3, 6)]
11+
12+
def clean_opcode(opcode):
13+
kind, alo, ahi, blo, bhi = opcode
14+
if kind == 'delete': # blo and bhi are irrelevant
15+
blo = bhi = None
16+
elif kind == 'insert':
17+
ahi = None
18+
return kind, alo, ahi, blo, bhi
19+
20+
21+
def clean_opcodes(opcodes):
22+
return list(map(clean_opcode, opcodes))
23+
24+
25+
def test_simple_levenshtein_ratcliff_similarity():
26+
a = list('012345')
27+
b = list('023x45')
28+
assert clean_opcodes(Levenshtein(a, b).get_opcodes()) == \
29+
clean_opcodes(RatcliffObershelp(a, b).get_opcodes())
2030

2131

2232
@differs_decorator
@@ -25,37 +35,42 @@ def test_simple(differ):
2535
'0123456HIJkopq',
2636
'0123456HIJKlmnopq'
2737
)
28-
assert list(sm.get_opcodes()) == [('equal', 0, 10, 0, 10),
29-
('replace', 10, 11, 10, 14),
30-
('equal', 11, 14, 14, 17)]
38+
assert clean_opcodes(sm.get_opcodes()) == \
39+
clean_opcodes([('equal', 0, 10, 0, 10),
40+
('replace', 10, 11, 10, 14),
41+
('equal', 11, 14, 14, 17)])
3142

3243

3344
@differs_decorator
3445
def test_one_insert(differ):
3546
sm = differ('b' * 100, 'a' + 'b' * 100)
36-
assert list(sm.get_opcodes()) == [('insert', 0, 0, 0, 1),
37-
('equal', 0, 100, 1, 101)]
47+
assert clean_opcodes(sm.get_opcodes()) == \
48+
clean_opcodes([('insert', 0, 0, 0, 1),
49+
('equal', 0, 100, 1, 101)])
3850
sm = differ('b' * 100, 'b' * 50 + 'a' + 'b' * 50)
39-
assert list(sm.get_opcodes()) == [('equal', 0, 50, 0, 50),
40-
('insert', 50, 50, 50, 51),
41-
('equal', 50, 100, 51, 101)]
51+
assert clean_opcodes(sm.get_opcodes()) == \
52+
clean_opcodes([('equal', 0, 50, 0, 50),
53+
('insert', 50, 50, 50, 51),
54+
('equal', 50, 100, 51, 101)])
4255

4356

4457
@differs_decorator
4558
def test_one_delete(differ):
4659
sm = differ('a' * 40 + 'c' + 'b' * 40, 'a' * 40 + 'b' * 40)
47-
assert list(sm.get_opcodes()) == [('equal', 0, 40, 0, 40),
48-
('delete', 40, 41, 40, 40),
49-
('equal', 41, 81, 40, 80)]
60+
assert clean_opcodes(sm.get_opcodes()) == \
61+
clean_opcodes([('equal', 0, 40, 0, 40),
62+
('delete', 40, 41, 40, 40),
63+
('equal', 41, 81, 40, 80)])
5064

5165

5266
def test_ratcliffobershelp():
5367
ref = "a b c d e f"
5468
hyp = "a b d e kfmod fgdjn idf giudfg diuf dufg idgiudgd"
5569
sm = RatcliffObershelp(ref, hyp)
56-
assert list(sm.get_opcodes()) == [('equal', 0, 3, 0, 3),
57-
('delete', 3, 5, 3, 3),
58-
('equal', 5, 10, 3, 8),
59-
('insert', 10, 10, 8, 9),
60-
('equal', 10, 11, 9, 10),
61-
('insert', 11, 11, 10, 49)]
70+
assert clean_opcodes(sm.get_opcodes()) == \
71+
clean_opcodes([('equal', 0, 3, 0, 3),
72+
('delete', 3, 5, 3, 3),
73+
('equal', 5, 10, 3, 8),
74+
('insert', 10, 10, 8, 9),
75+
('equal', 10, 11, 9, 10),
76+
('insert', 11, 11, 10, 49)])

0 commit comments

Comments
 (0)