Skip to content

Commit 2401e15

Browse files
authored
Fix #367 escape/unescape strings in simplemrs (#369)
1 parent 338418d commit 2401e15

File tree

3 files changed

+82
-8
lines changed

3 files changed

+82
-8
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66

77
* Bump versions of `actions/setup-python` (and others) in actions ([#364])
88

9+
### Fixed
10+
11+
* Escape/unescape quoted strings in SimpleMRS ([#367])
12+
913

1014
## [v1.8.0]
1115

@@ -1592,3 +1596,4 @@ information about changes, except for
15921596
[#357]: https://github.com/delph-in/pydelphin/issues/357
15931597
[#360]: https://github.com/delph-in/pydelphin/issues/360
15941598
[#364]: https://github.com/delph-in/pydelphin/issues/364
1599+
[#367]: https://github.com/delph-in/pydelphin/issues/367

delphin/codecs/simplemrs.py

Lines changed: 57 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
from pathlib import Path
6+
from typing import Optional
67

78
from delphin.util import Lexer
89
from delphin import predicate
@@ -184,7 +185,7 @@ def _decode_mrs(lexer):
184185
variables = {}
185186
lexer.expect_type(LBRACK)
186187
lnk = _decode_lnk(lexer)
187-
surface = lexer.accept_type(DQSTRING)
188+
surface = _decode_dqstring(lexer.accept_type(DQSTRING))
188189
feature = lexer.accept_type(FEATURE)
189190
while feature is not None:
190191
feature = feature.upper()
@@ -223,6 +224,12 @@ def _decode_lnk(lexer):
223224
return lnk
224225

225226

227+
def _decode_dqstring(dqstring: Optional[str]) -> Optional[str]:
228+
if dqstring is not None:
229+
dqstring = _unescape(dqstring)
230+
return dqstring
231+
232+
226233
def _decode_variable(lexer, variables):
227234
var = lexer.expect_type(SYMBOL).lower()
228235
if var not in variables:
@@ -243,17 +250,16 @@ def _decode_rel(lexer, variables):
243250
args = {}
244251
surface = None
245252
lexer.expect_type(LBRACK)
246-
pred = predicate.normalize(
247-
lexer.choice_type(DQSTRING, SQSYMBOL, PREDICATE, SYMBOL)[1])
253+
pred = _decode_predicate(lexer)
248254
lnk = _decode_lnk(lexer)
249-
surface = lexer.accept_type(DQSTRING)
255+
surface = _decode_dqstring(lexer.accept_type(DQSTRING))
250256
_, label = lexer.expect((FEATURE, 'LBL'), (SYMBOL, None))
251257
# any remaining are arguments or a constant
252258
role = lexer.accept_type(FEATURE)
253259
while role is not None:
254260
role = role.upper()
255261
if role == 'CARG':
256-
value = lexer.expect_type(DQSTRING)
262+
value = _decode_dqstring(lexer.expect_type(DQSTRING))
257263
else:
258264
value = _decode_variable(lexer, variables)
259265
args[role] = value
@@ -267,6 +273,15 @@ def _decode_rel(lexer, variables):
267273
base=None)
268274

269275

276+
def _decode_predicate(lexer) -> str:
277+
predstring = lexer.accept_type(DQSTRING)
278+
if predstring is not None:
279+
predstring = _decode_dqstring(predstring)
280+
else:
281+
predstring = lexer.choice_type(SQSYMBOL, PREDICATE, SYMBOL)[1]
282+
return predicate.normalize(predstring)
283+
284+
270285
def _decode_cons(lexer, cls, variables):
271286
lhs = _decode_variable(lexer, variables)
272287
relation = lexer.expect_type(SYMBOL).lower()
@@ -312,7 +327,7 @@ def _encode_surface_info(m, lnk):
312327
if m.lnk:
313328
tokens.append(str(m.lnk))
314329
if m.surface is not None:
315-
tokens.append('"{}"'.format(m.surface))
330+
tokens.append('"{}"'.format(_escape(m.surface)))
316331
return tokens
317332

318333

@@ -351,12 +366,12 @@ def _encode_rels(rels, varprops, lnk, indent):
351366
pred += str(rel.lnk)
352367
reltoks = ['[', pred]
353368
if lnk and rel.surface is not None:
354-
reltoks.append('"{}"'.format(rel.surface))
369+
reltoks.append('"{}"'.format(_escape(rel.surface)))
355370
reltoks.extend(('LBL:', rel.label))
356371
for role in sorted(rel.args, key=role_priority):
357372
arg = rel.args[role]
358373
if role == CONSTANT_ROLE:
359-
arg = '"{}"'.format(arg)
374+
arg = '"{}"'.format(_escape(arg))
360375
else:
361376
arg = _encode_variable(arg, varprops)
362377
reltoks.extend((role + ':', arg))
@@ -383,3 +398,37 @@ def _encode_icons(icons, varprops):
383398
if tokens:
384399
tokens = ['ICONS: <'] + [' '.join(tokens)] + ['>']
385400
return tokens
401+
402+
403+
# Character Escaping
404+
405+
406+
_ESCAPES = {
407+
'\\': '\\\\',
408+
'"': '\\"',
409+
}
410+
411+
412+
_UNESCAPES = {
413+
'\\\\': '\\',
414+
'\\"': '"',
415+
}
416+
417+
418+
def _escape(s: str) -> str:
419+
return "".join(_ESCAPES.get(c, c) for c in s)
420+
421+
422+
def _unescape(s: str) -> str:
423+
if not s:
424+
return s
425+
cs = []
426+
i = 0
427+
while i < len(s):
428+
if s[i] == '\\' and (i + 1) < len(s):
429+
cs.append(s[i+1])
430+
i += 2
431+
else:
432+
cs.append(s[i])
433+
i += 1
434+
return "".join(cs)

tests/codecs/simplemrs_test.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,23 @@ def assert_predicate(p):
6060
assert_predicate(r'_24/7_n_1')
6161
assert_predicate(r'_foo<bar_n_1')
6262
assert_predicate(r'_foo_n_<1:3')
63+
64+
65+
def test_escapes_issue_367():
66+
# https://github.com/delph-in/pydelphin/issues/367
67+
m = simplemrs.decode(
68+
'[ TOP: h0'
69+
' INDEX: e2 [ e SF: prop TENSE: pres MOOD: indicative PROG: - PERF: - ]'
70+
' RELS: < [ udef_q<0:6> LBL: h4 ARG0: x3 [ x PERS: 3 NUM: sg ] RSTR: h5 BODY: h6 ]'
71+
' [ _blue_a_1<0:6> LBL: h7 ARG0: x3 ARG1: i8 ]'
72+
' [ _in_p_loc<10:12> LBL: h1 ARG0: e2 ARG1: x3 ARG2: x9 [ x PERS: 3 NUM: sg IND: + ] ]'
73+
' [ _this_q_dem<13:17> LBL: h10 ARG0: x9 RSTR: h11 BODY: h12 ]'
74+
' [ _folder_n_of<18:25> LBL: h13 ARG0: x9 ARG1: i14 ] >'
75+
' HCONS: < h0 qeq h1 h5 qeq h7 h11 qeq h13 > ]'
76+
)
77+
m.surface = '"Blue" is in this folder.'
78+
s = simplemrs.encode(m)
79+
assert '\\"Blue\\" is in this folder.' in s
80+
m2 = simplemrs.decode(s)
81+
assert m == m2
82+
assert m.surface == m2.surface

0 commit comments

Comments
 (0)