Skip to content

Commit b6f3910

Browse files
committed
Refactor SExpr parser to be less complex
Also fixed a bug or two along the way; new tests for this.
1 parent a81d373 commit b6f3910

File tree

2 files changed

+99
-65
lines changed

2 files changed

+99
-65
lines changed

delphin/util.py

Lines changed: 93 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -293,72 +293,101 @@ def _SExpr_unescape_string(s):
293293
return re.sub(r'\\(["\\])', r'\1', s)
294294

295295

296-
class _SExprParser(object):
297-
def parse(self, s):
298-
i = 0
299-
n = len(s)
300-
while i < n and s[i].isspace():
301-
i += 1
302-
if i == n:
303-
return SExprResult([], '')
304-
assert s[i] == '('
305-
i += 1
306-
while i < n and s[i].isspace():
296+
def _SExpr_parse(s: str) -> SExprResult:
297+
s = s.lstrip()
298+
data: _Cons = []
299+
if not s:
300+
return SExprResult(data, '')
301+
assert s.startswith('(')
302+
i = 1
303+
n = len(s)
304+
stack: List[List[_SExpr]] = []
305+
vals: List[_SExpr] = []
306+
while i < n:
307+
c = s[i]
308+
# numbers
309+
if c.isdigit() or c == '-' and (i + 1 < n) and s[i + 1].isdigit():
310+
num, i = _SExpr_parse_number(s, i)
311+
vals.append(num)
312+
# quoted strings
313+
elif c == '"':
314+
string, i = _SExpr_parse_string(s, i)
315+
vals.append(string)
316+
# start new list
317+
elif c == '(':
318+
stack.append(vals)
319+
vals = []
307320
i += 1
308-
stack = [[]]
309-
while i < n:
310-
c = s[i]
311-
# numbers
312-
if c.isdigit() or c == '-' and s[i + 1].isdigit():
313-
j = i + 1
314-
while s[j].isdigit():
315-
j += 1
316-
c = s[j]
317-
if c in '.eE': # float
318-
if c == '.':
319-
j += 1
320-
while s[j].isdigit():
321-
j += 1
322-
if c in 'eE':
323-
j += 1
324-
if s[j] in '+=':
325-
j += 1
326-
while s[j].isdigit():
327-
j += 1
328-
stack[-1].append(float(s[i:j]))
329-
else: # int
330-
stack[-1].append(int(s[i:j]))
331-
i = j
332-
elif c == '"': # quoted strings
333-
j = i + 1
334-
while s[j] != '"':
335-
if s[j] == '\\':
336-
j += 2
337-
else:
338-
j += 1
339-
stack[-1].append(
340-
_SExpr_unescape_string(s[i + 1 : j])) # noqa: E203
341-
i = j + 1
342-
elif c == '(':
343-
stack.append([])
344-
i += 1
345-
elif c == ')':
346-
xs = stack.pop()
347-
if len(xs) == 3 and xs[1] == '.':
348-
xs = tuple(xs[::2])
349-
if len(stack) == 0:
350-
return SExprResult(xs, s[i + 1 :]) # noqa: E203
351-
else:
352-
stack[-1].append(xs)
353-
i += 1
354-
elif c.isspace():
355-
i += 1
321+
# end list
322+
elif c == ')':
323+
if len(vals) == 3 and vals[1] == '.':
324+
data = (vals[0], vals[2]) # simplify dotted pair
356325
else:
357-
m = _SExpr_symbol_re.match(s, pos=i)
358-
if m is None:
359-
raise ValueError('Invalid S-Expression: ' + s)
360-
stack[-1].append(_SExpr_unescape_symbol(m.group(0)))
361-
i += len(m.group(0))
326+
data = vals
327+
if len(stack) == 0:
328+
break
329+
else:
330+
stack[-1].append(data)
331+
vals = stack.pop()
332+
i += 1
333+
# ignore whitespace
334+
elif c.isspace():
335+
i += 1
336+
# any other symbol
337+
else:
338+
sym, i = _SExpr_parse_symbol(s, i)
339+
vals.append(sym)
340+
341+
return SExprResult(data, s[i+1:])
342+
343+
344+
def _SExpr_parse_number(s: str, i: int) -> Tuple[Union[int, float], int]:
345+
j = i + 1 # start at next character
346+
while s[j].isdigit():
347+
j += 1
348+
c = s[j]
349+
350+
if c not in '.eE': # int
351+
return int(s[i:j]), j
352+
353+
# float
354+
if c == '.':
355+
j += 1
356+
while s[j].isdigit():
357+
j += 1
358+
c = s[j]
359+
360+
if c in 'eE':
361+
j += 1
362+
if s[j] in '+-':
363+
j += 1
364+
while s[j].isdigit():
365+
j += 1
366+
367+
return float(s[i:j]), j
368+
369+
370+
def _SExpr_parse_string(s: str, i: int) -> Tuple[str, int]:
371+
j = i + 1
372+
while s[j] != '"':
373+
if s[j] == '\\':
374+
j += 2
375+
else:
376+
j += 1
377+
return _SExpr_unescape_string(s[i+1:j]), j + 1
378+
379+
380+
def _SExpr_parse_symbol(s: str, i: int) -> Tuple[str, int]:
381+
m = _SExpr_symbol_re.match(s, pos=i)
382+
if m is None:
383+
raise ValueError('Invalid S-Expression: ' + s)
384+
return _SExpr_unescape_symbol(m.group(0)), m.end()
385+
386+
387+
class _SExprParser(object):
388+
389+
def parse(self, s: str) -> SExprResult:
390+
return _SExpr_parse(s.lstrip())
362391

363392
def format(self, d):
364393
if isinstance(d, tuple) and len(d) == 2:

tests/util_test.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ def test_safe_int():
1313
assert safe_int('-12345') == -12345
1414
assert safe_int('1a') == '1a'
1515

16+
1617
def test_SExpr():
1718
# atoms outside of parens
1819
# assert SExpr.parse('a').data == 'a'
@@ -23,6 +24,9 @@ def test_SExpr():
2324
assert SExpr.parse('(a)').data == ['a']
2425
assert SExpr.parse('(1)').data == [1]
2526
assert SExpr.parse('(1.0)').data == [1.0]
27+
assert SExpr.parse('(1e2)').data == [100]
28+
assert SExpr.parse('(1.2e2)').data == [120]
29+
assert SExpr.parse('(1.2e-2)').data == [0.012]
2630
assert SExpr.parse('("a")').data == ['a'] # same as symbol?
2731
assert SExpr.parse('( a . b )').data == ('a', 'b')
2832
assert SExpr.parse('( :a (b) )').data == [':a', ['b']]
@@ -42,11 +46,12 @@ def test_SExpr():
4246
# other kinds of whitespace
4347
assert SExpr.parse('(\ta\n.\n\n b)').data == ('a', 'b')
4448

49+
4550
def test_SExpr_format():
4651
assert SExpr.format([]) == '()'
4752
assert SExpr.format([1]) == '(1)'
4853
assert SExpr.format([1.0]) == '(1.0)'
49-
assert SExpr.format((1,2)) == '(1 . 2)'
54+
assert SExpr.format((1, 2)) == '(1 . 2)'
5055
assert SExpr.format(['a-a', ('b', 'c')]) == '(a-a (b . c))'
5156

5257
# unescape_string is disabled in delphin.util

0 commit comments

Comments
 (0)