13
13
import io
14
14
import json
15
15
from functools import partial
16
+ import logging
16
17
17
18
from delphin import itsdb , tsql
18
19
from delphin .mrs import xmrs
19
20
from delphin .util import safe_int , SExpr
21
+ from delphin .exceptions import PyDelphinException
20
22
21
23
24
+ logging .basicConfig ()
25
+
22
26
###############################################################################
23
27
### CONVERT ###################################################################
24
28
@@ -91,7 +95,36 @@ def convert(path, source_fmt, target_fmt, select='result:mrs',
91
95
kwargs ['predicate_modifiers' ] = predicate_modifiers
92
96
kwargs ['properties' ] = properties
93
97
94
- return dumps (xs , ** kwargs )
98
+ # this is not a great way to improve robustness when converting
99
+ # many representations, but it'll do until v1.0.0. Also, it only
100
+ # improves robustness on the output, not the input.
101
+ # Note that all the code below is to replace the following:
102
+ # return dumps(xs, **kwargs)
103
+ head , joiner , tail = _get_output_details (target_fmt )
104
+ parts = []
105
+ if pretty_print :
106
+ joiner = joiner .strip () + '\n '
107
+ def _trim (s ):
108
+ if head and s .startswith (head ):
109
+ s = s [len (head ):].lstrip ('\n ' )
110
+ if tail and s .endswith (tail ):
111
+ s = s [:- len (tail )].rstrip ('\n ' )
112
+ return s
113
+ for x in xs :
114
+ try :
115
+ s = dumps ([x ], ** kwargs )
116
+ except (PyDelphinException , KeyError , IndexError ):
117
+ logging .exception ('could not convert representation' )
118
+ else :
119
+ s = _trim (s )
120
+ parts .append (s )
121
+ # set these after so head and tail are used correctly in _trim
122
+ if pretty_print :
123
+ if head :
124
+ head += '\n '
125
+ if tail :
126
+ tail = '\n ' + tail
127
+ return head + joiner .join (parts ) + tail
95
128
96
129
97
130
def _get_codec (codec , load = True ):
@@ -146,6 +179,21 @@ def _get_codec(codec, load=True):
146
179
raise ValueError ('invalid target format: ' + codec )
147
180
148
181
182
+ def _get_output_details (codec ):
183
+ if codec == 'mrx' :
184
+ return ('<mrs-list' , '' , '</mrs-list>' )
185
+
186
+ elif codec == 'dmrx' :
187
+ from delphin .mrs import dmrx
188
+ return ('<dmrs-list>' , '' , '</dmrs-list>' )
189
+
190
+ elif codec in ('mrs-json' , 'dmrs-json' , 'eds-json' ):
191
+ return ('[' , ',' , ']' )
192
+
193
+ else :
194
+ return ('' , ' ' , '' )
195
+
196
+
149
197
# simulate json codecs for MRS and DMRS
150
198
151
199
class _MRS_JSON (object ):
@@ -329,11 +377,13 @@ def mkprof(destination, source=None, relations=None, where=None,
329
377
dts = itsdb .TestSuite (path = destination , relations = relations )
330
378
# input is sentences on stdin
331
379
if source is None :
332
- dts .write ({'item' : _lines_to_rows (sys .stdin )}, gzip = gzip )
380
+ dts .write ({'item' : _lines_to_rows (sys .stdin , dts .relations )},
381
+ gzip = gzip )
333
382
# input is sentence file
334
383
elif os .path .isfile (source ):
335
384
with open (source ) as fh :
336
- dts .write ({'item' : _lines_to_rows (fh )}, gzip = gzip )
385
+ dts .write ({'item' : _lines_to_rows (fh , dts .relations )},
386
+ gzip = gzip )
337
387
# input is source testsuite
338
388
elif os .path .isdir (source ):
339
389
sts = itsdb .TestSuite (source )
@@ -372,20 +422,32 @@ def mkprof(destination, source=None, relations=None, where=None,
372
422
print (fmt .format (stat .st_size , _red (filename + '.gz' )))
373
423
374
424
375
- def _lines_to_rows (lines ):
425
+ def _lines_to_rows (lines , relations ):
426
+ # field indices only need to be computed once, so don't use
427
+ # itsdb.Record.from_dict()
428
+ i_id_idx = relations ['item' ].index ('i-id' )
429
+ i_wf_idx = relations ['item' ].index ('i-wf' )
430
+ i_input_idx = relations ['item' ].index ('i-input' )
431
+ num_fields = len (relations ['item' ])
432
+
433
+ def make_row (i_id , i_wf , i_input ):
434
+ row = [None ] * num_fields
435
+ row [i_id_idx ] = i_id
436
+ row [i_wf_idx ] = i_wf
437
+ row [i_input_idx ] = i_input
438
+ return itsdb .Record (relations ['item' ], row )
439
+
376
440
for i , line in enumerate (lines ):
377
- i_id = i * 10
378
- i_wf = 0 if line .startswith ('*' ) else 1
379
- i_input = line [1 :].strip () if line .startswith ('*' ) else line .strip ()
380
- yield {'i-id' : i_id , 'i-wf' : i_wf , 'i-input' : i_input }
441
+ i_wf , i_input = (0 , line [1 :]) if line .startswith ('*' ) else (1 , line )
442
+ yield make_row (i * 10 , i_wf , i_input .strip ())
381
443
382
444
383
445
###############################################################################
384
446
### PROCESS ###################################################################
385
447
386
448
def process (grammar , testsuite , source = None , select = None ,
387
- generate = False , transfer = False ,
388
- all_items = False , result_id = None ):
449
+ generate = False , transfer = False , options = None ,
450
+ all_items = False , result_id = None , gzip = False ):
389
451
"""
390
452
Process (e.g., parse) a [incr tsdb()] profile.
391
453
@@ -413,10 +475,15 @@ def process(grammar, testsuite, source=None, select=None,
413
475
(default: `False`)
414
476
transfer (bool): if `True`, transfer instead of parse
415
477
(default: `False`)
478
+ options (list): list of ACE command-line options to use when
479
+ invoking the ACE subprocess; unsupported options will
480
+ give an error message
416
481
all_items (bool): if `True`, don't exclude ignored items
417
482
(those with `i-wf==2`) when parsing
418
483
result_id (int): if given, only keep items with the specified
419
484
`result-id`
485
+ gzip (bool): if `True`, non-empty tables will be compressed
486
+ with gzip
420
487
"""
421
488
from delphin .interfaces import ace
422
489
@@ -441,18 +508,14 @@ def process(grammar, testsuite, source=None, select=None,
441
508
target = itsdb .TestSuite (testsuite )
442
509
column , tablename , condition = _interpret_selection (select , source )
443
510
table = itsdb .Table (
444
- tablename ,
445
511
source [tablename ].fields ,
446
512
tsql .select (
447
513
'* from {} {}' .format (tablename , condition ),
448
514
source ,
449
515
cast = False ))
450
516
451
- with processor (grammar ) as cpu :
452
- target .process (cpu , tablename + ':' + column , source = table )
453
-
454
- target .write ()
455
-
517
+ with processor (grammar , cmdargs = options ) as cpu :
518
+ target .process (cpu , ':' + column , source = table , gzip = gzip )
456
519
457
520
def _interpret_selection (select , source ):
458
521
queryobj = tsql .inspect_query ('select ' + select )
0 commit comments