3
3
from collections import OrderedDict
4
4
import errno
5
5
from io import StringIO
6
- import os . path
6
+ from os import path , devnull , strerror
7
7
import sys
8
8
from typing import Dict , List , Tuple
9
9
10
10
from lxml import etree
11
11
12
+ WITH_COUNT = False
13
+ MAX_ITEMS = 100000
14
+ OUT_FD = sys .stdout
12
15
13
16
def usage ():
14
17
helpstr = '''
15
- pyxml2xpath <file path> [mode] [initial xpath expression]
18
+ pyxml2xpath <file path> [mode] [initial xpath expression] [with element count: yes|true] [max elements: int] [no banner: yes|true]
16
19
17
20
mode: str
18
21
path : print elements xpath expressions (default)
@@ -27,9 +30,16 @@ def usage():
27
30
Examples:
28
31
pyxml2xpath tests/resources/soap.xml
29
32
30
- pyxml2xpath tests/resources/HL7.xml '' '//*[local-name()= "act"]'
33
+ pyxml2xpath tests/resources/HL7.xml none '//*[local-name()= "act"]'
31
34
32
35
pyxml2xpath tests/resources/HL7.xml 'values' '//*[local-name()= "act"]'
36
+
37
+ # mode : all
38
+ # starting at xpath : none
39
+ # count elements : False
40
+ # Limit elements : 11
41
+ # Do not show banner (just xpaths): true
42
+ pyxml2xpath ~/tmp/test.html all none none 11 true
33
43
'''
34
44
print (helpstr )
35
45
@@ -81,7 +91,7 @@ def build_path_from_parts(xmap, xp, qname, revns):
81
91
elif f'{ last } /*{ p } ' in xmap :
82
92
last = f'{ last } /*{ p } '
83
93
84
- def parse_mixed_ns (tree : etree ._ElementTree , nsmap : Dict , xpath_base : str = '//*' ) -> OrderedDict [str , Tuple [str , int , List [str ]]]:
94
+ def parse_mixed_ns (tree : etree ._ElementTree , nsmap : Dict , xpath_base : str = '//*' , * , with_count : bool = WITH_COUNT , max_items : int = MAX_ITEMS ) -> OrderedDict [str , Tuple [str , int , List [str ]]]:
85
95
'''Parse XML document that may contain anonymous namespace.
86
96
Returns a dict with original xpath as keys, xpath with qualified names and count of elements found with the latter.
87
97
xmap = {
@@ -98,16 +108,17 @@ def parse_mixed_ns(tree: etree._ElementTree, nsmap: Dict, xpath_base: str = '//*
98
108
namespaces dictionary from current document'''
99
109
100
110
revns = {v :k or 'ns' for k ,v in nsmap .items ()}
101
- elst = tree .xpath (xpath_base , namespaces = nsmap )
111
+ elements = tree .xpath (xpath_base , namespaces = nsmap )
102
112
xmap = OrderedDict ()
103
- for ele in elst :
113
+ for ele in elements [: max_items ] :
104
114
xp = tree .getpath (ele )
105
- qname = etree .QName (ele .tag )
106
115
#print(f"DEBUG: {xp}", file=sys. stderr)
107
116
if xp in xmap :
108
117
# Do not update an existing element. Should not enter here, but ...
109
118
print (f"ERROR: duplicated path: { xp } " ,file = sys . stderr )
110
119
continue
120
+
121
+ qname = etree .QName (ele .tag )
111
122
if '*' not in xp :
112
123
# xpath expression is already qualified
113
124
# e.g.:
@@ -143,16 +154,17 @@ def parse_mixed_ns(tree: etree._ElementTree, nsmap: Dict, xpath_base: str = '//*
143
154
xmap [xp ][2 ].extend (ele .attrib .keys ())
144
155
145
156
# count elements found with these xpath expressions
146
- for k , v in xmap .items ():
147
- # Define a nodeset with qualified expression: (/ns98:feed/ns98:entry/ns98:author)
148
- # and get the first element or none defined by the count of unqualified expression: count(/*/*[9]/*[6])
149
- # (/ns98:feed/ns98:entry/ns98:author)[count(/*/*[9]/*[6])]
150
- # for example: count((author author author)[1])
151
- # the count of that will be 1 and it means both expressions were validated to return results.
152
- xmap [k ]= v [0 ], int (tree .xpath (f"count(({ v [0 ]} )[count({ k } )])" , namespaces = nsmap )), v [2 ]
157
+ if with_count :
158
+ for k , v in xmap .items ():
159
+ # Define a nodeset with qualified expression: (/ns98:feed/ns98:entry/ns98:author)
160
+ # and get the first element or none defined by the count of unqualified expression: count(/*/*[9]/*[6])
161
+ # (/ns98:feed/ns98:entry/ns98:author)[count(/*/*[9]/*[6])]
162
+ # for example: count((author author author)[1])
163
+ # the count of that will be 1 and it means both expressions were validated to return results.
164
+ xmap [k ]= v [0 ], int (tree .xpath (f"count(({ v [0 ]} )[count({ k } )])" , namespaces = nsmap )), v [2 ]
153
165
return xmap
154
166
155
- def print_xpaths (xmap : Dict , mode : str = "path" ):
167
+ def print_xpaths (xmap : Dict , mode : str = "path" , * , with_count : bool = WITH_COUNT , out_fd = OUT_FD ):
156
168
'''Print xpath expressions and validate by count of elements found with it.
157
169
mode: str
158
170
path : print elements xpath expressions (default)
@@ -166,21 +178,20 @@ def print_xpaths(xmap: Dict, mode: str ="path"):
166
178
167
179
if mode in ["path" , "all" ]:
168
180
for unq_xpath , qxpath_lst in xmap .items ():
169
- if qxpath_lst [1 ] > 0 and mode != "none" :
170
- print (qxpath_lst [0 ])
171
- elif qxpath_lst [1 ] <= 0 :
181
+ print (qxpath_lst [0 ])
182
+ if qxpath_lst [1 ] <= 0 and with_count :
172
183
# built xpath didn't find elements
173
- print (f"ERROR: { int (qxpath_lst [1 ])} elements found with { qxpath_lst [0 ]} xpath expression.\n Original xpath: { unq_xpath } " , file = sys . stderr )
184
+ print (f"ERROR: { int (qxpath_lst [1 ])} elements found with { qxpath_lst [0 ]} xpath expression.\n Original xpath: { unq_xpath } " , file = sys .stderr )
174
185
175
186
if mode == "all" :
176
187
#Print xpath for attributes
177
188
for unq_xpath , qxpath_lst in xmap .items ():
178
189
if qxpath_lst [2 ] is None :
179
190
continue
180
- if qxpath_lst [1 ] > 0 :
181
- for a in qxpath_lst [2 ]:
182
- print (f"{ qxpath_lst [0 ]} /@{ a } " )
183
- acount += 1
191
+ # if qxpath_lst[1] > 0:
192
+ for a in qxpath_lst [2 ]:
193
+ print (f"{ qxpath_lst [0 ]} /@{ a } " )
194
+ acount += 1
184
195
acountmsg = f"Found { acount :3} xpath expressions for attributes\n "
185
196
elif mode == "raw" :
186
197
for key , value in xmap .items ():
@@ -189,7 +200,7 @@ def print_xpaths(xmap: Dict, mode: str ="path"):
189
200
for key , value in xmap .items ():
190
201
print (value )
191
202
192
- print (f"\n Found { len (xmap .keys ()):3} xpath expressions for elements\n { acountmsg } " )
203
+ print (f"\n Found { len (xmap .keys ()):3} xpath expressions for elements\n { acountmsg } " , file = out_fd )
193
204
194
205
def build_namespace_dict (tree ):
195
206
'''Build a namespaces dictionary with prefix for default namespaces.
@@ -214,11 +225,11 @@ def build_namespace_dict(tree):
214
225
nsmap [ns ] = v
215
226
return nsmap
216
227
217
- def fromstring (xmlstr : str , * , xpath_base : str = '//*' ) -> (etree ._ElementTree , Dict [str , str ], OrderedDict [str , Tuple [str , int , List [str ]]]):
228
+ def fromstring (xmlstr : str , * , xpath_base : str = '//*' , with_count : bool = WITH_COUNT , max_items : int = MAX_ITEMS ) -> (etree ._ElementTree , Dict [str , str ], OrderedDict [str , Tuple [str , int , List [str ]]]):
218
229
doc = etree .parse (StringIO (xmlstr ))
219
- return parse (file = None , itree = doc , xpath_base = xpath_base )
230
+ return parse (file = None , itree = doc , xpath_base = xpath_base , with_count = with_count , max_items = max_items )
220
231
221
- def parse (file : str , * , itree : etree ._ElementTree = None , xpath_base : str = '//*' ) -> (etree ._ElementTree , Dict [str , str ], OrderedDict [str , Tuple [str , int , List [str ]]]):
232
+ def parse (file : str , * , itree : etree ._ElementTree = None , xpath_base : str = '//*' , with_count : bool = WITH_COUNT , max_items : int = MAX_ITEMS ) -> (etree ._ElementTree , Dict [str , str ], OrderedDict [str , Tuple [str , int , List [str ]]]):
222
233
'''Parse given xml file, find xpath expressions in it and return
223
234
- The ElementTree for further usage
224
235
- The sanitized namespaces map (no None keys)
@@ -235,22 +246,24 @@ def parse(file: str, *, itree: etree._ElementTree = None, xpath_base: str = '//*
235
246
itree: lxml.etree._ElementTree
236
247
ElementTree object
237
248
xpath_base: xpath expression so start searching xpaths for.
249
+ with_count: Include count of elements found with each expression. Default: False
250
+ max_items: limit the number of parsed elements. Default: 100000
238
251
'''
239
252
240
253
try :
241
254
tree = itree
242
255
if tree is None :
243
- if not os . path .isfile (file ):
244
- raise FileNotFoundError (errno .ENOENT , os . strerror (errno .ENOENT ), file )
256
+ if not path .isfile (file ):
257
+ raise FileNotFoundError (errno .ENOENT , strerror (errno .ENOENT ), file )
245
258
with open (file , "r" ) as fin :
246
259
tree = etree .parse (fin )
247
260
248
261
nsmap = build_namespace_dict (tree )
249
262
#print(f"Namespaces found: {nsmap}")
250
- xmap = parse_mixed_ns (tree , nsmap , xpath_base )
263
+ xmap = parse_mixed_ns (tree , nsmap , xpath_base , with_count = with_count , max_items = max_items )
251
264
return (tree , nsmap , xmap )
252
265
except Exception as e :
253
- print ("ERROR." , type (e ).__name__ , "–" , e )
266
+ print ("ERROR." , type (e ).__name__ , "–" , e , file = sys . stderr )
254
267
raise (e )
255
268
256
269
def main ():
@@ -259,25 +272,35 @@ def main():
259
272
sys .exit ()
260
273
261
274
file = sys .argv [1 ]
275
+ mode = "path"
276
+ xpath_base = "//*"
277
+ with_count = WITH_COUNT
278
+ max_items = MAX_ITEMS
279
+ out_fd = OUT_FD
262
280
263
- if not os . path .isfile (file ):
264
- print (f"[Errno { errno .ENOENT } ] { os . strerror (errno .ENOENT )} " )
281
+ if not path .isfile (file ):
282
+ print (f"[Errno { errno .ENOENT } ] { strerror (errno .ENOENT )} " , file = sys . stderr )
265
283
sys .exit (errno .ENOENT )
266
284
267
- if len (sys .argv ) > 2 and sys .argv [2 ] != '' :
268
- mode = sys .argv [2 ]
269
- else :
270
- mode = "path"
271
-
272
- if len (sys .argv ) > 3 :
273
- xpath_base = sys .argv [3 ]
274
- else :
275
- xpath_base = "//*"
285
+ for i , arg in enumerate (sys .argv ):
286
+ if str (arg ).lower () in ['' , 'none' ]:
287
+ continue
288
+
289
+ if i == 2 :
290
+ mode = arg
291
+ elif i == 3 :
292
+ xpath_base = arg
293
+ elif i == 4 and str (arg ).lower () in ['yes' , 'true' ]:
294
+ with_count = True
295
+ elif i == 5 :
296
+ max_items = int (arg )
297
+ elif i == 6 and str (arg ).lower () in ['yes' , 'true' ]:
298
+ out_fd = open (devnull , 'w' )
276
299
277
- print (f"Running...\n { 'file' :10} : { file } \n { 'mode' :10} : { mode } \n { 'xpath_base' :10} : '{ xpath_base } '" )
278
- nsmap , xmap = parse (file , xpath_base = xpath_base )[1 :]
279
- print (f"namespaces: { nsmap } \n " )
280
- print_xpaths (xmap , mode )
300
+ print (f"Running...\n { 'file' :10} : { file } \n { 'mode' :10} : { mode } \n { 'xpath_base' :10} : '{ xpath_base } '\n { 'with_count' :10 } : { with_count } \n { 'max_items' :10 } : { max_items } " , file = out_fd , flush = True )
301
+ nsmap , xmap = parse (file , xpath_base = xpath_base , with_count = with_count , max_items = max_items )[1 :]
302
+ print (f"namespaces: { nsmap } \n " , file = out_fd , flush = True )
303
+ print_xpaths (xmap , mode , out_fd = out_fd )
281
304
282
305
if __name__ == "__main__" :
283
306
main ()
0 commit comments