1
- '''Find all xpath expressions on XML document'''
1
+ '''Generate xpath expressions from an XML document'''
2
2
3
3
4
4
from collections import OrderedDict
5
- from io import StringIO
5
+ from io import StringIO , BytesIO
6
6
from os import path , devnull , strerror
7
- from typing import Dict , List , Tuple
7
+ from typing import Dict , List , Tuple , Union
8
8
import errno
9
9
import sys
10
+ import json
10
11
11
12
from lxml import etree
12
13
15
16
WITH_COUNT = False
16
17
MAX_ITEMS = 100000
17
18
OUT_FD = sys .stdout
18
- modes = ['xpath' , 'all' , 'raw' , 'values' ]
19
+ modes = ['xpath' , 'all' , 'raw' , 'values' , 'raw_json' , 'raw_json_pretty' ]
19
20
20
21
def usage ():
21
22
helpstr = '''
22
23
pyxml2xpath <file path> [mode] [initial xpath expression] [with element count: yes|true] [max elements: int] [no banner: yes|true]
23
24
25
+ File path: pass '-' to read from stdin
26
+
24
27
mode: str
25
- path : print elements xpath expressions (default)
28
+ xpath : print elements xpath expressions (default)
26
29
all : also print attribute xpath expressions
27
30
raw : print unqualified xpath and found values (list)
28
31
values: print list of found values only
32
+ raw_json: json string with internal xpath and namespace maps.
33
+ {"xpath": { ... }, "namespaces": { ... }}
34
+ raw_json_pretty: same as raw_json with pretty print
29
35
30
36
Initial xpath expression: str
31
37
Start at some element defined by an xpath expression.
@@ -54,6 +60,16 @@ def _get_qualified_name(qname, revns):
54
60
lname = f"{ revns .get (qname .namespace )} :{ qname .localname } "
55
61
return lname
56
62
63
+ def get_encoding_from_fragment (xml_frag ):
64
+ '''
65
+ Get encoding attribute value from xml declaration if present.
66
+ :param xml_frag: first 100 characters from XML string.
67
+ '''
68
+ declstr = xml_frag .split (b'?>' )[0 ]
69
+ # Fake a minimal doc to parse declaration ensuring encoding is valid for lxml
70
+ frdoc = etree .parse (BytesIO (declstr + b'?><root/>' ))
71
+ return frdoc .docinfo .encoding
72
+
57
73
def _get_dict_list_value (value , element ):
58
74
'''Initialize tuple for xpath dictionary values.
59
75
Items:
@@ -110,24 +126,23 @@ def parse_mixed_ns(tree: etree._ElementTree,
110
126
'''Parse XML document that may contain anonymous namespace.
111
127
Returns a dict with original xpath as keys, xpath with qualified names and
112
128
count of elements found with the latter or None if an error occurred.
113
- xmap = {
129
+ xmap = `` {
114
130
"/some/xpath/*[1]": ("/some/xpath/ns:ele1", 1, ["id", "class"])
115
- }
131
+ }``
116
132
To get the qualified xpath:
117
- xmap['/some/xpath/*[1]'][0]
118
-
119
- Parameters
120
- ----------
121
- tree: lxml.etree._ElementTree
133
+ ``xmap['/some/xpath/*[1]'][0]``
134
+
135
+ :param tree: lxml.etree._ElementTree
122
136
ElementTree from current document
123
- nsmap: dict
137
+ :param nsmap: dict
124
138
namespaces dictionary from current document
125
- xpath_base: str
139
+ :param xpath_base: str
126
140
Xpath expression to start from
127
- with_count: bool
141
+ :param with_count: bool
128
142
add count of found elements (performance cost on large documents).
129
- max_items: int
130
- max number of elements to parse. Default: 100000'''
143
+ :param max_items: int
144
+ max number of elements to parse. Default: 100000
145
+ '''
131
146
132
147
revns = {v :k or 'ns' for k ,v in nsmap .items ()}
133
148
elements = tree .xpath (xpath_base , namespaces = nsmap )
@@ -203,36 +218,68 @@ def parse_mixed_ns(tree: etree._ElementTree,
203
218
xmap [xp ] = xmap [xp ][0 ], xcount , xmap [xp ][2 ]
204
219
return xmap
205
220
221
+
222
+ def get_json_response (xmap , nsmap , mode ):
223
+ '''
224
+ Return a json string as
225
+ {"xpath": { ... }.
226
+ "namespaces": { ... }}
227
+ :param xmap:
228
+ :param nsmap:
229
+ :param mode:
230
+ '''
231
+ indent = None
232
+ raw_json = {}
233
+ raw_json ['xpath' ] = xmap
234
+ if nsmap :
235
+ raw_json ['namespaces' ] = nsmap
236
+ if mode == 'raw_json_pretty' :
237
+ indent = 4
238
+ return json .dumps (raw_json , indent = indent )
239
+
206
240
def print_xpaths (xmap : Dict ,
207
- mode : str = "path " ,
241
+ mode : str = "xpath " ,
208
242
* ,
209
- out_fd = OUT_FD ):
210
- '''Print xpath expressions and validate by count of elements found with it.
243
+ out_fd = OUT_FD , nsmap = None ):
244
+ '''
245
+ Print xpath expressions according to 'mode' argument.
246
+
247
+ :param xmap: map of xpath expressions
248
+ :param mode: output mode
249
+ :param out_fd: output file descriptor
250
+ :param nsmap: namespaces map
251
+
211
252
mode: str
212
- path : print elements xpath expressions (default)
253
+ xpath : print elements xpath expressions (default)
213
254
all : also print attribute xpath expressions
214
255
raw : print unqualified xpath and found values (list)
256
+ raw_json: json string with internal xpath and namespace maps.
257
+ {"xpath": { ... }.
258
+ "namespaces": { ... }}
259
+ raw_json_pretty: same as raw_json with pretty print
215
260
values: print tuple of found values only
216
261
'''
217
262
218
263
acount = 0
219
264
acountmsg = ''
220
265
221
- for unq_xpath , qual_xpath_lst in xmap .items ():
222
- if mode not in ['raw' , 'values' ]:
223
- print (qual_xpath_lst [0 ])
224
-
225
- if mode == "all" :
226
- #Print xpath for attributes
227
- if qual_xpath_lst [2 ] is not []:
228
- for a in qual_xpath_lst [2 ]:
229
- print (f"{ qual_xpath_lst [0 ]} /@{ a } " )
230
- acount += 1
231
- acountmsg = f"Found { acount :3} xpath expressions for attributes\n "
232
- if mode == "raw" :
233
- print (unq_xpath , qual_xpath_lst )
234
- elif mode == "values" :
235
- print (qual_xpath_lst )
266
+ if mode in ['raw_json' , 'raw_json_pretty' ]:
267
+ print (get_json_response (xmap , nsmap , mode ))
268
+ else :
269
+ for unq_xpath , qual_xpath_lst in xmap .items ():
270
+ if mode in ['xpath' , 'all' ]:
271
+ print (qual_xpath_lst [0 ])
272
+ if mode == "all" :
273
+ #Print xpath for attributes
274
+ if qual_xpath_lst [2 ] is not []:
275
+ for a in qual_xpath_lst [2 ]:
276
+ print (f"{ qual_xpath_lst [0 ]} /@{ a } " )
277
+ acount += 1
278
+ acountmsg = f"Found { acount :3} xpath expressions for attributes\n "
279
+ elif mode == "raw" :
280
+ print (unq_xpath , qual_xpath_lst )
281
+ elif mode == "values" :
282
+ print (qual_xpath_lst )
236
283
237
284
print (f"\n Found { len (xmap .keys ()):3} xpath expressions for elements\n { acountmsg } " , file = out_fd )
238
285
@@ -259,11 +306,25 @@ def build_namespace_dict(tree: etree._ElementTree) -> Dict[str, str]:
259
306
nsmap [ns ] = v
260
307
return nsmap
261
308
262
- def fromstring (xmlstr : str , * ,
309
+ def fromstring (xmlstr : Union [ str , bytes ] , * ,
263
310
xpath_base : str = '//*' ,
264
311
with_count : bool = WITH_COUNT ,
265
- max_items : int = MAX_ITEMS ) -> (etree ._ElementTree , Dict [str , str ], OrderedDict [str , Tuple [str , int , List [str ]]]):
266
- doc = etree .parse (StringIO (xmlstr ))
312
+ max_items : int = MAX_ITEMS ,
313
+ encoding = 'utf-8' ) -> (etree ._ElementTree , Dict [str , str ], OrderedDict [str , Tuple [str , int , List [str ]]]):
314
+ '''
315
+ Parse from string. If the document contains an encoding in XML declaration, use it.
316
+ If it starts with an XML declaration but has no ``encoding`` attribute, ``utf-8`` will be used be default.
317
+
318
+ :param xmlstr: document or fragment as string or bytes.
319
+ :param xpath_base: xpath expression to start searching xpaths for.
320
+ :param with_count: Include count of elements found with each expression. Default: False
321
+ :param max_items: limit the number of parsed elements. Default: 100000
322
+ :param encoding: xml string encoding, ``utf-8`` by default. Use :func:`get_encoding_from_fragment` to get encoding.
323
+ '''
324
+ xmlbytes = xmlstr
325
+ if isinstance (xmlstr , str ):
326
+ xmlbytes = xmlstr .encode (encoding )
327
+ doc = etree .parse (BytesIO (xmlbytes ))
267
328
return parse (file = None , itree = doc , xpath_base = xpath_base , with_count = with_count , max_items = max_items )
268
329
269
330
def parse (file : str , * ,
@@ -283,20 +344,20 @@ def parse(file: str, *,
283
344
284
345
Parameters
285
346
----------
286
- file: file path string
287
- itree: lxml.etree._ElementTree
347
+ :param file: file path string
348
+ :param itree: lxml.etree._ElementTree
288
349
ElementTree object
289
- xpath_base: xpath expression to start searching xpaths for.
290
- with_count: Include count of elements found with each expression. Default: False
291
- max_items: limit the number of parsed elements. Default: 100000
350
+ :param xpath_base: xpath expression to start searching xpaths for.
351
+ :param with_count: Include count of elements found with each expression. Default: False
352
+ :param max_items: limit the number of parsed elements. Default: 100000
292
353
'''
293
354
294
355
try :
295
356
tree = itree
296
357
if tree is None :
297
358
if not path .isfile (file ):
298
359
raise FileNotFoundError (errno .ENOENT , strerror (errno .ENOENT ), file )
299
- with open (file , "r " ) as fin :
360
+ with open (file , "rb " ) as fin :
300
361
tree = etree .parse (fin )
301
362
302
363
nsmap = build_namespace_dict (tree )
@@ -308,20 +369,24 @@ def parse(file: str, *,
308
369
raise (e )
309
370
310
371
def main ():
372
+ if len (sys .argv ) == 1 :
373
+ print (f"ERROR: No arguments passed.\n Run with -h for help." , file = sys .stderr )
374
+ sys .exit (1 )
375
+
311
376
if sys .argv [1 ] in ["-h" , "--help" ]:
312
377
usage ()
313
378
sys .exit ()
314
379
315
380
file = sys .argv [1 ]
316
- mode = "path "
381
+ mode = "xpath "
317
382
xpath_base = XPATH_ALL
318
383
with_count = WITH_COUNT
319
384
max_items = MAX_ITEMS
320
385
out_fd = OUT_FD
321
386
no_banners = False
322
387
warns = None
323
388
324
- if not path .isfile (file ):
389
+ if file != '-' and not path .isfile (file ):
325
390
print (f"[Errno { errno .ENOENT } ] { strerror (errno .ENOENT )} " , file = sys .stderr )
326
391
sys .exit (errno .ENOENT )
327
392
@@ -357,10 +422,14 @@ def main():
357
422
print (f"{ 'no_banners' :10} : { no_banners } " , file = out_fd , flush = True )
358
423
if warns is not None :
359
424
print (f"\n { warns } \n " , file = sys .stderr )
360
- nsmap , xmap = parse (file , xpath_base = xpath_base , with_count = with_count , max_items = max_items )[1 :]
425
+ if file != '-' :
426
+ nsmap , xmap = parse (file , xpath_base = xpath_base , with_count = with_count , max_items = max_items )[1 :]
427
+ else :
428
+ xmlstr = sys .stdin .buffer .read ()
429
+ nsmap , xmap = fromstring (xmlstr , xpath_base = xpath_base , with_count = with_count , max_items = max_items )[1 :]
361
430
if xmap is not None :
362
431
print (f"namespaces: { nsmap } \n " , file = out_fd , flush = True )
363
- print_xpaths (xmap , mode , out_fd = out_fd )
432
+ print_xpaths (xmap , mode , out_fd = out_fd , nsmap = nsmap )
364
433
else :
365
434
sys .exit (1 )
366
435
0 commit comments