Skip to content

Commit 78271f5

Browse files
authored
Merge pull request #21 from mluis7/pxx-14
pxx-14 Performance improvement - make count of elements optional
2 parents 11f2a3c + 32c30bd commit 78271f5

File tree

4 files changed

+84
-52
lines changed

4 files changed

+84
-52
lines changed

README.md

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,13 @@ pyxml2xpath tests/resources/soap.xml
6868
pyxml2xpath tests/resources/HL7.xml '' '//*[local-name()= "act"]'
6969

7070
pyxml2xpath tests/resources/HL7.xml 'values' '//*[local-name()= "act"]'
71+
72+
# mode : all
73+
# starting at xpath : none
74+
# count elements : False
75+
# Limit elements : 11
76+
# Do not show banner (just xpaths): true
77+
pyxml2xpath ~/tmp/test.html all none none 11 true
7178
```
7279

7380

@@ -142,7 +149,7 @@ Found 6 xpath expressions for elements
142149
```
143150

144151
### Method parse(...)
145-
Signature: `parse(file: str, *, itree: etree._ElementTree = None, xpath_base: str = '//*')`
152+
Signature: `parse(file: str, *, itree: etree._ElementTree = None, xpath_base: str = '//*', with_count: bool = WITH_COUNT, max_items: int = MAX_ITEMS)`
146153

147154
Parse given xml file or `lxml` tree, find xpath expressions in it and return:
148155

@@ -174,6 +181,8 @@ If there are more than 1 default namespace, prefix will be incremental:
174181
- `file: str` file path string.
175182
- `itree: lxml.etree._ElementTree` ElementTree object.
176183
- `xpath_base: str` xpath expression To start searching xpaths for.
184+
- `with_count: bool` Include count of elements found with each expression. Default: False
185+
- `max_items: int` limit the number of parsed elements. Default: 100000
177186

178187
## Print result modes
179188
Print xpath expressions and validate by count of elements found with it.

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.2.1rc1
1+
0.3.0rc1

src/xml2xpath/xml2xpath.py

Lines changed: 69 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,19 @@
33
from collections import OrderedDict
44
import errno
55
from io import StringIO
6-
import os.path
6+
from os import path, devnull, strerror
77
import sys
88
from typing import Dict, List, Tuple
99

1010
from lxml import etree
1111

12+
WITH_COUNT = False
13+
MAX_ITEMS = 100000
14+
OUT_FD = sys.stdout
1215

1316
def usage():
1417
helpstr='''
15-
pyxml2xpath <file path> [mode] [initial xpath expression]
18+
pyxml2xpath <file path> [mode] [initial xpath expression] [with element count: yes|true] [max elements: int] [no banner: yes|true]
1619
1720
mode: str
1821
path : print elements xpath expressions (default)
@@ -27,9 +30,16 @@ def usage():
2730
Examples:
2831
pyxml2xpath tests/resources/soap.xml
2932
30-
pyxml2xpath tests/resources/HL7.xml '' '//*[local-name()= "act"]'
33+
pyxml2xpath tests/resources/HL7.xml none '//*[local-name()= "act"]'
3134
3235
pyxml2xpath tests/resources/HL7.xml 'values' '//*[local-name()= "act"]'
36+
37+
# mode : all
38+
# starting at xpath : none
39+
# count elements : False
40+
# Limit elements : 11
41+
# Do not show banner (just xpaths): true
42+
pyxml2xpath ~/tmp/test.html all none none 11 true
3343
'''
3444
print(helpstr)
3545

@@ -81,7 +91,7 @@ def build_path_from_parts(xmap, xp, qname, revns):
8191
elif f'{last}/*{p}' in xmap:
8292
last = f'{last}/*{p}'
8393

84-
def parse_mixed_ns(tree: etree._ElementTree, nsmap: Dict, xpath_base: str = '//*') -> OrderedDict[str, Tuple[str, int, List[str]]]:
94+
def parse_mixed_ns(tree: etree._ElementTree, nsmap: Dict, xpath_base: str = '//*', *, with_count: bool = WITH_COUNT, max_items: int = MAX_ITEMS) -> OrderedDict[str, Tuple[str, int, List[str]]]:
8595
'''Parse XML document that may contain anonymous namespace.
8696
Returns a dict with original xpath as keys, xpath with qualified names and count of elements found with the latter.
8797
xmap = {
@@ -98,16 +108,17 @@ def parse_mixed_ns(tree: etree._ElementTree, nsmap: Dict, xpath_base: str = '//*
98108
namespaces dictionary from current document'''
99109

100110
revns = {v:k or 'ns' for k,v in nsmap.items()}
101-
elst = tree.xpath(xpath_base, namespaces=nsmap)
111+
elements = tree.xpath(xpath_base, namespaces=nsmap)
102112
xmap = OrderedDict()
103-
for ele in elst:
113+
for ele in elements[:max_items]:
104114
xp = tree.getpath(ele)
105-
qname = etree.QName(ele.tag)
106115
#print(f"DEBUG: {xp}", file=sys. stderr)
107116
if xp in xmap:
108117
# Do not update an existing element. Should not enter here, but ...
109118
print(f"ERROR: duplicated path: {xp}",file=sys. stderr)
110119
continue
120+
121+
qname = etree.QName(ele.tag)
111122
if '*' not in xp:
112123
# xpath expression is already qualified
113124
# e.g.:
@@ -143,16 +154,17 @@ def parse_mixed_ns(tree: etree._ElementTree, nsmap: Dict, xpath_base: str = '//*
143154
xmap[xp][2].extend(ele.attrib.keys())
144155

145156
# count elements found with these xpath expressions
146-
for k, v in xmap.items():
147-
# Define a nodeset with qualified expression: (/ns98:feed/ns98:entry/ns98:author)
148-
# and get the first element or none defined by the count of unqualified expression: count(/*/*[9]/*[6])
149-
# (/ns98:feed/ns98:entry/ns98:author)[count(/*/*[9]/*[6])]
150-
# for example: count((author author author)[1])
151-
# the count of that will be 1 and it means both expressions were validated to return results.
152-
xmap[k]= v[0], int(tree.xpath(f"count(({v[0]})[count({k})])", namespaces=nsmap)), v[2]
157+
if with_count:
158+
for k, v in xmap.items():
159+
# Define a nodeset with qualified expression: (/ns98:feed/ns98:entry/ns98:author)
160+
# and get the first element or none defined by the count of unqualified expression: count(/*/*[9]/*[6])
161+
# (/ns98:feed/ns98:entry/ns98:author)[count(/*/*[9]/*[6])]
162+
# for example: count((author author author)[1])
163+
# the count of that will be 1 and it means both expressions were validated to return results.
164+
xmap[k]= v[0], int(tree.xpath(f"count(({v[0]})[count({k})])", namespaces=nsmap)), v[2]
153165
return xmap
154166

155-
def print_xpaths(xmap: Dict, mode: str ="path"):
167+
def print_xpaths(xmap: Dict, mode: str ="path", *, with_count: bool = WITH_COUNT, out_fd = OUT_FD):
156168
'''Print xpath expressions and validate by count of elements found with it.
157169
mode: str
158170
path : print elements xpath expressions (default)
@@ -166,21 +178,20 @@ def print_xpaths(xmap: Dict, mode: str ="path"):
166178

167179
if mode in ["path", "all"]:
168180
for unq_xpath, qxpath_lst in xmap.items():
169-
if qxpath_lst[1] > 0 and mode != "none":
170-
print(qxpath_lst[0])
171-
elif qxpath_lst[1] <= 0:
181+
print(qxpath_lst[0])
182+
if qxpath_lst[1] <= 0 and with_count:
172183
# built xpath didn't find elements
173-
print(f"ERROR: {int(qxpath_lst[1])} elements found with {qxpath_lst[0]} xpath expression.\nOriginal xpath: {unq_xpath}", file=sys. stderr)
184+
print(f"ERROR: {int(qxpath_lst[1])} elements found with {qxpath_lst[0]} xpath expression.\nOriginal xpath: {unq_xpath}", file=sys.stderr)
174185

175186
if mode == "all":
176187
#Print xpath for attributes
177188
for unq_xpath, qxpath_lst in xmap.items():
178189
if qxpath_lst[2] is None:
179190
continue
180-
if qxpath_lst[1] > 0:
181-
for a in qxpath_lst[2]:
182-
print(f"{qxpath_lst[0]}/@{a}")
183-
acount += 1
191+
#if qxpath_lst[1] > 0:
192+
for a in qxpath_lst[2]:
193+
print(f"{qxpath_lst[0]}/@{a}")
194+
acount += 1
184195
acountmsg = f"Found {acount:3} xpath expressions for attributes\n"
185196
elif mode == "raw":
186197
for key, value in xmap.items():
@@ -189,7 +200,7 @@ def print_xpaths(xmap: Dict, mode: str ="path"):
189200
for key, value in xmap.items():
190201
print(value)
191202

192-
print(f"\nFound {len(xmap.keys()):3} xpath expressions for elements\n{acountmsg}")
203+
print(f"\nFound {len(xmap.keys()):3} xpath expressions for elements\n{acountmsg}", file=out_fd)
193204

194205
def build_namespace_dict(tree):
195206
'''Build a namespaces dictionary with prefix for default namespaces.
@@ -214,11 +225,11 @@ def build_namespace_dict(tree):
214225
nsmap[ns] = v
215226
return nsmap
216227

217-
def fromstring(xmlstr: str, *, xpath_base: str = '//*') -> (etree._ElementTree, Dict[str, str], OrderedDict[str, Tuple[str, int, List[str]]]):
228+
def fromstring(xmlstr: str, *, xpath_base: str = '//*', with_count: bool = WITH_COUNT, max_items: int = MAX_ITEMS) -> (etree._ElementTree, Dict[str, str], OrderedDict[str, Tuple[str, int, List[str]]]):
218229
doc = etree.parse(StringIO(xmlstr))
219-
return parse(file=None, itree=doc, xpath_base=xpath_base)
230+
return parse(file=None, itree=doc, xpath_base=xpath_base, with_count=with_count, max_items=max_items)
220231

221-
def parse(file: str, *, itree: etree._ElementTree = None, xpath_base: str = '//*') -> (etree._ElementTree, Dict[str, str], OrderedDict[str, Tuple[str, int, List[str]]]):
232+
def parse(file: str, *, itree: etree._ElementTree = None, xpath_base: str = '//*', with_count: bool = WITH_COUNT, max_items: int = MAX_ITEMS) -> (etree._ElementTree, Dict[str, str], OrderedDict[str, Tuple[str, int, List[str]]]):
222233
'''Parse given xml file, find xpath expressions in it and return
223234
- The ElementTree for further usage
224235
- The sanitized namespaces map (no None keys)
@@ -235,22 +246,24 @@ def parse(file: str, *, itree: etree._ElementTree = None, xpath_base: str = '//*
235246
itree: lxml.etree._ElementTree
236247
ElementTree object
237248
xpath_base: xpath expression so start searching xpaths for.
249+
with_count: Include count of elements found with each expression. Default: False
250+
max_items: limit the number of parsed elements. Default: 100000
238251
'''
239252

240253
try:
241254
tree = itree
242255
if tree is None:
243-
if not os.path.isfile(file):
244-
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file)
256+
if not path.isfile(file):
257+
raise FileNotFoundError(errno.ENOENT, strerror(errno.ENOENT), file)
245258
with open(file, "r") as fin:
246259
tree = etree.parse(fin)
247260

248261
nsmap = build_namespace_dict(tree)
249262
#print(f"Namespaces found: {nsmap}")
250-
xmap = parse_mixed_ns(tree, nsmap, xpath_base)
263+
xmap = parse_mixed_ns(tree, nsmap, xpath_base, with_count=with_count, max_items=max_items)
251264
return (tree, nsmap, xmap)
252265
except Exception as e:
253-
print("ERROR.", type(e).__name__, "–", e)
266+
print("ERROR.", type(e).__name__, "–", e, file=sys.stderr)
254267
raise(e)
255268

256269
def main():
@@ -259,25 +272,35 @@ def main():
259272
sys.exit()
260273

261274
file = sys.argv[1]
275+
mode = "path"
276+
xpath_base = "//*"
277+
with_count = WITH_COUNT
278+
max_items = MAX_ITEMS
279+
out_fd = OUT_FD
262280

263-
if not os.path.isfile(file):
264-
print(f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}")
281+
if not path.isfile(file):
282+
print(f"[Errno {errno.ENOENT}] {strerror(errno.ENOENT)}", file=sys.stderr)
265283
sys.exit(errno.ENOENT)
266284

267-
if len(sys.argv) > 2 and sys.argv[2] != '':
268-
mode = sys.argv[2]
269-
else:
270-
mode = "path"
271-
272-
if len(sys.argv) > 3:
273-
xpath_base = sys.argv[3]
274-
else:
275-
xpath_base = "//*"
285+
for i, arg in enumerate(sys.argv):
286+
if str(arg).lower() in ['', 'none']:
287+
continue
288+
289+
if i == 2:
290+
mode = arg
291+
elif i == 3:
292+
xpath_base = arg
293+
elif i == 4 and str(arg).lower() in ['yes', 'true']:
294+
with_count = True
295+
elif i == 5:
296+
max_items = int(arg)
297+
elif i == 6 and str(arg).lower() in ['yes', 'true']:
298+
out_fd = open(devnull, 'w')
276299

277-
print(f"Running...\n{'file':10}: {file}\n{'mode':10}: {mode}\n{'xpath_base':10}: '{xpath_base}'")
278-
nsmap, xmap = parse(file, xpath_base=xpath_base)[1:]
279-
print(f"namespaces: {nsmap}\n")
280-
print_xpaths(xmap, mode)
300+
print(f"Running...\n{'file':10}: {file}\n{'mode':10}: {mode}\n{'xpath_base':10}: '{xpath_base}'\n{'with_count':10}: {with_count}\n{'max_items':10}: {max_items}", file=out_fd, flush=True)
301+
nsmap, xmap = parse(file, xpath_base=xpath_base, with_count=with_count, max_items=max_items)[1:]
302+
print(f"namespaces: {nsmap}\n", file=out_fd, flush=True)
303+
print_xpaths(xmap, mode, out_fd=out_fd)
281304

282305
if __name__ == "__main__":
283306
main()

tests/test_01.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ def test_all_samples_basic(self):
99
print("")
1010
for xfile in sample_paths:
1111
print(f"Testing '{xfile}'")
12-
xmap = xml2xpath.parse(xfile, xpath_base=xpath_base)[2]
12+
xmap = xml2xpath.parse(xfile, xpath_base=xpath_base, with_count=True)[2]
1313
print(f" --> Found {len(xmap.keys())} xpath expressions")
1414
assert len(xmap.keys()) > 0
1515
# assert all found expressions exist at least once in the document.
@@ -21,7 +21,7 @@ def test_parse_with_initial_xpath(self):
2121
xpath_base = '//*[local-name()="incident"]'
2222
print(f"\nTesting '{filepath}' starting at: '{xpath_base}'")
2323

24-
nsmap, xmap = xml2xpath.parse(filepath, xpath_base=xpath_base)[1:]
24+
nsmap, xmap = xml2xpath.parse(filepath, xpath_base=xpath_base, with_count=True)[1:]
2525
print(f" --> Found {len(xmap.keys())} xpath expressions")
2626
print(f" --> Found {len(nsmap.keys())} namespaces")
2727
print(f" --> nsmap: {nsmap}")
@@ -37,7 +37,7 @@ def test_fromstring(self):
3737
xmlstr = fd.read()
3838
print(f"\nTesting fromstring() from '{filepath}'")
3939

40-
nsmap, xmap = xml2xpath.fromstring(xmlstr)[1:]
40+
nsmap, xmap = xml2xpath.fromstring(xmlstr, with_count=True)[1:]
4141
print(f" --> Found {len(xmap.keys())} xpath expressions")
4242
print(f" --> Found {len(nsmap.keys())} namespaces")
4343
print(f" --> nsmap: {nsmap}")
@@ -54,7 +54,7 @@ def test_fromstring_and_xpath_base(self):
5454
xpath_base = '//*[local-name()="incident"]'
5555
print(f"\nTesting fromstring() from '{filepath}' starting at: '{xpath_base}'")
5656

57-
nsmap, xmap = xml2xpath.fromstring(xmlstr, xpath_base=xpath_base)[1:]
57+
nsmap, xmap = xml2xpath.fromstring(xmlstr, xpath_base=xpath_base, with_count=True)[1:]
5858
print(f" --> Found {len(xmap.keys())} xpath expressions")
5959
print(f" --> Found {len(nsmap.keys())} namespaces")
6060
print(f" --> nsmap: {nsmap}")

0 commit comments

Comments
 (0)