Skip to content

Commit 0fd5c9f

Browse files
authored
Merge pull request #31 from mluis7/feature_20_21
feature_20_21: Allow reading XML from stdin, provide raw mode ouput as json
2 parents c9b3a9b + 3a398bd commit 0fd5c9f

File tree

9 files changed

+202
-69
lines changed

9 files changed

+202
-69
lines changed

README.md

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -49,21 +49,21 @@ It can be used as a [command line utility](#command-line-usage) or as a [module]
4949
## Installation
5050
Installing from PyPi
5151

52-
`pip3.9 install pyxml2xpath`
52+
`pip3.12 install pyxml2xpath`
5353

5454
Or building from source repo
5555

5656
```bash
5757
git clone https://github.com/mluis7/pyxml2xpath.git
5858
cd pyxml2xpath
59-
python3.9 -m build
60-
python3.9 -m pip install dist/pyxml2xpath-0.2.0-py3-none-any.whl --upgrade
59+
python3.12 -m build
60+
python3.12 -m pip install dist/pyxml2xpath-0.2.0-py3-none-any.whl --upgrade
6161
```
6262

6363
Alternative without cloning the repo yourself
6464

6565
```
66-
pip3.9 install git+https://github.com/mluis7/pyxml2xpath.git
66+
pip3.12 install git+https://github.com/mluis7/pyxml2xpath.git
6767
```
6868

6969
## Command line usage
@@ -84,7 +84,13 @@ pyxml2xpath tests/resources/HL7.xml 'values' '//*[local-name()= "act"]'
8484

8585
pyxml2xpath ~/tmp/test.html all none none 11 true
8686
```
87+
Reading from stdin pass `-` as file
8788

89+
```bash
90+
cat tests/resources/simple-ns.xml | pyxml2xpath - xpath none false 100 false
91+
92+
pyxml2xpath - xpath none false 100 false < tests/resources/simple-ns.xml
93+
```
8894

8995
## Module usage
9096

@@ -175,12 +181,16 @@ If there are more than 1 default namespace, prefix will be incremental:
175181
## Print result modes
176182
Print xpath expressions and validate by count of elements found with it.
177183

178-
`mode` argument values (optional):
184+
**mode** argument values (optional):
179185

180-
- `path` : print elements xpath expressions (default)
181-
- `all` : also print attribute xpath expressions
182-
- `raw` : print unqualified xpath and found values (tuple)
183-
- `values`: print tuple of found values only
186+
| Mode | Description |
187+
| ---- | ----------- |
188+
| `xpath` | print elements xpath expressions (default) |
189+
| `all` | also print attribute xpath expressions |
190+
| `raw` | print unqualified xpath and found values (tuple) |
191+
| `values` | print tuple of found values only |
192+
| `raw_json` | json string with internal xpath and namespace maps. |
193+
| `raw_json_pretty` | same as raw_json with pretty print |
184194

185195
`pyxml2xpath ~/tmp/soap-ws-oasis.xml 'all'`
186196

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.3.4
1+
0.3.5

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ authors = [
1616

1717
description = "Generate xpath expressions from XML document."
1818
readme = "README.md"
19-
license = {text = "GPL-3.0"}
19+
license = "GPL-3.0"
2020
keywords = ["xpath", "xml"]
2121
classifiers = [
2222
"Development Status :: 5 - Production/Stable",

src/xml2xpath/xml2xpath.py

Lines changed: 118 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
1-
'''Find all xpath expressions on XML document'''
1+
'''Generate xpath expressions from an XML document'''
22

33

44
from collections import OrderedDict
5-
from io import StringIO
5+
from io import StringIO, BytesIO
66
from os import path, devnull, strerror
7-
from typing import Dict, List, Tuple
7+
from typing import Dict, List, Tuple, Union
88
import errno
99
import sys
10+
import json
1011

1112
from lxml import etree
1213

@@ -15,17 +16,22 @@
1516
WITH_COUNT = False
1617
MAX_ITEMS = 100000
1718
OUT_FD = sys.stdout
18-
modes = ['xpath', 'all', 'raw', 'values']
19+
modes = ['xpath', 'all', 'raw', 'values', 'raw_json', 'raw_json_pretty']
1920

2021
def usage():
2122
helpstr='''
2223
pyxml2xpath <file path> [mode] [initial xpath expression] [with element count: yes|true] [max elements: int] [no banner: yes|true]
2324
25+
File path: pass '-' to read from stdin
26+
2427
mode: str
25-
path : print elements xpath expressions (default)
28+
xpath : print elements xpath expressions (default)
2629
all : also print attribute xpath expressions
2730
raw : print unqualified xpath and found values (list)
2831
values: print list of found values only
32+
raw_json: json string with internal xpath and namespace maps.
33+
{"xpath": { ... }, "namespaces": { ... }}
34+
raw_json_pretty: same as raw_json with pretty print
2935
3036
Initial xpath expression: str
3137
Start at some element defined by an xpath expression.
@@ -54,6 +60,16 @@ def _get_qualified_name(qname, revns):
5460
lname = f"{revns.get(qname.namespace)}:{qname.localname}"
5561
return lname
5662

63+
def get_encoding_from_fragment(xml_frag):
64+
'''
65+
Get encoding attribute value from xml declaration if present.
66+
:param xml_frag: first 100 characters from XML string.
67+
'''
68+
declstr = xml_frag.split(b'?>')[0]
69+
# Fake a minimal doc to parse declaration ensuring encoding is valid for lxml
70+
frdoc= etree.parse(BytesIO(declstr + b'?><root/>'))
71+
return frdoc.docinfo.encoding
72+
5773
def _get_dict_list_value(value, element):
5874
'''Initialize tuple for xpath dictionary values.
5975
Items:
@@ -110,24 +126,23 @@ def parse_mixed_ns(tree: etree._ElementTree,
110126
'''Parse XML document that may contain anonymous namespace.
111127
Returns a dict with original xpath as keys, xpath with qualified names and
112128
count of elements found with the latter or None if an error occurred.
113-
xmap = {
129+
xmap = ``{
114130
"/some/xpath/*[1]": ("/some/xpath/ns:ele1", 1, ["id", "class"])
115-
}
131+
}``
116132
To get the qualified xpath:
117-
xmap['/some/xpath/*[1]'][0]
118-
119-
Parameters
120-
----------
121-
tree: lxml.etree._ElementTree
133+
``xmap['/some/xpath/*[1]'][0]``
134+
135+
:param tree: lxml.etree._ElementTree
122136
ElementTree from current document
123-
nsmap: dict
137+
:param nsmap: dict
124138
namespaces dictionary from current document
125-
xpath_base: str
139+
:param xpath_base: str
126140
Xpath expression to start from
127-
with_count: bool
141+
:param with_count: bool
128142
add count of found elements (performance cost on large documents).
129-
max_items: int
130-
max number of elements to parse. Default: 100000'''
143+
:param max_items: int
144+
max number of elements to parse. Default: 100000
145+
'''
131146

132147
revns = {v:k or 'ns' for k,v in nsmap.items()}
133148
elements = tree.xpath(xpath_base, namespaces=nsmap)
@@ -203,36 +218,68 @@ def parse_mixed_ns(tree: etree._ElementTree,
203218
xmap[xp] = xmap[xp][0], xcount, xmap[xp][2]
204219
return xmap
205220

221+
222+
def get_json_response(xmap, nsmap, mode):
223+
'''
224+
Return a json string as
225+
{"xpath": { ... }.
226+
"namespaces": { ... }}
227+
:param xmap:
228+
:param nsmap:
229+
:param mode:
230+
'''
231+
indent = None
232+
raw_json = {}
233+
raw_json['xpath'] = xmap
234+
if nsmap:
235+
raw_json['namespaces'] = nsmap
236+
if mode == 'raw_json_pretty':
237+
indent = 4
238+
return json.dumps(raw_json, indent=indent)
239+
206240
def print_xpaths(xmap: Dict,
207-
mode: str ="path",
241+
mode: str ="xpath",
208242
*,
209-
out_fd = OUT_FD):
210-
'''Print xpath expressions and validate by count of elements found with it.
243+
out_fd = OUT_FD, nsmap= None):
244+
'''
245+
Print xpath expressions according to 'mode' argument.
246+
247+
:param xmap: map of xpath expressions
248+
:param mode: output mode
249+
:param out_fd: output file descriptor
250+
:param nsmap: namespaces map
251+
211252
mode: str
212-
path : print elements xpath expressions (default)
253+
xpath : print elements xpath expressions (default)
213254
all : also print attribute xpath expressions
214255
raw : print unqualified xpath and found values (list)
256+
raw_json: json string with internal xpath and namespace maps.
257+
{"xpath": { ... }.
258+
"namespaces": { ... }}
259+
raw_json_pretty: same as raw_json with pretty print
215260
values: print tuple of found values only
216261
'''
217262

218263
acount=0
219264
acountmsg=''
220265

221-
for unq_xpath, qual_xpath_lst in xmap.items():
222-
if mode not in ['raw', 'values']:
223-
print(qual_xpath_lst[0])
224-
225-
if mode == "all":
226-
#Print xpath for attributes
227-
if qual_xpath_lst[2] is not []:
228-
for a in qual_xpath_lst[2]:
229-
print(f"{qual_xpath_lst[0]}/@{a}")
230-
acount += 1
231-
acountmsg = f"Found {acount:3} xpath expressions for attributes\n"
232-
if mode == "raw":
233-
print(unq_xpath, qual_xpath_lst)
234-
elif mode == "values":
235-
print(qual_xpath_lst)
266+
if mode in ['raw_json', 'raw_json_pretty']:
267+
print(get_json_response(xmap, nsmap, mode))
268+
else:
269+
for unq_xpath, qual_xpath_lst in xmap.items():
270+
if mode in ['xpath', 'all']:
271+
print(qual_xpath_lst[0])
272+
if mode == "all":
273+
#Print xpath for attributes
274+
if qual_xpath_lst[2] is not []:
275+
for a in qual_xpath_lst[2]:
276+
print(f"{qual_xpath_lst[0]}/@{a}")
277+
acount += 1
278+
acountmsg = f"Found {acount:3} xpath expressions for attributes\n"
279+
elif mode == "raw":
280+
print(unq_xpath, qual_xpath_lst)
281+
elif mode == "values":
282+
print(qual_xpath_lst)
236283

237284
print(f"\nFound {len(xmap.keys()):3} xpath expressions for elements\n{acountmsg}", file=out_fd)
238285

@@ -259,11 +306,25 @@ def build_namespace_dict(tree: etree._ElementTree) -> Dict[str, str]:
259306
nsmap[ns] = v
260307
return nsmap
261308

262-
def fromstring(xmlstr: str, *,
309+
def fromstring(xmlstr: Union[str,bytes], *,
263310
xpath_base: str = '//*',
264311
with_count: bool = WITH_COUNT,
265-
max_items: int = MAX_ITEMS) -> (etree._ElementTree, Dict[str, str], OrderedDict[str, Tuple[str, int, List[str]]]):
266-
doc = etree.parse(StringIO(xmlstr))
312+
max_items: int = MAX_ITEMS,
313+
encoding='utf-8') -> (etree._ElementTree, Dict[str, str], OrderedDict[str, Tuple[str, int, List[str]]]):
314+
'''
315+
Parse from string. If the document contains an encoding in XML declaration, use it.
316+
If it starts with an XML declaration but has no ``encoding`` attribute, ``utf-8`` will be used be default.
317+
318+
:param xmlstr: document or fragment as string or bytes.
319+
:param xpath_base: xpath expression to start searching xpaths for.
320+
:param with_count: Include count of elements found with each expression. Default: False
321+
:param max_items: limit the number of parsed elements. Default: 100000
322+
:param encoding: xml string encoding, ``utf-8`` by default. Use :func:`get_encoding_from_fragment` to get encoding.
323+
'''
324+
xmlbytes = xmlstr
325+
if isinstance(xmlstr, str):
326+
xmlbytes = xmlstr.encode(encoding)
327+
doc = etree.parse(BytesIO(xmlbytes))
267328
return parse(file=None, itree=doc, xpath_base=xpath_base, with_count=with_count, max_items=max_items)
268329

269330
def parse(file: str, *,
@@ -283,20 +344,20 @@ def parse(file: str, *,
283344
284345
Parameters
285346
----------
286-
file: file path string
287-
itree: lxml.etree._ElementTree
347+
:param file: file path string
348+
:param itree: lxml.etree._ElementTree
288349
ElementTree object
289-
xpath_base: xpath expression to start searching xpaths for.
290-
with_count: Include count of elements found with each expression. Default: False
291-
max_items: limit the number of parsed elements. Default: 100000
350+
:param xpath_base: xpath expression to start searching xpaths for.
351+
:param with_count: Include count of elements found with each expression. Default: False
352+
:param max_items: limit the number of parsed elements. Default: 100000
292353
'''
293354

294355
try:
295356
tree = itree
296357
if tree is None:
297358
if not path.isfile(file):
298359
raise FileNotFoundError(errno.ENOENT, strerror(errno.ENOENT), file)
299-
with open(file, "r") as fin:
360+
with open(file, "rb") as fin:
300361
tree = etree.parse(fin)
301362

302363
nsmap = build_namespace_dict(tree)
@@ -308,20 +369,24 @@ def parse(file: str, *,
308369
raise(e)
309370

310371
def main():
372+
if len(sys.argv) == 1:
373+
print(f"ERROR: No arguments passed.\nRun with -h for help.", file=sys.stderr)
374+
sys.exit(1)
375+
311376
if sys.argv[1] in ["-h", "--help"]:
312377
usage()
313378
sys.exit()
314379

315380
file = sys.argv[1]
316-
mode = "path"
381+
mode = "xpath"
317382
xpath_base = XPATH_ALL
318383
with_count = WITH_COUNT
319384
max_items = MAX_ITEMS
320385
out_fd = OUT_FD
321386
no_banners = False
322387
warns = None
323388

324-
if not path.isfile(file):
389+
if file != '-' and not path.isfile(file):
325390
print(f"[Errno {errno.ENOENT}] {strerror(errno.ENOENT)}", file=sys.stderr)
326391
sys.exit(errno.ENOENT)
327392

@@ -357,10 +422,14 @@ def main():
357422
print(f"{'no_banners':10}: {no_banners}", file=out_fd, flush=True)
358423
if warns is not None:
359424
print(f"\n{warns}\n", file=sys.stderr)
360-
nsmap, xmap = parse(file, xpath_base=xpath_base, with_count=with_count, max_items=max_items)[1:]
425+
if file != '-':
426+
nsmap, xmap = parse(file, xpath_base=xpath_base, with_count=with_count, max_items=max_items)[1:]
427+
else:
428+
xmlstr = sys.stdin.buffer.read()
429+
nsmap, xmap = fromstring(xmlstr, xpath_base=xpath_base, with_count=with_count, max_items=max_items)[1:]
361430
if xmap is not None:
362431
print(f"namespaces: {nsmap}\n", file=out_fd, flush=True)
363-
print_xpaths(xmap, mode, out_fd=out_fd)
432+
print_xpaths(xmap, mode, out_fd=out_fd, nsmap=nsmap)
364433
else:
365434
sys.exit(1)
366435

tests/resources/iso-8859-1.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
<?xml version="1.0" encoding="ISO-8859-1"?>
2+
<doc>
3+
<más>más</más>
4+
<a>a</a>
5+
</doc>

tests/resources/shift-jis.xml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
<?xml version="1.0" encoding="shift_jis"?>
2+
<doc>
3+
<‚±‚ê>&#x005C;&#x007E;&#x2015;&#x005C;&#x301C;
4+
&#x2016;&#x2212;&#x00A2;&#x00A3;&#x00AC;</‚±‚ê>
5+
<a>&#x005C;&#x007E;&#x2015;&#x005C;&#x301C;
6+
&#x2016;&#x2212;&#x00A2;&#x00A3;&#x00AC;</a>
7+
</doc>

0 commit comments

Comments
 (0)