diff --git a/htmlapi_client.py b/htmlapi_client.py
index 092cbeb..bd20c2d 100644
--- a/htmlapi_client.py
+++ b/htmlapi_client.py
@@ -16,11 +16,14 @@
import urllib
import urllib2
import urlparse
+
from lxml import etree
+
def _normalize_whitespace(s):
return ' '.join(s.split())
+
def _extract_text_help(root, acc):
if root.text is not None and root.text.strip():
acc.append(_normalize_whitespace(root.text.strip()))
@@ -30,8 +33,10 @@ def _extract_text_help(root, acc):
acc.append(_normalize_whitespace(child.tail.strip()))
return acc
+
def _extract_text(root):
- return ' '.join(_extract_text_help(root,[]))
+ return ' '.join(_extract_text_help(root, []))
+
def _extract(elt, doc):
"""This function takes a given DOM node 'elt' and attempts to interpret
@@ -46,15 +51,18 @@ def _extract(elt, doc):
if target is not None: return _extract(target, doc)
else:
up = urlparse.urlparse(href)
- remote_doc = enter(urlparse.urlunparse((up.scheme, up.netloc, up.path, up.params, up.query, '')))
+ remote_doc = enter(urlparse.urlunparse((up.scheme, up.netloc,
+ up.path, up.params, up.query, '')))
if up.fragment:
- target = remote_doc._doc.getroot().find(".//*[@id='%s']" % up.fragment)
+ target = remote_doc._doc.getroot().find(".//*[@id='%s']" %
+ up.fragment)
if target is not None: return _extract(target, remote_doc)
if len(remote_doc.objects) == 1: return remote_doc.objects[0]
return _extract(remote_doc._doc.getroot(), remote_doc)
if tag == 'img': return elt.attrib['src']
return _extract_text(elt)
+
def _value_of(doc, fragment=''):
if fragment:
target = doc._doc.getroot().find(".//*[@id='%s']" % fragment)
@@ -63,9 +71,11 @@ def _value_of(doc, fragment=''):
if len(doc.objects) > 0: return doc.objects
return _extract(doc._doc.getroot(), doc)
+
class Link(object):
"""Links are basically a representation of HTML tags. The main
thing you can do with a Link is to follow it."""
+
def __init__(self, elt, doc):
self._elt = elt
self._doc = doc
@@ -73,7 +83,6 @@ def __init__(self, elt, doc):
def __repr__(self):
return "" % (self._elt.attrib['href'], id(self))
-
def follow(self):
href = self._elt.attrib['href']
resolved = urlparse.urljoin(self._doc._url, href)
@@ -88,6 +97,7 @@ def follow(self):
remote_doc = enter(resolved_base)
return _value_of(remote_doc, up.fragment)
+
class Form(object):
"""Forms are a representation of an HTML