Skip to content

Instantly share code, notes, and snippets.

@waltercacau
Created April 20, 2013 11:39
Show Gist options
  • Save waltercacau/5425707 to your computer and use it in GitHub Desktop.
Save waltercacau/5425707 to your computer and use it in GitHub Desktop.
Simple utility class for easing the use of LXML with CSS selectors, using a similar interface to jquery.
# Copyright 2013 Walter Cacau <waltercacau@gmail.com>
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import re
import lxml.cssselect as cssselect
import lxml.html as htmlparser
WHITE_SPACE_RE = re.compile("\s+")
def from_string(html):
return JQueryLxml(htmlparser.document_fromstring(html))
class JQueryLxml(object):
@classmethod
def one_from(cls, arg, root):
if isinstance(arg, cls):
return arg
return root.one(arg)
def __init__(self, *nodes):
self.nodes = nodes
def one(self, selector):
sel = cssselect.CSSSelector(selector)
for node in self.nodes:
query = sel(node)
if len(query) > 0:
return JQueryLxml(query[0])
raise Exception('Could not find an element matching {}'.format(selector))
def exists(self, selector):
sel = cssselect.CSSSelector(selector)
for node in self.nodes:
query = sel(node)
if len(query) > 0:
return True
return False
def all(self, selector):
sel = cssselect.CSSSelector(selector)
result_nodes = []
for node in self.nodes:
result_nodes += sel(node)
return JQueryLxml(*result_nodes)
def hasAttr(self, name):
return name in self.nodes[0].attrib
def hasClass(self, name):
return name in WHITE_SPACE_RE.split(self.attr("class") or "")
def attr(self, name):
try:
return self.nodes[0].attrib[name]
except KeyError:
return None
def text(self):
return self.nodes[0].text_content()
def html(self):
return htmlparser.tostring(self.nodes[0], pretty_print=True)
def val(self):
return self.nodes[0].value
def next(self):
return JQueryLxml(*[
node.getnext() for node in self.nodes if node.getnext() is not None
])
def _form_values_with_empties(self, node):
"""
Return a list of tuples of the field values for the form.
This is suitable to be passed to ``urllib.urlencode()``.
"""
results = []
for el in node.inputs:
name = el.name
if not name:
continue
tag = htmlparser._nons(el.tag)
if tag == 'textarea':
results.append((name, el.value))
elif tag == 'select':
value = el.value
if el.multiple:
for v in value:
results.append((name, v))
elif value is not None:
results.append((name, value))
else:
assert tag == 'input', (
"Unexpected tag: %r" % el)
if el.checkable and not el.checked:
continue
if el.type in ('submit', 'image', 'reset'):
continue
value = el.value
results.append((name, el.value if value is not None else ""))
return results
def form_values(self, override={}):
return [
(k, v)
for k, v in self._form_values_with_empties(self.nodes[0])
if k not in override
] + override.items()
def is_checked(self):
return self.checked
def filter(self, match):
return JQueryLxml(*[
node for node in self.nodes if match(JQueryLxml(node))
])
def map(self, func):
return [
func(JQueryLxml(node)) for node in self.nodes
]
def parent(self):
return JQueryLxml(*[
node.getparent() for node in self.nodes
if node.getparent() is not None
])
def __iter__(self):
return iter(JQueryLxml(node) for node in self.nodes)
def __getitem__(self, i):
return JQueryLxml(self.nodes[i])
def __len__(self):
return len(self.nodes)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment