-
-
Save ad-inventory-fraud/7f41f18892163a4ae8291d140e35304b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# XPathUtil.py | |
# A collecton of utilities to extract and parse | |
# XPaths encountered while scraping. | |
# | |
# Steven Englehardt (github.com/englehardt) | |
import re | |
import bs4 | |
from bs4 import BeautifulSoup as bs | |
def is_clickable(xpath): | |
# We consider any xpath that has an 'a', 'button', | |
# or 'input' tag to be clickable as it most likely | |
# contains a link. It may make sense to see check | |
# <input type="button"> or other tags... | |
index_regex = re.compile(r"\[[^\]]*\]") # match index and id brackets | |
# check xpath for necessary tags | |
temp = re.sub(index_regex, "", xpath) | |
temp = temp.split("/") | |
if "a" in temp or "button" in temp or "input" in temp: | |
return True | |
return False | |
# ExtractXPath(element, use_id) | |
# - element: a bs4 tag node | |
# - use_id: defaults True | |
# | |
# Traverses up the tag tree of a Beautiful Soup node | |
# to return the XPath of that node. | |
# | |
# Use of ids is preferred when the xpath will be used | |
# outside of BeautifulSoup. Since an id is unique to | |
# all elements of the tree, it allows the use of a | |
# wildcard for all parent nodes. This minimizes the | |
# chances of incorrect indexing (which can occur if | |
# javascript changes a page during processing). | |
class ExtractXPathError(Exception): | |
def __init__(self, value): | |
self.value = value | |
def __str__(self): | |
return repr(self.value) | |
def check_previous_tags(node, use_id=True): | |
# index of node | |
counter = 1 | |
for tag in node.previous_siblings: | |
if type(tag) != bs4.element.Tag: | |
continue | |
elif tag.name == node.name: | |
counter += 1 | |
# XPath name | |
if counter > 1: | |
xpath = node.name + "[%d]" % counter | |
else: | |
xpath = node.name | |
return xpath | |
def ExtractXPath(element, use_id=True): | |
# Check that element is a tag node | |
if type(element) != bs4.element.Tag: | |
raise ExtractXPathError( | |
"%s is not a supported data type. " | |
"Only tag nodes from the tag tree are accepted." % type(element) | |
) | |
# Starting node | |
# Check id first | |
if use_id and element.get("id") is not None: | |
return "//*/" + element.name + '[@id="' + element.get("id") + '"]' | |
xpath = check_previous_tags(element) | |
# Parent Nodes | |
for parent in element.parents: | |
# End of XPath - exclude from string | |
if parent.name == "[document]": | |
break | |
# Check id first | |
if use_id and parent.get("id") is not None: | |
return ( | |
"//*/" + parent.name + '[@id="' + parent.get("id") + '"]/' + xpath | |
) # noqa | |
xpath = check_previous_tags(parent) + "/" + xpath | |
xpath = "/" + xpath | |
return xpath | |
# xp1_wildcard adds wildcard functionality to XPath 1.0 | |
# strings using the limited function set supported by the 1.0 | |
# implementation. | |
# | |
# xp1_lowercase likewise adds lowercase functionality | |
# | |
# Hopefully you never need these... | |
def xp1_lowercase(string): | |
return ( | |
"translate(" | |
+ string | |
+ ", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')" | |
) # noqa | |
# Converts a string with a wildcard in it to an XPath 1.0 | |
# compatible string *** ONLY SUPPORTS 1 WILDCARD *** | |
# string: string w/ wildcard that you are searching for | |
# attr: tag attribute you are searching for (e.g. 'text()' or '@id' or ...) | |
def xp1_wildcard(attr, string, normalize=True): | |
parts = string.split("*") | |
if normalize: | |
attr = "normalize-space(" + attr + ")" | |
if len(parts) != 2: | |
print("ERROR: This function is meant to support 1 wildcard") | |
return "[" + attr + "=" + string + "]" | |
else: | |
pt1 = "" | |
pt2 = "" | |
if parts[0] != "": | |
pt1 = "starts-with(" + attr + ", '" + parts[0] + "')" | |
if parts[1] != "": | |
pt2 = ( | |
"contains(substring(" | |
+ attr | |
+ ", string-length(" | |
+ attr | |
+ ")-" | |
+ str(len(parts[1]) - 1) | |
+ "), '" | |
+ parts[1] | |
+ "')" | |
) | |
if pt1 == "" and pt2 != "": | |
return "[" + pt2 + "]" | |
elif pt1 != "" and pt2 == "": | |
return "[" + pt1 + "]" | |
elif pt1 != "" and pt2 != "": | |
return "[" + pt1 + " and " + pt2 + "]" | |
else: | |
print("ERROR: The string is empty") | |
return "[" + attr + "=" + string + "]" | |
def main(): | |
# Output some sample XPaths | |
print("--- Sample XPaths ---") | |
import re | |
from random import choice | |
from urllib.request import urlopen | |
rsp = urlopen("http://www.reddit.com/") | |
if rsp.getcode() == 200: | |
soup = bs(rsp.read(), "lxml") | |
elements = soup.findAll(text=re.compile("[A-Za-z0-9]{10,}")) | |
for i in range(0, 5): | |
element = choice(elements).parent | |
print("HTML") | |
print(element) | |
print("XPath") | |
print(ExtractXPath(element)) | |
print("**************") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment