Skip to content

Instantly share code, notes, and snippets.

@ad-inventory-fraud
Last active April 10, 2023 21:37
Show Gist options
  • Save ad-inventory-fraud/7f41f18892163a4ae8291d140e35304b to your computer and use it in GitHub Desktop.
Save ad-inventory-fraud/7f41f18892163a4ae8291d140e35304b to your computer and use it in GitHub Desktop.
# XPathUtil.py
# A collecton of utilities to extract and parse
# XPaths encountered while scraping.
#
# Steven Englehardt (github.com/englehardt)
import re
import bs4
from bs4 import BeautifulSoup as bs
def is_clickable(xpath):
# We consider any xpath that has an 'a', 'button',
# or 'input' tag to be clickable as it most likely
# contains a link. It may make sense to see check
# <input type="button"> or other tags...
index_regex = re.compile(r"\[[^\]]*\]") # match index and id brackets
# check xpath for necessary tags
temp = re.sub(index_regex, "", xpath)
temp = temp.split("/")
if "a" in temp or "button" in temp or "input" in temp:
return True
return False
# ExtractXPath(element, use_id)
# - element: a bs4 tag node
# - use_id: defaults True
#
# Traverses up the tag tree of a Beautiful Soup node
# to return the XPath of that node.
#
# Use of ids is preferred when the xpath will be used
# outside of BeautifulSoup. Since an id is unique to
# all elements of the tree, it allows the use of a
# wildcard for all parent nodes. This minimizes the
# chances of incorrect indexing (which can occur if
# javascript changes a page during processing).
class ExtractXPathError(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
def check_previous_tags(node, use_id=True):
# index of node
counter = 1
for tag in node.previous_siblings:
if type(tag) != bs4.element.Tag:
continue
elif tag.name == node.name:
counter += 1
# XPath name
if counter > 1:
xpath = node.name + "[%d]" % counter
else:
xpath = node.name
return xpath
def ExtractXPath(element, use_id=True):
# Check that element is a tag node
if type(element) != bs4.element.Tag:
raise ExtractXPathError(
"%s is not a supported data type. "
"Only tag nodes from the tag tree are accepted." % type(element)
)
# Starting node
# Check id first
if use_id and element.get("id") is not None:
return "//*/" + element.name + '[@id="' + element.get("id") + '"]'
xpath = check_previous_tags(element)
# Parent Nodes
for parent in element.parents:
# End of XPath - exclude from string
if parent.name == "[document]":
break
# Check id first
if use_id and parent.get("id") is not None:
return (
"//*/" + parent.name + '[@id="' + parent.get("id") + '"]/' + xpath
) # noqa
xpath = check_previous_tags(parent) + "/" + xpath
xpath = "/" + xpath
return xpath
# xp1_wildcard adds wildcard functionality to XPath 1.0
# strings using the limited function set supported by the 1.0
# implementation.
#
# xp1_lowercase likewise adds lowercase functionality
#
# Hopefully you never need these...
def xp1_lowercase(string):
return (
"translate("
+ string
+ ", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')"
) # noqa
# Converts a string with a wildcard in it to an XPath 1.0
# compatible string *** ONLY SUPPORTS 1 WILDCARD ***
# string: string w/ wildcard that you are searching for
# attr: tag attribute you are searching for (e.g. 'text()' or '@id' or ...)
def xp1_wildcard(attr, string, normalize=True):
parts = string.split("*")
if normalize:
attr = "normalize-space(" + attr + ")"
if len(parts) != 2:
print("ERROR: This function is meant to support 1 wildcard")
return "[" + attr + "=" + string + "]"
else:
pt1 = ""
pt2 = ""
if parts[0] != "":
pt1 = "starts-with(" + attr + ", '" + parts[0] + "')"
if parts[1] != "":
pt2 = (
"contains(substring("
+ attr
+ ", string-length("
+ attr
+ ")-"
+ str(len(parts[1]) - 1)
+ "), '"
+ parts[1]
+ "')"
)
if pt1 == "" and pt2 != "":
return "[" + pt2 + "]"
elif pt1 != "" and pt2 == "":
return "[" + pt1 + "]"
elif pt1 != "" and pt2 != "":
return "[" + pt1 + " and " + pt2 + "]"
else:
print("ERROR: The string is empty")
return "[" + attr + "=" + string + "]"
def main():
# Output some sample XPaths
print("--- Sample XPaths ---")
import re
from random import choice
from urllib.request import urlopen
rsp = urlopen("http://www.reddit.com/")
if rsp.getcode() == 200:
soup = bs(rsp.read(), "lxml")
elements = soup.findAll(text=re.compile("[A-Za-z0-9]{10,}"))
for i in range(0, 5):
element = choice(elements).parent
print("HTML")
print(element)
print("XPath")
print(ExtractXPath(element))
print("**************")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment