Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
from requests_html import HTMLSession
#Builds a DOM path recursively
def build_dom_path(element, path):
if element is None:
return path
else:
path.append(element.tag)
return build_dom_path(element.getparent(), path)
def get_canonical_path(url):
print(url)
session = HTMLSession()
r = session.get(url)
print(r.status_code)
if r.status_code == 200:
out = r.html.xpath("//link[@rel='canonical']")
dom_path = list()
if len(out) > 0:
dom_path = build_dom_path(out[0].element, dom_path)
dom_path.reverse()
fixed_dom_path = "/"+"/".join(dom_path)
print(fixed_dom_path)
canonical = r.html.xpath(fixed_dom_path)
print("Found: "+ str(canonical))
url = "http://www.example.com"
get_canonical_path(url)
#Output
#http://www.example.com
#200
#/html/head/link
#Found: [<Element 'link' rel=('canonical',) href='http://localhost'>]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.