Skip to content

Instantly share code, notes, and snippets.

@hamletbatista
Created November 3, 2020 00:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hamletbatista/ad99891ecbcc3f56e7e65e6d01311225 to your computer and use it in GitHub Desktop.
Save hamletbatista/ad99891ecbcc3f56e7e65e6d01311225 to your computer and use it in GitHub Desktop.
from requests_html import HTMLSession
#Builds a DOM path recursively
def build_dom_path(element, path):
if element is None:
return path
else:
path.append(element.tag)
return build_dom_path(element.getparent(), path)
def get_canonical_path(url):
print(url)
session = HTMLSession()
r = session.get(url)
print(r.status_code)
if r.status_code == 200:
out = r.html.xpath("//link[@rel='canonical']")
dom_path = list()
if len(out) > 0:
dom_path = build_dom_path(out[0].element, dom_path)
dom_path.reverse()
fixed_dom_path = "/"+"/".join(dom_path)
print(fixed_dom_path)
canonical = r.html.xpath(fixed_dom_path)
print("Found: "+ str(canonical))
url = "http://www.example.com"
get_canonical_path(url)
#Output
#http://www.example.com
#200
#/html/head/link
#Found: [<Element 'link' rel=('canonical',) href='http://localhost'>]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment