Skip to content

Instantly share code, notes, and snippets.

View hamletbatista's full-sized avatar

Hamlet Batista hamletbatista

View GitHub Profile
#convert absolute URLs to relative
from urllib.parse import urlsplit, urlunsplit
#Absolute source URLs linking to 404s from Search Console API: webmasters.urlcrawlerrorssamples.list
linkedFromUrls= [
"http://www.example.com/brand/swirly/shopby?sizecode=99",
"https://www.example.com/brand/swirly"
]
#convert URL to lowercase
crawled_url = "https://www.example.com/ABOUT-chc/clinicians/audiology-technology/"
print(crawled_url.lower())
#Output -> https://www.example.com/about-chc/clinicians/audiology-technology/
#convert URL to uppercase
crawled_url = "https://www.example.com/ABOUT-chc/clinicians/audiology-technology/"
print(crawled_url.upper())
#Output -> HTTPS://WWW.EXAMPLE.COM/ABOUT-CHC/CLINICIANS/AUDIOLOGY-TECHNOLOGY/
#decode URL-econded URL
url_source="/url?sa=t&source=web&rct=j&url=https://support.google.com/webmasters/answer/35291%3Fhl%3Den&ved=2ahUKEwi42-aIwP3gAhUNON8KHf4EB-QQFjAIegQIChAB"
u = urlsplit(url_source)
print(u.query)
#Output -> 'sa=t&source=web&rct=j&url=https://support.google.com/webmasters/answer/35291%3Fhl%3Den&ved=2ahUKEwi42-aIwP3gAhUNON8KHf4EB-QQFjAIegQIChAB'
#note the parameter 'url' is URL encoded because it includes a query string
#Handling URL parameters
from urllib.parse import urlsplit, urlunsplit
def clean_url_params(url):
print(url)
u = urlsplit(url)
#Reformating date strings
#Crawled and first discovered dates from the Search Console API: webmasters.urlcrawlerrorssamples.list
last_crawled= "2019-01-12T04:00:59.000Z" #ISO-8601 date
first_detected= "2018-11-19T02:59:25.000Z"
from datetime import datetime
#Here is how to parse dates the hard way. See https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
from urllib.parse import unquote_plus
#standardize keyword by removing extra characters, space, url decoding and lowercase capitalization
def normalize_keywords(keyword):
bad_chars="\'\"®" #add more as needed
table = str.maketrans(dict.fromkeys(bad_chars))
#if url encoded, decode
@hamletbatista
hamletbatista / create_url_regex.py
Created March 26, 2019 21:38
Track performance of URL group in Data Studio with a generated regex
categories = """https://www.site.com/category/name1
https://www.site.com/category/name2
https://www.site.com/category/name3
"""
from urllib.parse import urlparse
"|".join([ urlparse(x).path for x in categories.split()])
#example output: '/category/name1|/category/name2|/category/name3'
#create a landing page filter in Data Studio, select RegExp Contains and copy and paste the regex
document.querySelector('#yDmH0d > c-wiz:nth-child(20) > div > div.OoO4Vb > span > div > div.y3IDJd.rFZTte.Fx3kmc > content > div.shSP > div > div > div:nth-child(3) > content > div > div > div > div.V8vvZb > div > div > content > div.I1NQYe > div:nth-child(1) > div:nth-child(3) > div:nth-child(2) > div > div') ||
document.querySelector('#yDmH0d > c-wiz:nth-child(20) > div > div.OoO4Vb > div.shSP > div > div > div:nth-child(3) > content > div > div > div > div.V8vvZb > div > div > content > div.I1NQYe > div:nth-child(1) > div:nth-child(3) > div:nth-child(2) > div');
document.querySelector('#yDmH0d > c-wiz:nth-child(20) > div > div.OoO4Vb > span > div > div.y3IDJd.rFZTte.Fx3kmc > content > div.shSP > div > div > div:nth-child(3) > content > div > div > div > div.V8vvZb > div > div > content > div.I1NQYe > div:nth-child(2) > div:nth-child(2) > div:nth-child(2) > div') ||
document.querySelector('#yDmH0d > c-wiz:nth-child(20) > div > div.OoO4Vb > div.shSP > div > div > div:nth-child(3) > content > div > div > div > div.V8vvZb > div > div > content > div.I1NQYe > div:nth-child(2) > div:nth-child(2) > div:nth-child(2) > div');