This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#convert absolute URLs to relative | |
from urllib.parse import urlsplit, urlunsplit | |
#Absolute source URLs linking to 404s from Search Console API: webmasters.urlcrawlerrorssamples.list | |
linkedFromUrls= [ | |
"http://www.example.com/brand/swirly/shopby?sizecode=99", | |
"https://www.example.com/brand/swirly" | |
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#convert URL to lowercase | |
crawled_url = "https://www.example.com/ABOUT-chc/clinicians/audiology-technology/" | |
print(crawled_url.lower()) | |
#Output -> https://www.example.com/about-chc/clinicians/audiology-technology/ | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#convert URL to uppercase | |
crawled_url = "https://www.example.com/ABOUT-chc/clinicians/audiology-technology/" | |
print(crawled_url.upper()) | |
#Output -> HTTPS://WWW.EXAMPLE.COM/ABOUT-CHC/CLINICIANS/AUDIOLOGY-TECHNOLOGY/ | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#decode URL-econded URL | |
url_source="/url?sa=t&source=web&rct=j&url=https://support.google.com/webmasters/answer/35291%3Fhl%3Den&ved=2ahUKEwi42-aIwP3gAhUNON8KHf4EB-QQFjAIegQIChAB" | |
u = urlsplit(url_source) | |
print(u.query) | |
#Output -> 'sa=t&source=web&rct=j&url=https://support.google.com/webmasters/answer/35291%3Fhl%3Den&ved=2ahUKEwi42-aIwP3gAhUNON8KHf4EB-QQFjAIegQIChAB' | |
#note the parameter 'url' is URL encoded because it includes a query string |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Handling URL parameters | |
from urllib.parse import urlsplit, urlunsplit | |
def clean_url_params(url): | |
print(url) | |
u = urlsplit(url) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Reformating date strings | |
#Crawled and first discovered dates from the Search Console API: webmasters.urlcrawlerrorssamples.list | |
last_crawled= "2019-01-12T04:00:59.000Z" #ISO-8601 date | |
first_detected= "2018-11-19T02:59:25.000Z" | |
from datetime import datetime | |
#Here is how to parse dates the hard way. See https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.parse import unquote_plus | |
#standardize keyword by removing extra characters, space, url decoding and lowercase capitalization | |
def normalize_keywords(keyword): | |
bad_chars="\'\"®" #add more as needed | |
table = str.maketrans(dict.fromkeys(bad_chars)) | |
#if url encoded, decode | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
categories = """https://www.site.com/category/name1 | |
https://www.site.com/category/name2 | |
https://www.site.com/category/name3 | |
""" | |
from urllib.parse import urlparse | |
"|".join([ urlparse(x).path for x in categories.split()]) | |
#example output: '/category/name1|/category/name2|/category/name3' | |
#create a landing page filter in Data Studio, select RegExp Contains and copy and paste the regex |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
document.querySelector('#yDmH0d > c-wiz:nth-child(20) > div > div.OoO4Vb > span > div > div.y3IDJd.rFZTte.Fx3kmc > content > div.shSP > div > div > div:nth-child(3) > content > div > div > div > div.V8vvZb > div > div > content > div.I1NQYe > div:nth-child(1) > div:nth-child(3) > div:nth-child(2) > div > div') || | |
document.querySelector('#yDmH0d > c-wiz:nth-child(20) > div > div.OoO4Vb > div.shSP > div > div > div:nth-child(3) > content > div > div > div > div.V8vvZb > div > div > content > div.I1NQYe > div:nth-child(1) > div:nth-child(3) > div:nth-child(2) > div'); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
document.querySelector('#yDmH0d > c-wiz:nth-child(20) > div > div.OoO4Vb > span > div > div.y3IDJd.rFZTte.Fx3kmc > content > div.shSP > div > div > div:nth-child(3) > content > div > div > div > div.V8vvZb > div > div > content > div.I1NQYe > div:nth-child(2) > div:nth-child(2) > div:nth-child(2) > div') || | |
document.querySelector('#yDmH0d > c-wiz:nth-child(20) > div > div.OoO4Vb > div.shSP > div > div > div:nth-child(3) > content > div > div > div > div.V8vvZb > div > div > content > div.I1NQYe > div:nth-child(2) > div:nth-child(2) > div:nth-child(2) > div'); |