Hamlet Batista hamletbatista

## convert_to_relative.py
#convert absolute URLs to relative

from urllib.parse import urlsplit, urlunsplit

#Absolute source URLs linking to 404s from Search Console API: webmasters.urlcrawlerrorssamples.list

linkedFromUrls= [
     "http://www.example.com/brand/swirly/shopby?sizecode=99",
     "https://www.example.com/brand/swirly"
    ]

## convert_to_lowercase.py
#convert URL to lowercase

crawled_url = "https://www.example.com/ABOUT-chc/clinicians/audiology-technology/"

print(crawled_url.lower())

#Output -> https://www.example.com/about-chc/clinicians/audiology-technology/


## convert_to_uppercase.py
#convert URL to uppercase

crawled_url = "https://www.example.com/ABOUT-chc/clinicians/audiology-technology/"

print(crawled_url.upper())

#Output -> HTTPS://WWW.EXAMPLE.COM/ABOUT-CHC/CLINICIANS/AUDIOLOGY-TECHNOLOGY/


## decode_url.py
#decode URL-econded URL

url_source="/url?sa=t&source=web&rct=j&url=https://support.google.com/webmasters/answer/35291%3Fhl%3Den&ved=2ahUKEwi42-aIwP3gAhUNON8KHf4EB-QQFjAIegQIChAB"

u = urlsplit(url_source)

print(u.query)
#Output -> 'sa=t&source=web&rct=j&url=https://support.google.com/webmasters/answer/35291%3Fhl%3Den&ved=2ahUKEwi42-aIwP3gAhUNON8KHf4EB-QQFjAIegQIChAB'

#note the parameter 'url' is URL encoded because it includes a query string

## handling_url_parameters.py
#Handling URL parameters

from urllib.parse import urlsplit, urlunsplit

def clean_url_params(url):

  print(url)

  u = urlsplit(url)

## formating_dates.py
#Reformating date strings

#Crawled and first discovered dates from the Search Console API: webmasters.urlcrawlerrorssamples.list

last_crawled= "2019-01-12T04:00:59.000Z" #ISO-8601 date
first_detected= "2018-11-19T02:59:25.000Z"

from datetime import datetime
#Here is how to parse dates the hard way. See https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior

## normalizing_keywords.py
from urllib.parse import unquote_plus

#standardize keyword by removing extra characters, space, url decoding and lowercase capitalization

def normalize_keywords(keyword):
  bad_chars="\'\"®" #add more as needed
  table = str.maketrans(dict.fromkeys(bad_chars))

  #if url encoded, decode


## create_url_regex.py
categories = """https://www.site.com/category/name1
https://www.site.com/category/name2
https://www.site.com/category/name3
"""

from urllib.parse import urlparse
"|".join([ urlparse(x).path for x in categories.split()])

#example output: '/category/name1|/category/name2|/category/name3'
#create a landing page filter in Data Studio, select RegExp Contains and copy and paste the regex

## referrer_page_selector.js
document.querySelector('#yDmH0d > c-wiz:nth-child(20) > div > div.OoO4Vb > span > div > div.y3IDJd.rFZTte.Fx3kmc > content > div.shSP > div > div > div:nth-child(3) > content > div > div > div > div.V8vvZb > div > div > content > div.I1NQYe > div:nth-child(1) > div:nth-child(3) > div:nth-child(2) > div > div') ||
document.querySelector('#yDmH0d > c-wiz:nth-child(20) > div > div.OoO4Vb > div.shSP > div > div > div:nth-child(3) > content > div > div > div > div.V8vvZb > div > div > content > div.I1NQYe > div:nth-child(1) > div:nth-child(3) > div:nth-child(2) > div');

## last_crawl_selector.js
document.querySelector('#yDmH0d > c-wiz:nth-child(20) > div > div.OoO4Vb > span > div > div.y3IDJd.rFZTte.Fx3kmc > content > div.shSP > div > div > div:nth-child(3) > content > div > div > div > div.V8vvZb > div > div > content > div.I1NQYe > div:nth-child(2) > div:nth-child(2) > div:nth-child(2) > div') ||
document.querySelector('#yDmH0d > c-wiz:nth-child(20) > div > div.OoO4Vb > div.shSP > div > div > div:nth-child(3) > content > div > div > div > div.V8vvZb > div > div > content > div.I1NQYe > div:nth-child(2) > div:nth-child(2) > div:nth-child(2) > div');
	#convert absolute URLs to relative

	from urllib.parse import urlsplit, urlunsplit

	#Absolute source URLs linking to 404s from Search Console API: webmasters.urlcrawlerrorssamples.list

	linkedFromUrls= [
	"http://www.example.com/brand/swirly/shopby?sizecode=99",
	"https://www.example.com/brand/swirly"
	]
	#convert URL to lowercase

	crawled_url = "https://www.example.com/ABOUT-chc/clinicians/audiology-technology/"

	print(crawled_url.lower())

	#Output -> https://www.example.com/about-chc/clinicians/audiology-technology/
	#convert URL to uppercase

	crawled_url = "https://www.example.com/ABOUT-chc/clinicians/audiology-technology/"

	print(crawled_url.upper())

	#Output -> HTTPS://WWW.EXAMPLE.COM/ABOUT-CHC/CLINICIANS/AUDIOLOGY-TECHNOLOGY/
	#decode URL-econded URL

	url_source="/url?sa=t&source=web&rct=j&url=https://support.google.com/webmasters/answer/35291%3Fhl%3Den&ved=2ahUKEwi42-aIwP3gAhUNON8KHf4EB-QQFjAIegQIChAB"

	u = urlsplit(url_source)

	print(u.query)
	#Output -> 'sa=t&source=web&rct=j&url=https://support.google.com/webmasters/answer/35291%3Fhl%3Den&ved=2ahUKEwi42-aIwP3gAhUNON8KHf4EB-QQFjAIegQIChAB'

	#note the parameter 'url' is URL encoded because it includes a query string
	#Handling URL parameters

	from urllib.parse import urlsplit, urlunsplit

	def clean_url_params(url):

	print(url)

	u = urlsplit(url)
	#Reformating date strings

	#Crawled and first discovered dates from the Search Console API: webmasters.urlcrawlerrorssamples.list

	last_crawled= "2019-01-12T04:00:59.000Z" #ISO-8601 date
	first_detected= "2018-11-19T02:59:25.000Z"

	from datetime import datetime
	#Here is how to parse dates the hard way. See https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
	from urllib.parse import unquote_plus

	#standardize keyword by removing extra characters, space, url decoding and lowercase capitalization

	def normalize_keywords(keyword):
	bad_chars="\'\"®" #add more as needed
	table = str.maketrans(dict.fromkeys(bad_chars))

	#if url encoded, decode
	categories = """https://www.site.com/category/name1
	https://www.site.com/category/name2
	https://www.site.com/category/name3
	"""

	from urllib.parse import urlparse
	"\|".join([ urlparse(x).path for x in categories.split()])

	#example output: '/category/name1\|/category/name2\|/category/name3'
	#create a landing page filter in Data Studio, select RegExp Contains and copy and paste the regex
	document.querySelector('#yDmH0d > c-wiz:nth-child(20) > div > div.OoO4Vb > span > div > div.y3IDJd.rFZTte.Fx3kmc > content > div.shSP > div > div > div:nth-child(3) > content > div > div > div > div.V8vvZb > div > div > content > div.I1NQYe > div:nth-child(1) > div:nth-child(3) > div:nth-child(2) > div > div') \|\|
	document.querySelector('#yDmH0d > c-wiz:nth-child(20) > div > div.OoO4Vb > div.shSP > div > div > div:nth-child(3) > content > div > div > div > div.V8vvZb > div > div > content > div.I1NQYe > div:nth-child(1) > div:nth-child(3) > div:nth-child(2) > div');