Skip to content

Instantly share code, notes, and snippets.

View eliasdabbas's full-sized avatar
💭
https://bit.ly/100DaysOfCode_elias

Elias Dabbas eliasdabbas

💭
https://bit.ly/100DaysOfCode_elias
View GitHub Profile
@eliasdabbas
eliasdabbas / redirect_chains.py
Created July 9, 2023 21:52
Get redirect chains from an advertools crawl dataset
def redirect_chains(crawldf):
"""Create a tidy DataFrame for redirects with the columns:
url: All the URLs in the redirect chain.
status: The status code of each URL.
type: "requested", "inermediate", or "crawled".
order: 1, 2, 3... up to the number of urls in the redirect chain.
redirect_times: The number of redirects in the chain (URLs in the chain minus one).
"""
redirect_df = (crawldf[['url', 'status', 'redirect_urls', 'redirect_reasons']]
import advertools as adv
import adviz
# get URLs of the sitemap index
nyt = adv.sitemap_to_df('https://nytimes.com/robots.txt', recursive=False)
# get URLs of the /sitemap.xml.gz sitemap index
nyt_sitemap_index = adv.sitemap_to_df('https://www.nytimes.com/sitemaps/new/sitemap.xml.gz', recursive=False)
nyt_2022 = []
@eliasdabbas
eliasdabbas / status_code_figure.py
Created October 13, 2022 23:30
Visualize a list of HTTP status codes as a treemap of two levels.
from http.client import responses
import pandas as pd
import plotly.express as px
import dash_bootstrap_components as dbc
from dash_bootstrap_templates import load_figure_template
themes = [theme for theme in dir(dbc.themes) if theme[0].isupper()]
load_figure_template(themes=themes)
def status_code_chart(
@eliasdabbas
eliasdabbas / crawl_multiple_header_combinations.py
Created October 13, 2022 09:59
Crawl a bunch of URLs using various combinations of request headers
import advertools as adv
import pandas as pd
pd.options.display.max_columns = None
headers_components = {
'User-agent': [
# Googlebot:
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
# iPhone 13:
import advertools as adv
import pandas as pd
key = 'YOUR_GOOGLE_KEY'
brands = [
'nike',
'adidas',
'puma',
'asics',
@eliasdabbas
eliasdabbas / company_marketcap_interactive_scatter.py
Last active June 11, 2022 12:40
Interactive emailable HTML chart of top 500 companies. Users can select which countries to display
import plotly.express as px
import pandas as pd
import requests
dflist = []
for i in range(1, 6):
resp = requests.get(f'https://companiesmarketcap.com/page/{i}/')
df = pd.read_html(resp.text)[0]
dflist.append(df)
import plotly.express as px
def treemap(traffic_df, metric='Users', path=['Medium', 'Source']):
"""Make in interactive treemap for two data dimensions/levels.
Parameters:
-----------
traffic_df : A DataFrame containing two dimensions, and one or more metrics
@eliasdabbas
eliasdabbas / dress_serp_heatmap.py
Last active April 29, 2023 19:17
Dress SERP heat-map: "dress type styles" and "shop dress type". 40 types. 4 countries: US, UK, CA, AU
import advertools as adv
import pandas as pd
import plotly
import plotly.graph_objects as go
pd.options.display.max_columns = None
cx = 'YOUR_CSE_ID'
key = 'YOUR_GOOGLE_DEV_KEY'
import advertools as adv
import pandas as pd
pd.options.display.max_columns = None
# Copied from https://en.wikipedia.org/wiki/List_of_cancer_types
cancers = {
"Chondrosarcoma": "Bone and muscle sarcoma" ,
"Ewing's sarcoma": "Bone and muscle sarcoma" ,