This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def redirect_chains(crawldf): | |
"""Create a tidy DataFrame for redirects with the columns: | |
url: All the URLs in the redirect chain. | |
status: The status code of each URL. | |
type: "requested", "inermediate", or "crawled". | |
order: 1, 2, 3... up to the number of urls in the redirect chain. | |
redirect_times: The number of redirects in the chain (URLs in the chain minus one). | |
""" | |
redirect_df = (crawldf[['url', 'status', 'redirect_urls', 'redirect_reasons']] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import advertools as adv | |
import adviz | |
# get URLs of the sitemap index | |
nyt = adv.sitemap_to_df('https://nytimes.com/robots.txt', recursive=False) | |
# get URLs of the /sitemap.xml.gz sitemap index | |
nyt_sitemap_index = adv.sitemap_to_df('https://www.nytimes.com/sitemaps/new/sitemap.xml.gz', recursive=False) | |
nyt_2022 = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from http.client import responses | |
import pandas as pd | |
import plotly.express as px | |
import dash_bootstrap_components as dbc | |
from dash_bootstrap_templates import load_figure_template | |
themes = [theme for theme in dir(dbc.themes) if theme[0].isupper()] | |
load_figure_template(themes=themes) | |
def status_code_chart( |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import advertools as adv | |
import pandas as pd | |
pd.options.display.max_columns = None | |
headers_components = { | |
'User-agent': [ | |
# Googlebot: | |
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', | |
# iPhone 13: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import advertools as adv | |
import pandas as pd | |
key = 'YOUR_GOOGLE_KEY' | |
brands = [ | |
'nike', | |
'adidas', | |
'puma', | |
'asics', |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import advertools as adv | |
import pandas as pd | |
pd.options.display.max_columns = None | |
homepage = 'https://example.com/' # <--- change this | |
domain = 'example.com' # <--- and this | |
adv.crawl(homepage, 'output_file.jl', follow_links=True, | |
custom_settings={'LOG_FILE': 'output_file.log'}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import plotly.express as px | |
import pandas as pd | |
import requests | |
dflist = [] | |
for i in range(1, 6): | |
resp = requests.get(f'https://companiesmarketcap.com/page/{i}/') | |
df = pd.read_html(resp.text)[0] | |
dflist.append(df) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import plotly.express as px | |
def treemap(traffic_df, metric='Users', path=['Medium', 'Source']): | |
"""Make in interactive treemap for two data dimensions/levels. | |
Parameters: | |
----------- | |
traffic_df : A DataFrame containing two dimensions, and one or more metrics | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import advertools as adv | |
import pandas as pd | |
import plotly | |
import plotly.graph_objects as go | |
pd.options.display.max_columns = None | |
cx = 'YOUR_CSE_ID' | |
key = 'YOUR_GOOGLE_DEV_KEY' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import advertools as adv | |
import pandas as pd | |
pd.options.display.max_columns = None | |
# Copied from https://en.wikipedia.org/wiki/List_of_cancer_types | |
cancers = { | |
"Chondrosarcoma": "Bone and muscle sarcoma" , | |
"Ewing's sarcoma": "Bone and muscle sarcoma" , |