Skip to content

Instantly share code, notes, and snippets.

View hamletbatista's full-sized avatar

Hamlet Batista hamletbatista

View GitHub Profile
from jinja2 import Template
sitemap_template="""<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{% for page in pages %}
<url>
<loc>{{page[0]|safe}}</loc>
<lastmod>{{page[1]}}</lastmod>
</url>
{% endfor %}
@hamletbatista
hamletbatista / crawl_redirects.py
Created February 28, 2019 21:58
crawl redirects
def get_redirects(url):
try:
# r = requests.get(url)
r = requests.head(url)
except:
return (url, None, "Error")
if r.status_code in [301, 302, 307]:
return (url, r.status_code, r.headers['Location'])
elif r.status_code == 404:
return (url, r.status_code, None)
# We do a left merge to append the redirect information to our original GA dataframe.
data_redirects = data.merge(redirects, left_on="url", right_on="url", how="left")
data_redirects['true_url'] = data_redirects['redirect_url'].combine_first(data_redirects['path'])
data_redirects['true_url'] = data_redirects['true_url'].apply(lambda x: urlparse(x).path)
data_redirects['ga:date'] = pd.to_datetime(data_redirects['ga:date'])
true_before = data_redirects[data_redirects['ga:date'] < pd.to_datetime(MIDPOINT_DATE)]
true_after = data_redirects[data_redirects['ga:date'] >= pd.to_datetime(MIDPOINT_DATE)]
# Traffic totals before shopify switch
true_totals_before = true_before[["true_url", "ga:newUsers"]]\
.groupby("true_url").sum()
true_totals_before = true_totals_before.reset_index()\
.sort_values("ga:newUsers", ascending=False)
# Comparing pages from before and after the switch
true_change = true_totals_after.merge(true_totals_before,
left_on="true_url",
right_on="true_url",
suffixes=["_after", "_before"],
how="outer")
true_change.loc[:, ["ga:newUsers_after", "ga:newUsers_before"]].fillna(0, inplace=True)
# Checking again that the total traffic adds up
true_change[["ga:newUsers_before", "ga:newUsers_after"]].sum().sum() == data['ga:newUsers'].sum()
#should be true
data_redirects['group'] = "N/A"
data_redirects.loc[data_redirects['true_url'].str.contains(r"/collections(?!.*products.*)(?!.*/product.*)"), "group"] = "Collections"
data_redirects.loc[data_redirects['true_url'].str.contains(r".*/products/.*|.*/product/.*"), "group"] = "Products"
grouped_data = data_redirects[['group', "ga:newUsers", "ga:date"]].groupby(["group", "ga:date"]).sum().reset_index()
# before and after comparison
grouped_before = grouped_data[grouped_data['ga:date'] < pd.to_datetime("2017-12-15")]
grouped_after = grouped_data[grouped_data['ga:date'] >= pd.to_datetime("2017-12-15")]
plot_data = [
go.Bar(
x = grouped_change['group'].tolist(),
y = grouped_change['difference'].tolist(),
marker = dict(
color = 'red'
),
name = 'Traffic Difference'
),
go.Bar(
line_data = []
for group in grouped_data['group'].unique().tolist():
line = go.Scatter(
x = grouped_data.loc[grouped_data['group'] == group, 'ga:date'],
y = grouped_data.loc[grouped_data['group'] == group, 'ga:newUsers'],
name = group,
mode="lines"
)
#convert relative URLs to absolute
from urllib.parse import urljoin
#relative 404 URLs from Search Console API: webmasters.urlcrawlerrorssamples.list
pageUrl = "product/mad-for-plaid-flannel-dress" #missing forward slash
print(urljoin("https://www.example.com/", pageUrl))