Skip to content

Instantly share code, notes, and snippets.

View hamletbatista's full-sized avatar

Hamlet Batista hamletbatista

View GitHub Profile
# We do a left merge to append the redirect information to our original GA dataframe.
data_redirects = data.merge(redirects, left_on="url", right_on="url", how="left")
data_redirects['true_url'] = data_redirects['redirect_url'].combine_first(data_redirects['path'])
data_redirects['true_url'] = data_redirects['true_url'].apply(lambda x: urlparse(x).path)
data_redirects['ga:date'] = pd.to_datetime(data_redirects['ga:date'])
true_before = data_redirects[data_redirects['ga:date'] < pd.to_datetime(MIDPOINT_DATE)]
true_after = data_redirects[data_redirects['ga:date'] >= pd.to_datetime(MIDPOINT_DATE)]
# Traffic totals before shopify switch
true_totals_before = true_before[["true_url", "ga:newUsers"]]\
.groupby("true_url").sum()
true_totals_before = true_totals_before.reset_index()\
.sort_values("ga:newUsers", ascending=False)
# Comparing pages from before and after the switch
true_change = true_totals_after.merge(true_totals_before,
left_on="true_url",
right_on="true_url",
suffixes=["_after", "_before"],
how="outer")
true_change.loc[:, ["ga:newUsers_after", "ga:newUsers_before"]].fillna(0, inplace=True)
# Checking again that the total traffic adds up
true_change[["ga:newUsers_before", "ga:newUsers_after"]].sum().sum() == data['ga:newUsers'].sum()
#should be true
data_redirects['group'] = "N/A"
data_redirects.loc[data_redirects['true_url'].str.contains(r"/collections(?!.*products.*)(?!.*/product.*)"), "group"] = "Collections"
data_redirects.loc[data_redirects['true_url'].str.contains(r".*/products/.*|.*/product/.*"), "group"] = "Products"
grouped_data = data_redirects[['group', "ga:newUsers", "ga:date"]].groupby(["group", "ga:date"]).sum().reset_index()
# before and after comparison
grouped_before = grouped_data[grouped_data['ga:date'] < pd.to_datetime("2017-12-15")]
grouped_after = grouped_data[grouped_data['ga:date'] >= pd.to_datetime("2017-12-15")]
plot_data = [
go.Bar(
x = grouped_change['group'].tolist(),
y = grouped_change['difference'].tolist(),
marker = dict(
color = 'red'
),
name = 'Traffic Difference'
),
go.Bar(
line_data = []
for group in grouped_data['group'].unique().tolist():
line = go.Scatter(
x = grouped_data.loc[grouped_data['group'] == group, 'ga:date'],
y = grouped_data.loc[grouped_data['group'] == group, 'ga:newUsers'],
name = group,
mode="lines"
)
import pandas as pd
print(pd.__version__) #should be 0.23 or later
df = pd.DataFrame.from_dict(sitemaps, orient="index", columns=['lastmod'])
df.head(10)
#convert relative URLs to absolute
from urllib.parse import urljoin
#relative 404 URLs from Search Console API: webmasters.urlcrawlerrorssamples.list
pageUrl = "product/mad-for-plaid-flannel-dress" #missing forward slash
print(urljoin("https://www.example.com/", pageUrl))
#convert absolute URLs to relative
from urllib.parse import urlsplit, urlunsplit
#Absolute source URLs linking to 404s from Search Console API: webmasters.urlcrawlerrorssamples.list
linkedFromUrls= [
"http://www.example.com/brand/swirly/shopby?sizecode=99",
"https://www.example.com/brand/swirly"
]