Skip to content

Instantly share code, notes, and snippets.

@cordon-thiago
Created December 1, 2019 22:22
Show Gist options
  • Save cordon-thiago/0ac8e629cdec9692fd9f266e2a72ad9d to your computer and use it in GitHub Desktop.
Save cordon-thiago/0ac8e629cdec9692fd9f266e2a72ad9d to your computer and use it in GitHub Desktop.
Extract and transform e-mail domain.
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from functions import aux_functions
# Extract e-mail domain
hardbounce_2['emailDomain'] = hardbounce_2['email'].apply(aux_functions.getEmailDomain)
# count by domain
group_df = pd.DataFrame(hardbounce_2.groupby('emailDomain').count()['email'].sort_values(ascending=False)).reset_index()
# plot top 20 domains
sns.set(style="whitegrid")
ax = sns.barplot(x="email", y="emailDomain", data=group_df.head(20))
# Classify emails different from gmail and hotmail in others
hardbounce_2["emailDomain_cat"] = hardbounce_2["emailDomain"].apply(lambda x: 'others' if (x != 'gmail.com' and x != 'hotmail.com') else x)
# count by domain categorized
group_df = pd.DataFrame(hardbounce_2.groupby('emailDomain_cat').count()['email'].sort_values(ascending=False)).reset_index()
# plot domains
sns.set(style="whitegrid")
ax = sns.barplot(x="email", y="emailDomain_cat", data=group_df)
# Variable Y distribution by domain category
aux_functions.freqTable(
[hardbounce_2["emailDomain_cat"]]
,[hardbounce_2["flgHardBounce_n"]]
,True
,"index"
)
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from functions import aux_functions
# Split first domain piece
hardbounce_2['emailDomainPiece1'] = hardbounce_2['emailDomain'].apply(aux_functions.getPiece1EmailDomain)
# count by domain piece
group_df = pd.DataFrame(hardbounce_2.groupby('emailDomainPiece1').count()['email'].sort_values(ascending=False)).reset_index()
# plot top 20 domains
sns.set(style="whitegrid")
ax = sns.barplot(x="email", y="emailDomainPiece1", data=group_df.head(20))
# Create category with domains different from .com because this represents the majority of the dataset rows
hardbounce_2["emailDomainPiece1"] = hardbounce_2["emailDomainPiece1"].apply(lambda x: 'others' if x != 'com' else x)
# Variable Y distribution by domain piece
aux_functions.freqTable(
[hardbounce_2["emailDomainPiece1"]]
,[hardbounce_2["flgHardBounce_n"]]
,True
,"index"
)
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from functions import aux_functions
# Split second domain piece
hardbounce_2['emailDomainPiece2'] = hardbounce_2['emailDomain'].apply(aux_functions.getPiece2EmailDomain)
# count by domain piece
group_df = pd.DataFrame(hardbounce_2.groupby('emailDomainPiece2').count()['email'].sort_values(ascending=False)).reset_index()
# plot top 20 domains
sns.set(style="whitegrid")
ax = sns.barplot(x="email", y="emailDomainPiece2", data=group_df.head(20))
# Create category with domains different from .br and missing because they represents the majority of the dataset rows
hardbounce_2["emailDomainPiece2"] = hardbounce_2["emailDomainPiece2"].apply(lambda x: 'others' if (x != 'missing' and x != 'br') else x)
# Variable Y distribution by domain piece
aux_functions.freqTable(
[hardbounce_2["emailDomainPiece2"]]
,[hardbounce_2["flgHardBounce_n"]]
,True
,"index"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment