cordon-thiago/emailDomain.py

## emailDomain.py
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from functions import aux_functions

# Extract e-mail domain
hardbounce_2['emailDomain'] = hardbounce_2['email'].apply(aux_functions.getEmailDomain)

# count by domain
group_df = pd.DataFrame(hardbounce_2.groupby('emailDomain').count()['email'].sort_values(ascending=False)).reset_index()

# plot top 20 domains
sns.set(style="whitegrid")
ax = sns.barplot(x="email", y="emailDomain", data=group_df.head(20))

# Classify emails different from gmail and hotmail in others
hardbounce_2["emailDomain_cat"] =  hardbounce_2["emailDomain"].apply(lambda x: 'others' if (x != 'gmail.com' and x != 'hotmail.com') else x)

# count by domain categorized
group_df = pd.DataFrame(hardbounce_2.groupby('emailDomain_cat').count()['email'].sort_values(ascending=False)).reset_index()

# plot domains
sns.set(style="whitegrid")
ax = sns.barplot(x="email", y="emailDomain_cat", data=group_df)

# Variable Y distribution by domain category
aux_functions.freqTable(
    [hardbounce_2["emailDomain_cat"]]
    ,[hardbounce_2["flgHardBounce_n"]]
    ,True
    ,"index"
)

## emailDomainPiece1.py
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from functions import aux_functions

# Split first domain piece
hardbounce_2['emailDomainPiece1'] = hardbounce_2['emailDomain'].apply(aux_functions.getPiece1EmailDomain)

# count by domain piece
group_df = pd.DataFrame(hardbounce_2.groupby('emailDomainPiece1').count()['email'].sort_values(ascending=False)).reset_index()

# plot top 20 domains
sns.set(style="whitegrid")
ax = sns.barplot(x="email", y="emailDomainPiece1", data=group_df.head(20))

# Create category with domains different from .com because this represents the majority of the dataset rows
hardbounce_2["emailDomainPiece1"] =  hardbounce_2["emailDomainPiece1"].apply(lambda x: 'others' if x != 'com' else x)

# Variable Y distribution by domain piece
aux_functions.freqTable(
    [hardbounce_2["emailDomainPiece1"]]
    ,[hardbounce_2["flgHardBounce_n"]]
    ,True
    ,"index"
)

## emailDomainPiece2.py
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from functions import aux_functions

# Split second domain piece
hardbounce_2['emailDomainPiece2'] = hardbounce_2['emailDomain'].apply(aux_functions.getPiece2EmailDomain)

# count by domain piece
group_df = pd.DataFrame(hardbounce_2.groupby('emailDomainPiece2').count()['email'].sort_values(ascending=False)).reset_index()

# plot top 20 domains
sns.set(style="whitegrid")
ax = sns.barplot(x="email", y="emailDomainPiece2", data=group_df.head(20))

# Create category with domains different from .br and missing because they represents the majority of the dataset rows
hardbounce_2["emailDomainPiece2"] =  hardbounce_2["emailDomainPiece2"].apply(lambda x: 'others' if (x != 'missing' and x != 'br') else x)

# Variable Y distribution by domain piece
aux_functions.freqTable(
    [hardbounce_2["emailDomainPiece2"]]
    ,[hardbounce_2["flgHardBounce_n"]]
    ,True
    ,"index"
)
	# Import libraries
	import pandas as pd
	import numpy as np
	import seaborn as sns
	from functions import aux_functions

	# Extract e-mail domain
	hardbounce_2['emailDomain'] = hardbounce_2['email'].apply(aux_functions.getEmailDomain)

	# count by domain
	group_df = pd.DataFrame(hardbounce_2.groupby('emailDomain').count()['email'].sort_values(ascending=False)).reset_index()

	# plot top 20 domains
	sns.set(style="whitegrid")
	ax = sns.barplot(x="email", y="emailDomain", data=group_df.head(20))

	# Classify emails different from gmail and hotmail in others
	hardbounce_2["emailDomain_cat"] = hardbounce_2["emailDomain"].apply(lambda x: 'others' if (x != 'gmail.com' and x != 'hotmail.com') else x)

	# count by domain categorized
	group_df = pd.DataFrame(hardbounce_2.groupby('emailDomain_cat').count()['email'].sort_values(ascending=False)).reset_index()

	# plot domains
	sns.set(style="whitegrid")
	ax = sns.barplot(x="email", y="emailDomain_cat", data=group_df)

	# Variable Y distribution by domain category
	aux_functions.freqTable(
	[hardbounce_2["emailDomain_cat"]]
	,[hardbounce_2["flgHardBounce_n"]]
	,True
	,"index"
	)