Skip to content

Instantly share code, notes, and snippets.

@cordon-thiago
cordon-thiago / import_removeMissing.py
Last active December 1, 2019 22:14
Import CSV and remove rows with predictor variable missing.
# Import libraries
import pandas as pd
import numpy as np
from functions import aux_functions
hardbounce = pd.read_csv('datasets/hardbounce_sample.csv',sep=';')
# Run the function on dataset
aux_functions.percMissing(hardbounce)
@cordon-thiago
cordon-thiago / datatypeConversion.py
Created December 1, 2019 21:35
Datatype Conversion
import pandas as pd
hardbounce_2['flgHardBounce_n'] = hardbounce_2.flgHardBounce.astype(int)
hardbounce_2['regDate_n'] = pd.to_datetime(hardbounce_2['regDate'])
hardbounce_2['birthDate_n'] = pd.to_datetime(hardbounce_2['birthDate'], errors = 'coerce')
import pandas as pd
import numpy as np
from datetime import date
from functions import aux_functions
curr_date = pd.to_datetime(date.today())
hardbounce_2['monthsSinceRegDate'] = (curr_date - hardbounce_2['regDate_n']) / np.timedelta64(1, 'M')
# There are rows with monthsSinceRegDate missing
@cordon-thiago
cordon-thiago / age.py
Created December 1, 2019 21:53
Create age dataframe column
import pandas as pd
import numpy as np
from datetime import date
from functions import aux_functions
curr_date = pd.to_datetime(date.today())
hardbounce_2['age'] = (curr_date - hardbounce_2['birthDate_n']) / np.timedelta64(1, 'Y')
# There are ages missing
@cordon-thiago
cordon-thiago / hardbounce_aux_functions.py
Created December 1, 2019 22:08
Auxiliary functions for Hard bounce article.
import matplotlib.pyplot as plt
def plot_roc_curve(fpr, tpr, label=None):
"""
The ROC curve, modified from
Hands-On Machine learning with Scikit-Learn and TensorFlow; p.91
tpr = true positive rate
fpr = false positive rate
"""
@cordon-thiago
cordon-thiago / emailDomain.py
Created December 1, 2019 22:22
Extract and transform e-mail domain.
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from functions import aux_functions
# Extract e-mail domain
hardbounce_2['emailDomain'] = hardbounce_2['email'].apply(aux_functions.getEmailDomain)
# count by domain
@cordon-thiago
cordon-thiago / emailUserCharQty.py
Created December 1, 2019 22:34
Create variables percNumbersInEmailUser, hasNumberInEmailUser and emailUserCharQty
# Import libraries
import pandas as pd
import numpy as np
from functions import aux_functions
# Calculate character qty in e-mail user
hardbounce_2['emailUserCharQty'] = hardbounce_2['email'].apply(lambda x : len(aux_functions.getEmailUser(x)))
@cordon-thiago
cordon-thiago / hardbounce_saveDF.py
Created December 1, 2019 22:38
Sabe dataframe in CSV file
hardbounce_2[['emailDomain_cat', 'emailDomainPiece1', 'emailDomainPiece2', 'regDate_n', 'birthDate_n'
,'monthsSinceRegDate', 'age', 'percNumbersInEmailUser', 'hasNumberInEmailUser', 'emailUserCharQty'
,'flgHardBounce_n']].to_csv('datasets/hardbounce_featureEngineering.csv', sep=';',index=False)
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.