Created
May 26, 2017 17:02
-
-
Save abehmiel/68b19d787a05395cc587947050f6d672 to your computer and use it in GitHub Desktop.
Exploratory data analysis and model selection with TPOT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# conversion rate exploratory analysis and ML algorithm hyperparameter optimization | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from sklearn.model_selection import train_test_split | |
from tpot import TPOTClassifier | |
# load the data | |
df = pd.read_csv("conversion_data.csv") | |
# print out some general descriptors of the dataset | |
print(df.head(5)) | |
print("number of conversions", len(df[df["converted"] == 1])) | |
print("number not converted", len(df[df["converted"] == 0])) | |
# print(df.describe()) # since we are changing df | |
# describe() tells us that we have users that are over 100 | |
# we can include them in the data set or ignore them. vd says there | |
# are very few. In that sense it's ok to remove them | |
df = df[df["age"] < 100] | |
print(df.describe()) | |
# now the maximum age is 79, that makes more sense. | |
# the country and source columns are not numeric, so they are excluded from the summary. | |
# summary statistics reveal that about 3.2% of site visits result in conversions. | |
# this seems to make sense, too. | |
# let's look at grouping the data in different ways: | |
# any global differences in conversion or other features by country? | |
print(df.groupby(by="country", axis=0).mean()) | |
# Germany seems to convert well, China not so much. | |
# how about by source? | |
print(df.groupby(by="source", axis=0).mean()) | |
# doesn't tell us very much information there. | |
# now try grouping by age: | |
print(df.groupby(by="age", axis=0).mean()) | |
# looks like younger users tend to convert more, . | |
# now try grouping by new user status: | |
print(df.groupby(by="new_user", axis=0).mean()) | |
# 0 (repeat customer) tends to convert more. Makes sense. | |
# now try grouping by number of pages visited: | |
print(df.groupby(by="total_pages_visited", axis=0).mean()) | |
# here's a definite payday: more pages visited = more conversions. | |
# what if we wanted to plot some of these? | |
# start with the total pages visited plot. Can do it easily in pandas. | |
df.groupby(by="total_pages_visited", axis=0).mean().plot(y="converted",kind="line") | |
plt.savefig("total_pages_visited_vs_conversion.png") | |
plt.clf() | |
df.groupby(by="country", axis=0).mean().plot(y="converted",kind="bar") | |
plt.savefig("country_vs_conversion.png") | |
plt.clf() | |
df.groupby(by="age", axis=0).mean().plot(y="converted",kind="line") | |
plt.savefig("age_vs_conversion.png") | |
plt.clf() | |
# ok, now that we have a reasonable feel for what's in the dataset, we can | |
# start to build an ML-model in scikit-learn. There's a little cleaning left to do, though. | |
# first, there are two columns in the dataframe that are composed of string data, | |
# so what we need to do is convert that to vectorized, or 'one-hot' representations. | |
df = pd.get_dummies(df) | |
# here are the features we wish to use, i.e., every column except 'converted' | |
X = df.drop('converted', axis=1).values | |
# and the target, 'converted' | |
y = df.loc[:, 'converted'].values | |
# split the dataset randomly into train and test | |
X_train, X_test, y_train, y_test = train_test_split(X, y, | |
train_size=0.75, | |
test_size=0.25) | |
# use TPOT to automatically find the best classifier for the dataset | |
my_tpot = TPOTClassifier(generations=10) | |
my_tpot.fit(X_train, y_train) | |
print(my_tpot.score(X_test, y_test)) | |
# export the pipeline to a new python file (clutch!) | |
my_tpot.export('exported_pipeline.py') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment