abehmiel/exploratory_model_tpot

## exploratory_model_tpot
# conversion rate exploratory analysis and ML algorithm hyperparameter optimization

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier

# load the data
df = pd.read_csv("conversion_data.csv")

# print out some general descriptors of the dataset
print(df.head(5))
print("number of conversions", len(df[df["converted"] == 1]))
print("number not converted", len(df[df["converted"] == 0]))
# print(df.describe()) # since we are changing df

# describe() tells us that we have users that are over 100
# we can include them in the data set or ignore them. vd says there
# are very few. In that sense it's ok to remove them

df = df[df["age"] < 100]
print(df.describe())

# now the maximum age is 79, that makes more sense.
# the country and source columns are not numeric, so they are excluded from the summary.

# summary statistics reveal that about 3.2% of site visits result in conversions.
# this seems to make sense, too.

# let's look at grouping the data in different ways:
# any global differences in conversion or other features by country?
print(df.groupby(by="country", axis=0).mean())

# Germany seems to convert well, China not so much.
# how about by source?
print(df.groupby(by="source", axis=0).mean())

# doesn't tell us very much information there.
# now try grouping by age:
print(df.groupby(by="age", axis=0).mean())

# looks like younger users tend to convert more, .
# now try grouping by new user status:
print(df.groupby(by="new_user", axis=0).mean())

# 0 (repeat customer) tends to convert more. Makes sense.
# now try grouping by number of pages visited:
print(df.groupby(by="total_pages_visited", axis=0).mean())

# here's a definite payday: more pages visited = more conversions.
# what if we wanted to plot some of these?
# start with the total pages visited plot. Can do it easily in pandas.
df.groupby(by="total_pages_visited", axis=0).mean().plot(y="converted",kind="line")
plt.savefig("total_pages_visited_vs_conversion.png")
plt.clf()

df.groupby(by="country", axis=0).mean().plot(y="converted",kind="bar")
plt.savefig("country_vs_conversion.png")
plt.clf()

df.groupby(by="age", axis=0).mean().plot(y="converted",kind="line")
plt.savefig("age_vs_conversion.png")
plt.clf()

# ok, now that we have a reasonable feel for what's in the dataset, we can
# start to build an ML-model in scikit-learn. There's a little cleaning left to do, though.

# first, there are two columns in the dataframe that are composed of string data,
# so what we need to do is convert that to vectorized, or 'one-hot' representations.
df = pd.get_dummies(df)

# here are the features we wish to use, i.e., every column except 'converted'
X = df.drop('converted', axis=1).values
# and the target, 'converted'
y = df.loc[:, 'converted'].values

# split the dataset randomly into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.75,
                                                    test_size=0.25)

# use TPOT to automatically find the best classifier for the dataset
my_tpot = TPOTClassifier(generations=10)
my_tpot.fit(X_train, y_train)

print(my_tpot.score(X_test, y_test))

# export the pipeline to a new python file (clutch!)
my_tpot.export('exported_pipeline.py')
	# conversion rate exploratory analysis and ML algorithm hyperparameter optimization

	import pandas as pd
	import matplotlib.pyplot as plt
	from sklearn.model_selection import train_test_split
	from tpot import TPOTClassifier

	# load the data
	df = pd.read_csv("conversion_data.csv")

	# print out some general descriptors of the dataset
	print(df.head(5))
	print("number of conversions", len(df[df["converted"] == 1]))
	print("number not converted", len(df[df["converted"] == 0]))
	# print(df.describe()) # since we are changing df

	# describe() tells us that we have users that are over 100
	# we can include them in the data set or ignore them. vd says there
	# are very few. In that sense it's ok to remove them

	df = df[df["age"] < 100]
	print(df.describe())

	# now the maximum age is 79, that makes more sense.
	# the country and source columns are not numeric, so they are excluded from the summary.

	# summary statistics reveal that about 3.2% of site visits result in conversions.
	# this seems to make sense, too.

	# let's look at grouping the data in different ways:
	# any global differences in conversion or other features by country?
	print(df.groupby(by="country", axis=0).mean())

	# Germany seems to convert well, China not so much.
	# how about by source?
	print(df.groupby(by="source", axis=0).mean())

	# doesn't tell us very much information there.
	# now try grouping by age:
	print(df.groupby(by="age", axis=0).mean())

	# looks like younger users tend to convert more, .
	# now try grouping by new user status:
	print(df.groupby(by="new_user", axis=0).mean())

	# 0 (repeat customer) tends to convert more. Makes sense.
	# now try grouping by number of pages visited:
	print(df.groupby(by="total_pages_visited", axis=0).mean())

	# here's a definite payday: more pages visited = more conversions.
	# what if we wanted to plot some of these?
	# start with the total pages visited plot. Can do it easily in pandas.
	df.groupby(by="total_pages_visited", axis=0).mean().plot(y="converted",kind="line")
	plt.savefig("total_pages_visited_vs_conversion.png")
	plt.clf()

	df.groupby(by="country", axis=0).mean().plot(y="converted",kind="bar")
	plt.savefig("country_vs_conversion.png")
	plt.clf()

	df.groupby(by="age", axis=0).mean().plot(y="converted",kind="line")
	plt.savefig("age_vs_conversion.png")
	plt.clf()

	# ok, now that we have a reasonable feel for what's in the dataset, we can
	# start to build an ML-model in scikit-learn. There's a little cleaning left to do, though.

	# first, there are two columns in the dataframe that are composed of string data,
	# so what we need to do is convert that to vectorized, or 'one-hot' representations.
	df = pd.get_dummies(df)

	# here are the features we wish to use, i.e., every column except 'converted'
	X = df.drop('converted', axis=1).values
	# and the target, 'converted'
	y = df.loc[:, 'converted'].values

	# split the dataset randomly into train and test
	X_train, X_test, y_train, y_test = train_test_split(X, y,
	train_size=0.75,
	test_size=0.25)

	# use TPOT to automatically find the best classifier for the dataset
	my_tpot = TPOTClassifier(generations=10)
	my_tpot.fit(X_train, y_train)

	print(my_tpot.score(X_test, y_test))

	# export the pipeline to a new python file (clutch!)
	my_tpot.export('exported_pipeline.py')