Last active
May 20, 2019 00:05
-
-
Save ychennay/f99777287a12247134541916db51578f to your computer and use it in GitHub Desktop.
Script to quickly process and load into memory anonymous sales conversion data from Kaggle
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import requests | |
from io import StringIO | |
COLUMNS_TO_DROP = ["ad_id", "xyz_campaign_id", "fb_campaign_id", "Total_Conversion"] # drop to avoid potential data leakage | |
TARGET = ["Approved_Conversion"] | |
# a hosted version of sales optimization conversion data from an anonymous organization, originally from Kaggle | |
# at https://www.kaggle.com/loveall/clicks-conversion-tracking | |
data_url = "https://raw.githubusercontent.com/ychennay/ychennay.github.io/master/KAG_conversion_data.csv" | |
# read data into memory and drop columns | |
data_string = requests.get(data_url).content | |
conversions_df = pd.read_csv(StringIO(data_string.decode("utf-8"))).drop(columns=COLUMNS_TO_DROP) | |
conversions_df["bias"] = 1 # add a bias/intercept column | |
# define the target and features | |
y = conversions_df[TARGET] | |
X = conversions_df.loc[:, ~conversions_df.columns.isin(TARGET)] | |
# using dictionary convert columns into categorical data types | |
convert_dict = {'gender': "category", 'interest':"category", "age": "category"} | |
# get dummy features for categorical variables - X is 1143 x 47, y is 1143 x 1 | |
X = pd.get_dummies(X.astype(convert_dict), drop_first=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment