# Avazu CTR prediction | |
# SGD Logistic regression + one hot encoder. Score: 0.414 | |
import pandas as pd | |
import numpy as np | |
from datetime import datetime, date, time | |
from OneHotEncoderPartial import OneHotEncoder | |
from sklearn.linear_model import SGDClassifier | |
cols = ["C1","banner_pos","site_category", "device_type","device_conn_type","C14","C15","C16","C17","C18","C19","C20","C21", "hour"] | |
# add two columns for hour and a weekday | |
def dayhour(timestr): | |
d = datetime.strptime(str(x), "%y%m%d%H") | |
return [float(d.weekday()), float(d.hour)] | |
enc = OneHotEncoder() | |
# Fit OneHotEncoder small batch at the time | |
# This implementation of encoder that supports partial fitting borrowed from Mahendra Kariya. | |
train = pd.read_csv("train", chunksize = 1000000, iterator = True) | |
for chunk in train: | |
chunk = chunk[cols] | |
chunk = chunk.join(pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd","hr"])) | |
chunk.drop("hour", axis=1, inplace = True) | |
enc.partial_fit(np.array(chunk)) | |
# Train the classifier | |
clf = SGDClassifier(loss="log") | |
train = pd.read_csv("train", chunksize = 1000000, iterator = True) | |
all_classes = np.array([0, 1]) | |
for chunk in train: | |
y_train = chunk["click"] | |
chunk = chunk[cols] | |
chunk = chunk.join(pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd", "hr"])) | |
chunk.drop("hour", axis=1, inplace = True) | |
Xcat = enc.transform(np.array(chunk)) | |
clf.partial_fit(Xcat, y_train, classes=all_classes) | |
# Create a submission file | |
X_test = pd.read_csv("test", usecols = cols + ["id"]) | |
X_test = X_test.join(pd.DataFrame([dayhour(x) for x in X_test.hour], columns=["wd", "hr"])) | |
X_test.drop("hour", axis=1, inplace = True) | |
X_enc_test = enc.transform(X_test) | |
y_pred = clf.predict_proba(X_enc_test)[:, 1] | |
with open("submission.csv", "w") as f: | |
f.write("id,click\n") | |
for idx, xid in enumerate(X_test.id): | |
f.write(str(xid) + "," + "{0:.10f}".format(y_pred[idx]) + "\n") | |
f.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment