Skip to content

Instantly share code, notes, and snippets.

@yankov
Last active August 13, 2019 08:38
Show Gist options
  • Save yankov/c2ec3864c963da3b59f6 to your computer and use it in GitHub Desktop.
Save yankov/c2ec3864c963da3b59f6 to your computer and use it in GitHub Desktop.
# Avazu CTR prediction
# SGD Logistic regression + one hot encoder. Score: 0.414
import pandas as pd
import numpy as np
from datetime import datetime, date, time
from OneHotEncoderPartial import OneHotEncoder
from sklearn.linear_model import SGDClassifier
cols = ["C1","banner_pos","site_category", "device_type","device_conn_type","C14","C15","C16","C17","C18","C19","C20","C21", "hour"]
# add two columns for hour and a weekday
def dayhour(timestr):
d = datetime.strptime(str(x), "%y%m%d%H")
return [float(d.weekday()), float(d.hour)]
enc = OneHotEncoder()
# Fit OneHotEncoder small batch at the time
# This implementation of encoder that supports partial fitting borrowed from Mahendra Kariya.
train = pd.read_csv("train", chunksize = 1000000, iterator = True)
for chunk in train:
chunk = chunk[cols]
chunk = chunk.join(pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd","hr"]))
chunk.drop("hour", axis=1, inplace = True)
enc.partial_fit(np.array(chunk))
# Train the classifier
clf = SGDClassifier(loss="log")
train = pd.read_csv("train", chunksize = 1000000, iterator = True)
all_classes = np.array([0, 1])
for chunk in train:
y_train = chunk["click"]
chunk = chunk[cols]
chunk = chunk.join(pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd", "hr"]))
chunk.drop("hour", axis=1, inplace = True)
Xcat = enc.transform(np.array(chunk))
clf.partial_fit(Xcat, y_train, classes=all_classes)
# Create a submission file
X_test = pd.read_csv("test", usecols = cols + ["id"])
X_test = X_test.join(pd.DataFrame([dayhour(x) for x in X_test.hour], columns=["wd", "hr"]))
X_test.drop("hour", axis=1, inplace = True)
X_enc_test = enc.transform(X_test)
y_pred = clf.predict_proba(X_enc_test)[:, 1]
with open("submission.csv", "w") as f:
f.write("id,click\n")
for idx, xid in enumerate(X_test.id):
f.write(str(xid) + "," + "{0:.10f}".format(y_pred[idx]) + "\n")
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment