Created
April 7, 2017 20:01
-
-
Save jaume-ferrarons/951f6e958a2a270b12b5e13e741ef465 to your computer and use it in GitHub Desktop.
HackerRank - Indeed Machine Learning CodeSprint - Basic Keras solution using a MLP (https://www.hackerrank.com/contests/indeed-ml-codesprint-2017)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://github.com/datalogai/recurrentshop | |
# https://github.com/farizrahman4u/seq2seq | |
import pandas as pd | |
import numpy as np | |
import keras | |
from keras.models import Sequential | |
from keras.layers import Dense, Dropout, Activation | |
from keras.layers import LSTM | |
from keras.preprocessing.text import Tokenizer | |
from sklearn.preprocessing import MultiLabelBinarizer | |
from sklearn.feature_extraction.text import CountVectorizer | |
from keras import backend | |
batch_size = 256 | |
epochs = 100 | |
# Data files should be in the following paths | |
TRAIN_DATA = "data/train.tsv" | |
TEST_DATA = "data/test.tsv" | |
# Define exclusive groups of tags | |
TAGS_GROUPS = [ | |
set(['1-year-experience-needed', '2-4-years-experience-needed', '5-plus-years-experience-needed']), | |
set(['associate-needed', 'bs-degree-needed', 'ms-or-phd-needed', 'licence-needed']), | |
set(['full-time-job', 'part-time-job']), | |
set(['salary', 'hourly-wage']), | |
set(['supervising-job']) | |
] | |
# Load data | |
df = pd.read_csv(TRAIN_DATA, sep="\t") | |
# Prepare data | |
x_train = df["description"] | |
df_test = pd.read_csv(TEST_DATA, sep="\t") | |
x_test = df_test["description"] | |
vectorizer = CountVectorizer(ngram_range=(1, 5), min_df=5) | |
x_train = vectorizer.fit_transform(x_train.values).todense() | |
x_test = vectorizer.transform(x_test.values).todense() | |
nwords = x_train.shape[1] | |
def compute_class_weights(binary_labels): | |
"""Weight the classes""" | |
total = np.sum(binary_labels) | |
label_count = np.sum(binary_labels, axis=0) | |
return dict(enumerate(float(total)/label_count)) | |
def predict_group(tag_group): | |
"""Predict per tag group""" | |
# Get tags | |
tags = [str(tags).split(" ") for tags in df["tags"].values.tolist()] | |
# Remove nan tag and select the ones to predict | |
for i in range(len(tags)): | |
tags[i] = [t for t in tags[i] if t != 'nan' and t in tag_group] | |
# Vectorize labels | |
mlb = MultiLabelBinarizer() | |
y_train = mlb.fit_transform(tags) | |
# Get the shape | |
num_classes = y_train.shape[1] + 1 | |
# Create an extra class for the ones with no tag | |
print np.sum(y_train, axis=1) | |
y_train = np.c_[y_train, 1 - np.sum(y_train, axis=1)] | |
print "Class weights: ", compute_class_weights(y_train) | |
print 'Building model...' | |
model = Sequential() | |
model.add(Dense(256, input_dim=nwords)) | |
model.add(Activation('sigmoid')) | |
model.add(Dropout(0.3)) | |
model.add(Dense(512)) | |
model.add(Activation('relu')) | |
model.add(Dropout(0.3)) | |
model.add(Dense(256)) | |
model.add(Activation('tanh')) | |
model.add(Dropout(0.3)) | |
model.add(Dense(num_classes)) | |
model.add(Activation('softmax')) | |
model.compile(loss='categorical_crossentropy', | |
optimizer='adam', | |
metrics=['accuracy']) | |
print model.summary() | |
#Fit model using all data | |
model.fit(x_train, y_train, | |
batch_size=batch_size, | |
epochs=epochs, | |
verbose=1, | |
class_weight=compute_class_weights(y_train)) | |
prediction = model.predict(x_test)[:, :-1] | |
backend.clear_session() | |
y_test = mlb.inverse_transform(np.round(prediction)) | |
return y_test | |
#Predict for every group | |
results = [] | |
for tag_group in TAGS_GROUPS: | |
predictions = predict_group(tag_group) | |
if len(results) == 0: | |
results = predictions | |
else: | |
results = [a + b for (a, b) in zip(results, predictions)] | |
# Compute results | |
print results[0:5] | |
#Write predictions to file | |
out = open('out.tsv', 'w') | |
out.write("tags\n") | |
for line in results: | |
out.write(' '.join(line)) | |
out.write('\n') | |
out.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment