Skip to content

Instantly share code, notes, and snippets.

@jaume-ferrarons
Created April 7, 2017 20:01
Show Gist options
  • Save jaume-ferrarons/951f6e958a2a270b12b5e13e741ef465 to your computer and use it in GitHub Desktop.
Save jaume-ferrarons/951f6e958a2a270b12b5e13e741ef465 to your computer and use it in GitHub Desktop.
HackerRank - Indeed Machine Learning CodeSprint - Basic Keras solution using a MLP (https://www.hackerrank.com/contests/indeed-ml-codesprint-2017)
# https://github.com/datalogai/recurrentshop
# https://github.com/farizrahman4u/seq2seq
import pandas as pd
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from keras import backend
batch_size = 256
epochs = 100
# Data files should be in the following paths
TRAIN_DATA = "data/train.tsv"
TEST_DATA = "data/test.tsv"
# Define exclusive groups of tags
TAGS_GROUPS = [
set(['1-year-experience-needed', '2-4-years-experience-needed', '5-plus-years-experience-needed']),
set(['associate-needed', 'bs-degree-needed', 'ms-or-phd-needed', 'licence-needed']),
set(['full-time-job', 'part-time-job']),
set(['salary', 'hourly-wage']),
set(['supervising-job'])
]
# Load data
df = pd.read_csv(TRAIN_DATA, sep="\t")
# Prepare data
x_train = df["description"]
df_test = pd.read_csv(TEST_DATA, sep="\t")
x_test = df_test["description"]
vectorizer = CountVectorizer(ngram_range=(1, 5), min_df=5)
x_train = vectorizer.fit_transform(x_train.values).todense()
x_test = vectorizer.transform(x_test.values).todense()
nwords = x_train.shape[1]
def compute_class_weights(binary_labels):
"""Weight the classes"""
total = np.sum(binary_labels)
label_count = np.sum(binary_labels, axis=0)
return dict(enumerate(float(total)/label_count))
def predict_group(tag_group):
"""Predict per tag group"""
# Get tags
tags = [str(tags).split(" ") for tags in df["tags"].values.tolist()]
# Remove nan tag and select the ones to predict
for i in range(len(tags)):
tags[i] = [t for t in tags[i] if t != 'nan' and t in tag_group]
# Vectorize labels
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(tags)
# Get the shape
num_classes = y_train.shape[1] + 1
# Create an extra class for the ones with no tag
print np.sum(y_train, axis=1)
y_train = np.c_[y_train, 1 - np.sum(y_train, axis=1)]
print "Class weights: ", compute_class_weights(y_train)
print 'Building model...'
model = Sequential()
model.add(Dense(256, input_dim=nwords))
model.add(Activation('sigmoid'))
model.add(Dropout(0.3))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(256))
model.add(Activation('tanh'))
model.add(Dropout(0.3))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
print model.summary()
#Fit model using all data
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=1,
class_weight=compute_class_weights(y_train))
prediction = model.predict(x_test)[:, :-1]
backend.clear_session()
y_test = mlb.inverse_transform(np.round(prediction))
return y_test
#Predict for every group
results = []
for tag_group in TAGS_GROUPS:
predictions = predict_group(tag_group)
if len(results) == 0:
results = predictions
else:
results = [a + b for (a, b) in zip(results, predictions)]
# Compute results
print results[0:5]
#Write predictions to file
out = open('out.tsv', 'w')
out.write("tags\n")
for line in results:
out.write(' '.join(line))
out.write('\n')
out.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment