Skip to content

Instantly share code, notes, and snippets.

@dotslash
Created September 3, 2016 17:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dotslash/941233ae3c524b86ea21c95b5f71360a to your computer and use it in GitHub Desktop.
Save dotslash/941233ae3c524b86ea21c95b5f71360a to your computer and use it in GitHub Desktop.
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
import json
import re
import time
import sys
import scipy.sparse as sparse
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
import keras.utils.np_utils as kutils
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
date_format = "%Y-%m-%d %H:%H:%S"
def timer(func, *args):
return labelled_timer(func.__name__, func, *args)
def labelled_timer(label, func, *args):
handle = start_timer(label)
ret = func(*args)
end_timer(label, handle)
return ret
def start_timer(label):
start = time.time()
print "{} {}:start".format(time.strftime(date_format), label)
return start
def end_timer(label, handle):
diff = int(time.time()*1000 - handle*1000)
print "{} {}:end time:{} ms".format(time.strftime(date_format), label, diff)
def expand(word):
words = word.split(' ')
# bigrams
ret = {a + '_' + b for a,b in zip(words, words[1:])}
# monograms
ret.update(words)
# word it self
ret.add('_'.join(words))
return list(ret)
def transform(ingrs):
if type(ingrs) == list:
return [item for ingr in ingrs for item in transform(ingr) ]
ingrs = ingrs.lower()
ingrs = re.sub(r' & ', ' and ', ingrs)
ingrs = re.sub(r'(\(.*\))', '', ingrs)
ingrs = re.sub(r'\d+ (onz|ounc|oz|pound|lb|kg|g|cup|tsp|tbsp|ml)\S+ ', '', ingrs)
ingrs = re.sub(r'\'', '', ingrs)
ingrs = re.sub(r'\-', '', ingrs)
ingrs = re.sub(r'/S+', ' ', ingrs)
ret = re.sub(r'[^a-z]+', ' ', ingrs)
ret = re.sub(r' s($| )', ' ', ret)
return expand(ret.strip())
def load_data(fname, use_tfidf):
parsed = json.loads(open(fname).read())
cuisines, ids, ingrs = [], [], []
for item in parsed:
if 'cuisine' in item:
cuisines.append(item['cuisine'])
ids.append(item['id'])
trans_ingrs = transform(item['ingredients'])
if use_tfidf:
trans_ingrs = ' '.join(trans_ingrs)
ingrs.append(trans_ingrs)
return cuisines, ids, ingrs
def transform_le(le, labels):
if type(le) == TfidfVectorizer:
return le.transform(labels)
if type(labels) == list:
return [transform_le(le, label) for label in labels]
if 'rev_map' not in le.__dict__:
print "populating rev_map for le"
le.rev_map = {le.classes_[ind]:ind for ind in
xrange(len(le.classes_))}
rev_map = le.rev_map
return rev_map.get(labels, -1)
def to_boolean_matr(feat_lists, cnt):
ret = sparse.dok_matrix((len(feat_lists), cnt))
for i in xrange(len(feat_lists)):
for feat in feat_lists[i]:
if feat < 0 : continue
ret[i, feat] = 1
return ret
def load_train(fname, use_tfidf):
cuisines, ids, ingrs = labelled_timer('load_train_data', load_data, fname, use_tfidf)
all_ingrs = ingrs
if not use_tfidf:
all_ingrs = [item for sublist in ingrs for item in sublist]
cuisine_le = LabelEncoder()
ingr_le = TfidfVectorizer() if use_tfidf else LabelEncoder()
transformed_cuisines = labelled_timer('fit_cuisines_le', cuisine_le.fit_transform, cuisines)
labelled_timer('fit_ingrs_le', ingr_le.fit_transform, all_ingrs)
transformed_ingrs = labelled_timer('transform_ingrs', transform_le, ingr_le, ingrs)
class_cnt = len(ingr_le.vocabulary_) if use_tfidf else len(ingr_le.classes_)
print "expanded ingredients:{}".format(class_cnt)
cuisines_bool = transformed_ingrs
if not use_tfidf:
cuisines_bool = timer(to_boolean_matr, transformed_ingrs, len(ingr_le.classes_))
return ids, transformed_cuisines, cuisine_le, cuisines_bool, ingr_le
def load_test(fname, ingr_le, use_tfidf):
cuisines, ids, ingrs = labelled_timer('load_test_data', load_data, fname, use_tfidf)
transformed_ingrs = labelled_timer('transform_ingrs', transform_le, ingr_le, ingrs)
if use_tfidf:
return ids, transformed_ingrs
cuisines_bool = timer(to_boolean_matr, transformed_ingrs, len(ingr_le.classes_))
return ids, cuisines_bool
def load_nn_model(feat_cnt):
# load neural net model architectiure
mdl = Sequential()
mdl.add(Dense(512, init='glorot_uniform', activation='relu',
input_shape=(feat_cnt,)))
mdl.add(Dropout(0.5))
mdl.add(Dense(128, init='glorot_uniform', activation='relu'))
mdl.add(Dropout(0.5))
mdl.add(Dense(20, activation='softmax'))
mdl.compile(loss='categorical_crossentropy', optimizer='adadelta')
return mdl
def predict(model, X, is_nn):
if not is_nn:
return model.predict(X)
return kutils.probas_to_classes(model.predict(X))
def load_model():
model = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=10,
random_state=42, n_jobs=-1)
# 77.4
model = SGDClassifier(loss='modified_huber', penalty='elasticnet', alpha=1e-3,
n_iter=10, random_state=42, n_jobs=-1)
# model = RandomForestClassifier(max_features=200)
# max_depth = 5 7
# 20 iter : 75.7, 71.4 79.2, 73.4
# 50 iter : 81.7, 75.4 85.8, 76.3
# 100 iter : 86.4, 77.1 91.1, 77.4
model = GradientBoostingClassifier(verbose=1, max_depth=7,
min_samples_leaf=10, n_estimators=100)
# one-vs-one
# 96, 75
model = SVC(verbose=True, kernel='linear', decision_function_shape='ovo')
# 96, 75
model = SVC(verbose=True, kernel='linear', decision_function_shape='ovr')
# 95, 75
model = LinearSVC(verbose=True)
# 43, 42 :-o
model = SVC(verbose=True, decision_function_shape='ovr')
# 77.4
model = LinearSVC(verbose=True)
#model = GradientBoostingClassifier(verbose=1, max_depth=7,
# min_samples_leaf=10, n_estimators=100)
return model
def main(is_nn, use_tfidf):
train_ids, y_all, cuisine_le, X_all, ingr_le = timer(load_train, 'train.json', use_tfidf)
split_handle = start_timer('train_test_split')
X_train, X_test, y_train, y_test = train_test_split(
X_all, y_all, test_size=0.25, random_state=42)
end_timer('train_test_split', split_handle)
print "======Loading train data done======="
test_ids, test_ingrs = timer(load_test, 'test.json', ingr_le, use_tfidf)
print "======Loading test data done======="
train_handle = start_timer('training')
print "Sizes of train, test"
print X_train.shape, len(y_train)
print X_test.shape, len(y_test)
model = load_model() if not is_nn else load_nn_model(X_train.shape[1])
if is_nn:
y_mod_train = kutils.to_categorical(y_train)
print y_mod_train.shape
model.fit(X_train.todense(), y_mod_train, nb_epoch = 300, batch_size = 4096,
show_accuracy = True)
right = sum(1.0 for a,b in zip(predict(model, X_test.todense(), is_nn),
y_test) if a==b)
print "\nTest score:{}".format(right/len(y_test))
else:
print ingr_le.inverse_transform(X_train[:5])
print cuisine_le.inverse_transform(y_train[:5])
cv = StratifiedKFold(y_train, n_folds=5, shuffle=True)
model = GridSearchCV(SVC(), {}, cv=cv, n_jobs=1, verbose=1)
model.fit(X_train.todense(), y_train)
print "Train score:{}".format(model.score(X_train.todense(), y_train))
print "Test score:{}".format(model.score(X_test.todense(), y_test))
end_timer('training', train_handle)
outfile = open('results', 'w')
predictions = cuisine_le.inverse_transform(
predict(model, test_ingrs.todense(), is_nn))
outfile.write('cuisine,id\n')
for i in xrange(len(test_ids)):
p = predictions[i]
item_id = test_ids[i]
outfile.write('{},{}\n'.format(p, item_id))
outfile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment