Created
September 3, 2016 17:18
-
-
Save dotslash/941233ae3c524b86ea21c95b5f71360a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.preprocessing import LabelEncoder | |
from sklearn.cross_validation import train_test_split | |
import json | |
import re | |
import time | |
import sys | |
import scipy.sparse as sparse | |
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.linear_model import SGDClassifier | |
from sklearn.svm import SVC | |
from sklearn.svm import LinearSVC | |
from keras.models import Sequential | |
from keras.layers.core import Dense, Dropout | |
import keras.utils.np_utils as kutils | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.grid_search import GridSearchCV | |
from sklearn.cross_validation import StratifiedKFold | |
date_format = "%Y-%m-%d %H:%H:%S" | |
def timer(func, *args): | |
return labelled_timer(func.__name__, func, *args) | |
def labelled_timer(label, func, *args): | |
handle = start_timer(label) | |
ret = func(*args) | |
end_timer(label, handle) | |
return ret | |
def start_timer(label): | |
start = time.time() | |
print "{} {}:start".format(time.strftime(date_format), label) | |
return start | |
def end_timer(label, handle): | |
diff = int(time.time()*1000 - handle*1000) | |
print "{} {}:end time:{} ms".format(time.strftime(date_format), label, diff) | |
def expand(word): | |
words = word.split(' ') | |
# bigrams | |
ret = {a + '_' + b for a,b in zip(words, words[1:])} | |
# monograms | |
ret.update(words) | |
# word it self | |
ret.add('_'.join(words)) | |
return list(ret) | |
def transform(ingrs): | |
if type(ingrs) == list: | |
return [item for ingr in ingrs for item in transform(ingr) ] | |
ingrs = ingrs.lower() | |
ingrs = re.sub(r' & ', ' and ', ingrs) | |
ingrs = re.sub(r'(\(.*\))', '', ingrs) | |
ingrs = re.sub(r'\d+ (onz|ounc|oz|pound|lb|kg|g|cup|tsp|tbsp|ml)\S+ ', '', ingrs) | |
ingrs = re.sub(r'\'', '', ingrs) | |
ingrs = re.sub(r'\-', '', ingrs) | |
ingrs = re.sub(r'/S+', ' ', ingrs) | |
ret = re.sub(r'[^a-z]+', ' ', ingrs) | |
ret = re.sub(r' s($| )', ' ', ret) | |
return expand(ret.strip()) | |
def load_data(fname, use_tfidf): | |
parsed = json.loads(open(fname).read()) | |
cuisines, ids, ingrs = [], [], [] | |
for item in parsed: | |
if 'cuisine' in item: | |
cuisines.append(item['cuisine']) | |
ids.append(item['id']) | |
trans_ingrs = transform(item['ingredients']) | |
if use_tfidf: | |
trans_ingrs = ' '.join(trans_ingrs) | |
ingrs.append(trans_ingrs) | |
return cuisines, ids, ingrs | |
def transform_le(le, labels): | |
if type(le) == TfidfVectorizer: | |
return le.transform(labels) | |
if type(labels) == list: | |
return [transform_le(le, label) for label in labels] | |
if 'rev_map' not in le.__dict__: | |
print "populating rev_map for le" | |
le.rev_map = {le.classes_[ind]:ind for ind in | |
xrange(len(le.classes_))} | |
rev_map = le.rev_map | |
return rev_map.get(labels, -1) | |
def to_boolean_matr(feat_lists, cnt): | |
ret = sparse.dok_matrix((len(feat_lists), cnt)) | |
for i in xrange(len(feat_lists)): | |
for feat in feat_lists[i]: | |
if feat < 0 : continue | |
ret[i, feat] = 1 | |
return ret | |
def load_train(fname, use_tfidf): | |
cuisines, ids, ingrs = labelled_timer('load_train_data', load_data, fname, use_tfidf) | |
all_ingrs = ingrs | |
if not use_tfidf: | |
all_ingrs = [item for sublist in ingrs for item in sublist] | |
cuisine_le = LabelEncoder() | |
ingr_le = TfidfVectorizer() if use_tfidf else LabelEncoder() | |
transformed_cuisines = labelled_timer('fit_cuisines_le', cuisine_le.fit_transform, cuisines) | |
labelled_timer('fit_ingrs_le', ingr_le.fit_transform, all_ingrs) | |
transformed_ingrs = labelled_timer('transform_ingrs', transform_le, ingr_le, ingrs) | |
class_cnt = len(ingr_le.vocabulary_) if use_tfidf else len(ingr_le.classes_) | |
print "expanded ingredients:{}".format(class_cnt) | |
cuisines_bool = transformed_ingrs | |
if not use_tfidf: | |
cuisines_bool = timer(to_boolean_matr, transformed_ingrs, len(ingr_le.classes_)) | |
return ids, transformed_cuisines, cuisine_le, cuisines_bool, ingr_le | |
def load_test(fname, ingr_le, use_tfidf): | |
cuisines, ids, ingrs = labelled_timer('load_test_data', load_data, fname, use_tfidf) | |
transformed_ingrs = labelled_timer('transform_ingrs', transform_le, ingr_le, ingrs) | |
if use_tfidf: | |
return ids, transformed_ingrs | |
cuisines_bool = timer(to_boolean_matr, transformed_ingrs, len(ingr_le.classes_)) | |
return ids, cuisines_bool | |
def load_nn_model(feat_cnt): | |
# load neural net model architectiure | |
mdl = Sequential() | |
mdl.add(Dense(512, init='glorot_uniform', activation='relu', | |
input_shape=(feat_cnt,))) | |
mdl.add(Dropout(0.5)) | |
mdl.add(Dense(128, init='glorot_uniform', activation='relu')) | |
mdl.add(Dropout(0.5)) | |
mdl.add(Dense(20, activation='softmax')) | |
mdl.compile(loss='categorical_crossentropy', optimizer='adadelta') | |
return mdl | |
def predict(model, X, is_nn): | |
if not is_nn: | |
return model.predict(X) | |
return kutils.probas_to_classes(model.predict(X)) | |
def load_model(): | |
model = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=10, | |
random_state=42, n_jobs=-1) | |
# 77.4 | |
model = SGDClassifier(loss='modified_huber', penalty='elasticnet', alpha=1e-3, | |
n_iter=10, random_state=42, n_jobs=-1) | |
# model = RandomForestClassifier(max_features=200) | |
# max_depth = 5 7 | |
# 20 iter : 75.7, 71.4 79.2, 73.4 | |
# 50 iter : 81.7, 75.4 85.8, 76.3 | |
# 100 iter : 86.4, 77.1 91.1, 77.4 | |
model = GradientBoostingClassifier(verbose=1, max_depth=7, | |
min_samples_leaf=10, n_estimators=100) | |
# one-vs-one | |
# 96, 75 | |
model = SVC(verbose=True, kernel='linear', decision_function_shape='ovo') | |
# 96, 75 | |
model = SVC(verbose=True, kernel='linear', decision_function_shape='ovr') | |
# 95, 75 | |
model = LinearSVC(verbose=True) | |
# 43, 42 :-o | |
model = SVC(verbose=True, decision_function_shape='ovr') | |
# 77.4 | |
model = LinearSVC(verbose=True) | |
#model = GradientBoostingClassifier(verbose=1, max_depth=7, | |
# min_samples_leaf=10, n_estimators=100) | |
return model | |
def main(is_nn, use_tfidf): | |
train_ids, y_all, cuisine_le, X_all, ingr_le = timer(load_train, 'train.json', use_tfidf) | |
split_handle = start_timer('train_test_split') | |
X_train, X_test, y_train, y_test = train_test_split( | |
X_all, y_all, test_size=0.25, random_state=42) | |
end_timer('train_test_split', split_handle) | |
print "======Loading train data done=======" | |
test_ids, test_ingrs = timer(load_test, 'test.json', ingr_le, use_tfidf) | |
print "======Loading test data done=======" | |
train_handle = start_timer('training') | |
print "Sizes of train, test" | |
print X_train.shape, len(y_train) | |
print X_test.shape, len(y_test) | |
model = load_model() if not is_nn else load_nn_model(X_train.shape[1]) | |
if is_nn: | |
y_mod_train = kutils.to_categorical(y_train) | |
print y_mod_train.shape | |
model.fit(X_train.todense(), y_mod_train, nb_epoch = 300, batch_size = 4096, | |
show_accuracy = True) | |
right = sum(1.0 for a,b in zip(predict(model, X_test.todense(), is_nn), | |
y_test) if a==b) | |
print "\nTest score:{}".format(right/len(y_test)) | |
else: | |
print ingr_le.inverse_transform(X_train[:5]) | |
print cuisine_le.inverse_transform(y_train[:5]) | |
cv = StratifiedKFold(y_train, n_folds=5, shuffle=True) | |
model = GridSearchCV(SVC(), {}, cv=cv, n_jobs=1, verbose=1) | |
model.fit(X_train.todense(), y_train) | |
print "Train score:{}".format(model.score(X_train.todense(), y_train)) | |
print "Test score:{}".format(model.score(X_test.todense(), y_test)) | |
end_timer('training', train_handle) | |
outfile = open('results', 'w') | |
predictions = cuisine_le.inverse_transform( | |
predict(model, test_ingrs.todense(), is_nn)) | |
outfile.write('cuisine,id\n') | |
for i in xrange(len(test_ids)): | |
p = predictions[i] | |
item_id = test_ids[i] | |
outfile.write('{},{}\n'.format(p, item_id)) | |
outfile.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment