Skip to content

Instantly share code, notes, and snippets.

@ashish01
Created December 21, 2015 03:10
Show Gist options
  • Save ashish01/3d1621848ab922191086 to your computer and use it in GitHub Desktop.
Save ashish01/3d1621848ab922191086 to your computer and use it in GitHub Desktop.
import json
import argparse
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
parser = argparse.ArgumentParser()
parser.add_argument('--train')
parser.add_argument('--test')
parser.add_argument('--pred')
args = parser.parse_args()
tokenizer = CountVectorizer().build_tokenizer()
def gen_ngrams(x, order = 1):
for o in range(order):
for i in range(len(x) - o):
yield x[i:i+o+1]
def ingredient_bag_of_words(ingredients):
for ingredient in ingredients:
tokens = tokenizer(ingredient.lower().replace('-', '_'))
for ngram in gen_ngrams(tokens, 2):
yield ' '.join(ngram)
def load(filename):
D = []
with open(filename) as f:
for line in f:
data = json.loads(line)
features = {x: 1 for x in ingredient_bag_of_words(data['ingredients'])}
D.append((data.get('cuisine', None), features, data['id']))
return D
D_train = load(args.train)
D_test = load(args.test)
featureizer = DictVectorizer()
labelizer = LabelEncoder()
X = featureizer.fit_transform([x[1] for x in D_train])
Y = labelizer.fit_transform([x[0] for x in D_train])
X_test = featureizer.transform([x[1] for x in D_test])
clf = LogisticRegression()
clf.fit(X, Y)
Y_pred = clf.predict(X)
Y_test_pred = clf.predict(X_test)
Y_test_labels = labelizer.inverse_transform(Y_test_pred)
with open(args.pred, 'w') as f:
for i in range(len(D_test)):
f.write('{},{}\n'.format(D_test[i][2], Y_test_labels[i]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment