Skip to content

Instantly share code, notes, and snippets.

@bowbowbow
Last active June 16, 2018 17:16
Show Gist options
  • Save bowbowbow/f990f3e9dc3e124e6646defab81a0442 to your computer and use it in GitHub Desktop.
Save bowbowbow/f990f3e9dc3e124e6646defab81a0442 to your computer and use it in GitHub Desktop.
TOP K=10, Mean Average Precision : 0.87821
import json
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
json_data = open('./dataset.json').read()
tweets = json.loads(json_data)
tweets_train, tweets_test = [], []
for tweet in tweets:
if random.randint(1, 10) <= 1:
tweets_test.append(tweet)
else:
tweets_train.append(tweet)
count_vec = CountVectorizer(lowercase=True, ngram_range=(1, 1))
count_vec.fit([tweet['text'].replace('#', '') for tweet in tweets_train])
x_train, y_train = [], []
label_encoder = preprocessing.LabelEncoder()
for tweet in tweets_train:
for hashtag in tweet['hashtags']:
x_train.append(tweet['text'])
y_train.append(hashtag.lower())
label_encoder.fit(y_train)
x_train = count_vec.transform(x_train)
y_train = label_encoder.transform(y_train)
cls = LogisticRegression()
cls.fit(x_train, y_train)
x_test = count_vec.transform([tweet['text'].replace('#', '') for tweet in tweets_test])
y_test_proba = cls.predict_proba(x_test)
'''
Mean Average Precision Explanation:
http://sdsawtelle.github.io/blog/output/mean-average-precision-MAP-for-recommender-systems.html
'''
TOP_K = 10
average_precision_sum = 0
for i in range(len(y_test_proba)):
y_pair = [(y_test_proba[i][j], j) for j in range(len(y_test_proba[i]))]
ranked = list(map(lambda x: x[1], sorted(y_pair, key=lambda x: x[0], reverse=True)))[:TOP_K]
ranked = label_encoder.inverse_transform(ranked)
hashtag_map = {}
for hashtag in tweets_test[i]['hashtags']:
hashtag_map[hashtag.lower()] = True
hit, precision_sum = 0, 0
for j in range(len(ranked)):
if ranked[j] in hashtag_map:
hit += 1
precision_sum += hit / (j + 1)
if hit == len(tweets_test[i]['hashtags']):
break
average_precision_sum += precision_sum/len(tweets_test[i]['hashtags'])
print('TOP K={}, Mean Average Precision : {}'.format(TOP_K, average_precision_sum / len(tweets_test)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment