bowbowbow/hashtag_baseline.py

## hashtag_baseline.py
import json
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

json_data = open('./dataset.json').read()
tweets = json.loads(json_data)

tweets_train, tweets_test = [], []
for tweet in tweets:
    if random.randint(1, 10) <= 1:
        tweets_test.append(tweet)
    else:
        tweets_train.append(tweet)

count_vec = CountVectorizer(lowercase=True, ngram_range=(1, 1))
count_vec.fit([tweet['text'].replace('#', '') for tweet in tweets_train])

x_train, y_train = [], []
label_encoder = preprocessing.LabelEncoder()
for tweet in tweets_train:
    for hashtag in tweet['hashtags']:
        x_train.append(tweet['text'])
        y_train.append(hashtag.lower())

label_encoder.fit(y_train)
x_train = count_vec.transform(x_train)
y_train = label_encoder.transform(y_train)

cls = LogisticRegression()
cls.fit(x_train, y_train)

x_test = count_vec.transform([tweet['text'].replace('#', '') for tweet in tweets_test])
y_test_proba = cls.predict_proba(x_test)

'''
Mean Average Precision Explanation:
http://sdsawtelle.github.io/blog/output/mean-average-precision-MAP-for-recommender-systems.html
'''
TOP_K = 10
average_precision_sum = 0
for i in range(len(y_test_proba)):
    y_pair = [(y_test_proba[i][j], j) for j in range(len(y_test_proba[i]))]
    ranked = list(map(lambda x: x[1], sorted(y_pair, key=lambda x: x[0], reverse=True)))[:TOP_K]
    ranked = label_encoder.inverse_transform(ranked)

    hashtag_map = {}
    for hashtag in tweets_test[i]['hashtags']:
        hashtag_map[hashtag.lower()] = True

    hit, precision_sum = 0, 0
    for j in range(len(ranked)):
        if ranked[j] in hashtag_map:
            hit += 1
            precision_sum += hit / (j + 1)

        if hit == len(tweets_test[i]['hashtags']):
            break

    average_precision_sum += precision_sum/len(tweets_test[i]['hashtags'])

print('TOP K={}, Mean Average Precision : {}'.format(TOP_K, average_precision_sum / len(tweets_test)))
	import json
	import random
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.linear_model import LogisticRegression
	from sklearn import preprocessing

	json_data = open('./dataset.json').read()
	tweets = json.loads(json_data)

	tweets_train, tweets_test = [], []
	for tweet in tweets:
	if random.randint(1, 10) <= 1:
	tweets_test.append(tweet)
	else:
	tweets_train.append(tweet)

	count_vec = CountVectorizer(lowercase=True, ngram_range=(1, 1))
	count_vec.fit([tweet['text'].replace('#', '') for tweet in tweets_train])

	x_train, y_train = [], []
	label_encoder = preprocessing.LabelEncoder()
	for tweet in tweets_train:
	for hashtag in tweet['hashtags']:
	x_train.append(tweet['text'])
	y_train.append(hashtag.lower())

	label_encoder.fit(y_train)
	x_train = count_vec.transform(x_train)
	y_train = label_encoder.transform(y_train)

	cls = LogisticRegression()
	cls.fit(x_train, y_train)

	x_test = count_vec.transform([tweet['text'].replace('#', '') for tweet in tweets_test])
	y_test_proba = cls.predict_proba(x_test)

	'''
	Mean Average Precision Explanation:
	http://sdsawtelle.github.io/blog/output/mean-average-precision-MAP-for-recommender-systems.html
	'''
	TOP_K = 10
	average_precision_sum = 0
	for i in range(len(y_test_proba)):
	y_pair = [(y_test_proba[i][j], j) for j in range(len(y_test_proba[i]))]
	ranked = list(map(lambda x: x[1], sorted(y_pair, key=lambda x: x[0], reverse=True)))[:TOP_K]
	ranked = label_encoder.inverse_transform(ranked)

	hashtag_map = {}
	for hashtag in tweets_test[i]['hashtags']:
	hashtag_map[hashtag.lower()] = True

	hit, precision_sum = 0, 0
	for j in range(len(ranked)):
	if ranked[j] in hashtag_map:
	hit += 1
	precision_sum += hit / (j + 1)

	if hit == len(tweets_test[i]['hashtags']):
	break

	average_precision_sum += precision_sum/len(tweets_test[i]['hashtags'])

	print('TOP K={}, Mean Average Precision : {}'.format(TOP_K, average_precision_sum / len(tweets_test)))