Honga1/tf–idf

## tf–idf
#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
import csv
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

f = open('mscoco.json')
data = json.load(f)
f.close()

corpus = []

for image_desc in data['val']:
    tot_phrase = []
    for image_sentence in image_desc[1]:
        sentence = []
        for word in image_sentence:
            word = stemmer.stem(word)
            sentence.append(word)
        sentence_str = ' '.join(sentence)
        tot_phrase.append(sentence_str)
    tot_para = ''.join(tot_phrase)
    corpus.append(tot_para)

from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0,
                     stop_words='english')
corpus = corpus[0:4000]
tfidf_matrix = tf.fit_transform(corpus)
feature_names = tf.get_feature_names()
print len(feature_names)
print feature_names[50:70]
print tfidf_matrix
dense = tfidf_matrix.todense()
image = dense[0].tolist()[0]
phrase_scores = [pair for pair in zip(range(0, len(image)), image)
                 if pair[1] > 0]
print len(phrase_scores)

print sorted(phrase_scores, key=lambda t: t[1] * -1)[:5]
sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
for (phrase, score) in [(feature_names[word_id], score) for (word_id,
                        score) in sorted_phrase_scores][:20]:
    print '{0: <20} {1}'.format(phrase, score)

with open('out.csv', 'w') as file:
    writer = csv.writer(file, delimiter=',')
    writer.writerow(['Image', 'Phrase', 'Score'])

    doc_id = 0
    for doc in tfidf_matrix.todense():
        print 'Document %d' % doc_id
        word_id = 0
        for score in doc.tolist()[0]:
            if score > 0:
                word = feature_names[word_id]
                writer.writerow([doc_id + 1, word.encode('utf-8'),
                                score])
            word_id += 1
        doc_id += 1
	#!/usr/bin/python
	# -- coding: utf-8 --
	import json
	import csv
	from nltk.stem.lancaster import LancasterStemmer
	stemmer = LancasterStemmer()

	f = open('mscoco.json')
	data = json.load(f)
	f.close()

	corpus = []

	for image_desc in data['val']:
	tot_phrase = []
	for image_sentence in image_desc[1]:
	sentence = []
	for word in image_sentence:
	word = stemmer.stem(word)
	sentence.append(word)
	sentence_str = ' '.join(sentence)
	tot_phrase.append(sentence_str)
	tot_para = ''.join(tot_phrase)
	corpus.append(tot_para)

	from sklearn.feature_extraction.text import TfidfVectorizer
	tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0,
	stop_words='english')
	corpus = corpus[0:4000]
	tfidf_matrix = tf.fit_transform(corpus)
	feature_names = tf.get_feature_names()
	print len(feature_names)
	print feature_names[50:70]
	print tfidf_matrix
	dense = tfidf_matrix.todense()
	image = dense[0].tolist()[0]
	phrase_scores = [pair for pair in zip(range(0, len(image)), image)
	if pair[1] > 0]
	print len(phrase_scores)

	print sorted(phrase_scores, key=lambda t: t[1] * -1)[:5]
	sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
	for (phrase, score) in [(feature_names[word_id], score) for (word_id,
	score) in sorted_phrase_scores][:20]:
	print '{0: <20} {1}'.format(phrase, score)

	with open('out.csv', 'w') as file:
	writer = csv.writer(file, delimiter=',')
	writer.writerow(['Image', 'Phrase', 'Score'])

	doc_id = 0
	for doc in tfidf_matrix.todense():
	print 'Document %d' % doc_id
	word_id = 0
	for score in doc.tolist()[0]:
	if score > 0:
	word = feature_names[word_id]
	writer.writerow([doc_id + 1, word.encode('utf-8'),
	score])
	word_id += 1
	doc_id += 1