Skip to content

Instantly share code, notes, and snippets.

@Honga1
Created July 21, 2018 14:44
Show Gist options
  • Save Honga1/25e1bb8f73b912e7be79adb493ca6f4a to your computer and use it in GitHub Desktop.
Save Honga1/25e1bb8f73b912e7be79adb493ca6f4a to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
import csv
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
f = open('mscoco.json')
data = json.load(f)
f.close()
corpus = []
for image_desc in data['val']:
tot_phrase = []
for image_sentence in image_desc[1]:
sentence = []
for word in image_sentence:
word = stemmer.stem(word)
sentence.append(word)
sentence_str = ' '.join(sentence)
tot_phrase.append(sentence_str)
tot_para = ''.join(tot_phrase)
corpus.append(tot_para)
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0,
stop_words='english')
corpus = corpus[0:4000]
tfidf_matrix = tf.fit_transform(corpus)
feature_names = tf.get_feature_names()
print len(feature_names)
print feature_names[50:70]
print tfidf_matrix
dense = tfidf_matrix.todense()
image = dense[0].tolist()[0]
phrase_scores = [pair for pair in zip(range(0, len(image)), image)
if pair[1] > 0]
print len(phrase_scores)
print sorted(phrase_scores, key=lambda t: t[1] * -1)[:5]
sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
for (phrase, score) in [(feature_names[word_id], score) for (word_id,
score) in sorted_phrase_scores][:20]:
print '{0: <20} {1}'.format(phrase, score)
with open('out.csv', 'w') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow(['Image', 'Phrase', 'Score'])
doc_id = 0
for doc in tfidf_matrix.todense():
print 'Document %d' % doc_id
word_id = 0
for score in doc.tolist()[0]:
if score > 0:
word = feature_names[word_id]
writer.writerow([doc_id + 1, word.encode('utf-8'),
score])
word_id += 1
doc_id += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment