rajacsp/gist:8c265676f2c6a0268d4452a47407f96c

## gistfile1.txt
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# the above line is to avoid 'SyntaxError: Non-UTF-8 code starting with' error

'''
Created on

@author: raja.raman

source:
    http://www.cnn.com/interactive/2017/politics/trump-tweets/

'''
import gensim
from nltk.tokenize import word_tokenize


recent_tweets = [
    "China is actually placing propaganda ads in the Des Moines Register and other papers, made to look like news. That’s because we are beating them on Trade, opening markets, and the farmers will make a fortune when this is over!"
    ,"Avenatti is a third rate lawyer who is good at making false accusations, like he did on me and like he is now doing on Judge Brett Kavanaugh. He is just looking for attention and doesn’t want people to look at his past record and relationships - a total low-life!"
    ,"Consumer confidence hits an 18 year high, close to breaking the all-time record. A big jump from last 8 years. People are excited about the USA again! We are getting Bigger and Richer and Stronger. WAY MORE TO GO!"
    ,"Consumer confidence rose in September, notching its highest level in about 18 years. The Consumer Board's index rose to 138.4 this month from 134.7 in August"
    ,"Will be speaking at the United Nations this morning. Our country is much stronger and much richer than it was when I took office less than two years ago. We are also MUCH safer!"
]

new_tweet = "Despite requests, I have no plans to meet Iranian President Hassan Rouhani. Maybe someday in the future. I am sure he is an absolutely lovely man!"

def main():

    print(recent_tweets)

    print("Number of documents:",len(recent_tweets))

    gen_docs = [[w.lower() for w in word_tokenize(text)]
            for text in recent_tweets]
    #print(gen_docs)

    dictionary = gensim.corpora.Dictionary(gen_docs)
    print("Number of words in dictionary:",len(dictionary))

    corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
    #print(corpus)

    tf_idf = gensim.models.TfidfModel(corpus)
    sims = gensim.similarities.Similarity('c:/test/',tf_idf[corpus],
                                      num_features=len(dictionary))


    query_doc = [w.lower() for w in word_tokenize(new_tweet)]

    query_doc_bow = dictionary.doc2bow(query_doc)
    query_doc_tf_idf = tf_idf[query_doc_bow]

    similarity = sims[query_doc_tf_idf]

    counter = 0
    for x in similarity:
        print(str(counter) + " ==> " + str("%.2f" % (x * 100)+"%") )
        counter = counter + 1


if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	# -- coding: utf-8 --
	# the above line is to avoid 'SyntaxError: Non-UTF-8 code starting with' error

	'''
	Created on

	@author: raja.raman

	source:
	http://www.cnn.com/interactive/2017/politics/trump-tweets/

	'''
	import gensim
	from nltk.tokenize import word_tokenize


	recent_tweets = [
	"China is actually placing propaganda ads in the Des Moines Register and other papers, made to look like news. That’s because we are beating them on Trade, opening markets, and the farmers will make a fortune when this is over!"
	,"Avenatti is a third rate lawyer who is good at making false accusations, like he did on me and like he is now doing on Judge Brett Kavanaugh. He is just looking for attention and doesn’t want people to look at his past record and relationships - a total low-life!"
	,"Consumer confidence hits an 18 year high, close to breaking the all-time record. A big jump from last 8 years. People are excited about the USA again! We are getting Bigger and Richer and Stronger. WAY MORE TO GO!"
	,"Consumer confidence rose in September, notching its highest level in about 18 years. The Consumer Board's index rose to 138.4 this month from 134.7 in August"
	,"Will be speaking at the United Nations this morning. Our country is much stronger and much richer than it was when I took office less than two years ago. We are also MUCH safer!"
	]

	new_tweet = "Despite requests, I have no plans to meet Iranian President Hassan Rouhani. Maybe someday in the future. I am sure he is an absolutely lovely man!"

	def main():

	print(recent_tweets)

	print("Number of documents:",len(recent_tweets))

	gen_docs = [[w.lower() for w in word_tokenize(text)]
	for text in recent_tweets]
	#print(gen_docs)

	dictionary = gensim.corpora.Dictionary(gen_docs)
	print("Number of words in dictionary:",len(dictionary))

	corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
	#print(corpus)

	tf_idf = gensim.models.TfidfModel(corpus)
	sims = gensim.similarities.Similarity('c:/test/',tf_idf[corpus],
	num_features=len(dictionary))



	query_doc = [w.lower() for w in word_tokenize(new_tweet)]

	query_doc_bow = dictionary.doc2bow(query_doc)
	query_doc_tf_idf = tf_idf[query_doc_bow]

	similarity = sims[query_doc_tf_idf]

	counter = 0
	for x in similarity:
	print(str(counter) + " ==> " + str("%.2f" % (x * 100)+"%") )
	counter = counter + 1


	if __name__ == '__main__':
	main()