Skip to content

Instantly share code, notes, and snippets.

@oskar-j
Created April 12, 2015 15:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save oskar-j/181813226e7c4fa9c3c0 to your computer and use it in GitHub Desktop.
Save oskar-j/181813226e7c4fa9c3c0 to your computer and use it in GitHub Desktop.
find_diverse.py
'''
Program works for current directory
just be sure to have .txt data there
and to output results to a file or somewhere
'''
import os
import codecs
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
def include_ext(ext):
def compare(fn):
return os.path.splitext(fn)[1] == ext
return compare
def search_for_max(maax):
max = 0
item = None
for z in maax.keys():
if maax[z] > max:
item = z
max = maax[z]
return item
#print 'Test............'
'''
vect = TfidfVectorizer(min_df=1)
tfidf = vect.fit_transform(["I'd like an apple",
"An apple a day keeps the doctor away",
"Never compare an apple to an orange",
"I prefer scikit-learn to Orange"])
'''
#print (tfidf * tfidf.T).A
#print 'End of test.................'
text_files = filter(os.path.isfile, os.listdir(os.curdir)) # files only
text_files = filter(include_ext(".txt"), text_files)
#print '................................................................'
print text_files
#print '................................................................'
documents = [codecs.open(f, 'r', encoding='utf-8', errors='ignore').read() for f in text_files]
#print documents
#print '................................................................'
#tfidf = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False).fit_transform(documents)
tfidf = TfidfVectorizer(min_df=1).fit_transform(documents)
# no need to normalize, since Vectorizer will return normalized tf-idf
#pairwise_similarity = tfidf * tfidf.T
#pairwise_similarity = (tfidf * tfidf.T).A
#print tfidf
#print '.....................'
#print tfidf.T
#print '.....................'
#print tfidf.A
#print '.....................'
#print (tfidf * tfidf.T)
#print '.....................'
#print (tfidf * tfidf.T).A
#print '.....................'
cosine_sim = (tfidf * tfidf.T).toarray()
#print cosine_sim
ms = []
for i in range(0, len(text_files)):
most_similar = np.argsort(cosine_sim[:, i])[::-1]
print most_similar
ms.append(most_similar)
#candicates = ms[:100] # or however many you desire
#print candicates
counts = dict()
results = []
for y in reversed(range(1, len(ms))): # kolumny
for x in range(0, len(ms)): # wpisy (wystapienia, wiersze)
counts[str(ms[x][y])] = counts[str(ms[x][y])] + 1 if str(ms[x][y]) in counts else 1
results.append(search_for_max(counts))
counts = dict()
print results[0:1999] # this is guaranted to have insert order
zz = [text_files[int(x)] for x in results[0:1999]] # same here
print zz
print set(zz)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment