Skip to content

Instantly share code, notes, and snippets.

@glamp
Last active December 11, 2015 07:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save glamp/4565505 to your computer and use it in GitHub Desktop.
Save glamp/4565505 to your computer and use it in GitHub Desktop.
from sklearn.cluster import AffinityPropagation, KMeans, MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import odcdata
import pprint as pp
q = "select UPPER(description) as description from odc_bank_transaction limit 1000"
data = odcdata.read_gp_as_dict(q)
descriptions = [row['description'] for row in data]
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(descriptions)
cluterers = {
"affinity": AffinityPropagation(),
"kmeans": KMeans(n_clusters=50),
"mini-batch-kmeans": MiniBatchKMeans(n_clusters=50)
}
for name, clusterer in cluterers.iteritems():
print "*"*80
print "fitting %s" % name
clusterer.fit(X)
clusters = {}
for desc, label in zip(descriptions, clusterer.labels_):
clusters[label] = clusters.get(label, [])
clusters[label] += [desc]
raw_input("ready to print results for %s?" % name)
pp.pprint(clusters)
raw_input("ready for next clusterer?")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment