Skip to content

Instantly share code, notes, and snippets.

@otobrglez
Created April 4, 2015 12:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save otobrglez/c39bb6b70007b83fed84 to your computer and use it in GitHub Desktop.
Save otobrglez/c39bb6b70007b83fed84 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# http://bogdan-ivanov.com/recipe-text-clustering-using-nltk-and-scikit-learn/
#!/usr/bin/env python
import nltk
import string
import collections
from data.feeds import feed
from math import sqrt, ceil, floor
import string
import collections
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
from datetime import datetime
def process_text(text, stem=True):
""" Tokenize text and stem words removing punctuation """
text = text.translate(string.punctuation)
tokens = word_tokenize(text)
if stem:
stemmer = PorterStemmer()
tokens = [stemmer.stem(t) for t in tokens]
return tokens
def extended_stopwords():
words = stopwords.words('english')
words = words + ['umek','dj', 'mix', 'remix', 'promo', 'episode', 'club', 'live']
return words
def cluster_texts(texts, clusters=3):
""" Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
vectorizer = TfidfVectorizer(tokenizer=process_text,
stop_words=extended_stopwords(),
max_df=0.7,
min_df=0.1,
#use_idf=True,
lowercase=True)
tfidf_model = vectorizer.fit_transform(texts)
km_model = KMeans(n_clusters=clusters)
km_model.fit(tfidf_model)
clustering = collections.defaultdict(list)
for idx, label in enumerate(km_model.labels_):
clustering[label].append(idx)
return clustering
if __name__ == "__main__":
from data.feeds import feed
items = [x for x in feed("./data/umek_music_feed.csv")]
item_names = ["%s %s %s" % (datetime.strptime(x['created_at'].split(".")[0],"%Y-%m-%d %H:%M:%S").\
strftime("%Y-%m"),\
x['name'],\
"") for x in items]
# print(item_names[0])
# exit(0)
k = int(ceil(sqrt(len(item_names)/2)))
k = 26
#k = 60
clusters = cluster_texts(item_names, k)
i = len(clusters)
# Pump-up 'K' for k-means algorithm
optimize_k = False
if optimize_k:
ks, cs = [], []
while(True):
print "K=%d, Clusters=%d" % (k, i)
k += 2
clusters = cluster_texts(item_names, k)
i = len(clusters)
ks.append(k)
cs.append(i)
if len(ks) > 2:
if [cs[-2], cs[-1]] == [i, i]:
break
clusters_dict = dict(clusters)
a = [[items[i] for i in clusters_dict[item]] for item in clusters_dict]
for cluster in a:
for item in sorted(cluster, key=lambda p: datetime.strptime(p['created_at'].split(".")[0],"%Y-%m-%d %H:%M:%S"), reverse=True):
print "%s %s %s" % (item['channel'], datetime.strptime(item['created_at'].split(".")[0],"%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d"), item['name'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment