otobrglez/kmeans_items.py

## kmeans_items.py
#!/usr/bin/env python

# http://bogdan-ivanov.com/recipe-text-clustering-using-nltk-and-scikit-learn/
#!/usr/bin/env python

import nltk
import string
import collections
from data.feeds import feed
from math import sqrt, ceil, floor
import string
import collections
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
from datetime import datetime

def process_text(text, stem=True):
    """ Tokenize text and stem words removing punctuation """
    text = text.translate(string.punctuation)
    tokens = word_tokenize(text)

    if stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(t) for t in tokens]

    return tokens

def extended_stopwords():
    words = stopwords.words('english')
    words = words + ['umek','dj', 'mix', 'remix', 'promo', 'episode', 'club', 'live']
    return words

def cluster_texts(texts, clusters=3):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    vectorizer = TfidfVectorizer(tokenizer=process_text,
                                 stop_words=extended_stopwords(),
                                 max_df=0.7,
                                 min_df=0.1,
                                 #use_idf=True,
                                 lowercase=True)

    tfidf_model = vectorizer.fit_transform(texts)
    km_model = KMeans(n_clusters=clusters)
    km_model.fit(tfidf_model)

    clustering = collections.defaultdict(list)

    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)

    return clustering


if __name__ == "__main__":
    from data.feeds import feed
    items = [x for x in feed("./data/umek_music_feed.csv")]
    item_names = ["%s %s %s" % (datetime.strptime(x['created_at'].split(".")[0],"%Y-%m-%d %H:%M:%S").\
                                strftime("%Y-%m"),\
                                x['name'],\
                                "") for x in items]

    # print(item_names[0])
    # exit(0)

    k = int(ceil(sqrt(len(item_names)/2)))
    k = 26
    #k = 60

    clusters = cluster_texts(item_names, k)
    i = len(clusters)

    # Pump-up 'K' for k-means algorithm
    optimize_k = False
    if optimize_k:
        ks, cs = [], []
        while(True):
            print "K=%d, Clusters=%d" % (k, i)

            k += 2
            clusters = cluster_texts(item_names, k)
            i = len(clusters)

            ks.append(k)
            cs.append(i)

            if len(ks) > 2:
                if [cs[-2], cs[-1]] == [i, i]:
                    break

    clusters_dict = dict(clusters)
    a = [[items[i] for i in clusters_dict[item]] for item in clusters_dict]

    for cluster in a:
        for item in sorted(cluster, key=lambda p: datetime.strptime(p['created_at'].split(".")[0],"%Y-%m-%d %H:%M:%S"), reverse=True):
            print "%s %s %s" % (item['channel'], datetime.strptime(item['created_at'].split(".")[0],"%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d"), item['name'])
	#!/usr/bin/env python

	# http://bogdan-ivanov.com/recipe-text-clustering-using-nltk-and-scikit-learn/
	#!/usr/bin/env python

	import nltk
	import string
	import collections
	from data.feeds import feed
	from math import sqrt, ceil, floor
	import string
	import collections
	from nltk import word_tokenize
	from nltk.stem import PorterStemmer
	from nltk.corpus import stopwords
	from sklearn.cluster import KMeans
	from sklearn.feature_extraction.text import TfidfVectorizer
	from pprint import pprint
	from datetime import datetime

	def process_text(text, stem=True):
	""" Tokenize text and stem words removing punctuation """
	text = text.translate(string.punctuation)
	tokens = word_tokenize(text)

	if stem:
	stemmer = PorterStemmer()
	tokens = [stemmer.stem(t) for t in tokens]

	return tokens

	def extended_stopwords():
	words = stopwords.words('english')
	words = words + ['umek','dj', 'mix', 'remix', 'promo', 'episode', 'club', 'live']
	return words

	def cluster_texts(texts, clusters=3):
	""" Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
	vectorizer = TfidfVectorizer(tokenizer=process_text,
	stop_words=extended_stopwords(),
	max_df=0.7,
	min_df=0.1,
	#use_idf=True,
	lowercase=True)

	tfidf_model = vectorizer.fit_transform(texts)
	km_model = KMeans(n_clusters=clusters)
	km_model.fit(tfidf_model)

	clustering = collections.defaultdict(list)

	for idx, label in enumerate(km_model.labels_):
	clustering[label].append(idx)

	return clustering


	if __name__ == "__main__":
	from data.feeds import feed
	items = [x for x in feed("./data/umek_music_feed.csv")]
	item_names = ["%s %s %s" % (datetime.strptime(x['created_at'].split(".")[0],"%Y-%m-%d %H:%M:%S").\
	strftime("%Y-%m"),\
	x['name'],\
	"") for x in items]

	# print(item_names[0])
	# exit(0)

	k = int(ceil(sqrt(len(item_names)/2)))
	k = 26
	#k = 60

	clusters = cluster_texts(item_names, k)
	i = len(clusters)

	# Pump-up 'K' for k-means algorithm
	optimize_k = False
	if optimize_k:
	ks, cs = [], []
	while(True):
	print "K=%d, Clusters=%d" % (k, i)

	k += 2
	clusters = cluster_texts(item_names, k)
	i = len(clusters)

	ks.append(k)
	cs.append(i)

	if len(ks) > 2:
	if [cs[-2], cs[-1]] == [i, i]:
	break

	clusters_dict = dict(clusters)
	a = [[items[i] for i in clusters_dict[item]] for item in clusters_dict]

	for cluster in a:
	for item in sorted(cluster, key=lambda p: datetime.strptime(p['created_at'].split(".")[0],"%Y-%m-%d %H:%M:%S"), reverse=True):
	print "%s %s %s" % (item['channel'], datetime.strptime(item['created_at'].split(".")[0],"%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d"), item['name'])