Skip to content

Instantly share code, notes, and snippets.

@ysnrkdm
Created September 28, 2014 07:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ysnrkdm/701051ab608ace5b09cf to your computer and use it in GitHub Desktop.
Save ysnrkdm/701051ab608ace5b09cf to your computer and use it in GitHub Desktop.
scikit-learn example
# coding=utf-8
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from igo.tagger import Tagger
import numpy
import logging
import time
__author__ = 'yoshinori'
def analyzer(text):
dir_dict = 'naist-jdic'
t = Tagger(dir_dict)
strs = t.wakati(text)
return strs
def analyze_responses(responses):
max_df = 0.5
max_features = 500
logging.info('Start analyzing...')
t0 = time.clock()
messages = []
for res in responses:
msg = res.message
messages.append(msg)
logging.info('Done in ' + str(time.clock() - t0) + ' sec(s)')
logging.info('Feature extraction...')
t0 = time.clock()
# feature extraction
vectorizer = TfidfVectorizer(analyzer=analyzer, max_df=max_df, max_features=max_features)
bow = vectorizer.fit_transform(messages)
bow_tn = bow
logging.info('Done in ' + str(time.clock() - t0) + ' sec(s)')
# dimensionality reduction by LSA
# lsa_dim = 100
# lsa = TruncatedSVD(lsa_dim)
# bow_t = lsa.fit_transform(bow)
# bow_tn = Normalizer(copy=False).fit_transform(bow_t)
logging.info('Clustering by KMeans...')
t0 = time.clock()
# clustering by KMeans
num_clusters = 10
mini_batch = True
if mini_batch:
km = MiniBatchKMeans(n_clusters=num_clusters,
init='k-means++',
batch_size=1000,
n_init=10,
max_no_improvement=10)
else:
km = KMeans(n_clusters=num_clusters, init='k-means++', n_init=1)
km.fit(bow_tn)
labels = km.labels_
transformed = km.transform(bow_tn)
dists = numpy.zeros(labels.shape)
for i in range(len(labels)):
dists[i] = transformed[i, labels[i]]
logging.info('Done in ' + str(time.clock() - t0) + ' sec(s)')
logging.info('Sort clusters by distance...')
t0 = time.clock()
# sort by distance
clusters = []
for i in range(num_clusters):
cluster = []
ii = numpy.where(labels == i)[0]
dd = dists[ii]
di = numpy.vstack([dd, ii]).transpose().tolist()
di.sort()
for d, j in di:
cluster.append(messages[int(j)])
clusters.append(cluster)
logging.info('Done in ' + str(time.clock() - t0) + ' sec(s)')
print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)
print "Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)
print "V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)
print "Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)
print "Silhouette Coefficient: %0.3f" % metrics.silhouette_score(
bow_tn, labels, sample_size=1000)
return clusters
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment