Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Groups (clusters) similar lines together from a text file using k-means clustering algorithm.
"""
Groups (clusters) similar lines together from a text file
using k-means clustering algorithm.
Also does some simple cleaning (such as removing white space and replacing numbers with (N)).
Example:
python cluster_lines.py --clusters 20 invalid_dates.txt
Required libs:
click
sklearn
"""
import click
import re
import numpy
import random
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
@click.command()
@click.argument('filename')
@click.option('--clusters', default=10, help='Number of clusters')
@click.option('--sample', default=10, help='Number of samples to print')
def cluster_lines(filename, clusters, sample):
lines = numpy.array(list(_get_lines(filename)))
doc_feat = TfidfVectorizer().fit_transform(lines)
km = KMeans(clusters).fit(doc_feat)
k = 0
clusters = defaultdict(list)
for i in km.labels_:
clusters[i].append(lines[k])
k += 1
s_clusters = sorted(clusters.values(), key=lambda l: -len(l))
for cluster in s_clusters:
print 'Cluster [%s]:' % len(cluster)
if len(cluster) > sample:
cluster = random.sample(cluster, sample)
for line in cluster:
print line
print '--------'
def _clean_line(line):
line = line.strip().lower()
line = re.sub('\d+', '(N)', line)
return line
def _get_lines(filename):
for line in open(filename).readlines():
yield _clean_line(line)
if __name__ == '__main__':
cluster_lines()
@jiuweizhao

This comment has been minimized.

Copy link

commented Jan 29, 2018

Sorry, Could you tell me how the data are arranged.
Is the data 3 dimensions (x,y, time-different time length of each record)?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.