Last active
December 24, 2015 22:09
-
-
Save zpconn/6870616 to your computer and use it in GitHub Desktop.
Naive clustering based on gaps between successive numbers. Must have lists.csv in the same directory as cluster.py. lists.csv should contain several columns of real numbers with labels in the header row. This filters out zero values, which will obviously always be in the same cluster. Optional command line argument specifies a custom gap, which …
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def pairify(it): | |
it0, it1 = itertools.tee(it, 2) | |
first = next(it0) | |
return zip(itertools.chain([first, first], it0), it1) | |
def cluster(sequence, maxgap): | |
batch = [] | |
for prev, val in pairify(sequence): | |
if abs(val - prev) >= maxgap: | |
yield batch | |
batch = [val] | |
else: | |
batch.append(val) | |
if batch: | |
yield batch | |
if __name__ == '__main__': | |
import sys | |
import csv | |
import itertools | |
from collections import defaultdict | |
gap = 0.05 | |
if len(sys.argv) == 2: | |
gap = float(sys.argv[1]) | |
columns = defaultdict(list) | |
with open('lists.csv', 'rU') as csvfile: | |
reader = csv.DictReader(csvfile) | |
for row in reader: | |
for (k,v) in row.items(): | |
columns[k].append(v) | |
for column in reader.fieldnames: | |
data = map(float, filter(lambda v: v != "0.00" and v != '', columns[column])) | |
print '%s: ' % column | |
for group in cluster(data, maxgap=gap): | |
print group | |
print '\n\n' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment