Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth
import os
import stat
files = ['archives/2017-07/{}'.format(f) for f in list(os.walk('archives/2017-07/'))[0][2]]
ar = []
arr = []
for f in files:
ar.append([int(os.stat(f).st_mtime),f])
for a,b in ar:
arr.append([a-ar[0][0],b])
nums = [int(a[0]) for a in arr]
x=nums
X = np.array(list(zip(x,np.zeros(len(x)))), dtype=np.int)
bandwidth = estimate_bandwidth(X, quantile=0.05)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
res = []
for k in range(n_clusters_):
my_members = labels == k
print ( "cluster {0}: {1}".format(k, X[my_members, 0]))
res.append(X[my_members, 0].tolist())
groups = {}
for i,row in enumerate(res):
groups[i]=[]
for r in row:
for tup in arr:
if tup[0] == r:
groups[i].append(tup)
for g,tups in groups.items():
with open('{}.txt'.format(g),'w') as ofile:
for t in tups:
ofile.write('{}\n'.format(t[1]))
import sys
import numpy as np
import io as _io
import fileinput
import argparse
import itertools
import operator
import collections
from scipy.cluster.vq import kmeans, vq
from sklearn.cluster import MeanShift, estimate_bandwidth
def cluster2(nlist,num_groups):
y = np.array(nlist, dtype=float)
codebook, _ = kmeans(y, num_groups) # three clusters
cluster_indices, _ = vq(y, codebook)
groups = []
#print (list(itertools.groupby(zip(cluster_indices.tolist(),nlist),operator.itemgetter(0))))
res = []
last_index = None
for idx,k in enumerate(cluster_indices):
if last_index != k:
res.append([])
res[-1].append(nlist[idx])
last_index=k
#for foo, bar in itertools.groupby(zip(cluster_indices.tolist(),nlist),operator.itemgetter(0)):
# res.append(list(bar))
#res = [[bar[0],list(bar[1])] for bar in list(itertools.groupby(zip(cluster_indices.tolist(),nlist),operator.itemgetter(0))))]
print (res)
return res
# for a,b in zip(cluster_indices.tolist(),nlist):
# if lasta!=a:
# groups.app
# print (cluster_indices)
#return cluster_indices
def cluster(nlist, quantile):
nlist = [int(x) for x in nlist]
X = np.array(list(zip(nlist,np.zeros(len(nlist)))), dtype=np.int)
bandwidth = estimate_bandwidth(X, quantile=quantile)
print (bandwidth)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
res = []
for k in range(n_clusters_):
my_members = labels == k
print ( "cluster {0}: {1}".format(k, X[my_members, 0]))
res.append(X[my_members, 0].tolist())
for i,r in enumerate(sorted(res)):
for el in r:
sys.stdout.write('{}\t{}\n'.format(i,el))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--dummy', help='dummy argument')
parser.add_argument('-q', '--quantile', default='.1', dest='quantile', help='quantile')
parser.add_argument('-t', '--type', default='cluster', dest='type', help='type')
parser.add_argument('-n', '--num', default=5, dest='num_groups', help='num_groups')
parser.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used')
args = parser.parse_args()
print(args)
# If you would call fileinput.input() without files it would try to process all arguments.
# We pass '-' as only file when argparse got no files which will cause fileinput to read from stdin
ar = []
for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )):
ar.append(line.strip())
if args.type == 'cluster':
cluster(ar,float(args.quantile))
if args.type == 'cluster2':
cluster2(ar,int(args.num_groups))
#parser = argparse.ArgumentParser()
#parser.add_argument('infile', nargs='?', type=argparse.FileType('r'),
# default=sys.stdin)
#parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'),
# default=sys.stdout)
#parser.parse_args(['input.txt', 'output.txt'])
#Namespace(infile=<_io.TextIOWrapper name='input.txt' encoding='UTF-8'>,
# outfile=<_io.TextIOWrapper name='output.txt' encoding='UTF-8'>)
#parser.parse_args([])
#Namespace(infile=<_io.TextIOWrapper name='<stdin>' encoding='UTF-8'>,
# outfile=<_io.TextIOWrapper name='<stdout>' encoding='UTF-8'>)
#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment