Pinak-Chakraborty/K-means implementation

## K-means implementation
import re, random, numpy as np

def centers (X, K):
    # Initialize to K random centers
    oldmu = random.sample(list(X), K)
    mu = random.sample(list(X), K)
    while not has_converged(mu, oldmu):
        oldmu = mu
        # Assign all points in X to clusters
        clusters = cluster_points(X, mu)
        # Reevaluate centers
        mu = reevaluate_centers(oldmu, clusters)
    return(mu, clusters)

def cluster_points(X, mu):
    # For each data point, find the best mu and assign that cluster
    clusters  = {}
    for x in X:
        try:
            bestmukey = min([(i[0], np.linalg.norm(x-mu[i[0]])) \
                    for i in enumerate(mu)], key=lambda t:t[1])[0]
        except Exception:
            pass
    # Allocate the data point to its best mu
        try:
            clusters[bestmukey].append(x)
        except KeyError:
            clusters[bestmukey] = [x]
    return clusters

def reevaluate_centers(mu, clusters):
    # find new mu for all data points within a cluster
    newmu = []
    keys = sorted(clusters.keys())
    for k in keys:
        newmu.append(np.mean(clusters[k], axis = 0))
    return newmu

def has_converged(mu, oldmu):
    return(set([tuple(a) for a in mu]) == set([tuple(a) for a in oldmu]))

def wordTokenizier(line):
    delimiters = "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+|[.,!;:\"()^*'-/]"
    tokenList = re.findall(delimiters, line)
    return tokenList

def cooccur_cluster(fname):
    idxr = {}
    idxc = {}
    row_count = 0;
    col_count = 0;
    word_list = [[0 for x in range(250)] for y in range(250)]

# Open & read file in a loop
    for line in open (fname, mode='r', encoding="UTF-8"):
        line = line.rstrip()

#-- Find words from bigrams --------------------------------------
        words = wordTokenizier(line)

        if len(words) < 3:
            print("length less than 2, length = ", len(words), "record = ",  words)
            continue
#---- iterate over words to note their occurance counts from bigrams ----
        text = []
        for i in words:
            text.append(i)

        word1 = text[1]
        word2 = text[2]
        if word1 not in idxr:
            idxr[word1] = row_count
            row_count +=1
        if word2 not in idxc:
            idxc[word2] = col_count
            col_count +=1

        row = idxc[word2]
        col = idxc[word2]
        #print ("row = ", row, "col = ", col)
        #print ("word1 = ", word1, "word2 = ", word2)
        word_list[idxr[word1]][idxc[word2]] +=int(text[0])

    word_array = np.asarray(word_list)
    print ("converted to array ")
    del word_list

    m, c = centers(word_array, 50)

    for clust in c:
        print("========== cluster =================== ", clust)
        for point in c[clust]:
            #print ("points ", point)

            j=0
            while not np.allclose(point,word_array[j]):
                                  j += 1
            #print ("found ", j)
            #print ("word ", word_array[j])

            for word, id in idxc.items():
                if id == j:
                    print ("word found ", word)

#------------------------------------------------------------------------
fname = "C:\Python34\Data\inp-engbkup.txt"
cooccur_cluster(fname)
	import re, random, numpy as np

	def centers (X, K):
	# Initialize to K random centers
	oldmu = random.sample(list(X), K)
	mu = random.sample(list(X), K)
	while not has_converged(mu, oldmu):
	oldmu = mu
	# Assign all points in X to clusters
	clusters = cluster_points(X, mu)
	# Reevaluate centers
	mu = reevaluate_centers(oldmu, clusters)
	return(mu, clusters)

	def cluster_points(X, mu):
	# For each data point, find the best mu and assign that cluster
	clusters = {}
	for x in X:
	try:
	bestmukey = min([(i[0], np.linalg.norm(x-mu[i[0]])) \
	for i in enumerate(mu)], key=lambda t:t[1])[0]
	except Exception:
	pass
	# Allocate the data point to its best mu
	try:
	clusters[bestmukey].append(x)
	except KeyError:
	clusters[bestmukey] = [x]
	return clusters

	def reevaluate_centers(mu, clusters):
	# find new mu for all data points within a cluster
	newmu = []
	keys = sorted(clusters.keys())
	for k in keys:
	newmu.append(np.mean(clusters[k], axis = 0))
	return newmu

	def has_converged(mu, oldmu):
	return(set([tuple(a) for a in mu]) == set([tuple(a) for a in oldmu]))

	def wordTokenizier(line):
	delimiters = "[A-Z]{2,}(?![a-z])\|[A-Z][a-z]+(?=[A-Z])\|[\'\w\-]+\|[.,!;:\"()^*'-/]"
	tokenList = re.findall(delimiters, line)
	return tokenList

	def cooccur_cluster(fname):
	idxr = {}
	idxc = {}
	row_count = 0;
	col_count = 0;
	word_list = [[0 for x in range(250)] for y in range(250)]

	# Open & read file in a loop
	for line in open (fname, mode='r', encoding="UTF-8"):
	line = line.rstrip()

	#-- Find words from bigrams --------------------------------------
	words = wordTokenizier(line)

	if len(words) < 3:
	print("length less than 2, length = ", len(words), "record = ", words)
	continue
	#---- iterate over words to note their occurance counts from bigrams ----
	text = []
	for i in words:
	text.append(i)

	word1 = text[1]
	word2 = text[2]
	if word1 not in idxr:
	idxr[word1] = row_count
	row_count +=1
	if word2 not in idxc:
	idxc[word2] = col_count
	col_count +=1

	row = idxc[word2]
	col = idxc[word2]
	#print ("row = ", row, "col = ", col)
	#print ("word1 = ", word1, "word2 = ", word2)
	word_list[idxr[word1]][idxc[word2]] +=int(text[0])

	word_array = np.asarray(word_list)
	print ("converted to array ")
	del word_list

	m, c = centers(word_array, 50)

	for clust in c:
	print("========== cluster =================== ", clust)
	for point in c[clust]:
	#print ("points ", point)

	j=0
	while not np.allclose(point,word_array[j]):
	j += 1
	#print ("found ", j)
	#print ("word ", word_array[j])

	for word, id in idxc.items():
	if id == j:
	print ("word found ", word)

	#------------------------------------------------------------------------
	fname = "C:\Python34\Data\inp-engbkup.txt"
	cooccur_cluster(fname)