Skip to content

Instantly share code, notes, and snippets.

@Pinak-Chakraborty
Created August 20, 2014 21:21
Show Gist options
  • Save Pinak-Chakraborty/ed314b5386f2dca53452 to your computer and use it in GitHub Desktop.
Save Pinak-Chakraborty/ed314b5386f2dca53452 to your computer and use it in GitHub Desktop.
K-means implementation for word co-occurence
import re, random, numpy as np
def centers (X, K):
# Initialize to K random centers
oldmu = random.sample(list(X), K)
mu = random.sample(list(X), K)
while not has_converged(mu, oldmu):
oldmu = mu
# Assign all points in X to clusters
clusters = cluster_points(X, mu)
# Reevaluate centers
mu = reevaluate_centers(oldmu, clusters)
return(mu, clusters)
def cluster_points(X, mu):
# For each data point, find the best mu and assign that cluster
clusters = {}
for x in X:
try:
bestmukey = min([(i[0], np.linalg.norm(x-mu[i[0]])) \
for i in enumerate(mu)], key=lambda t:t[1])[0]
except Exception:
pass
# Allocate the data point to its best mu
try:
clusters[bestmukey].append(x)
except KeyError:
clusters[bestmukey] = [x]
return clusters
def reevaluate_centers(mu, clusters):
# find new mu for all data points within a cluster
newmu = []
keys = sorted(clusters.keys())
for k in keys:
newmu.append(np.mean(clusters[k], axis = 0))
return newmu
def has_converged(mu, oldmu):
return(set([tuple(a) for a in mu]) == set([tuple(a) for a in oldmu]))
def wordTokenizier(line):
delimiters = "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+|[.,!;:\"()^*'-/]"
tokenList = re.findall(delimiters, line)
return tokenList
def cooccur_cluster(fname):
idxr = {}
idxc = {}
row_count = 0;
col_count = 0;
word_list = [[0 for x in range(250)] for y in range(250)]
# Open & read file in a loop
for line in open (fname, mode='r', encoding="UTF-8"):
line = line.rstrip()
#-- Find words from bigrams --------------------------------------
words = wordTokenizier(line)
if len(words) < 3:
print("length less than 2, length = ", len(words), "record = ", words)
continue
#---- iterate over words to note their occurance counts from bigrams ----
text = []
for i in words:
text.append(i)
word1 = text[1]
word2 = text[2]
if word1 not in idxr:
idxr[word1] = row_count
row_count +=1
if word2 not in idxc:
idxc[word2] = col_count
col_count +=1
row = idxc[word2]
col = idxc[word2]
#print ("row = ", row, "col = ", col)
#print ("word1 = ", word1, "word2 = ", word2)
word_list[idxr[word1]][idxc[word2]] +=int(text[0])
word_array = np.asarray(word_list)
print ("converted to array ")
del word_list
m, c = centers(word_array, 50)
for clust in c:
print("========== cluster =================== ", clust)
for point in c[clust]:
#print ("points ", point)
j=0
while not np.allclose(point,word_array[j]):
j += 1
#print ("found ", j)
#print ("word ", word_array[j])
for word, id in idxc.items():
if id == j:
print ("word found ", word)
#------------------------------------------------------------------------
fname = "C:\Python34\Data\inp-engbkup.txt"
cooccur_cluster(fname)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment