nickumia-reisys/categorize.py

## categorize.py
from nltk.corpus import wordnet
import json
import re


def compare_two_words(a, b):
    aw = wordnet.synsets(a)
    bw = wordnet.synsets(b)

    # Take the first definition
    # return aw[0].shortest_path_distance(bw[0])

    # Take the average of all definitions
    avg = 0
    links = 0
    for y in aw:
        for z in bw:
            dist = y.shortest_path_distance(z)
            if dist is not None:
                avg += y.shortest_path_distance(z)
                links += 1
    # return avg/(len(aw)*len(bw))
    if links == 0:
        links = 1
    return avg/links

def min_index(a):
    min_index = 0
    cur_index = 1

    while cur_index < len(a):
        if a[cur_index] < a[min_index]:
            min_index = cur_index
        cur_index = cur_index + 1

    return min_index


def categorize(groups, unseen):
    visited = set()
    multi = set()
    for word in unseen:
        multi = set(re.split('-|/|_| ', word))
        if len(multi) > 1:
            visited.add(word)
        if groups == {}:
            groups[0] = set([word])
            visited.add(word)
        else:
            assessment = {}
            for c,w in groups.items():
                temp = []
                for x in w:
                    # print(word, x, compare_two_words(word, x))
                    temp.append(compare_two_words(word, x))
                assessment[c] = temp
                # append(sum(temp)/len(temp))
            # print(word, x, assessment)
            for c,a in assessment.items():
                if min(a) < 10 and min(a) != 0:
                    groups[c].add(word)
                    # groups[min_index(a)].add(word)
                    visited.add(word)

    unseen |= multi
    return groups, (unseen-visited)


def test():
    a = set(['ocean', 'water', 'bank', 'clothes', 'plant', 'floor', 'building'])
    groups = {}

    while len(a) > 0:
        old_len = len(a)
        groups, a = categorize(groups, a)
        new_len = len(a)

        print(old_len, new_len)
        if old_len == new_len:
            new_category = a.pop()
            groups[len(groups)] = set([new_category])

    print(groups)


def data_gov_keywords():
    f = open('tags.json')
    a = json.load(f)
    f.close()
    keywords = set(a.keys())
    groups = {}

    while len(keywords) > 0:
        print(len(keywords), len(groups))
        old_len = len(keywords)
        groups, keywords = categorize(groups, keywords)
        new_len = len(keywords)

        if old_len == new_len:
            new_category = keywords.pop()
            groups[len(groups)] = set([new_category])

        output = {i:list(j) for i,j in groups.items()}
        with open('output.json', 'w') as o:
            o.write(json.dumps(output))

    print(groups)
    print(len(groups))


if __name__ == "__main__":
    data_gov_keywords()
	from nltk.corpus import wordnet
	import json
	import re


	def compare_two_words(a, b):
	aw = wordnet.synsets(a)
	bw = wordnet.synsets(b)

	# Take the first definition
	# return aw[0].shortest_path_distance(bw[0])

	# Take the average of all definitions
	avg = 0
	links = 0
	for y in aw:
	for z in bw:
	dist = y.shortest_path_distance(z)
	if dist is not None:
	avg += y.shortest_path_distance(z)
	links += 1
	# return avg/(len(aw)*len(bw))
	if links == 0:
	links = 1
	return avg/links

	def min_index(a):
	min_index = 0
	cur_index = 1

	while cur_index < len(a):
	if a[cur_index] < a[min_index]:
	min_index = cur_index
	cur_index = cur_index + 1

	return min_index


	def categorize(groups, unseen):
	visited = set()
	multi = set()
	for word in unseen:
	multi = set(re.split('-\|/\|_\| ', word))
	if len(multi) > 1:
	visited.add(word)
	if groups == {}:
	groups[0] = set([word])
	visited.add(word)
	else:
	assessment = {}
	for c,w in groups.items():
	temp = []
	for x in w:
	# print(word, x, compare_two_words(word, x))
	temp.append(compare_two_words(word, x))
	assessment[c] = temp
	# append(sum(temp)/len(temp))
	# print(word, x, assessment)
	for c,a in assessment.items():
	if min(a) < 10 and min(a) != 0:
	groups[c].add(word)
	# groups[min_index(a)].add(word)
	visited.add(word)

	unseen \|= multi
	return groups, (unseen-visited)


	def test():
	a = set(['ocean', 'water', 'bank', 'clothes', 'plant', 'floor', 'building'])
	groups = {}

	while len(a) > 0:
	old_len = len(a)
	groups, a = categorize(groups, a)
	new_len = len(a)

	print(old_len, new_len)
	if old_len == new_len:
	new_category = a.pop()
	groups[len(groups)] = set([new_category])

	print(groups)


	def data_gov_keywords():
	f = open('tags.json')
	a = json.load(f)
	f.close()
	keywords = set(a.keys())
	groups = {}

	while len(keywords) > 0:
	print(len(keywords), len(groups))
	old_len = len(keywords)
	groups, keywords = categorize(groups, keywords)
	new_len = len(keywords)

	if old_len == new_len:
	new_category = keywords.pop()
	groups[len(groups)] = set([new_category])

	output = {i:list(j) for i,j in groups.items()}
	with open('output.json', 'w') as o:
	o.write(json.dumps(output))

	print(groups)
	print(len(groups))


	if __name__ == "__main__":
	data_gov_keywords()