Created
November 24, 2022 02:54
-
-
Save nickumia-reisys/89de774350b42ec2d94fe1e18fdaecd6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.corpus import wordnet | |
import json | |
import re | |
def compare_two_words(a, b): | |
aw = wordnet.synsets(a) | |
bw = wordnet.synsets(b) | |
# Take the first definition | |
# return aw[0].shortest_path_distance(bw[0]) | |
# Take the average of all definitions | |
avg = 0 | |
links = 0 | |
for y in aw: | |
for z in bw: | |
dist = y.shortest_path_distance(z) | |
if dist is not None: | |
avg += y.shortest_path_distance(z) | |
links += 1 | |
# return avg/(len(aw)*len(bw)) | |
if links == 0: | |
links = 1 | |
return avg/links | |
def min_index(a): | |
min_index = 0 | |
cur_index = 1 | |
while cur_index < len(a): | |
if a[cur_index] < a[min_index]: | |
min_index = cur_index | |
cur_index = cur_index + 1 | |
return min_index | |
def categorize(groups, unseen): | |
visited = set() | |
multi = set() | |
for word in unseen: | |
multi = set(re.split('-|/|_| ', word)) | |
if len(multi) > 1: | |
visited.add(word) | |
if groups == {}: | |
groups[0] = set([word]) | |
visited.add(word) | |
else: | |
assessment = {} | |
for c,w in groups.items(): | |
temp = [] | |
for x in w: | |
# print(word, x, compare_two_words(word, x)) | |
temp.append(compare_two_words(word, x)) | |
assessment[c] = temp | |
# append(sum(temp)/len(temp)) | |
# print(word, x, assessment) | |
for c,a in assessment.items(): | |
if min(a) < 10 and min(a) != 0: | |
groups[c].add(word) | |
# groups[min_index(a)].add(word) | |
visited.add(word) | |
unseen |= multi | |
return groups, (unseen-visited) | |
def test(): | |
a = set(['ocean', 'water', 'bank', 'clothes', 'plant', 'floor', 'building']) | |
groups = {} | |
while len(a) > 0: | |
old_len = len(a) | |
groups, a = categorize(groups, a) | |
new_len = len(a) | |
print(old_len, new_len) | |
if old_len == new_len: | |
new_category = a.pop() | |
groups[len(groups)] = set([new_category]) | |
print(groups) | |
def data_gov_keywords(): | |
f = open('tags.json') | |
a = json.load(f) | |
f.close() | |
keywords = set(a.keys()) | |
groups = {} | |
while len(keywords) > 0: | |
print(len(keywords), len(groups)) | |
old_len = len(keywords) | |
groups, keywords = categorize(groups, keywords) | |
new_len = len(keywords) | |
if old_len == new_len: | |
new_category = keywords.pop() | |
groups[len(groups)] = set([new_category]) | |
output = {i:list(j) for i,j in groups.items()} | |
with open('output.json', 'w') as o: | |
o.write(json.dumps(output)) | |
print(groups) | |
print(len(groups)) | |
if __name__ == "__main__": | |
data_gov_keywords() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment