Skip to content

Instantly share code, notes, and snippets.

@nickumia-reisys
Created November 24, 2022 02:54
Show Gist options
  • Save nickumia-reisys/89de774350b42ec2d94fe1e18fdaecd6 to your computer and use it in GitHub Desktop.
Save nickumia-reisys/89de774350b42ec2d94fe1e18fdaecd6 to your computer and use it in GitHub Desktop.
from nltk.corpus import wordnet
import json
import re
def compare_two_words(a, b):
aw = wordnet.synsets(a)
bw = wordnet.synsets(b)
# Take the first definition
# return aw[0].shortest_path_distance(bw[0])
# Take the average of all definitions
avg = 0
links = 0
for y in aw:
for z in bw:
dist = y.shortest_path_distance(z)
if dist is not None:
avg += y.shortest_path_distance(z)
links += 1
# return avg/(len(aw)*len(bw))
if links == 0:
links = 1
return avg/links
def min_index(a):
min_index = 0
cur_index = 1
while cur_index < len(a):
if a[cur_index] < a[min_index]:
min_index = cur_index
cur_index = cur_index + 1
return min_index
def categorize(groups, unseen):
visited = set()
multi = set()
for word in unseen:
multi = set(re.split('-|/|_| ', word))
if len(multi) > 1:
visited.add(word)
if groups == {}:
groups[0] = set([word])
visited.add(word)
else:
assessment = {}
for c,w in groups.items():
temp = []
for x in w:
# print(word, x, compare_two_words(word, x))
temp.append(compare_two_words(word, x))
assessment[c] = temp
# append(sum(temp)/len(temp))
# print(word, x, assessment)
for c,a in assessment.items():
if min(a) < 10 and min(a) != 0:
groups[c].add(word)
# groups[min_index(a)].add(word)
visited.add(word)
unseen |= multi
return groups, (unseen-visited)
def test():
a = set(['ocean', 'water', 'bank', 'clothes', 'plant', 'floor', 'building'])
groups = {}
while len(a) > 0:
old_len = len(a)
groups, a = categorize(groups, a)
new_len = len(a)
print(old_len, new_len)
if old_len == new_len:
new_category = a.pop()
groups[len(groups)] = set([new_category])
print(groups)
def data_gov_keywords():
f = open('tags.json')
a = json.load(f)
f.close()
keywords = set(a.keys())
groups = {}
while len(keywords) > 0:
print(len(keywords), len(groups))
old_len = len(keywords)
groups, keywords = categorize(groups, keywords)
new_len = len(keywords)
if old_len == new_len:
new_category = keywords.pop()
groups[len(groups)] = set([new_category])
output = {i:list(j) for i,j in groups.items()}
with open('output.json', 'w') as o:
o.write(json.dumps(output))
print(groups)
print(len(groups))
if __name__ == "__main__":
data_gov_keywords()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment