Skip to content

Instantly share code, notes, and snippets.

@nickumia-reisys
Forked from nickumia/datagov_cluster.py
Last active November 28, 2022 19:43
Show Gist options
  • Save nickumia-reisys/ba260d8b776ebb39026b8927ba19278d to your computer and use it in GitHub Desktop.
Save nickumia-reisys/ba260d8b776ebb39026b8927ba19278d to your computer and use it in GitHub Desktop.
Download all keywords from catalog.data.gov
import matplotlib.pyplot as mp
import nlp.processing.storage as nps
import numpy as np
import json
from datagov_model_convert import gen_key
MODEL = nps.LocalStorage()
MODEL.restore('model.dict.pkl')
# print(len(MODEL.save))
MODEL_KEYS = nps.LocalStorage()
MODEL_KEYS.restore('model_keys.pkl')
def compare_two_words(a, b):
if a in MODEL_KEYS.save:
a_id = MODEL_KEYS.save[a]
else:
a_id = -1
if b in MODEL_KEYS.save:
b_id = MODEL_KEYS.save[b]
else:
b_id = -1
if a_id == -1 or b_id == -1:
return 0
key = gen_key(a_id, b_id)
if key in MODEL.save:
return MODEL.save[key]
return 0
def categorize(groups, unseen, similarity=0):
visited = set()
for word in unseen:
assessment = {}
for c, w in groups.items():
temp = []
for x in w:
temp.append(compare_two_words(word, x))
assessment[c] = temp
for c,a in assessment.items():
if max(a) > similarity:
groups[c].add(word)
visited.add(word)
return groups, (unseen-visited)
f = open('tags.json')
a = json.load(f)
f.close()
groups = {}
keywords = set(a.keys())
relatedness = 1001
change_in_r = 0
times_with_no_change = 0
groups['0'] = set([keywords.pop()])
lower_keywords = set([])
while len(keywords) > 0:
print(len(keywords), len(lower_keywords), len(groups), relatedness, change_in_r)
old_len = len(keywords)
groups, keywords = categorize(groups, keywords, similarity=relatedness)
relatedness -= 10
change_in_r += 10
groups, keywords = categorize(groups, keywords, similarity=relatedness)
new_len = len(keywords)
if old_len == new_len:
times_with_no_change += 1
relatedness += 10
if times_with_no_change > 1:
new_category = keywords.pop()
groups[len(groups)] = set([new_category])
times_with_no_change = 0
if change_in_r > 100 and len(keywords) > 0:
new_category = keywords.pop()
groups[len(groups)] = set([new_category])
change_in_r = 0
relatedness -= 10
# groupc = groups.copy()
# for k,v in groupc.items():
# if len(v) == 1:
# # lower_keywords.add(v.pop())
# keywords.add(v.pop())
# del groups[k]
# if len(groups) > 10:
# keywords.update(lower_keywords)
# lower_keywords = set([])
output = {i:list(j) for i,j in groups.items()}
# with open('output.json', 'w') as o:
# o.write(json.dumps(output))
# print(len(groups))
x = [float(m) for m in groups.keys()]
y = [float(len(z)) for z in groups.values()]
print(sum(y))
mp.bar(x, y)
mp.show()
import matplotlib.pyplot as mp
import nlp.processing.storage as nps
import numpy as np
MODEL = nps.LocalStorage()
MODEL.restore('model.dict.pkl')
# print(len(MODEL.save))
MODEL_KEYS = nps.LocalStorage()
MODEL_KEYS.restore('model_keys.pkl')
condensed_keys = set([])
def dec_key(key):
i = key.split(',')
return int(i[0]), int(i[1])
for k,v in MODEL.save.items():
i,j = dec_key(k)
condensed_keys.add(i)
condensed_keys.add(j)
print(len(condensed_keys))
# From the 93691 unique keys that appear more than once,
# there are connections between 91145 keys
# Since this is still to large to plot an NxN colormap matrix,
# Just do the clustering on the memory-optimized data
import matplotlib.pyplot as mp
import nlp.processing.storage as nps
import numpy as np
def gen_key(i,j):
if i < j:
return "%d,%d" % (i, j)
else:
return "%d,%d" % (j, i)
def update_dict(key, v):
global MODEL_DICT
if key in MODEL_DICT.save.keys():
MODEL_DICT.save[key] += v
else:
MODEL_DICT.save[key] = v
if __name__ == '__main__':
MODEL = nps.LocalStorage()
MODEL.restore('model.pkl')
MODEL_DICT = nps.LocalStorage()
MODEL_DICT.save = {}
nonzero = np.nonzero(MODEL.save)
print(len(nonzero))
for i in np.transpose(nonzero):
update_dict(gen_key(i[0], i[1]), MODEL.save[i[0]][i[1]])
MODEL_DICT.backup("model.dict.pkl")
import json
import numpy as np
import nlp.processing.storage as nps
import sys
global OUTPUT, OUTPUT_INDEX
INPUT = nps.LocalStorage()
OUTPUT = nps.LocalStorage()
OUTPUT_INDEX = nps.LocalStorage()
OUTPUT_INDEX.save = {}
a = open('more_than_2.json')
REFERENCE = json.load(a)
a.close()
INPUT.save = {}
current_index = 0
def lookup_index(key):
global current_index, OUTPUT_INDEX
if key in OUTPUT_INDEX.save.keys():
return OUTPUT_INDEX.save[key]
current_index += 1
OUTPUT_INDEX.save[key] = current_index
return current_index
try:
INPUT.restore('datasets.pkl')
# unique_keywords = 292043
unique_keywords = 93691
except:
INPUT.save = {
1: ['a', 'b', 'c'],
2: ['a', 'c'],
3: ['c', 'd'],
4: ['e']
}
unique_keywords = 5
OUTPUT.save = np.zeros((unique_keywords, unique_keywords), dtype=np.int16)
total = len(INPUT.save)
now = 0
for dataset, kw in INPUT.save.items():
now += 1
print(now/total)
if len(kw) > 1:
if kw[0] in REFERENCE.keys():
a = lookup_index(kw[0])
for k in kw[1:]:
if k in REFERENCE.keys():
b = lookup_index(k)
OUTPUT.save[a][b] += 1
if OUTPUT.save[a][b] < 0:
print('Data type not large enough :(')
print(now/total, OUTPUT.save[a][b])
sys.exit(1)
OUTPUT.backup('model.pkl')
OUTPUT_INDEX.backup('model_keys.pkl')
# print(OUTPUT.save)
print(OUTPUT_INDEX.save)
import nlp.processing.storage as nps
import json
# INPUT = nps.LocalStorage()
# INPUT.restore('datasets.pkl')
# print(len(INPUT.save))
# count = 0
# for k,v in INPUT.save.items():
# print(k,v)
# count += 1
# if count > 2:
# break
jso = {}
a = open('output_top_483.json')
jso = json.load(a)
a.close()
more_than_1 = 0
for k, v in jso.items():
if len(v) > 1:
more_than_1 += 1
print(more_than_1)
import requests
import nlp.processing.storage as nps
import threading
from threading import Lock
fil = 'keywords.pkl'
STORAGE = nps.LocalStorage()
STORAGE.save = {}
current = 0
concurrent = 0
batch = 100
all_threads = []
def get_total_count():
# Returns the total number of datasets on catalog.data.gov
return requests.get('https://catalog.data.gov/api/action/package_search').json()['result']['count']
def get_next(rows, start):
# curl -sL
# 'https://catalog.data.gov/api/action/package_search?rows=10&start=9'
# | jq '.result.results[].id'
# | jq '.result.results[].tags[].display_name'
r = requests.get('https://catalog.data.gov/api/action/package_search?rows=%s&start=%s' % (rows, start)).json()
r_id = []
r_tags = []
return r['result']['results']
def parse_dataset(result):
r_id = result['id']
r_tags = [i['display_name'] for i in result['tags']]
return r_id, r_tags
class Work(threading.Thread):
def __init__(self, threadID, name):
threading.Thread.__init__(self)
self.name = name
self.tid = threadID
self.lock = Lock()
def run(self):
global concurrent
print("Starting...", self.name)
results_list = get_next(batch, self.tid*batch)
self.lock.acquire()
for a, i in enumerate(results_list):
i_s, i_t = parse_dataset(i)
if i_s not in STORAGE.save:
STORAGE.save[i_s] = i_t
if int(self.name) % batch == 0:
STORAGE.backup(fil)
concurrent -= 1
self.lock.release()
print("Completed...", self.name)
if __name__ == "__main__":
total = get_total_count()
try:
STORAGE.restore(fil)
except:
pass
while current*batch < total:
if concurrent < 5:
t = Work(current, str(current))
all_threads.append(t)
t.start()
concurrent += 1
current += 1
[i.join() for i in all_threads]
STORAGE.backup(fil)
import nlp.processing.storage as nps
fil = 'test.pkl'
STORAGE = nps.LocalStorage()
STORAGE.save = {}
keywords = {}
if __name__ == "__main__":
try:
STORAGE.restore(fil)
except:
pass
# This should be equal to the total number of datasets
# print(len(STORAGE.save))
for dataset, kw in STORAGE.save.items():
for k in kw:
if k in keywords:
keywords[k] += 1
else:
keywords[k] = 1
most_frequent = dict(sorted(keywords.items(), key=lambda item: item[1]))
print(most_frequent)
# Reference: https://networkx.org/documentation/stable/auto_examples/drawing/plot_weighted_graph.html
import matplotlib
matplotlib.use('TkAgg')
import json
import matplotlib.pyplot as mp
import networkx as nx
import nlp.processing.storage as nps
from datagov_related_words import dec_key
MODEL = nps.LocalStorage()
MODEL.restore('model.dict.pkl')
MODEL_KEYS = nps.LocalStorage()
MODEL_KEYS.restore('model_keys.pkl')
INVERTED_KEYS = {j:i for i,j in MODEL_KEYS.save.items()}
print(len(MODEL.save))
a = open('keywords_1000.json')
keywords = json.load(a)
a.close()
top_1000 = {}
acceptable_words = set(keywords.keys())
print(len(acceptable_words))
for k,v in MODEL.save.items():
i,j = dec_key(k)
if INVERTED_KEYS[i] in acceptable_words or INVERTED_KEYS[j] in acceptable_words:
if v > 100:
top_1000[k] = v
print(len(top_1000))
G = nx.Graph()
# For entire graph
# for i, kv in enumerate(MODEL.save.items()):
# k = kv[0]
# v = kv[1]
# if i < 2000:
# a,b = dec_key(k)
# G.add_edge(INVERTED_KEYS[a], INVERTED_KEYS[b], weight=v)
# else:
# break
# For specific graph
for k, v in top_1000.items():
a,b = dec_key(k)
G.add_edge(INVERTED_KEYS[a], INVERTED_KEYS[b], weight=v)
print("Created list of nodes")
elarge = [(u, v) for (u, v, d) in G.edges(data=True) if d["weight"] > 100]
esmall = [(u, v) for (u, v, d) in G.edges(data=True) if d["weight"] <= 100]
print("Separated large and small nodes")
pos = nx.spring_layout(G, seed=7)
print("Created spring layout")
nx.draw_networkx_nodes(G, pos, node_size=200)
print("Created all nodes")
nx.draw_networkx_edges(G, pos, edgelist=elarge, width=3)
nx.draw_networkx_edges(
G, pos, edgelist=esmall, width=3, alpha=0.5, edge_color="b", style="dashed"
)
print("Created edges on graph")
nx.draw_networkx_labels(G, pos, font_size=5, font_family="sans-serif")
edge_labels = nx.get_edge_attributes(G, "weight")
nx.draw_networkx_edge_labels(G, pos, edge_labels)
print("Drew all labels")
ax = mp.gca()
ax.margins(0.08)
mp.axis("off")
mp.tight_layout()
mp.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment