Skip to content

Instantly share code, notes, and snippets.

@nickumia
Last active November 27, 2022 17:53
Show Gist options
  • Save nickumia/4f034ae951349a9dea5fda999f935405 to your computer and use it in GitHub Desktop.
Save nickumia/4f034ae951349a9dea5fda999f935405 to your computer and use it in GitHub Desktop.
Download all keywords from catalog.data.gov
import matplotlib.pyplot as mp
import nlp.processing.storage as nps
import numpy as np
import json
from datagov_model_convert import gen_key
MODEL = nps.LocalStorage()
MODEL.restore('model.dict.pkl')
# print(len(MODEL.save))
MODEL_KEYS = nps.LocalStorage()
MODEL_KEYS.restore('model_keys.pkl')
def compare_two_words(a, b):
if a in MODEL_KEYS.save:
a_id = MODEL_KEYS.save[a]
else:
a_id = -1
if b in MODEL_KEYS.save:
b_id = MODEL_KEYS.save[b]
else:
b_id = -1
if a_id == -1 or b_id == -1:
return 0
key = gen_key(a_id, b_id)
if key in MODEL.save:
return MODEL.save[key]
return 0
def categorize(groups, unseen, similarity=0):
visited = set()
for word in unseen:
assessment = {}
for c, w in groups.items():
temp = []
for x in w:
temp.append(compare_two_words(word, x))
assessment[c] = temp
for c,a in assessment.items():
if max(a) > similarity:
groups[c].add(word)
visited.add(word)
return groups, (unseen-visited)
f = open('tags.json')
a = json.load(f)
f.close()
groups = {}
keywords = set(a.keys())
relatedness = 1001
change_in_r = 0
times_with_no_change = 0
groups['0'] = set([keywords.pop()])
lower_keywords = set([])
while len(keywords) > 0:
print(len(keywords), len(lower_keywords), len(groups), relatedness, change_in_r)
old_len = len(keywords)
groups, keywords = categorize(groups, keywords, similarity=relatedness)
relatedness -= 10
change_in_r += 10
groups, keywords = categorize(groups, keywords, similarity=relatedness)
new_len = len(keywords)
if old_len == new_len:
times_with_no_change += 1
relatedness += 10
if times_with_no_change > 1:
new_category = keywords.pop()
groups[len(groups)] = set([new_category])
times_with_no_change = 0
if change_in_r > 100 and len(keywords) > 0:
new_category = keywords.pop()
groups[len(groups)] = set([new_category])
change_in_r = 0
relatedness -= 10
# groupc = groups.copy()
# for k,v in groupc.items():
# if len(v) == 1:
# # lower_keywords.add(v.pop())
# keywords.add(v.pop())
# del groups[k]
# if len(groups) > 10:
# keywords.update(lower_keywords)
# lower_keywords = set([])
output = {i:list(j) for i,j in groups.items()}
# with open('output.json', 'w') as o:
# o.write(json.dumps(output))
# print(len(groups))
x = [float(m) for m in groups.keys()]
y = [float(len(z)) for z in groups.values()]
print(sum(y))
mp.bar(x, y)
mp.show()
import matplotlib.pyplot as mp
import nlp.processing.storage as nps
import numpy as np
MODEL = nps.LocalStorage()
MODEL.restore('model.dict.pkl')
# print(len(MODEL.save))
MODEL_KEYS = nps.LocalStorage()
MODEL_KEYS.restore('model_keys.pkl')
condensed_keys = set([])
def dec_key(key):
i = key.split(',')
return int(i[0]), int(i[1])
for k,v in MODEL.save.items():
i,j = dec_key(k)
condensed_keys.add(i)
condensed_keys.add(j)
print(len(condensed_keys))
# From the 93691 unique keys that appear more than once,
# there are connections between 91145 keys
# Since this is still to large to plot an NxN colormap matrix,
# Just do the clustering on the memory-optimized data
import matplotlib.pyplot as mp
import nlp.processing.storage as nps
import numpy as np
def gen_key(i,j):
if i < j:
return "%d,%d" % (i, j)
else:
return "%d,%d" % (j, i)
def update_dict(key, v):
global MODEL_DICT
if key in MODEL_DICT.save.keys():
MODEL_DICT.save[key] += v
else:
MODEL_DICT.save[key] = v
if __name__ == '__main__':
MODEL = nps.LocalStorage()
MODEL.restore('model.pkl')
MODEL_DICT = nps.LocalStorage()
MODEL_DICT.save = {}
nonzero = np.nonzero(MODEL.save)
print(len(nonzero))
for i in np.transpose(nonzero):
update_dict(gen_key(i[0], i[1]), MODEL.save[i[0]][i[1]])
MODEL_DICT.backup("model.dict.pkl")
import json
import numpy as np
import nlp.processing.storage as nps
import sys
global OUTPUT, OUTPUT_INDEX
INPUT = nps.LocalStorage()
OUTPUT = nps.LocalStorage()
OUTPUT_INDEX = nps.LocalStorage()
OUTPUT_INDEX.save = {}
a = open('more_than_2.json')
REFERENCE = json.load(a)
a.close()
INPUT.save = {}
current_index = 0
def lookup_index(key):
global current_index, OUTPUT_INDEX
if key in OUTPUT_INDEX.save.keys():
return OUTPUT_INDEX.save[key]
current_index += 1
OUTPUT_INDEX.save[key] = current_index
return current_index
try:
INPUT.restore('datasets.pkl')
# unique_keywords = 292043
unique_keywords = 93691
except:
INPUT.save = {
1: ['a', 'b', 'c'],
2: ['a', 'c'],
3: ['c', 'd'],
4: ['e']
}
unique_keywords = 5
OUTPUT.save = np.zeros((unique_keywords, unique_keywords), dtype=np.int16)
total = len(INPUT.save)
now = 0
for dataset, kw in INPUT.save.items():
now += 1
print(now/total)
if len(kw) > 1:
if kw[0] in REFERENCE.keys():
a = lookup_index(kw[0])
for k in kw[1:]:
if k in REFERENCE.keys():
b = lookup_index(k)
OUTPUT.save[a][b] += 1
if OUTPUT.save[a][b] < 0:
print('Data type not large enough :(')
print(now/total, OUTPUT.save[a][b])
sys.exit(1)
OUTPUT.backup('model.pkl')
OUTPUT_INDEX.backup('model_keys.pkl')
# print(OUTPUT.save)
print(OUTPUT_INDEX.save)
import nlp.processing.storage as nps
import json
# INPUT = nps.LocalStorage()
# INPUT.restore('datasets.pkl')
# print(len(INPUT.save))
# count = 0
# for k,v in INPUT.save.items():
# print(k,v)
# count += 1
# if count > 2:
# break
jso = {}
a = open('output_top_483.json')
jso = json.load(a)
a.close()
more_than_1 = 0
for k, v in jso.items():
if len(v) > 1:
more_than_1 += 1
print(more_than_1)
import requests
import nlp.processing.storage as nps
import threading
from threading import Lock
fil = 'keywords.pkl'
STORAGE = nps.LocalStorage()
STORAGE.save = {}
current = 0
concurrent = 0
batch = 100
all_threads = []
def get_total_count():
# Returns the total number of datasets on catalog.data.gov
return requests.get('https://catalog.data.gov/api/action/package_search').json()['result']['count']
def get_next(rows, start):
# curl -sL
# 'https://catalog.data.gov/api/action/package_search?rows=10&start=9'
# | jq '.result.results[].id'
# | jq '.result.results[].tags[].display_name'
r = requests.get('https://catalog.data.gov/api/action/package_search?rows=%s&start=%s' % (rows, start)).json()
r_id = []
r_tags = []
return r['result']['results']
def parse_dataset(result):
r_id = result['id']
r_tags = [i['display_name'] for i in result['tags']]
return r_id, r_tags
class Work(threading.Thread):
def __init__(self, threadID, name):
threading.Thread.__init__(self)
self.name = name
self.tid = threadID
self.lock = Lock()
def run(self):
global concurrent
print("Starting...", self.name)
results_list = get_next(batch, self.tid*batch)
self.lock.acquire()
for a, i in enumerate(results_list):
i_s, i_t = parse_dataset(i)
if i_s not in STORAGE.save:
STORAGE.save[i_s] = i_t
if int(self.name) % batch == 0:
STORAGE.backup(fil)
concurrent -= 1
self.lock.release()
print("Completed...", self.name)
if __name__ == "__main__":
total = get_total_count()
try:
STORAGE.restore(fil)
except:
pass
while current*batch < total:
if concurrent < 5:
t = Work(current, str(current))
all_threads.append(t)
t.start()
concurrent += 1
current += 1
[i.join() for i in all_threads]
STORAGE.backup(fil)
import nlp.processing.storage as nps
fil = 'test.pkl'
STORAGE = nps.LocalStorage()
STORAGE.save = {}
keywords = {}
if __name__ == "__main__":
try:
STORAGE.restore(fil)
except:
pass
# This should be equal to the total number of datasets
# print(len(STORAGE.save))
for dataset, kw in STORAGE.save.items():
for k in kw:
if k in keywords:
keywords[k] += 1
else:
keywords[k] = 1
most_frequent = dict(sorted(keywords.items(), key=lambda item: item[1]))
print(most_frequent)
import matplotlib.pyplot as mp
import nlp.processing.storage as nps
MODEL = nps.LocalStorage()
MODEL.restore('model.pkl')
# mp.plot(MODEL.save)
# mp.show()
print(MODEL.save[0][0])
print(MODEL.save[10][5])
print(MODEL.save[5][10])
@nickumia
Copy link
Author

See my work fork for the complete working example:

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment