Skip to content

Instantly share code, notes, and snippets.

@avdata99
Last active January 24, 2020 14:54
Show Gist options
  • Save avdata99/fcf8f17cb90fd0f1cbd81529e06e4a2b to your computer and use it in GitHub Desktop.
Save avdata99/fcf8f17cb90fd0f1cbd81529e06e4a2b to your computer and use it in GitHub Desktop.
import json
import os
import requests
from slugify import slugify
instance_url = 'https://ckan.io'
package_list_url = 'api/3/action/package_list'
package_show_url = 'api/3/action/package_show'
expected_topics = [
'College Costs',
'Student Aid',
'Colleges/Universities',
'Early Childhood',
'English Language Learners',
'Postsecondary',
'K-12',
'Elementary/Secondary',
'Students',
'Public Schools',
'Private Schools',
'Teachers and Administrators',
'Safety/Bullying',
'Special Education',
'STEM',
'Families',
'Student Demographics',
'Student Outcomes/Graduation Rates',
'Suspension/Discipline',
'Technical/Adult Education',
'Assessments',
'Data Systems',
'School Geography',
'International',
'Library Programs',
'Reports',
'Contracts',
'Programs'
]
results = requests.get(f'{instance_url}/{package_list_url}')
data = results.json()
datasets = data['result']
categorized = 0
non_categorized = 0
groups_in_use = {}
categorized_datasets = []
datasets_with_expected_tags = []
tags_found = {}
datasets_with_tags_found = set()
for dataset in datasets:
print(f'Analyzing {dataset}')
if not os.path.isfile(f'{dataset}.json'):
dataset_response = requests.get(f'{instance_url}/{package_show_url}?id={dataset}')
data = dataset_response.json()
f = open(f'{dataset}.json', 'w')
f.write(dataset_response.text)
f.close()
else:
f = open(f'{dataset}.json', 'r')
data = json.load(f)
f.close()
full_dataset = data['result']
tags = full_dataset.get('tags', [])
expected_topics_slug = [slugify(topic)for topic in expected_topics]
expected_topics_found_as_tags = 0
for tag in tags:
name = tag['name']
if name in expected_topics_slug:
datasets_with_tags_found.add(dataset)
expected_topics_found_as_tags += 1
print(f'Dataset {dataset} has {name} TAG')
datasets_with_expected_tags.append({'dataset': dataset, 'tag': name})
if name not in tags_found:
tags_found[name] = 0
tags_found[name] += 1
groups = full_dataset.get('groups', [])
if len(groups) > 0:
categorized += 1
categorized_datasets.append(full_dataset)
else:
non_categorized += 1
for group in groups:
name = group['name']
if name not in groups_in_use:
groups_in_use[name] = 0
groups_in_use[name] += 1
total = categorized + non_categorized
perc_cat = round(categorized / total * 100, 2)
print(f'{total} datasets. {categorized} categorized ({perc_cat} %)')
print(f'{total} datasets. {categorized} categorized ({perc_cat} %)')
print('Groups in use')
print(groups_in_use)
print('Categorized datasets:')
for cd in categorized_datasets:
title = cd['title']
groups = ', '.join([x['name'] for x in cd['groups']])
print(f' - Dataset: {title}')
print(f' + Groups: {groups}')
print('Datasets with expected tags: {}'.format(len(datasets_with_expected_tags)))
print(datasets_with_expected_tags)
print('Tags found: {}'.format(len(tags_found)))
print(tags_found)
print('Datasets with tags found: {}'.format(len(datasets_with_tags_found)))
print(datasets_with_tags_found)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment