avdata99/analyze_ckan_datasets_in_groups.py

## analyze_ckan_datasets_in_groups.py
import json
import os
import requests
from slugify import slugify

instance_url = 'https://ckan.io'
package_list_url = 'api/3/action/package_list'
package_show_url = 'api/3/action/package_show'

expected_topics = [
    'College Costs',
    'Student Aid',
    'Colleges/Universities',
    'Early Childhood',
    'English Language Learners',
    'Postsecondary',
    'K-12',
    'Elementary/Secondary',
    'Students',
    'Public Schools',
    'Private Schools',
    'Teachers and Administrators',
    'Safety/Bullying',
    'Special Education',
    'STEM',
    'Families',
    'Student Demographics',
    'Student Outcomes/Graduation Rates',
    'Suspension/Discipline',
    'Technical/Adult Education',
    'Assessments',
    'Data Systems',
    'School Geography',
    'International',
    'Library Programs',
    'Reports',
    'Contracts',
    'Programs'
]

results = requests.get(f'{instance_url}/{package_list_url}')

data = results.json()
datasets = data['result']

categorized = 0
non_categorized = 0

groups_in_use = {}
categorized_datasets = []
datasets_with_expected_tags = []
tags_found = {}
datasets_with_tags_found = set()

for dataset in datasets:
    print(f'Analyzing {dataset}')
    if not os.path.isfile(f'{dataset}.json'):
        dataset_response = requests.get(f'{instance_url}/{package_show_url}?id={dataset}')
        data = dataset_response.json()
        f = open(f'{dataset}.json', 'w')
        f.write(dataset_response.text)
        f.close()
    else:
        f = open(f'{dataset}.json', 'r')
        data = json.load(f)
        f.close()

    full_dataset = data['result']

    tags = full_dataset.get('tags', [])
    expected_topics_slug = [slugify(topic)for topic in expected_topics]
    expected_topics_found_as_tags = 0
    for tag in tags:
        name = tag['name']
        if name in expected_topics_slug:
            datasets_with_tags_found.add(dataset)
            expected_topics_found_as_tags += 1
            print(f'Dataset {dataset} has {name} TAG')
            datasets_with_expected_tags.append({'dataset': dataset, 'tag': name})
            if name not in tags_found:
                tags_found[name] = 0
            tags_found[name] += 1

    groups = full_dataset.get('groups', [])
    if len(groups) > 0:
        categorized += 1
        categorized_datasets.append(full_dataset)
    else:
        non_categorized += 1

    for group in groups:
        name = group['name']
        if name not in groups_in_use:
            groups_in_use[name] = 0
        groups_in_use[name] += 1

    total = categorized + non_categorized
    perc_cat = round(categorized / total * 100, 2)
    print(f'{total} datasets. {categorized} categorized ({perc_cat} %)')

print(f'{total} datasets. {categorized} categorized ({perc_cat} %)')
print('Groups in use')
print(groups_in_use)

print('Categorized datasets:')
for cd in categorized_datasets:
    title = cd['title']
    groups = ', '.join([x['name'] for x in cd['groups']])
    print(f' - Dataset: {title}')
    print(f'   + Groups: {groups}')

print('Datasets with expected tags: {}'.format(len(datasets_with_expected_tags)))
print(datasets_with_expected_tags)

print('Tags found: {}'.format(len(tags_found)))
print(tags_found)

print('Datasets with tags found: {}'.format(len(datasets_with_tags_found)))
print(datasets_with_tags_found)
	import json
	import os
	import requests
	from slugify import slugify

	instance_url = 'https://ckan.io'
	package_list_url = 'api/3/action/package_list'
	package_show_url = 'api/3/action/package_show'

	expected_topics = [
	'College Costs',
	'Student Aid',
	'Colleges/Universities',
	'Early Childhood',
	'English Language Learners',
	'Postsecondary',
	'K-12',
	'Elementary/Secondary',
	'Students',
	'Public Schools',
	'Private Schools',
	'Teachers and Administrators',
	'Safety/Bullying',
	'Special Education',
	'STEM',
	'Families',
	'Student Demographics',
	'Student Outcomes/Graduation Rates',
	'Suspension/Discipline',
	'Technical/Adult Education',
	'Assessments',
	'Data Systems',
	'School Geography',
	'International',
	'Library Programs',
	'Reports',
	'Contracts',
	'Programs'
	]

	results = requests.get(f'{instance_url}/{package_list_url}')

	data = results.json()
	datasets = data['result']

	categorized = 0
	non_categorized = 0

	groups_in_use = {}
	categorized_datasets = []
	datasets_with_expected_tags = []
	tags_found = {}
	datasets_with_tags_found = set()

	for dataset in datasets:
	print(f'Analyzing {dataset}')
	if not os.path.isfile(f'{dataset}.json'):
	dataset_response = requests.get(f'{instance_url}/{package_show_url}?id={dataset}')
	data = dataset_response.json()
	f = open(f'{dataset}.json', 'w')
	f.write(dataset_response.text)
	f.close()
	else:
	f = open(f'{dataset}.json', 'r')
	data = json.load(f)
	f.close()

	full_dataset = data['result']

	tags = full_dataset.get('tags', [])
	expected_topics_slug = [slugify(topic)for topic in expected_topics]
	expected_topics_found_as_tags = 0
	for tag in tags:
	name = tag['name']
	if name in expected_topics_slug:
	datasets_with_tags_found.add(dataset)
	expected_topics_found_as_tags += 1
	print(f'Dataset {dataset} has {name} TAG')
	datasets_with_expected_tags.append({'dataset': dataset, 'tag': name})
	if name not in tags_found:
	tags_found[name] = 0
	tags_found[name] += 1

	groups = full_dataset.get('groups', [])
	if len(groups) > 0:
	categorized += 1
	categorized_datasets.append(full_dataset)
	else:
	non_categorized += 1

	for group in groups:
	name = group['name']
	if name not in groups_in_use:
	groups_in_use[name] = 0
	groups_in_use[name] += 1

	total = categorized + non_categorized
	perc_cat = round(categorized / total * 100, 2)
	print(f'{total} datasets. {categorized} categorized ({perc_cat} %)')

	print(f'{total} datasets. {categorized} categorized ({perc_cat} %)')
	print('Groups in use')
	print(groups_in_use)

	print('Categorized datasets:')
	for cd in categorized_datasets:
	title = cd['title']
	groups = ', '.join([x['name'] for x in cd['groups']])
	print(f' - Dataset: {title}')
	print(f' + Groups: {groups}')

	print('Datasets with expected tags: {}'.format(len(datasets_with_expected_tags)))
	print(datasets_with_expected_tags)

	print('Tags found: {}'.format(len(tags_found)))
	print(tags_found)

	print('Datasets with tags found: {}'.format(len(datasets_with_tags_found)))
	print(datasets_with_tags_found)