Skip to content

Instantly share code, notes, and snippets.

@defvol
Last active August 29, 2015 14:03
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save defvol/08b8d978ae788565ec9a to your computer and use it in GitHub Desktop.
Save defvol/08b8d978ae788565ec9a to your computer and use it in GitHub Desktop.
ALL YOUR DATASETS ARE BELONG TO US (deprecated, see https://github.com/mxabierto/ckanops)

ckanops.py

ALL YOUR DATASETS ARE BELONG TO US

Overview

August 22, 2014

A collection of everyday tasks on CKAN.

In the box

ckanops.py

  1. Update datasets owner
  2. Clear datasets licenses
  3. Update resources format based on URI extensions
  4. Update datasets spatial coverage based on keywords
  5. Get publication stats from organizations

Using the cli

  1. Backup
$ ckanapi dump datasets --all -q -r http://sourceckan.example.com > backup-`date "+%Y%m%dT%H%M%S"`

Run instructions

% python ckanops.py
#!/usr/bin/env python
import os
import operator
import ckanapi
from ckanapi.errors import CKANAPIError
from urlparse import urlparse
host = os.environ['CKAN_HOST']
token = os.environ['CKAN_API_TOKEN']
def update_dataset(remote, dataset, attributes):
try:
# Remove duplicate metadata fields
# NOTE: This happened in a few experiments in the extra fields
unique_extras = set(str(e) for e in dataset['extras'])
dataset['extras'] = [eval(e) for e in unique_extras]
# Merge new attributes and update package
dataset = dict(dataset.items() + attributes.items())
remote.call_action('package_update',
data_dict=dataset,
apikey=token)
return True
except CKANAPIError, e:
print "CKANAPIError for dataset", "'%s'" % dataset['title']
print e
return False
def update_resource(remote, resource, attributes):
try:
# Merge new attributes and update package
resource = dict(resource.items() + attributes.items())
remote.call_action('resource_update',
data_dict=resource,
apikey=token)
return True
except CKANAPIError, e:
print "CKANAPIError for resource", "'%s'" % resource['url']
print e
return False
# Extract DCAT publisher name from metadata
def get_dcat_publisher(dataset):
extra_metadata = dataset.get('extras', [])
for e in extra_metadata:
if e['key'] == 'dcat_publisher_name':
return e['value']
print "Couldn't find dcat fields metadata for", "'%s'" % dataset['title']
return ''
# Update the owner package to be the same as DCAT publisher
# NOTE: publisher name comes in uppercase, org name in lowercase
def update_dataset_owner_as_dcat_publisher(remote, dataset):
organizations = remote.action.organization_list()
publisher = get_dcat_publisher(dataset).lower()
# TODO: owner_org is a hash id so it will never match publisher name
if (publisher in organizations and dataset['owner_org'] != publisher):
success = update_dataset(remote,
dataset,
{ 'owner_org': publisher })
if success:
print publisher.upper(), "now owns", "'%s'" % dataset['title']
def clear_dataset_license(remote, dataset):
attributes = { 'license_id': 'notspecified' }
if dataset['license_id'] == 'notspecified':
return True
if update_dataset(remote, dataset, attributes):
title = "'%s'" % dataset['title']
print "Updated", dataset['license_id'], "to notspecified", "for", title
def update_resources_format_based_on_extension(remote, dataset):
formats = {
'csv': 'CSV',
'geojson': 'GeoJSON',
'gif': 'GIF',
'json': 'JSON',
'kml+xml': 'KML',
'kmz': 'KMZ',
'pdf': 'PDF',
'png': 'PNG',
'xls': 'XLS',
'xlsx': 'XLSX',
'xml': 'XML',
'zip': 'ZIP',
}
for r in dataset['resources']:
if len(r['format']) > 0:
continue
extension = get_extension_from_url(r['url'])
try:
f = formats[extension]
update_resource(remote,
r,
{ 'format': f })
except KeyError, e:
print extension, "is not a known file extension"
def get_extension_from_url(u):
# SEE: https://docs.python.org/2/library/urlparse.html
path = urlparse(u)[2]
return path.split('.')[-1].strip().lower()
def update_bbox(remote, dataset, region):
regions = {
'Mexico': [
{'key':'spatial-text', 'value':'Mexico'},
{'key':'spatial', 'value':'{"type":"Polygon","coordinates":[[[-118.30078125,13.667338259654947],[-118.30078125,33.35806161277885],[-85.95703125,33.35806161277885],[-85.95703125,13.667338259654947],[-118.30078125,13.667338259654947]]]}'},
{'key':'spatial-uri', 'value':'http://www.geonames.org/3996063'}
]
}
print "Updating", "'%s'" % dataset['title']
extras = dataset['extras']
for h in regions[region]:
extras.append(h)
update_dataset(remote, dataset, {'extras': extras})
def tags_covered_by_an_organization(o):
tags = []
for p in o['packages']:
tag_names = [t['name'].encode('UTF8') for t in p['tags']]
tags.append(tag_names)
if len(tags) == 0:
return []
else:
return reduce(operator.add, tags)
def all_tags_by_organization(remote):
tags_by_organization = {}
organizations_names = remote.action.organization_list()
for name in organizations_names:
organization = remote.action.organization_show(id=name)
tags = tags_covered_by_an_organization(organization)
tags_by_organization[name] = tags
return tags_by_organization
def main():
remote = ckanapi.RemoteCKAN(host, user_agent='ckanops/1.0', apikey=token)
datasets = remote.action.package_list()
# all_tags = all_tags_by_organization(remote)
# datasets = [d['name'] for d in remote.action.package_search(q='nacional')['results']]
print "Will update", len(datasets), "datasets"
for d in datasets:
# Get dataset metadata
pkg = remote.action.package_show(id=d)
# Could stumble upon harvesters
if pkg['type'] == 'dataset':
# update_dataset_owner_as_dcat_publisher(remote, pkg)
# clear_dataset_license(remote, pkg)
update_resources_format_based_on_extension(remote, pkg)
# update_bbox(remote, pkg, 'Mexico')
if __name__ == "__main__":
main()
@defvol
Copy link
Author

defvol commented Jul 9, 2014

(ckan)➜  foo git:(master) ✗ python dataset.py
CENAPRED now owns 'Actividad volcánica'
CONAGUA now owns 'Avisos de ciclón tropical'
Couldn't find dcat fields metadata for 'Estaciones de ecobici'
SALUD now owns 'Mortalidad Materna'
SALUD now owns 'Recursos en salud'
CENAPRED now owns 'Regiones susceptibles de deslizamientos'
CENAPRED now owns 'Zona de peligro por tsunamis lejanos y locales'
CENAPRED now owns 'Zona regional de hundimientos y agrietamientos'

@defvol
Copy link
Author

defvol commented Jul 10, 2014

(ckan)➜  foo git:(master) ✗ python dataset.py
Updated notspecified to notspecified for 'Demarcaciones geográficas'
Updated notspecified to notspecified for 'Estaciones de ecobici'
Updated notspecified to notspecified for 'FONDEN'
Updated cc-nc to notspecified for 'Inventario CONEVAL de Programas y Acciones Federales de Desarrollo Social'

@urkonn
Copy link

urkonn commented Jul 14, 2014

Se puede modificar el texto de license_id para que no aparezca: "No se ha especificado la licencia"

Selection 018

@defvol
Copy link
Author

defvol commented Nov 18, 2014

@defvol
Copy link
Author

defvol commented Dec 4, 2014

Los ids de todos los datasets de organizaciones que empiecen con "ayuntamiento-"

(ckan)➜ ckanops git:(master) ✗ ckanapi action organization_list -r http://192.168.33.20:8080 | jq 'map(select(. | startswith("ayuntamiento-")))' -c | awk '{ gsub(/[[]"]/,"", $0); print }' | awk '{ gsub(/,/, "\n", $0); print }' | parallel ckanapi action organization_show -r http://192.168.33.20:8080 id={} | jq '[.packages[].id]'

[
"78c59ffc-57f5-4365-ac3a-5044271ac704",
"0623ab48-8e8a-45ba-8783-0f87495f090a"
]
[
"36b87f5d-a729-4b28-89ad-0acc845d50be",
"12563446-78a8-467e-8af0-f1dd252d8053",
"b5b731aa-0664-432d-af22-4c4124a8d388",
"6be9b9d8-9b45-4fbe-ad8e-c4c14274de26",
"f058fab7-29c3-4a03-9f5a-627b3f698703",
"01ca820c-a341-4f81-9598-50cc883d5941",
"ea5e1baa-34aa-4d9f-8556-38cbcd1d9284"
]

@Article{Tange2011a,
title = {GNU Parallel - The Command-Line Power Tool},
author = {O. Tange},
address = {Frederiksberg, Denmark},
journal = {;login: The USENIX Magazine},
month = {Feb},
number = {1},
volume = {36},
url = {http://www.gnu.org/s/parallel},
year = {2011},
pages = {42-47}
}

@defvol
Copy link
Author

defvol commented Dec 12, 2014

Add org's datasets to a group
ckanapi action organization_show -r http://foo.bar id=tu-gobierno-en-mapas | jq '.packages[].id' | awk '{ gsub(/"/, "", $0); printf "%s ",$0 }' | xargs -t python ckanops.py --group "tgm"

Look for all datasets with org_id "c462b461-b33f-8476-462c9ceeb994" and set owner_org empty
python ckanops.py --replace dataset owner_org c462b461-beef-41a0-8476-462c9ceeb994 ""

@defvol
Copy link
Author

defvol commented Jan 7, 2015

List dataset names from organizations starting with "estado-"

NOTE: markdown may break awk's escaping

✗ ckanapi action organization_list -r http://192.168.33.20:8080 | jq 'map(select(. | startswith("estado-")))' -c | awk '{ gsub(/[\[\]\"]/,"", $0); print }' | awk '{ gsub(/,/, "\n", $0); print }' | parallel ckanapi action organization_show -r http://192.168.33.20:8080 id={} | jq '.packages[].name' > estatales.txt

Add a new attribute to these datasets
need to remove quotes for the next command (hint: %s/"/g)

cat estatales.txt | parallel "ckanapi action package_show id={} -r http://192.168.33.20:8080 | jq '.gov_type=[\"Estatal\"]' | ckanapi action package_update -i -r http://192.168.33.20:8080 -a 9e21c4ed-7af9-492a-bd25-5de8a1949f71"

Note:
better yet, use env vars

ckanapi action organization_list -r `echo $CKAN_HOST` | jq 'map(select(. | startswith("ayuntamiento-")))' -c | awk '{ gsub(/[\[\]\"]/,"", $0); print }' | awk '{ gsub(/,/, "\n", $0); print }' | parallel ckanapi action organization_show -r `echo $CKAN_HOST` id={} | jq '.packages[].name' > municipales.txt

cat municipales.txt | parallel "ckanapi action package_show id={} -r `echo $CKAN_HOST` | jq '.gov_type=[\"Municipal\"]' | ckanapi action package_update -i -r `echo $CKAN_HOST` -a `echo $CKAN_API_TOKEN`"

The fxxxx feds

Find fed orgs

ckanapi action organization_list -r `echo $CKAN_HOST` | jq . -c | awk '{ gsub(/[\[\]\"]/,"", $0); print }' | awk '{ gsub(/,/, "\n", $0); print }' | awk '{ gsub(/^(ayuntamiento-|estado-|gobierno-).*/, "", $0); print }' > ffeds

Get packages

cat ffeds | parallel ckanapi action organization_show -r `echo $CKAN_HOST` id={} | jq '.packages[].name' > federales

Update

cat federales | parallel "ckanapi action package_show id={} -r $CKAN_HOST | jq '.gov_type=[\"Federal\"]' | ckanapi action package_update -i -r $CKAN_HOST -a $CKAN_API_TOKEN"

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment