Skip to content

Instantly share code, notes, and snippets.

@max-mapper
Last active April 3, 2017 21:09
Show Gist options
  • Save max-mapper/7f36ba7663b65a73c81d9c0cc6841702 to your computer and use it in GitHub Desktop.
Save max-mapper/7f36ba7663b65a73c81d9c0cc6841702 to your computer and use it in GitHub Desktop.
ckan metadata downloader

on data.gov at least there is a 2 level hierarchy to dataset metadata. top level parent items include individual datasets as well as collection items. children items include all datasets published inside a collection. not sure if this is a general CKAN concept

var request = require('request')
var fs = require('fs')
var ndjson = require('ndjson')
var serialize = ndjson.serialize()
var write = fs.createWriteStream('./meta-children.json')
serialize.pipe(write)
var current = 1601000
var rows = 1000
var delay = 1000
var limit = 2000000
function page (start) {
return `http://catalog.data.gov/api/3/action/package_search?fq=collection_package_id:*&rows=${rows}&start=${start}`
}
function go () {
var url = page(current)
console.log('GET', url)
request({url: url, json: true}, function (err, resp, body) {
if (err) throw err
if (resp.statusCode !== 200) throw new Error(body)
if (!body.result.results.length) throw new Error(body)
current += rows
body.result.results.forEach(function (r) {
serialize.write(r)
})
if (current > limit) {
console.log('done')
serialize.end()
} else {
setTimeout(go, delay)
}
})
}
go()
var request = require('request')
var fs = require('fs')
var ndjson = require('ndjson')
var serialize = ndjson.serialize()
var write = fs.createWriteStream('./meta.json')
serialize.pipe(write)
var current = 164000
var rows = 1000
var delay = 1000
var limit = 200000
function page (start) {
return `http://catalog.data.gov/api/3/action/package_search?rows=${rows}&start=${start}`
}
function go () {
var url = page(current)
console.log('GET', url)
request({url: url, json: true}, function (err, resp, body) {
if (err) throw err
if (resp.statusCode !== 200) throw new Error(body)
if (!body.result.results.length) throw new Error(body)
current += rows
body.result.results.forEach(function (r) {
serialize.write(r)
})
if (current > limit) {
console.log('done')
serialize.end()
} else {
setTimeout(go, delay)
}
})
}
go()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment