on data.gov at least there is a 2 level hierarchy to dataset metadata. top level parent items include individual datasets as well as collection items. children items include all datasets published inside a collection. not sure if this is a general CKAN concept
Last active
April 3, 2017 21:09
-
-
Save max-mapper/7f36ba7663b65a73c81d9c0cc6841702 to your computer and use it in GitHub Desktop.
ckan metadata downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var request = require('request') | |
var fs = require('fs') | |
var ndjson = require('ndjson') | |
var serialize = ndjson.serialize() | |
var write = fs.createWriteStream('./meta-children.json') | |
serialize.pipe(write) | |
var current = 1601000 | |
var rows = 1000 | |
var delay = 1000 | |
var limit = 2000000 | |
function page (start) { | |
return `http://catalog.data.gov/api/3/action/package_search?fq=collection_package_id:*&rows=${rows}&start=${start}` | |
} | |
function go () { | |
var url = page(current) | |
console.log('GET', url) | |
request({url: url, json: true}, function (err, resp, body) { | |
if (err) throw err | |
if (resp.statusCode !== 200) throw new Error(body) | |
if (!body.result.results.length) throw new Error(body) | |
current += rows | |
body.result.results.forEach(function (r) { | |
serialize.write(r) | |
}) | |
if (current > limit) { | |
console.log('done') | |
serialize.end() | |
} else { | |
setTimeout(go, delay) | |
} | |
}) | |
} | |
go() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var request = require('request') | |
var fs = require('fs') | |
var ndjson = require('ndjson') | |
var serialize = ndjson.serialize() | |
var write = fs.createWriteStream('./meta.json') | |
serialize.pipe(write) | |
var current = 164000 | |
var rows = 1000 | |
var delay = 1000 | |
var limit = 200000 | |
function page (start) { | |
return `http://catalog.data.gov/api/3/action/package_search?rows=${rows}&start=${start}` | |
} | |
function go () { | |
var url = page(current) | |
console.log('GET', url) | |
request({url: url, json: true}, function (err, resp, body) { | |
if (err) throw err | |
if (resp.statusCode !== 200) throw new Error(body) | |
if (!body.result.results.length) throw new Error(body) | |
current += rows | |
body.result.results.forEach(function (r) { | |
serialize.write(r) | |
}) | |
if (current > limit) { | |
console.log('done') | |
serialize.end() | |
} else { | |
setTimeout(go, delay) | |
} | |
}) | |
} | |
go() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment