View index.sh
// data from geojson lines file from https://mapzen.com/data/metro-extracts/ | |
cat portland_oregon_osm_line.geojson | jsonfilter features.* > lines.ndjson | |
cat lines.ndjson | jsonfilter --match="this.properties.bicycle && this.properties.bicycle !== 'no'" > sharedpaths.json | |
cat lines.ndjson | grep "cycleway" >> sharedpaths.json | |
cat sharedpaths.json | sort | uniq > dedupe.json | |
mv dedupe.json sharedpaths.json | |
cat sharedpaths.json | ndjson-reduce | ndjson-map '{type: "FeatureCollection", features: d}' > sharedpaths.geojson | |
mkdir shp | |
cd shp | |
ogr2ogr -f "ESRI Shapefile" sharedpaths.shp ../sharedpaths.geojson OGRGeoJSON |
View index.txt
379490 referenceworks.brillonline.com | |
377682 doi.apa.org | |
244045 primarysources.brillonline.com | |
189587 f1000.com | |
106769 www.iucnredlist.org | |
78961 www.e-enlightenment.com | |
67194 doi.namesforlife.com | |
20335 www.degruyter.com | |
17940 www.icpsr.umich.edu | |
17044 www.scivee.tv |
View index.txt
672055 www.ccdc.cam.ac.uk | |
618996 figshare.com | |
493410 rgdoi.net | |
487454 plutof.ut.ee | |
378396 ba.e-pics.ethz.ch | |
376822 retro.seals.ch | |
373193 www.die-bonn.de | |
358476 doi.pangaea.de | |
313951 www.gbif.org | |
237629 www.hepdata.net |
View index.sh
npm install gunzip-maybe xml-json jsonfilter nugget -g | |
curl "https://search.datacite.org/sitemaps/sitemap.xml.gz" | gunzip-maybe | xml-json sitemapindex | jsonfilter sitemap.*.loc | xargs nugget -d datacite | |
ls datacite | xargs -I {} sh -c "cat datacite/{} | gunzip-maybe | xml-json urlset | jsonfilter url.*.loc | grep works" >> urls.txt |
View index.js
var request = require('request') | |
var base = 'https://api.crossref.org/works?filter=type:dataset&rows=1000' | |
doNext() | |
function doNext (cursor) { | |
if (!cursor) cursor = '*' | |
var url = base + '&cursor=' + cursor | |
console.error('GET', url) |
View data.csv
We can make this file beautiful and searchable if this error is corrected: It looks like row 10 should actually have 4 columns, instead of 3. in line 9.
doi,id,type,url | |
10.7554/eLife.00007,dataro1,generated-dataset,http://dx.doi.org/10.5061/dryad.gs45f | |
10.7554/eLife.00048,dataro1,generated-dataset,http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE40298 | |
10.7554/eLife.00049,dataro1,generated-dataset,http://www.ncbi.nlm.nih.gov/genbank/ | |
10.7554/eLife.00065,dataro1,generated-dataset,http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE39313http://www.ncbi.nlm.nih.gov/geo/ | |
10.7554/eLife.00170,dataro1,generated-dataset,http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE41937http://eisenlab.org/data/TAF7Lhttp://www.ncbi.nlm.nih.gov/geo/ | |
10.7554/eLife.00170,dataro2,generated-dataset,http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE21365http://www.ncbi.nlm.nih.gov/geo/ | |
10.7554/eLife.00170,dataro3,generated-dataset,http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE27450http://www.ncbi.nlm.nih.gov/geo/ | |
10.7554/eLife.00170,dataro4,generated-dataset,http://trace.ddbj.nig.ac.jp/DRASearch/study?acc=DRP000383http://www.ddbj.nig.ac.jp/ | |
10.7554/eLife.00184,dataro1, |
View urls.txt
15030476 linkinghub.elsevier.com | |
9053559 link.springer.com | |
7949710 doi.wiley.com | |
3749242 ieeexplore.ieee.org | |
3468507 www.tandfonline.com | |
2005530 academic.oup.com | |
2000344 www.jstor.org | |
1662232 content.wkhealth.com | |
1498775 www.degruyter.com | |
1438236 pubs.acs.org |
View csv.js
// output of above script pipe into here, converts it to smaller csv | |
var split = require('split2') | |
var through = require('through2') | |
console.log('doi,url') | |
var splitter = split() | |
var each = through(function (buf, enc, next) { | |
var _ | |
try { |
View download.js
var request = require('request') | |
var link = process.argv[2] | |
var start = process.argv[3] | |
if (!start) start = 0 | |
else start = +start | |
dl(link, start, function (err) { | |
if (err) throw err | |
console.error('All done') | |
}) |
View links.json
"http://download.kiwix.org/zim/wikipedia_ab_all.zim.torrent" | |
"http://download.kiwix.org/zim/wikipedia_ab_all_nopic.zim.torrent" | |
"http://download.kiwix.org/zim/wikipedia_ace_all.zim.torrent" | |
"http://download.kiwix.org/zim/wikipedia_ace_all_nopic.zim.torrent" | |
"http://download.kiwix.org/zim/wikipedia_ady_all.zim.torrent" | |
"http://download.kiwix.org/zim/wikipedia_ady_all_nopic.zim.torrent" | |
"http://download.kiwix.org/zim/wikipedia_af_all.zim.torrent" | |
"http://download.kiwix.org/zim/wikipedia_af_all_nopic.zim.torrent" | |
"http://download.kiwix.org/zim/wikipedia_ak_all.zim.torrent" | |
"http://download.kiwix.org/zim/wikipedia_ak_all_nopic.zim.torrent" |