View index.js
var $ = require('cheerio')
var fs = require('fs')
var walker = require('folder-walker')
var transform = require('parallel-transform')
var ndjson = require('ndjson')
var walk = walker('./pageblobs') // generated by abstract-blob-store
var scraper = transform(10, scrape)
var out = ndjson.serialize()
View index.js
var hyperdb = require('hyperdb')
var hyperdiscovery = require('hyperdiscovery')
var npmkey = '0f8a60595af5387d52b053af4a8a4aecd5d6d3799741c3993916798e71ea0730'
var db = hyperdb('./npm.db', npmkey, {sparse: true, valueEncoding: 'json'})
db.on('ready', function () {
var swarm = hyperdiscovery(db, {live: true})
db.once('remote-update', function () {
db.get('/modules/aws.js', function (err, data) {
View index.js
var fs = require('fs')
var request = require('request')
var through = require('through2')
var ndjson = require('ndjson')
var once = require('once')
var pump = require('pump')
var concat = require('concat-stream')
var parallel = require('parallel-transform')
var hyperdb = require('hyperdb')
var db = hyperdb('./npm.db', {valueEncoding: 'json'})
View index.sh
// data from geojson lines file from https://mapzen.com/data/metro-extracts/
cat portland_oregon_osm_line.geojson | jsonfilter features.* > lines.ndjson
cat lines.ndjson | jsonfilter --match="this.properties.bicycle && this.properties.bicycle !== 'no'" > sharedpaths.json
cat lines.ndjson | grep "cycleway" >> sharedpaths.json
cat sharedpaths.json | sort | uniq > dedupe.json
mv dedupe.json sharedpaths.json
cat sharedpaths.json | ndjson-reduce | ndjson-map '{type: "FeatureCollection", features: d}' > sharedpaths.geojson
mkdir shp
cd shp
ogr2ogr -f "ESRI Shapefile" sharedpaths.shp ../sharedpaths.geojson OGRGeoJSON
View index.txt
379490 referenceworks.brillonline.com
377682 doi.apa.org
244045 primarysources.brillonline.com
189587 f1000.com
106769 www.iucnredlist.org
78961 www.e-enlightenment.com
67194 doi.namesforlife.com
20335 www.degruyter.com
17940 www.icpsr.umich.edu
17044 www.scivee.tv
View index.txt
672055 www.ccdc.cam.ac.uk
618996 figshare.com
493410 rgdoi.net
487454 plutof.ut.ee
378396 ba.e-pics.ethz.ch
376822 retro.seals.ch
373193 www.die-bonn.de
358476 doi.pangaea.de
313951 www.gbif.org
237629 www.hepdata.net
View index.sh
npm install gunzip-maybe xml-json jsonfilter nugget -g
curl "https://search.datacite.org/sitemaps/sitemap.xml.gz" | gunzip-maybe | xml-json sitemapindex | jsonfilter sitemap.*.loc | xargs nugget -d datacite
ls datacite | xargs -I {} sh -c "cat datacite/{} | gunzip-maybe | xml-json urlset | jsonfilter url.*.loc | grep works" >> urls.txt
View index.js
var request = require('request')
var base = 'https://api.crossref.org/works?filter=type:dataset&rows=1000'
doNext()
function doNext (cursor) {
if (!cursor) cursor = '*'
var url = base + '&cursor=' + cursor
console.error('GET', url)
View data.csv
We can make this file beautiful and searchable if this error is corrected: It looks like row 10 should actually have 4 columns, instead of 3.
doi,id,type,url
10.7554/eLife.00007,dataro1,generated-dataset,http://dx.doi.org/10.5061/dryad.gs45f
10.7554/eLife.00048,dataro1,generated-dataset,http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE40298
10.7554/eLife.00049,dataro1,generated-dataset,http://www.ncbi.nlm.nih.gov/genbank/
10.7554/eLife.00065,dataro1,generated-dataset,http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE39313http://www.ncbi.nlm.nih.gov/geo/
10.7554/eLife.00170,dataro1,generated-dataset,http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE41937http://eisenlab.org/data/TAF7Lhttp://www.ncbi.nlm.nih.gov/geo/
10.7554/eLife.00170,dataro2,generated-dataset,http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE21365http://www.ncbi.nlm.nih.gov/geo/
10.7554/eLife.00170,dataro3,generated-dataset,http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE27450http://www.ncbi.nlm.nih.gov/geo/
10.7554/eLife.00170,dataro4,generated-dataset,http://trace.ddbj.nig.ac.jp/DRASearch/study?acc=DRP000383http://www.ddbj.nig.ac.jp/
10.7554/eLife.00184,dataro1,
View urls.txt
15030476 linkinghub.elsevier.com
9053559 link.springer.com
7949710 doi.wiley.com
3749242 ieeexplore.ieee.org
3468507 www.tandfonline.com
2005530 academic.oup.com
2000344 www.jstor.org
1662232 content.wkhealth.com
1498775 www.degruyter.com
1438236 pubs.acs.org