Skip to content

Instantly share code, notes, and snippets.

@johnrichardrinehart
Last active April 5, 2017 22:38
Show Gist options
  • Save johnrichardrinehart/43cee6f817afa16da3c48da378e8cf00 to your computer and use it in GitHub Desktop.
Save johnrichardrinehart/43cee6f817afa16da3c48da378e8cf00 to your computer and use it in GitHub Desktop.
var Promise = require("bluebird");
var bhttp = require("bhttp");
var taskQueue = require("promise-task-queue");
var MongoClient = require('mongodb');
var queue = taskQueue();
var failedRequests = 0;
//const alphabet = "abcdefghijklmnopqrstuvwxyz"
//const urls =[]
//for (let i=0 ; i < 4; i++) {
//urls[i] = "http://echo.jsontest.com/" + alphabet.charAt(i) + "/" + alphabet.charAt(i+1)
//}
const dataStore = function(opts) {
return opts.mc.then(db => db.collection(opts.collection_name))
.then(coll => coll.updateOne(opts.obj, opts.obj, {upsert: true}))
}
const dbState = function(opts) {
return opts.mc.then(db => db.collection(opts.collection_name))
.then(c => c.findOne(opts.obj))
.then(result => {
if (result !== null) {
console.log("Data already obtained: " + JSON.stringify(opts.obj))
opts.obj = null;
} else if (!opts.obj.Response) {
console.log("New data: " + JSON.stringify(opts.obj) )
}
return opts
})
}
queue.define(
"getRequest", function(opts) {
// Grab the data and check if we already have it.
// The data has to be stored here instead of in scrape() in order that not
// *all* of the URL responses are stored in memory and then only later
// stored in the database. We could have many, many URLs to deal with.
return Promise.try(function() {
return opts.request()
})
.then((res) => dbState({obj: res.body, mc:opts.mc, collection_name:
opts.collection_name, url: res.request.url}))
.then((opts) => {
// short circuit over the null from dbState
if (opts.obj !== null && opts.obj.Response === "Error") {
console.log("Bad request. Data doesn't exist: " + opts.url)
opts.obj = null;
return opts;
} else {
return opts;
}
})
.then((opts) => {
if (opts.obj !== null) return dataStore(opts)}
)
}
,
{ concurrency: 1, interval:2,}
)
// opts = {urls: [], mc: MongoClient, collection_name: ''}
function scrape(opts) {
return Promise.try(() => {
let promises = opts.urls.map( (url) => {
// additional level of functional indirection needed so that the query
// isn't executed here.
return () => bhttp.get(url,{encodeJSON:true})
})
return promises;
})
.map((request,idx,length) => queue.push(
"getRequest",
{request: request, mc: opts.mc, collection_name: opts.collection_name}
))
.catch(err => console.log(err))
}
module.exports = {scrape};
//start(urls);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment