Last active
September 11, 2017 16:42
-
-
Save nemo/21e125eeae8a24a671fb486ef7a20f60 to your computer and use it in GitHub Desktop.
StdLIb Distributed Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const request = require('request-promise-native'); | |
const cheerio = require('cheerio'); | |
const parseAll = require('html-metadata').parseAll; | |
/** | |
* A simple and powerful scraper | |
* @param {string} url Url to fetch | |
* @param {string} userAgent Request's User Agent | |
* @param {array} queries Queries to apply (using cheerio) | |
* @returns {object} | |
**/ | |
module.exports = async function (url, userAgent = 'stdlib-example/scraper v0.1', queries = [], context) { | |
let opts = { | |
url: url, | |
headers: { | |
'User-Agent': userAgent | |
}, | |
transform: function (body) { | |
return cheerio.load(body); | |
} | |
}; | |
let $; | |
let metadata; | |
try { | |
$ = await request(opts); | |
metadata = await parseAll($); | |
} catch (err) { | |
return Promise.reject(err); | |
} | |
let result = { | |
url: url, | |
meta: metadata | |
} | |
/** | |
* Queries Syntax | |
* -------------- | |
* queries is an array of arrays that are formatted in the following way: | |
* ['CSS_SELECTOR', 'RESOLVER_FUNC', [...RESOLVER_FUNC_ARGS]] | |
* CSS_SELECTOR: this is the css-selector query you'd like to make | |
* RESOLVER_FUNC: this is the function that we'll use on the selection in cheerio to resolve the data (e.g. text, or attr) | |
* RESOLVER_FUNC_ARGS: these are the arguments that are passed to the resolver. | |
* | |
* Example: | |
* [ | |
* ['.repo-list-item h3 a', 'text'], // This will run $('.repo-list-item h3 a').text() on the content | |
* ['.repo-list-item h3 a', 'attr', ['ref']] // This will run $('.repo-list-item h3 a').attr('href') on the content | |
* ] | |
* | |
*/ | |
result.queries = queries.map((queryObj, index) => { | |
if (!queryObj) { | |
return new Error(`Query at index ${index} doesn't exist`); | |
} | |
if (!Array.isArray(queryObj)) { | |
return new Error(`Query at index ${index} isn't an array`); | |
} | |
if (queryObj.length > 3 || queryObj.length < 2) { | |
return new Error(`Query at index ${index} has too few or too many arguments`); | |
} | |
if (queryObj.length === 3 && !Array.isArray(queryObj[2])) { | |
return new Error(`Query at index ${index} has resolver arguments but it's not an array.`); | |
} | |
// Actual query | |
let query = queryObj[0]; | |
// How to resolve from dom (text, attr, etc.) | |
let resolver = queryObj[1]; | |
// Arguments to pass to resolver | |
let resolverArgs = queryObj[2] || []; | |
let matches = $(query); | |
return (matches.length ? matches.toArray() : [matches]).map((el) => { | |
let $el = $(el); | |
return $el[resolver].apply($el, resolverArgs); | |
}); | |
}); | |
return result; | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment