Skip to content

Instantly share code, notes, and snippets.

@st8998
Created September 16, 2016 11:38
Show Gist options
  • Save st8998/028d12c71c769e2623ecb25d6cf34b73 to your computer and use it in GitHub Desktop.
Save st8998/028d12c71c769e2623ecb25d6cf34b73 to your computer and use it in GitHub Desktop.
const cheerio = require('cheerio')
const request = require('request')
const fs = require('fs')
const { map, join, stringify, readable } = require('event-stream')
const BASE_URL = 'http://www.profjournal.com/update2016/article_page_new.cfm'
const SUBID = 110890
const START_ID = 74371 //81209//468
const END_ID = 468 //81209
const FETCH_LIMIT = 1000
const out = fs.createWriteStream('./out/reviews.json')
let currId = START_ID
let fetched = 0
const stream = readable(function (count, callback) {
if (currId === END_ID || fetched >= FETCH_LIMIT) return this.emit('end')
this.emit('data', currId--)
callback()
})
const MAX_REQ_COUNT = 4
let currReqCount = 0
const fetchArticle = map(function (id, callback) {
console.log('TRY:', id, '|', 'FETCHED:', fetched)
currReqCount += 1
if (currReqCount === MAX_REQ_COUNT) stream.pause()
request(`${BASE_URL}?article_id=${id}&subid=${SUBID}`, function (err, res, html) {
currReqCount -= 1
stream.resume()
callback(null, { id, html })
})
})
const extractData = map(function ({ id, html }, callback) {
const $ = cheerio.load(html)
const out = {
id,
link: $("td:contains('Link')").next().text().trim(),
topics: $("td:contains('Topics')").next().text().split(',').map(t => t.trim()),
keywords: $("td:contains('Keywords')").next().text().split(',').map(t => t.trim()),
summary: $("td:contains('Summary')").next().text().trim(),
classroom_application: $("td:contains('Classroom Application')").next().text().trim(),
questions: $("td:contains('Question')").map((idx, node) => $(node).next().text().trim()).get()
}
if (out.link) {
fetched += 1
callback(null, out)
} else {
callback()
}
})
out.write('[\n')
stream
.pipe(fetchArticle)
.pipe(extractData)
.pipe(stringify())
.pipe(join(','))
.on('end', () => out.write(']'))
.pipe(out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment