Skip to content

Instantly share code, notes, and snippets.

@lathropd
Created April 4, 2018 17:25
Show Gist options
  • Save lathropd/cb57cbfe08c83708239083705ab3f3ef to your computer and use it in GitHub Desktop.
Save lathropd/cb57cbfe08c83708239083705ab3f3ef to your computer and use it in GitHub Desktop.
paged results single page scraper created by lathropd - https://repl.it/@lathropd/paged-results-single-page-scraper
let request = require('request')
let rp = require('request-promise-native');
let cheerio = require('cheerio');
let d3 = require('d3');
let fs = require('fs');
let sleep = require('thread-sleep');
var mainUrl = "http://www.journalismjobs.com/job-listings";
var data = [];
rp(mainUrl)
.then(scrape)
.catch();
function scrape(html) {
var $ = cheerio.load(html);
var jobs = $("div.result");
jobs.each(function (i, job) {
job = $(job);
var href = job.find("div.title a").attr('href');
var jobTitle = job.find("div.title a").text
();
var jobLocation = job.find("li.location").text();
var jobStatus = job.find("li.status").text();
var jobCompany = job.find("div.company").text();
var jobPostedDate = job.find("li.posted").text();
var now = new Date();
now = now.toString();
var d = {
link: href,
title: jobTitle,
location: jobLocation,
status: jobStatus,
company: jobCompany,
posted: jobPostedDate,
scrapedAt: now
}
if (href) {
data.push(d);
}
});
var nextPage = $("li.next a").attr('href');
if (nextPage) {
console.log(data.length)
var nextUrl = 'http://www.journalismjobs.com'+ nextPage;
console.log(nextUrl);
sleep(1000);
rp( nextUrl )
.then(scrape)
.catch(function (err) {console.log(err)});
} else {
var csv = d3.csvFormat(data);
console.log(csv);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment