Skip to content

Instantly share code, notes, and snippets.

@lathropd
Created April 2, 2018 23:32
Show Gist options
  • Save lathropd/fe983922b8c5abe3d42dfafaa7bff9ac to your computer and use it in GitHub Desktop.
Save lathropd/fe983922b8c5abe3d42dfafaa7bff9ac to your computer and use it in GitHub Desktop.
multipage_scraper created by lathropd - https://repl.it/@lathropd/multipagescraper
let request = require('request')
let rp = require('request-promise-native');
let cheerio = require('cheerio');
let d3 = require('d3');
let fs = require('fs');
var mainUrl = "http://www.spotrac.com/mlb/payroll";
var teamList = [];
var playerSalaries = [];
rp(mainUrl)
.then(scrape)
.catch();
function scrape(html) {
var $ = cheerio.load(html);
var table = $("table").first();
var rows = table.find("tr");
var header = rows.first();
rows = rows.slice(1);
var data = [];
rows.each(function (i, row) {
row = $(row);
var href = row.find(".player a").attr('href');
var cells = row.find("td");
var d = {
link: href,
rank: $(cells[0]).text(),
team: $(cells[1]).text(),
roster: $(cells[2]).text(),
twentyFiveMan: $(cells[3]).text(),
disabledList: $(cells[4]).text(),
retained: $(cells[5]).text(),
buried: $(cells[6]).text(),
suspended: $(cells[7]).text()
}
if (href) {
data.push(d);
}
});
var csv = d3.csvFormat(data);
console.log(csv);
var teamUrls = data.map(function (team) {
return team.link;
});
scrapeTeams(teamUrls, []);
}
function scrapeTeams(teamUrls, data) {
var url = teamUrls.pop()||"";
rp(url)
.then(function (html) {
var $ = cheerio.load(html);
var team = $("div.team-name h1");
team.find("span").remove();
team = team.text()
var rows = $("table").first().find("tr");
rows = rows.slice(1);
rows.each(function (i, row) {
row = $(row);
var href = row.find(".player a").attr('href');
var name = row.find(".player a").text();
var cells = row.find("td");
var d = {
player: name,
link: href,
team: team,
teamUrl: url,
age: $(cells[1]).text(),
position: $(cells[2]).text(),
status: $(cells[3]).text(),
baseSalary: $(cells[4]).text(),
signingBonus: $(cells[5]).text(),
incentives: $(cells[6]).text(),
totalSalary: $(cells[7]).text(),
adjSalary: $(cells[8]).text(),
payrollPercent: $(cells[9]).text()
}
data.push(d);
})
// scrape the next team if there is one
// otherwise save the list
if (teamUrls.length > 0) {
scrapeTeam(teamUrls, data)
} else {
var csv = d3.csvFormat(data);
console.log("-".repeat(80));
console.log(csv);
}
})
.catch(function (err) { console.log(err) })
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment