Last active
April 4, 2018 17:57
-
-
Save lathropd/d7955f0b1ed2d488111fc868b69504b3 to your computer and use it in GitHub Desktop.
multipage_scraper created by lathropd - https://repl.it/@lathropd/multipagescraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
let request = require('request') | |
let rp = require('request-promise-native'); | |
let cheerio = require('cheerio'); | |
let d3 = require('d3'); | |
var mainUrl = "http://www.spotrac.com/mlb/payroll"; | |
var playerSalaries = []; | |
rp(mainUrl) | |
.then(scrape) | |
.catch(); | |
function scrape(html) { | |
var $ = cheerio.load(html); | |
var table = $("table").first(); | |
var rows = table.find("tr"); | |
var header = rows.first(); | |
rows = rows.slice(1); | |
var data = []; | |
rows.each(function (i, row) { | |
row = $(row); | |
var href = row.find(".player a").attr('href'); | |
var cells = row.find("td"); | |
var d = { | |
link: href, | |
rank: $(cells[0]).text(), | |
team: $(cells[1]).text(), | |
roster: $(cells[2]).text(), | |
twentyFiveMan: $(cells[3]).text(), | |
disabledList: $(cells[4]).text(), | |
retained: $(cells[5]).text(), | |
buried: $(cells[6]).text(), | |
suspended: $(cells[7]).text() | |
} | |
if (href) { | |
data.push(d); | |
} | |
}); | |
var csv = d3.csvFormat(data); | |
console.log(csv); | |
var teamUrls = data.map(function (team) { | |
return team.link; | |
}); | |
scrapeTeams(teamUrls, []); | |
} | |
function scrapeTeams(teamUrls, data) { | |
var url = teamUrls.pop()||""; | |
rp(url) | |
.then(function (html) { | |
var $ = cheerio.load(html); | |
var team = $("div.team-name h1"); | |
team.find("span").remove(); | |
team = team.text() | |
var rows = $("table").first().find("tr"); | |
rows = rows.slice(1); | |
rows.each(function (i, row) { | |
row = $(row); | |
var href = row.find(".player a").attr('href'); | |
var name = row.find(".player a").text(); | |
var cells = row.find("td"); | |
var d = { | |
player: name, | |
link: href, | |
team: team, | |
teamUrl: url, | |
age: $(cells[1]).text(), | |
position: $(cells[2]).text(), | |
status: $(cells[3]).text(), | |
baseSalary: $(cells[4]).text(), | |
signingBonus: $(cells[5]).text(), | |
incentives: $(cells[6]).text(), | |
totalSalary: $(cells[7]).text(), | |
adjSalary: $(cells[8]).text(), | |
payrollPercent: $(cells[9]).text() | |
} | |
data.push(d); | |
}) | |
// scrape the next team if there is one | |
// otherwise save the list | |
if (teamUrls.length > 0) { | |
scrapeTeams(teamUrls, data) | |
} else { | |
var csv = d3.csvFormat(data); | |
console.log("-".repeat(80)); | |
console.log(csv); | |
} | |
}) | |
.catch(function (err) { console.log(err) }) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment