Created
April 2, 2018 23:32
-
-
Save lathropd/fe983922b8c5abe3d42dfafaa7bff9ac to your computer and use it in GitHub Desktop.
multipage_scraper created by lathropd - https://repl.it/@lathropd/multipagescraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
let request = require('request') | |
let rp = require('request-promise-native'); | |
let cheerio = require('cheerio'); | |
let d3 = require('d3'); | |
let fs = require('fs'); | |
var mainUrl = "http://www.spotrac.com/mlb/payroll"; | |
var teamList = []; | |
var playerSalaries = []; | |
rp(mainUrl) | |
.then(scrape) | |
.catch(); | |
function scrape(html) { | |
var $ = cheerio.load(html); | |
var table = $("table").first(); | |
var rows = table.find("tr"); | |
var header = rows.first(); | |
rows = rows.slice(1); | |
var data = []; | |
rows.each(function (i, row) { | |
row = $(row); | |
var href = row.find(".player a").attr('href'); | |
var cells = row.find("td"); | |
var d = { | |
link: href, | |
rank: $(cells[0]).text(), | |
team: $(cells[1]).text(), | |
roster: $(cells[2]).text(), | |
twentyFiveMan: $(cells[3]).text(), | |
disabledList: $(cells[4]).text(), | |
retained: $(cells[5]).text(), | |
buried: $(cells[6]).text(), | |
suspended: $(cells[7]).text() | |
} | |
if (href) { | |
data.push(d); | |
} | |
}); | |
var csv = d3.csvFormat(data); | |
console.log(csv); | |
var teamUrls = data.map(function (team) { | |
return team.link; | |
}); | |
scrapeTeams(teamUrls, []); | |
} | |
function scrapeTeams(teamUrls, data) { | |
var url = teamUrls.pop()||""; | |
rp(url) | |
.then(function (html) { | |
var $ = cheerio.load(html); | |
var team = $("div.team-name h1"); | |
team.find("span").remove(); | |
team = team.text() | |
var rows = $("table").first().find("tr"); | |
rows = rows.slice(1); | |
rows.each(function (i, row) { | |
row = $(row); | |
var href = row.find(".player a").attr('href'); | |
var name = row.find(".player a").text(); | |
var cells = row.find("td"); | |
var d = { | |
player: name, | |
link: href, | |
team: team, | |
teamUrl: url, | |
age: $(cells[1]).text(), | |
position: $(cells[2]).text(), | |
status: $(cells[3]).text(), | |
baseSalary: $(cells[4]).text(), | |
signingBonus: $(cells[5]).text(), | |
incentives: $(cells[6]).text(), | |
totalSalary: $(cells[7]).text(), | |
adjSalary: $(cells[8]).text(), | |
payrollPercent: $(cells[9]).text() | |
} | |
data.push(d); | |
}) | |
// scrape the next team if there is one | |
// otherwise save the list | |
if (teamUrls.length > 0) { | |
scrapeTeam(teamUrls, data) | |
} else { | |
var csv = d3.csvFormat(data); | |
console.log("-".repeat(80)); | |
console.log(csv); | |
} | |
}) | |
.catch(function (err) { console.log(err) }) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment