Created
August 13, 2020 03:07
-
-
Save tanho63/cf5a7d18ca62548eba32c08700f1d24f to your computer and use it in GitHub Desktop.
PFR while loop
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rvest) | |
library(tidyverse) | |
library(glue) | |
library(ratelimitr) | |
library(DBI) | |
# Ratelimit read_html to 100 pages per minute | |
lim_readhtml<-limit_rate(read_html,rate(n = 100,period = 60)) | |
# Set parameters of loop | |
yearmin<-2000 | |
yearmax<-2020 | |
offset<-0 | |
aws_db <- dbConnect(odbc::odbc(),"dynastyprocess_db") | |
team_ids <- dbGetQuery(aws_db,"SELECT mfl,pfr FROM dp_teamids") | |
dbDisconnect(aws_db) | |
pfr_drafted<-data.frame() | |
while (!is.na(offset)) { | |
# Construct scraper loop | |
pfr_url<-glue('https://www.pro-football-reference.com/play-index/draft-finder.cgi?request=1&year_min={ | |
yearmin}&year_max={yearmax}&show=all&order_by=default&offset={offset}') %>% | |
lim_readhtml() | |
# extract pfr ids from html attribute | |
pfrselect_names<-html_nodes(pfr_url,'.left:nth-child(5)') | |
pfr_ids<-tibble(id_pfr = html_attr(pfrselect_names,'data-append-csv'), | |
names = html_attr(pfrselect_names,'csk') | |
) %>% | |
filter(!is.na(names)) | |
pfr_table<-pfr_url %>% | |
html_node('table') %>% | |
html_table(fill=TRUE) %>% | |
set_names(.[1,]) %>% | |
rename(CollegeStats=18) %>% | |
slice(-1) %>% | |
filter(Rk!='Rk') %>% | |
mutate_at(vars(Rk,Year,Rnd,Pick,DrAge,From,To,AP1,PB,St,CarAV,G,GS),as.numeric) | |
# extract CFB reference ids | |
pfrselect_cfbid<-html_nodes(pfr_url,'.right:nth-child(18)') %>% | |
html_nodes('a') %>% | |
html_attr('href') %>% | |
str_remove('https://www.sports-reference.com/cfb/players/') %>% | |
str_remove('.html') | |
# Combine and clean | |
pfr_output<-pfr_table %>% | |
filter(CollegeStats =='College Stats') %>% | |
bind_cols(id_cfbref = pfrselect_cfbid) %>% | |
bind_rows({pfr_table %>% filter(CollegeStats!='College Stats')}) %>% | |
arrange(Rk) %>% | |
bind_cols(pfr_ids) %>% | |
left_join(team_ids,by = c('Tm'='pfr')) %>% | |
select(draft_year = Year,draft_round = Rnd,draft_pick = Pick,player = Player,team=mfl,pos = Pos, | |
draft_age = DrAge,career_from = From, career_end = To, allpro = AP1, probowl = PB, | |
years_starter = St, career_av = CarAV, games = G, games_started = GS, college = `College/Univ`, | |
id_cfbref,id_pfr) %>% | |
mutate(merge_name = str_remove_all(player,"( Jr.)|( Sr.)|( III)|( II)|( IV)|(\\')|(\\.)")) %>% | |
mutate_at('merge_name',str_squish) %>% | |
mutate_at('merge_name',tolower) | |
pfr_drafted<-pfr_drafted %>% | |
bind_rows(pfr_output) | |
# get next if available | |
offset<-html_node(pfr_url,'.next') %>% | |
html_attr('href') %>% | |
str_extract('[0-9]+$') | |
} | |
# conn_srv<-dbConnect(odbc::odbc(),'dynastyprocess_db') | |
# dbExecute(conn_srv,"TRUNCATE TABLE pfr_drafted") | |
# dbAppendTable(conn_srv,pfr_drafted, name = 'pfr_drafted') | |
# dbDisconnect(conn_srv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment