####################################################################################### # Function to scrape season skater statistics from Hockey-reference.com ####################################################################################### GrabSkaters <- function(S) { # The function takes parameter S which is a string and represents the Season # Returns: data frame require(XML) ## create the URL URL <- paste("http://www.hockey-reference.com/leagues/NHL_", S, "_skaters.html", sep="") ## grab the page -- the table is parsed nicely tables <- readHTMLTable(URL) ds.skaters <- tables$stats ## determine if the HTML table was well formed (column names are the first record) ## can either read in directly or need to force column names ## and ## I don't like dealing with factors if I don't have to ## and I prefer lower case for(i in 1:ncol(ds.skaters)) { ds.skaters[,i] <- as.character(ds.skaters[,i]) names(ds.skaters) <- tolower(colnames(ds.skaters)) } ## fix a couple of the column names colnames(ds.skaters) ## names(ds.skaters)[10] <- "plusmin" names(ds.skaters)[11] <- "plusmin" names(ds.skaters)[18] <- "spct" ## finally fix the columns - NAs forced by coercion warnings for(i in c(1, 3, 6:18)) { ds.skaters[,i] <- as.numeric(ds.skaters[, i]) } ## convert toi to seconds, and seconds/game ## ds.skaters$seconds <- (ds.skaters$toi*60)/ds.skaters$gp ## remove the header and totals row ds.skaters <- ds.skaters[!is.na(ds.skaters$rk), ] ## ds.skaters <- ds.skaters[ds.skaters$tm != "TOT", ] ## add the year ds.skaters$season <- S ## return the dataframe return(ds.skaters) }