Parse the ratings of a series from IMDB (see http://www.imdb.com/title/tt0898266/epdate?ref_=ttep_ql_4)
# NOTE: This code works with the HTML code existing in imdb.com as today (06-11-2013) | |
# The URL with the user ratings per episode always ends with "/epdate?ref_=ttep_ql_4", | |
# so if the URL for Breaking Bad in IMDB is http://www.imdb.com/title/tt0903747/, the URL with | |
# the ratings is http://www.imdb.com/title/tt0903747/epdate?ref_=ttep_ql_4 | |
parseSeriesRatings <- function(url) { | |
require(XML) | |
# download the data | |
rawData <- readHTMLTable(url, colClasses=c("character", "character", "character", "character")) | |
# get the firs value of the list returned by readHTMLTable | |
series <- rawData[[1]] | |
# remove a colum full of noise generated by the stars available in the page | |
series <- series[,-5] | |
# convert the ratings column from factor to character | |
series[,1] <- as.character(series[,1]) | |
# remove the weird character that appears at the end of each Season.Episode code | |
series[,1] <- substr(series[,1], 1, nchar(series[,1])-1) | |
# rename the column to facilitate its access | |
names(series)[1] <- "Season.Episode" | |
# get the number of season | |
series$season <- trunc(as.numeric(series$Season.Episode)) | |
# get the number of episode | |
series$episode <- as.numeric(substr(series$Season.Episode, 3, 5)) | |
return (series) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment