Last active
December 27, 2015 15:09
-
-
Save vpascual/7345060 to your computer and use it in GitHub Desktop.
Parse the ratings of a series from IMDB (see http://www.imdb.com/title/tt0898266/epdate?ref_=ttep_ql_4)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# NOTE: This code works with the HTML code existing in imdb.com as today (06-11-2013) | |
# The URL with the user ratings per episode always ends with "/epdate?ref_=ttep_ql_4", | |
# so if the URL for Breaking Bad in IMDB is http://www.imdb.com/title/tt0903747/, the URL with | |
# the ratings is http://www.imdb.com/title/tt0903747/epdate?ref_=ttep_ql_4 | |
parseSeriesRatings <- function(url) { | |
require(XML) | |
# download the data | |
rawData <- readHTMLTable(url, colClasses=c("character", "character", "character", "character")) | |
# get the firs value of the list returned by readHTMLTable | |
series <- rawData[[1]] | |
# remove a colum full of noise generated by the stars available in the page | |
series <- series[,-5] | |
# convert the ratings column from factor to character | |
series[,1] <- as.character(series[,1]) | |
# remove the weird character that appears at the end of each Season.Episode code | |
series[,1] <- substr(series[,1], 1, nchar(series[,1])-1) | |
# rename the column to facilitate its access | |
names(series)[1] <- "Season.Episode" | |
# get the number of season | |
series$season <- trunc(as.numeric(series$Season.Episode)) | |
# get the number of episode | |
series$episode <- as.numeric(substr(series$Season.Episode, 3, 5)) | |
return (series) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment