jamiew/netflix-ratings.rb

## netflix-ratings.rb
#!/usr/bin/env ruby
require 'iconv'
require 'nokogiri'

# This is a simple script to spider your Netflix paginated "What You've Rated" list.
# It requires an OS X based system with Ruby 1.9+, Safari, and AppleScript
#
# I could not find a way to back up my ratings (for all titles, not just my rental activity)
# without registering for a Netflix API key or handing my Netflix credentials over to someone
# who had an API key, so I decided to take a brute force approach and just parse the HTML for
# every page of my ratings history on Netflix's site.
#
# INSTRUCTIONS:
#  1) Launch Safari, visit Netflix.com, log in if necessary, and visit your "What You've Rated"
#     page. If the URL for page 1 differs from that of the STARTING_URL variable below, then
#     update the variable's value. Leave the browser open on that page.
#     for this script to write the ratings to.
#  2) Set the PAGE_LOAD_GRACE variable equal to the number of seconds that you would like to
#     give Safari to fully render each individual ratings history page before grabbing the
#     HTML source for the page.
#  3) Execute this script ($> ruby <scriptname>) and be careful not to interfere with Safari
#     while it visits each page in your ratings history.

# Config
STARTING_URL    = 'http://movies.netflix.com/MoviesYouveSeen'
PAGE_LOAD_GRACE = 4 # seconds of grace to allow for Safari to finish rendering a single page of ratings

# Character encoding converter instance used to force all HTML output into UTF-8 format
ICONV           = Iconv.new('UTF-8//IGNORE', 'UTF-8')

# For the given page's worth of Netflix ratings, glean out the title, Netflix URL,
# genre, and rating for each entry.
def glean_movie_info(html, ratings_array=[])
  next_url = nil
  page = Nokogiri::HTML(html)
  page.xpath('//table//tbody//tr').each do |row|
   title=row.xpath('.//td[@class="cell-title"]').text.strip
   genre=row.xpath('.//td[@class="cell-genre"]').text.strip
   rating= row.xpath('.//td[@class="cell-starbar"]//span[contains(@class,"stbrMaskFg")]').text.strip.split(":")[1].strip
   puts "#{title}|#{genre}|#{rating}"
  end
  next_url=page.xpath('//a[contains(@class, "next")]').first
  next_url.nil? ? "" : next_url["href"]
end

# Obtain the HTML source for the given URL
def fetch_html(url)
  applescript = <<-EOF
    tell application "Safari"
      activate
      set url of document 1 to "#{url}"
      delay #{PAGE_LOAD_GRACE}
      set htmlSource to source of document 1
      set the clipboard to htmlSource as text
    end tell
  EOF
  ICONV.iconv(`osascript -e '#{applescript}' && pbpaste` + ' ')[0..-2]
end

# Starting with the first page of ratings, keep gleaning ratings info
# and moving on to the next page until the last page (which will not
# have a "next" link at the bottom). Keep adding each page's worth of
# info to the ratings array, which contains a hash of info for each movie.
url_to_fetch = STARTING_URL
ratings = []
until url_to_fetch == ""
  url_to_fetch = glean_movie_info(fetch_html(url_to_fetch), ratings)
end
	#!/usr/bin/env ruby
	require 'iconv'
	require 'nokogiri'

	# This is a simple script to spider your Netflix paginated "What You've Rated" list.
	# It requires an OS X based system with Ruby 1.9+, Safari, and AppleScript
	#
	# I could not find a way to back up my ratings (for all titles, not just my rental activity)
	# without registering for a Netflix API key or handing my Netflix credentials over to someone
	# who had an API key, so I decided to take a brute force approach and just parse the HTML for
	# every page of my ratings history on Netflix's site.
	#
	# INSTRUCTIONS:
	# 1) Launch Safari, visit Netflix.com, log in if necessary, and visit your "What You've Rated"
	# page. If the URL for page 1 differs from that of the STARTING_URL variable below, then
	# update the variable's value. Leave the browser open on that page.
	# for this script to write the ratings to.
	# 2) Set the PAGE_LOAD_GRACE variable equal to the number of seconds that you would like to
	# give Safari to fully render each individual ratings history page before grabbing the
	# HTML source for the page.
	# 3) Execute this script ($> ruby <scriptname>) and be careful not to interfere with Safari
	# while it visits each page in your ratings history.

	# Config
	STARTING_URL = 'http://movies.netflix.com/MoviesYouveSeen'
	PAGE_LOAD_GRACE = 4 # seconds of grace to allow for Safari to finish rendering a single page of ratings

	# Character encoding converter instance used to force all HTML output into UTF-8 format
	ICONV = Iconv.new('UTF-8//IGNORE', 'UTF-8')

	# For the given page's worth of Netflix ratings, glean out the title, Netflix URL,
	# genre, and rating for each entry.
	def glean_movie_info(html, ratings_array=[])
	next_url = nil
	page = Nokogiri::HTML(html)
	page.xpath('//table//tbody//tr').each do \|row\|
	title=row.xpath('.//td[@class="cell-title"]').text.strip
	genre=row.xpath('.//td[@class="cell-genre"]').text.strip
	rating= row.xpath('.//td[@class="cell-starbar"]//span[contains(@class,"stbrMaskFg")]').text.strip.split(":")[1].strip
	puts "#{title}\|#{genre}\|#{rating}"
	end
	next_url=page.xpath('//a[contains(@class, "next")]').first
	next_url.nil? ? "" : next_url["href"]
	end

	# Obtain the HTML source for the given URL
	def fetch_html(url)
	applescript = <<-EOF
	tell application "Safari"
	activate
	set url of document 1 to "#{url}"
	delay #{PAGE_LOAD_GRACE}
	set htmlSource to source of document 1
	set the clipboard to htmlSource as text
	end tell
	EOF
	ICONV.iconv(`osascript -e '#{applescript}' && pbpaste` + ' ')[0..-2]
	end

	# Starting with the first page of ratings, keep gleaning ratings info
	# and moving on to the next page until the last page (which will not
	# have a "next" link at the bottom). Keep adding each page's worth of
	# info to the ratings array, which contains a hash of info for each movie.
	url_to_fetch = STARTING_URL
	ratings = []
	until url_to_fetch == ""
	url_to_fetch = glean_movie_info(fetch_html(url_to_fetch), ratings)
	end