marcoranieri/interface.rb

## interface.rb
require "yaml"
require_relative "scraper"

# fetch array of urls
puts "Fetching URLs"
urls = fetch_movie_urls

# return an array of hashes of movie details
movies = urls.map do |url|
  puts "Scraping #{url}"
  scrape_movie(url)
end

# write all movies to a yml file
puts "Writing movies.yml"
File.open("movies.yml", "w") do |f|
  f.write(movies.to_yaml)
end

puts "Done."

## scraper.rb
require "open-uri"
require "nokogiri"
require 'pry'

# Create 'fetch_movie_urls' method
def fetch_movie_urls
  # url where we want to scrape the links from
  top_url = "https://www.imdb.com/chart/top"
  # retrieve the html doc and parse it through Nokogiri
  doc = Nokogiri::HTML(open(top_url).read)
  # search the document for the '.titleColumn' class and its 'a'-tag (link)
  movies = doc.search(".titleColumn a")
  # take the first 5 links we find adn iterate with .map
  movies.take(5).map do |movie|
    # binding.pry
    # 'movie.attributes["href"].value' returns the link for each movie
    # parse it with URI in order to get the 'base' url to match our test
    uri = URI.parse(movie.attributes["href"].value)
    # define the scheme
    uri.scheme = "https"
    # define the 'root' / host
    uri.host = "www.imdb.com"
    # set the query to zero (removes the long version of the url)
    uri.query = nil
    # return url as a string
    uri.to_s
  end
end

def scrape_movie(url)
  # retrieve the html doc and parse it through Nokogiri
  # added "Accept-Language" => "en", to avoid language conflicts
  doc = Nokogiri::HTML(open(url, "Accept-Language" => "en").read)
  # search the doc for a 'h1' then remove the ')' and split on '('
  whole_title = doc.search("h1").text.gsub(")", "").split("(")
  # since .split returns an array we can grab the first part as the title
  # to remove that anoying whitespace at the end we use [0..-2]
  title = whole_title.first.strip[0..-2]
  # second element in the array is the year and turn into an integer
  year = whole_title.last.strip.to_i
  # search the doc again for '.primary_photo' and take the next 'td' and its 'a'
  # we are using .map here so cast will be an array of the first 3 cast members
  cast = doc.search(".primary_photo + td a").take(3).map do |element|
    element.text.strip
  end
  # search the doc for 'summary_text' to return the 'storyline'
  storyline = doc.search(".summary_text").text.strip
  # search the doc for an 'h4' that contains the text 'Director:' and take the
  # next 'a'-tag
  director = doc.search("h4:contains('Director:') + a").text
  # Return a hash with all the scraped elements - Done!
  {
    title: title,
    cast: cast,
    director: director,
    storyline: storyline,
    year: year
  }
end

## scraper_specs.rb
require_relative "../scraper"

describe "#fetch_movie_urls" do

  # NOTE: this test may break because IMDB top order might change!
  it "returns an array of movies" do
    urls = fetch_movie_urls
    expected = [
      "https://www.imdb.com/title/tt0111161/",
      "https://www.imdb.com/title/tt0068646/",
      "https://www.imdb.com/title/tt0071562/",
      "https://www.imdb.com/title/tt0468569/",
      "https://www.imdb.com/title/tt0050083/"
    ]
    expect(urls).to eq(expected)
  end
end

describe "#scrape_movie" do
  it "returns a Hash describing a movie" do
    the_dark_knight_url = "http://www.imdb.com/title/tt0468569/"
    movie = scrape_movie(the_dark_knight_url)

    expected = {
      cast: [ "Christian Bale", "Heath Ledger", "Aaron Eckhart" ],
      director: "Christopher Nolan",
      storyline: "When the menace known as the Joker emerges from his mysterious past, he wreaks havoc and chaos on the people of Gotham. The Dark Knight must accept one of the greatest psychological and physical tests of his ability to fight injustice.",
      title: "The Dark Knight",
      year: 2008
    }
    expect(movie).to eq(expected)
  end
end
	require "yaml"
	require_relative "scraper"

	# fetch array of urls
	puts "Fetching URLs"
	urls = fetch_movie_urls

	# return an array of hashes of movie details
	movies = urls.map do \|url\|
	puts "Scraping #{url}"
	scrape_movie(url)
	end

	# write all movies to a yml file
	puts "Writing movies.yml"
	File.open("movies.yml", "w") do \|f\|
	f.write(movies.to_yaml)
	end

	puts "Done."
	require "open-uri"
	require "nokogiri"
	require 'pry'

	# Create 'fetch_movie_urls' method
	def fetch_movie_urls
	# url where we want to scrape the links from
	top_url = "https://www.imdb.com/chart/top"
	# retrieve the html doc and parse it through Nokogiri
	doc = Nokogiri::HTML(open(top_url).read)
	# search the document for the '.titleColumn' class and its 'a'-tag (link)
	movies = doc.search(".titleColumn a")
	# take the first 5 links we find adn iterate with .map
	movies.take(5).map do \|movie\|
	# binding.pry
	# 'movie.attributes["href"].value' returns the link for each movie
	# parse it with URI in order to get the 'base' url to match our test
	uri = URI.parse(movie.attributes["href"].value)
	# define the scheme
	uri.scheme = "https"
	# define the 'root' / host
	uri.host = "www.imdb.com"
	# set the query to zero (removes the long version of the url)
	uri.query = nil
	# return url as a string
	uri.to_s
	end
	end

	def scrape_movie(url)
	# retrieve the html doc and parse it through Nokogiri
	# added "Accept-Language" => "en", to avoid language conflicts
	doc = Nokogiri::HTML(open(url, "Accept-Language" => "en").read)
	# search the doc for a 'h1' then remove the ')' and split on '('
	whole_title = doc.search("h1").text.gsub(")", "").split("(")
	# since .split returns an array we can grab the first part as the title
	# to remove that anoying whitespace at the end we use [0..-2]
	title = whole_title.first.strip[0..-2]
	# second element in the array is the year and turn into an integer
	year = whole_title.last.strip.to_i
	# search the doc again for '.primary_photo' and take the next 'td' and its 'a'
	# we are using .map here so cast will be an array of the first 3 cast members
	cast = doc.search(".primary_photo + td a").take(3).map do \|element\|
	element.text.strip
	end
	# search the doc for 'summary_text' to return the 'storyline'
	storyline = doc.search(".summary_text").text.strip
	# search the doc for an 'h4' that contains the text 'Director:' and take the
	# next 'a'-tag
	director = doc.search("h4:contains('Director:') + a").text
	# Return a hash with all the scraped elements - Done!
	{
	title: title,
	cast: cast,
	director: director,
	storyline: storyline,
	year: year
	}
	end
	require_relative "../scraper"

	describe "#fetch_movie_urls" do

	# NOTE: this test may break because IMDB top order might change!
	it "returns an array of movies" do
	urls = fetch_movie_urls
	expected = [
	"https://www.imdb.com/title/tt0111161/",
	"https://www.imdb.com/title/tt0068646/",
	"https://www.imdb.com/title/tt0071562/",
	"https://www.imdb.com/title/tt0468569/",
	"https://www.imdb.com/title/tt0050083/"
	]
	expect(urls).to eq(expected)
	end
	end

	describe "#scrape_movie" do
	it "returns a Hash describing a movie" do
	the_dark_knight_url = "http://www.imdb.com/title/tt0468569/"
	movie = scrape_movie(the_dark_knight_url)

	expected = {
	cast: [ "Christian Bale", "Heath Ledger", "Aaron Eckhart" ],
	director: "Christopher Nolan",
	storyline: "When the menace known as the Joker emerges from his mysterious past, he wreaks havoc and chaos on the people of Gotham. The Dark Knight must accept one of the greatest psychological and physical tests of his ability to fight injustice.",
	title: "The Dark Knight",
	year: 2008
	}
	expect(movie).to eq(expected)
	end
	end