marcoranieri/interface.rb

## interface.rb
require "yaml"
require_relative "scraper"

# fetch array of urls
puts "Fetching URLs"
urls = fetch_movie_urls

# return an array of hashes of movie details
movies = urls.map do |url|
  puts "Scraping #{url}"
  scrape_movie(url)
end

# write all movies to a yml file
puts "Writing movies.yml"
File.open("movies.yml", "w") do |f|
  f.write(movies.to_yaml)
end

puts "Done."

## scraper.rb
require "open-uri"
require "nokogiri"

# Create 'fetch_movie_urls' method
def fetch_movie_urls
  # url where we want to scrape the links from
  top_url = "https://www.imdb.com/chart/top"
  # retrieve the html doc and parse it through Nokogiri
  html_doc = open(top_url).read
  html_nodes = Nokogiri::HTML(html_doc)

  # search the document for the '.titleColumn' class and its 'a'-tag (link)
  links = html_nodes.search(".titleColumn a").first(5)
  # take the first 5 links we find adn iterate with .map
  links.map do |link|
    # p link.attribute("href").value
    # 'movie.attributes["href"].value' returns the link for each movie

    # parse it with URI in order to get the 'base' url to match our test
    uri = URI.parse(link.attributes["href"].value)
    # define the scheme
    uri.scheme = "http"
    # define the 'root' / host
    uri.host = "www.imdb.com"
    # set the query to zero (removes the long version of the url)
    uri.query = nil
    # return url as a string

    uri.to_s
  end
end

p fetch_movie_urls


def scrape_movie(url)
  # retrieve the html doc and parse it through Nokogiri
  # added "Accept-Language" => "en", to avoid language conflicts
  doc = Nokogiri::HTML(open(url, "Accept-Language" => "en").read)

  sentence = doc.search("h1").first.text.split("(")

  title = sentence.first[0..-2]

  year = sentence.last.split(")").first.to_i

  # Return a hash with all the scraped elements - Done!
  {
    title: title,
    year: year
  }
end

p scrape_movie("https://www.imdb.com/title/tt0111161/")

## scraper_spec.rb
require_relative "../scraper"

describe "#fetch_movie_urls" do

  # NOTE: this test may break because IMDB top order might change!
  it "returns an array of movies" do
    urls = fetch_movie_urls
    expected = [
      "http://www.imdb.com/title/tt0111161/",
      "http://www.imdb.com/title/tt0068646/",
      "http://www.imdb.com/title/tt0071562/",
      "http://www.imdb.com/title/tt0468569/",
      "http://www.imdb.com/title/tt0050083/"
    ]
    expect(urls).to eq(expected)
  end
end

describe "#scrape_movie" do
  it "returns a Hash describing a movie" do
    the_dark_knight_url = "http://www.imdb.com/title/tt0468569/"
    movie = scrape_movie(the_dark_knight_url)

    expected = {
      title: "The Dark Knight",
      year: 2008
    }
    expect(movie).to eq(expected)
  end
end
	require "yaml"
	require_relative "scraper"

	# fetch array of urls
	puts "Fetching URLs"
	urls = fetch_movie_urls

	# return an array of hashes of movie details
	movies = urls.map do \|url\|
	puts "Scraping #{url}"
	scrape_movie(url)
	end

	# write all movies to a yml file
	puts "Writing movies.yml"
	File.open("movies.yml", "w") do \|f\|
	f.write(movies.to_yaml)
	end

	puts "Done."
	require "open-uri"
	require "nokogiri"

	# Create 'fetch_movie_urls' method
	def fetch_movie_urls
	# url where we want to scrape the links from
	top_url = "https://www.imdb.com/chart/top"
	# retrieve the html doc and parse it through Nokogiri
	html_doc = open(top_url).read
	html_nodes = Nokogiri::HTML(html_doc)

	# search the document for the '.titleColumn' class and its 'a'-tag (link)
	links = html_nodes.search(".titleColumn a").first(5)
	# take the first 5 links we find adn iterate with .map
	links.map do \|link\|
	# p link.attribute("href").value
	# 'movie.attributes["href"].value' returns the link for each movie

	# parse it with URI in order to get the 'base' url to match our test
	uri = URI.parse(link.attributes["href"].value)
	# define the scheme
	uri.scheme = "http"
	# define the 'root' / host
	uri.host = "www.imdb.com"
	# set the query to zero (removes the long version of the url)
	uri.query = nil
	# return url as a string

	uri.to_s
	end
	end

	p fetch_movie_urls




	def scrape_movie(url)
	# retrieve the html doc and parse it through Nokogiri
	# added "Accept-Language" => "en", to avoid language conflicts
	doc = Nokogiri::HTML(open(url, "Accept-Language" => "en").read)

	sentence = doc.search("h1").first.text.split("(")

	title = sentence.first[0..-2]

	year = sentence.last.split(")").first.to_i

	# Return a hash with all the scraped elements - Done!
	{
	title: title,
	year: year
	}
	end

	p scrape_movie("https://www.imdb.com/title/tt0111161/")
	require_relative "../scraper"

	describe "#fetch_movie_urls" do

	# NOTE: this test may break because IMDB top order might change!
	it "returns an array of movies" do
	urls = fetch_movie_urls
	expected = [
	"http://www.imdb.com/title/tt0111161/",
	"http://www.imdb.com/title/tt0068646/",
	"http://www.imdb.com/title/tt0071562/",
	"http://www.imdb.com/title/tt0468569/",
	"http://www.imdb.com/title/tt0050083/"
	]
	expect(urls).to eq(expected)
	end
	end

	describe "#scrape_movie" do
	it "returns a Hash describing a movie" do
	the_dark_knight_url = "http://www.imdb.com/title/tt0468569/"
	movie = scrape_movie(the_dark_knight_url)

	expected = {
	title: "The Dark Knight",
	year: 2008
	}
	expect(movie).to eq(expected)
	end
	end