Skip to content

Instantly share code, notes, and snippets.

@marcoranieri
Created July 16, 2019 15:10
Show Gist options
  • Save marcoranieri/6cbcf1ec1de5d5a3a095f9f01ef25336 to your computer and use it in GitHub Desktop.
Save marcoranieri/6cbcf1ec1de5d5a3a095f9f01ef25336 to your computer and use it in GitHub Desktop.
scraper
require "yaml"
require_relative "scraper"
# fetch array of urls
puts "Fetching URLs"
urls = fetch_movie_urls
# return an array of hashes of movie details
movies = urls.map do |url|
puts "Scraping #{url}"
scrape_movie(url)
end
# write all movies to a yml file
puts "Writing movies.yml"
File.open("movies.yml", "w") do |f|
f.write(movies.to_yaml)
end
puts "Done."
require "open-uri"
require "nokogiri"
require 'pry'
# Create 'fetch_movie_urls' method
def fetch_movie_urls
# url where we want to scrape the links from
top_url = "https://www.imdb.com/chart/top"
# retrieve the html doc and parse it through Nokogiri
doc = Nokogiri::HTML(open(top_url).read)
# search the document for the '.titleColumn' class and its 'a'-tag (link)
movies = doc.search(".titleColumn a")
# take the first 5 links we find adn iterate with .map
movies.take(5).map do |movie|
# binding.pry
# 'movie.attributes["href"].value' returns the link for each movie
# parse it with URI in order to get the 'base' url to match our test
uri = URI.parse(movie.attributes["href"].value)
# define the scheme
uri.scheme = "https"
# define the 'root' / host
uri.host = "www.imdb.com"
# set the query to zero (removes the long version of the url)
uri.query = nil
# return url as a string
uri.to_s
end
end
def scrape_movie(url)
# retrieve the html doc and parse it through Nokogiri
# added "Accept-Language" => "en", to avoid language conflicts
doc = Nokogiri::HTML(open(url, "Accept-Language" => "en").read)
# search the doc for a 'h1' then remove the ')' and split on '('
whole_title = doc.search("h1").text.gsub(")", "").split("(")
# since .split returns an array we can grab the first part as the title
# to remove that anoying whitespace at the end we use [0..-2]
title = whole_title.first.strip[0..-2]
# second element in the array is the year and turn into an integer
year = whole_title.last.strip.to_i
# search the doc again for '.primary_photo' and take the next 'td' and its 'a'
# we are using .map here so cast will be an array of the first 3 cast members
cast = doc.search(".primary_photo + td a").take(3).map do |element|
element.text.strip
end
# search the doc for 'summary_text' to return the 'storyline'
storyline = doc.search(".summary_text").text.strip
# search the doc for an 'h4' that contains the text 'Director:' and take the
# next 'a'-tag
director = doc.search("h4:contains('Director:') + a").text
# Return a hash with all the scraped elements - Done!
{
title: title,
cast: cast,
director: director,
storyline: storyline,
year: year
}
end
require_relative "../scraper"
describe "#fetch_movie_urls" do
# NOTE: this test may break because IMDB top order might change!
it "returns an array of movies" do
urls = fetch_movie_urls
expected = [
"https://www.imdb.com/title/tt0111161/",
"https://www.imdb.com/title/tt0068646/",
"https://www.imdb.com/title/tt0071562/",
"https://www.imdb.com/title/tt0468569/",
"https://www.imdb.com/title/tt0050083/"
]
expect(urls).to eq(expected)
end
end
describe "#scrape_movie" do
it "returns a Hash describing a movie" do
the_dark_knight_url = "http://www.imdb.com/title/tt0468569/"
movie = scrape_movie(the_dark_knight_url)
expected = {
cast: [ "Christian Bale", "Heath Ledger", "Aaron Eckhart" ],
director: "Christopher Nolan",
storyline: "When the menace known as the Joker emerges from his mysterious past, he wreaks havoc and chaos on the people of Gotham. The Dark Knight must accept one of the greatest psychological and physical tests of his ability to fight injustice.",
title: "The Dark Knight",
year: 2008
}
expect(movie).to eq(expected)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment