Created
July 16, 2019 15:10
-
-
Save marcoranieri/6cbcf1ec1de5d5a3a095f9f01ef25336 to your computer and use it in GitHub Desktop.
scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "yaml" | |
require_relative "scraper" | |
# fetch array of urls | |
puts "Fetching URLs" | |
urls = fetch_movie_urls | |
# return an array of hashes of movie details | |
movies = urls.map do |url| | |
puts "Scraping #{url}" | |
scrape_movie(url) | |
end | |
# write all movies to a yml file | |
puts "Writing movies.yml" | |
File.open("movies.yml", "w") do |f| | |
f.write(movies.to_yaml) | |
end | |
puts "Done." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "open-uri" | |
require "nokogiri" | |
require 'pry' | |
# Create 'fetch_movie_urls' method | |
def fetch_movie_urls | |
# url where we want to scrape the links from | |
top_url = "https://www.imdb.com/chart/top" | |
# retrieve the html doc and parse it through Nokogiri | |
doc = Nokogiri::HTML(open(top_url).read) | |
# search the document for the '.titleColumn' class and its 'a'-tag (link) | |
movies = doc.search(".titleColumn a") | |
# take the first 5 links we find adn iterate with .map | |
movies.take(5).map do |movie| | |
# binding.pry | |
# 'movie.attributes["href"].value' returns the link for each movie | |
# parse it with URI in order to get the 'base' url to match our test | |
uri = URI.parse(movie.attributes["href"].value) | |
# define the scheme | |
uri.scheme = "https" | |
# define the 'root' / host | |
uri.host = "www.imdb.com" | |
# set the query to zero (removes the long version of the url) | |
uri.query = nil | |
# return url as a string | |
uri.to_s | |
end | |
end | |
def scrape_movie(url) | |
# retrieve the html doc and parse it through Nokogiri | |
# added "Accept-Language" => "en", to avoid language conflicts | |
doc = Nokogiri::HTML(open(url, "Accept-Language" => "en").read) | |
# search the doc for a 'h1' then remove the ')' and split on '(' | |
whole_title = doc.search("h1").text.gsub(")", "").split("(") | |
# since .split returns an array we can grab the first part as the title | |
# to remove that anoying whitespace at the end we use [0..-2] | |
title = whole_title.first.strip[0..-2] | |
# second element in the array is the year and turn into an integer | |
year = whole_title.last.strip.to_i | |
# search the doc again for '.primary_photo' and take the next 'td' and its 'a' | |
# we are using .map here so cast will be an array of the first 3 cast members | |
cast = doc.search(".primary_photo + td a").take(3).map do |element| | |
element.text.strip | |
end | |
# search the doc for 'summary_text' to return the 'storyline' | |
storyline = doc.search(".summary_text").text.strip | |
# search the doc for an 'h4' that contains the text 'Director:' and take the | |
# next 'a'-tag | |
director = doc.search("h4:contains('Director:') + a").text | |
# Return a hash with all the scraped elements - Done! | |
{ | |
title: title, | |
cast: cast, | |
director: director, | |
storyline: storyline, | |
year: year | |
} | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require_relative "../scraper" | |
describe "#fetch_movie_urls" do | |
# NOTE: this test may break because IMDB top order might change! | |
it "returns an array of movies" do | |
urls = fetch_movie_urls | |
expected = [ | |
"https://www.imdb.com/title/tt0111161/", | |
"https://www.imdb.com/title/tt0068646/", | |
"https://www.imdb.com/title/tt0071562/", | |
"https://www.imdb.com/title/tt0468569/", | |
"https://www.imdb.com/title/tt0050083/" | |
] | |
expect(urls).to eq(expected) | |
end | |
end | |
describe "#scrape_movie" do | |
it "returns a Hash describing a movie" do | |
the_dark_knight_url = "http://www.imdb.com/title/tt0468569/" | |
movie = scrape_movie(the_dark_knight_url) | |
expected = { | |
cast: [ "Christian Bale", "Heath Ledger", "Aaron Eckhart" ], | |
director: "Christopher Nolan", | |
storyline: "When the menace known as the Joker emerges from his mysterious past, he wreaks havoc and chaos on the people of Gotham. The Dark Knight must accept one of the greatest psychological and physical tests of his ability to fight injustice.", | |
title: "The Dark Knight", | |
year: 2008 | |
} | |
expect(movie).to eq(expected) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment