Created
October 15, 2019 16:46
-
-
Save marcoranieri/90161d434b0bea5930c0c4cc2b9dc087 to your computer and use it in GitHub Desktop.
IMDB_Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "yaml" | |
require_relative "scraper" | |
# fetch array of urls | |
puts "Fetching URLs" | |
urls = fetch_movie_urls | |
# return an array of hashes of movie details | |
movies = urls.map do |url| | |
puts "Scraping #{url}" | |
scrape_movie(url) | |
end | |
# write all movies to a yml file | |
puts "Writing movies.yml" | |
File.open("movies.yml", "w") do |f| | |
f.write(movies.to_yaml) | |
end | |
puts "Done." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "open-uri" | |
require "nokogiri" | |
# Create 'fetch_movie_urls' method | |
def fetch_movie_urls | |
# url where we want to scrape the links from | |
top_url = "https://www.imdb.com/chart/top" | |
# retrieve the html doc and parse it through Nokogiri | |
html_doc = open(top_url).read | |
html_nodes = Nokogiri::HTML(html_doc) | |
# search the document for the '.titleColumn' class and its 'a'-tag (link) | |
links = html_nodes.search(".titleColumn a").first(5) | |
# take the first 5 links we find adn iterate with .map | |
links.map do |link| | |
# p link.attribute("href").value | |
# 'movie.attributes["href"].value' returns the link for each movie | |
# parse it with URI in order to get the 'base' url to match our test | |
uri = URI.parse(link.attributes["href"].value) | |
# define the scheme | |
uri.scheme = "http" | |
# define the 'root' / host | |
uri.host = "www.imdb.com" | |
# set the query to zero (removes the long version of the url) | |
uri.query = nil | |
# return url as a string | |
uri.to_s | |
end | |
end | |
p fetch_movie_urls | |
def scrape_movie(url) | |
# retrieve the html doc and parse it through Nokogiri | |
# added "Accept-Language" => "en", to avoid language conflicts | |
doc = Nokogiri::HTML(open(url, "Accept-Language" => "en").read) | |
sentence = doc.search("h1").first.text.split("(") | |
title = sentence.first[0..-2] | |
year = sentence.last.split(")").first.to_i | |
# Return a hash with all the scraped elements - Done! | |
{ | |
title: title, | |
year: year | |
} | |
end | |
p scrape_movie("https://www.imdb.com/title/tt0111161/") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require_relative "../scraper" | |
describe "#fetch_movie_urls" do | |
# NOTE: this test may break because IMDB top order might change! | |
it "returns an array of movies" do | |
urls = fetch_movie_urls | |
expected = [ | |
"http://www.imdb.com/title/tt0111161/", | |
"http://www.imdb.com/title/tt0068646/", | |
"http://www.imdb.com/title/tt0071562/", | |
"http://www.imdb.com/title/tt0468569/", | |
"http://www.imdb.com/title/tt0050083/" | |
] | |
expect(urls).to eq(expected) | |
end | |
end | |
describe "#scrape_movie" do | |
it "returns a Hash describing a movie" do | |
the_dark_knight_url = "http://www.imdb.com/title/tt0468569/" | |
movie = scrape_movie(the_dark_knight_url) | |
expected = { | |
title: "The Dark Knight", | |
year: 2008 | |
} | |
expect(movie).to eq(expected) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment