Skip to content

Instantly share code, notes, and snippets.

@marcoranieri
Created October 15, 2019 16:46
Show Gist options
  • Save marcoranieri/90161d434b0bea5930c0c4cc2b9dc087 to your computer and use it in GitHub Desktop.
Save marcoranieri/90161d434b0bea5930c0c4cc2b9dc087 to your computer and use it in GitHub Desktop.
IMDB_Scraper
require "yaml"
require_relative "scraper"
# fetch array of urls
puts "Fetching URLs"
urls = fetch_movie_urls
# return an array of hashes of movie details
movies = urls.map do |url|
puts "Scraping #{url}"
scrape_movie(url)
end
# write all movies to a yml file
puts "Writing movies.yml"
File.open("movies.yml", "w") do |f|
f.write(movies.to_yaml)
end
puts "Done."
require "open-uri"
require "nokogiri"
# Create 'fetch_movie_urls' method
def fetch_movie_urls
# url where we want to scrape the links from
top_url = "https://www.imdb.com/chart/top"
# retrieve the html doc and parse it through Nokogiri
html_doc = open(top_url).read
html_nodes = Nokogiri::HTML(html_doc)
# search the document for the '.titleColumn' class and its 'a'-tag (link)
links = html_nodes.search(".titleColumn a").first(5)
# take the first 5 links we find adn iterate with .map
links.map do |link|
# p link.attribute("href").value
# 'movie.attributes["href"].value' returns the link for each movie
# parse it with URI in order to get the 'base' url to match our test
uri = URI.parse(link.attributes["href"].value)
# define the scheme
uri.scheme = "http"
# define the 'root' / host
uri.host = "www.imdb.com"
# set the query to zero (removes the long version of the url)
uri.query = nil
# return url as a string
uri.to_s
end
end
p fetch_movie_urls
def scrape_movie(url)
# retrieve the html doc and parse it through Nokogiri
# added "Accept-Language" => "en", to avoid language conflicts
doc = Nokogiri::HTML(open(url, "Accept-Language" => "en").read)
sentence = doc.search("h1").first.text.split("(")
title = sentence.first[0..-2]
year = sentence.last.split(")").first.to_i
# Return a hash with all the scraped elements - Done!
{
title: title,
year: year
}
end
p scrape_movie("https://www.imdb.com/title/tt0111161/")
require_relative "../scraper"
describe "#fetch_movie_urls" do
# NOTE: this test may break because IMDB top order might change!
it "returns an array of movies" do
urls = fetch_movie_urls
expected = [
"http://www.imdb.com/title/tt0111161/",
"http://www.imdb.com/title/tt0068646/",
"http://www.imdb.com/title/tt0071562/",
"http://www.imdb.com/title/tt0468569/",
"http://www.imdb.com/title/tt0050083/"
]
expect(urls).to eq(expected)
end
end
describe "#scrape_movie" do
it "returns a Hash describing a movie" do
the_dark_knight_url = "http://www.imdb.com/title/tt0468569/"
movie = scrape_movie(the_dark_knight_url)
expected = {
title: "The Dark Knight",
year: 2008
}
expect(movie).to eq(expected)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment