Skip to content

Instantly share code, notes, and snippets.

@caioertai
Created April 19, 2022 23:04
Show Gist options
  • Save caioertai/bfd17af8ffc67549d8e89dc1d81fd38b to your computer and use it in GitHub Desktop.
Save caioertai/bfd17af8ffc67549d8e89dc1d81fd38b to your computer and use it in GitHub Desktop.
require "yaml"
require_relative "scraper"
puts "Getting top movies..."
top_50_movies_urls = top_movies
movies_infos = top_50_movies_urls.map do |url|
puts "Scraping... #{url}"
movie_info(url)
end
puts "Writing YAML file..."
File.open("movies.yml", "wb") do |file|
yaml_string = YAML.dump(movies_infos)
file.write(yaml_string)
end
require "open-uri"
require "nokogiri"
BASE_URL = "https://www.imdb.com"
def top_movies
url = "#{BASE_URL}/chart/top"
html_string = URI.open(url, "Accept-Language" => "en-US").read
doc = Nokogiri::HTML.parse(html_string)
movie_links = doc.search(".titleColumn a").first(50)
movie_links.map do |movie_link|
# Get the href of each movie link
href = movie_link.attr(:href)
BASE_URL + href
end
end
# Return the movie info as a hash
def movie_info(movie_url)
html_string = URI.open(movie_url)
doc = Nokogiri::HTML.parse(html_string)
# Get title
title = doc.at("h1").text
# Get year
year = doc.at(".ipc-link").text
# Get director
director = doc.at(".ipc-metadata-list-item__list-content-item").text
# Get storyline
storyline = doc.at(".kgphFu").text
# Get cast
cast_links = doc.search(".ipc-sub-grid [data-testid='title-cast-item__actor']").first(3)
cast_names = cast_links.first(3).map { |link| link.text }
{
title: title,
year: year,
director: director,
storyline: storyline,
cast: cast_names
}
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment