Skip to content

Instantly share code, notes, and snippets.

@progapandist
Created January 19, 2021 17:57
Show Gist options
  • Save progapandist/f79485039a1e8a2cb016897f6fa19b2a to your computer and use it in GitHub Desktop.
Save progapandist/f79485039a1e8a2cb016897f6fa19b2a to your computer and use it in GitHub Desktop.
require 'nokogiri'
require 'open-uri'
require 'pry' # gem install pry (for debugging)
# gem 'httplog'
# - cast:
# - "Tim Robbins"
# - "Morgan Freeman"
# - "Bob Gunton"
# # Only the first 3 first actress/actors
# director: "Frank Darabont"
# storyline: |
# Chronicles the experiences of a formerly successful banker as a prisoner in the gloomy jailhouse of Shawshank after being found guilty of a crime he did not commit. The film portrays the man's unique way of dealing with his new, torturous life; along the way he befriends a number of fellow prisoners, most notably a wise long-term inmate named Red.
# title: "The Shawshank Redemption"
# year: 1994
TOP_IMDB_URL = "https://www.imdb.com/chart/top"
# => Array of Hashes
def top_n_imdb(n)
return {} if n.zero?
# Trick
title = link = year = director = storyline = actors = nil
results = []
top_imdb_source = URI.open(
TOP_IMDB_URL,
"Accept-Language" => "en",
"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:84.0) Gecko/20100101 Firefox/84.0"
).read
parsed = Nokogiri::HTML(top_imdb_source)
parsed.search(".titleColumn").first(n).each do |node|
title = node.css("a").text
link = node.css("a").first["href"]
year = node.css("span.secondaryInfo").text.delete(")").delete("(").to_i
movie_page = URI.open(
"https://www.imdb.com/" + link,
"Accept-Language" => "en",
"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:84.0) Gecko/20100101 Firefox/84.0"
).read
parsed_movie = Nokogiri::HTML(movie_page)
director = parsed_movie.search(".credit_summary_item").css("a").first.text
storyline = parsed_movie.search(".summary_text").text.strip
actors = [
parsed_movie.search(".cast_list .odd").first.text.strip.split("\n").first,
parsed_movie.search(".cast_list .even").first.text.strip.split("\n").first,
parsed_movie.search(".cast_list .odd")[1].text.strip.split("\n").first
]
results << {
director: director,
storyline: storyline,
title: title,
year: year,
actors: actors
}
end
return results
end
results = top_n_imdb(20)
File.write("storage.yml", YAML.dump(results))
require "rspec"
require_relative "../scraper.rb"
describe "#top_n_imdb" do
it "should empty hash if called with 0" do
actual = top_n_imdb(0)
expected = {}
expect(actual).to eq(expected)
end
xit "should process saved HTML" do
# TODO: Implement with mock API request if you want to have
# LOTS OF FUN
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment