Last active
June 16, 2020 06:02
-
-
Save sergioro9/9c4bd5fba92379a1f4c9ff5d3fea0b9e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'open-uri' | |
require 'nokogiri' | |
base_url = 'https://en.wikipedia.org' | |
site = ARGV.empty? ? 'Film_adaptation' : ARGV.shift | |
url = base_url + "/wiki/#{site}" | |
Signal.trap('INT') { exit } | |
# fetch the first link from the first paragraph of the site | |
CSS_SELECTOR = '#mw-content-text p a' | |
link_num = 0 | |
# maximum links to skip | |
MAX_SKIPS = 10 | |
# array of visited sites | |
sites = [site] | |
# skip pronounciation links | |
class PronunciationLink < StandardError; end | |
# skip links to files like images or audio | |
class FileLink < StandardError; end | |
class NoPhilosophy < StandardError; end | |
puts site | |
#require 'byebug'; byebug | |
loop do | |
html = Nokogiri::HTML(URI.open(url)) | |
raise NoPhilosophy if link_num > MAX_SKIPS | |
element = html.css(CSS_SELECTOR)[link_num] | |
# raise exception of there is no link in the first paragraph | |
raise NoPhilosophy if element.nil? | |
site, site_href = [element.text, element['href']] | |
raise PronunciationLink if site.match(%r(^[\[/].*[\]/]$)) || site_href.downcase.match(/help:pronun/) | |
raise FileLink if site_href.match(/.*\.(ogg|png|jpg)/) | |
sites << site | |
puts sites.last | |
url = base_url + site_href | |
(puts "Reached philosophy!"; exit) if sites.last.downcase == 'philosophy' | |
# exit if there is a repeated site | |
raise NoPhilosophy if sites.uniq! | |
link_num = 0 | |
rescue NoMethodError, TypeError, PronunciationLink, FileLink, OpenURI::HTTPError | |
link_num += 1 | |
puts "Skipped #{link_num} link#{link_num > 1 ? 's' : ''} in #{sites.last}" | |
rescue NoPhilosophy | |
puts "Did not reached philosophy :(" | |
exit | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment