Skip to content

Instantly share code, notes, and snippets.

@DataKinds
Last active May 20, 2019 09:48
Show Gist options
  • Save DataKinds/ffe12eeeed6d2e61e856e0dfbf513f13 to your computer and use it in GitHub Desktop.
Save DataKinds/ffe12eeeed6d2e61e856e0dfbf513f13 to your computer and use it in GitHub Desktop.
Directed graph of TVTropes links
# frozen_string_literal: true
source "https://rubygems.org"
git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
gem "nokogiri"
require "nokogiri"
require "open-uri"
require "thread"
require "set"
require "pry"
$OUTPUT_FILE = File.open "output", "w"
$URL_QUEUE = Queue.new
$URL_QUEUE.push "PrincessClassic"
$COMPLETED_URLS = Set.new
$COMPLETED_URLS_LOCK = Mutex.new
def is_valid_tvtropes_href?(href)
href =~ /\/pmwiki\/pmwiki\.php\/Main\/\w+\z/
end
def get_tvtropes_linklist(url)
Nokogiri::HTML(open "https://tvtropes.org/pmwiki/pmwiki.php/Main/#{url}")
.css("a")
.select{ |link| is_valid_tvtropes_href? link["href"] }
.map{ |link| link["href"].scan(/\/pmwiki\/pmwiki\.php\/Main\/(\w+\z)/)[0][0] }
end
pry
$THREAD_POOL = []
24.times do
$THREAD_POOL << Thread.new do
loop do
sleep 1
# check the URL queue, and rip page if the URL queue is nonempty
if $URL_QUEUE.empty?
next
else
url = $URL_QUEUE.pop
if $COMPLETED_URLS.include? url
next
else
linklist = get_tvtropes_linklist url
end
end
# output the rips
linklist.each do |point|
$OUTPUT_FILE.puts "#{url}->#{point}"
end
# update the completed URL set
$COMPLETED_URLS_LOCK.synchronize do
$COMPLETED_URLS.add url
end
puts "completed #{url}"
# update the URL queue
linklist.each do |point|
$URL_QUEUE << point
end
end
end
end
$THREAD_POOL.map { |th| th.join }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment