Skip to content

Instantly share code, notes, and snippets.

@hannahwhy
Created January 8, 2012 08:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hannahwhy/1577729 to your computer and use it in GitHub Desktop.
Save hannahwhy/1577729 to your computer and use it in GitHub Desktop.
require 'escape'
require 'tempfile'
require 'fileutils'
E = lambda { |str| Escape.shell_single_word(str) }
DOWNLOAD_TO = File.expand_path('../data', __FILE__)
WGET_WARC = File.expand_path('../wget-warc', __FILE__)
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2'
VERSION = '20120108.01'
include FileUtils
sid = ARGV[0]
base_path = "#{DOWNLOAD_TO}/#{sid[0..0]}/#{sid[0..1]}/#{sid[0..2]}/#{sid}"
warc_file = base_path + "/#{sid}"
log_file = base_path + "/#{sid}.log"
urls = `./make_story_urls.rb #{sid}`.split("\n")
url_file = base_path + "/#{sid}_urls"
fetch_path = base_path + "/fetch"
mkdir_p File.dirname(base_path)
mkdir_p fetch_path
File.open(url_file, "w") do |f|
urls.each { |u| f.puts u }
end
cmd = [
WGET_WARC,
"-U " + E[USER_AGENT],
"-o " + E[log_file],
"-e 'robots=off'",
"--warc-file=" + E[warc_file],
"--warc-max-size=inf",
"--warc-header=" + E["operator: Archive Team"],
"--warc-header=" + E["ff-download-script-version: #{VERSION}"],
"-nd",
"-nv",
"--no-timestamping",
"--page-requisites",
"-i " + E[url_file]
].join(' ')
Dir.chdir(fetch_path) do
puts cmd
system(cmd)
end
#!/usr/bin/env ruby
require 'mechanize'
require File.expand_path('../url_generators', __FILE__)
include UrlGenerators
sid = ARGV[0]
agent = Mechanize.new do |m|
m.user_agent = 'Linux Firefox'
m.max_history = 0
end
story_page = agent.get(UrlGenerators::STORY_URL[sid, ''])
puts profile_url_for(sid, story_page)
puts chapter_urls_for(sid, story_page)
begin
review_page = agent.get(UrlGenerators::REVIEW_URL[sid, '/'])
puts review_urls_for(sid, review_page)
rescue Mechanize::ResponseCodeError
# oh well, not every story has reviews
end
module UrlGenerators
BASE_URL = "http://www.fanfiction.net"
STORY_URL = lambda { |sid, rest| "#{BASE_URL}/s/#{sid}#{rest}" }
REVIEW_URL = lambda { |sid, rest| "#{BASE_URL}/r/#{sid}#{rest}" }
def chapter_urls_for(sid, first_story_page)
urls = [first_story_page.uri]
# There's two chapter selection boxes on a page, so just pick one of them.
chapter_box = (first_story_page/'select[name="chapter"]').first
if chapter_box
# We've already got chapter 1.
chapters = (chapter_box/'option')[1..-1]
chapters.each { |opt| urls << STORY_URL[sid, "/" + opt.attribute('value').text] }
end
urls
end
def review_urls_for(sid, first_review_page)
urls = [first_review_page.uri]
# Search for links of the form /r/#{sid}/0/\d+. The link that has the
# largest number in the final position is the last review page. If no such
# links exist, then there's only one page of reviews.
highest_page = first_review_page.links.map { |l| l.href }.map { |h| h =~ %r{/r/#{sid}/0/(\d+)}; $1 }.map(&:to_i).max
if highest_page
# we already got page 1
(2..highest_page).each do |page| urls << REVIEW_URL[sid, "/0/#{page}"]
end
end
urls
end
def profile_url_for(sid, first_story_page)
BASE_URL + first_story_page.links.map { |l| l.href }.detect { |h| h =~ %r{/u/\d+} }
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment