Skip to content

Instantly share code, notes, and snippets.

@meqif
Last active August 29, 2015 13:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save meqif/9722515 to your computer and use it in GitHub Desktop.
Save meqif/9722515 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
#
# Downloads the entire 'Worm' story and builds an ebook out of it in HTML form,
# ready to be processed by Calibre into whatever format you need.
#
require 'nokogiri'
require 'open-uri'
INDEX_URL = "http://parahumans.wordpress.com/table-of-contents/"
# Fetch index
def fetch_index
doc = Nokogiri::HTML(open(INDEX_URL))
# Get all links to chapters
links = doc.css(".entry-content")
# Remove sharing links
links.search(".sharedaddy").remove
links = links.css("a")
# Clean up link text and remove incorrect links
links.search("br").remove
links.search("strong").each { |node| node.replace(node.content) }
links = links.reject{ |link| link.text.empty? }.each do |link|
link.text.strip!
# Some urls lack the scheme
if not link['href'].match(/http:\/\/*/)
link['href'] = "http://#{link['href']}"
end
end
return links
end
# Fetch single page/chapter
def fetch_chapter(url)
doc = Nokogiri::HTML(open(url))
# Get content
content = doc.css(".entry-content")
# Remove sharing links
content.search(".sharedaddy").remove
content.search("a").each do |link|
# Remove "Last Chapter" and "Next Chapter" links
if link.text.match(/(Next|Last) Chapter/)
link.remove
end
end
content.search("p").each do |par|
# Remove empty paragraphs
if par.text.gsub(/\u00A0/, '').strip.empty?
par.remove
end
end
return content
end
# Build complete HTML
def build
File.open("output.html", "w") do |f|
template =<<-END
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Worm</title>
</head>
<body>
<h1>Worm</h1>
END
f.write(template)
index = fetch_index
# Output index
f.write("<section class=\"toc\">\n")
f.write("<ul>\n")
index.each_with_index do |link, index|
f.write("<li><a href=\"#chapter_#{index}\">#{link.text}</a></li>\n")
end
f.write("</ul>\n</section>\n")
# Output chapters
index.each_with_index do |link, index|
title = link.text
chapter = fetch_chapter(URI.encode(link['href']))
f.write("\n<section id=\"chapter_#{index}\" class=\"chapter\">\n<h1>#{title}</h1>#{chapter}\n</section>")
end
f.write("\n</body>\n</html>")
end
end
# Main
if __FILE__ == $PROGRAM_NAME
build
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment