Skip to content

Instantly share code, notes, and snippets.

@znz
Created Feb 25, 2010
Embed
What would you like to do?
Plone site to static files
#!/usr/bin/ruby
# -*- coding: utf-8 -*-
require 'rubygems'
require 'nokogiri'
require 'logger'
require 'open-uri'
require 'pathname'
require 'set'
require 'uri'
def canonical(uri)
if uri.fragment
uri.fragment = nil
end
uri.path = uri.path.sub(/(\/(?:index_html)?)?\z/, '')
uri
end
def uri_to_path(uri, type)
path = uri.path.sub(/(\/(?:index_html)?)?\z/, '')
path = URI.unescape(path)
if type == "text/html"
if /\.html\z/ !~ uri.path
path += "/index.html"
end
end
Pathname([uri.scheme, uri.host, uri.port].join(",") + path)
end
def write_file(path, body, time, top_uri, logger)
unless path.dirname.directory?
begin
path.dirname.mkpath
rescue Errno::EEXIST
t = path.dirname
until t.exist?
t = t.dirname
end
t1 = "#{t}~"
t2 = t + "index#{t.extname}"
logger.info("Move") { "#{t.to_s.dump} -> #{t2.to_s.dump}" }
FileUtils.mv(t, t1)
t.mkpath
FileUtils.mv(t1, t2)
path.dirname.mkpath
end
end
if path.directory?
path += "index" + File.extname(path)
end
if /\.(?:html|css)\z/ =~ path
orig = Pathname("#{path}.orig")
logger.info("Backup") { orig.to_s.dump }
orig.open("wb") do |f|
f.write body
end
orig.utime(time, time) if time
top_path = Pathname([top_uri.scheme, top_uri.host, top_uri.port].join(","))
rel_path_to_top = "#{top_path.relative_path_from(path)}".sub(/\.\.\z/, '')
body = body.gsub(Regexp.new(Regexp.quote(top_uri.to_s))) do
rel_path_to_top
end
body = body.sub(/<base href=".+?" \/>/, '')
end
logger.info("Save") { path.to_s.dump }
path.open("wb") do |f|
f.write body
end
path.utime(time, time) if time
end
def download(uri)
uri.open do |f|
{
:base_uri => f.base_uri,
:body => f.read,
:type => f.content_type,
:time => f.last_modified,
}
end
end
def extract_css_links(css, base_uri)
links = Set.new
css.scan(/ url\((.+?)\)/).each do |url,|
if /\A\'(.+)\'\z/ =~ url
url = $1
end
links << base_uri + url
end
links
end
def extract_links(html, base_uri)
doc = Nokogiri::HTML(html)
base_href = doc.at("//base/@href")
if base_href
base_uri = URI(base_href.value)
end
links = extract_css_links(html, base_uri)
[
"//img/@src",
"//script/@src",
"//link/@href",
"//a/@href",
].each do |xpath|
doc.xpath(xpath).each do |href|
next if /\Ajavascript:/ =~ href
links << base_uri + href.to_s
end
end
links
end
def uniq_links(links, top_uri)
top = top_uri.to_s
s = Set.new
links.each do |u|
if u.to_s.start_with?(top)
s << canonical(u)
end
end
s
end
def download_all(uri, logger=Logger.new(STDERR))
links_from = {}
top_uri = uri
q = Set[uri]
done = Set.new
while 0 < q.size
uri = q.each{|v|break v}
logger.info("URI") { uri.to_s.dump }
logger.info("Progress") { "q=#{q.size} done=#{done.size}" }
q.delete(uri)
done.add(uri)
begin
h = download(uri)
rescue OpenURI::HTTPError => e
logger.error("HTTPError") { "#{e} #{uri.to_s.dump}" }
if /\A404 / =~ e.to_s
next
else
raise
end
end
if h[:base_uri] != uri
logger.warn("Redirect") { "#{uri.to_s.dump} -> #{h[:base_uri].to_s.dump}" }
path = uri_to_path(uri, "text/html")
write_file(path, <<-HTML, nil, top_uri, logger)
<html><head>
<meta http-equiv="refresh" content="0; URL=#{h[:base_uri]}">
</head></html>
HTML
end
path = uri_to_path(h[:base_uri], h[:type])
write_file(path, h[:body], h[:last_modified], top_uri, logger)
case h[:type]
when "text/html"
links = extract_links(h[:body], h[:base_uri])
when "text/css"
links = extract_css_links(h[:body], h[:base_uri])
else
links = nil
end
if links
links.each do |link|
(links_from[link] ||= Set.new) << uri
end
q.merge(uniq_links(links, top_uri) - done)
end
end
end
if __FILE__ == $0
download_all(URI(ARGV.shift))
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment