Created
February 25, 2010 09:24
-
-
Save znz/314396 to your computer and use it in GitHub Desktop.
Plone site to static files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
# -*- coding: utf-8 -*- | |
require 'rubygems' | |
require 'nokogiri' | |
require 'logger' | |
require 'open-uri' | |
require 'pathname' | |
require 'set' | |
require 'uri' | |
def canonical(uri) | |
if uri.fragment | |
uri.fragment = nil | |
end | |
uri.path = uri.path.sub(/(\/(?:index_html)?)?\z/, '') | |
uri | |
end | |
def uri_to_path(uri, type) | |
path = uri.path.sub(/(\/(?:index_html)?)?\z/, '') | |
path = URI.unescape(path) | |
if type == "text/html" | |
if /\.html\z/ !~ uri.path | |
path += "/index.html" | |
end | |
end | |
Pathname([uri.scheme, uri.host, uri.port].join(",") + path) | |
end | |
def write_file(path, body, time, top_uri, logger) | |
unless path.dirname.directory? | |
begin | |
path.dirname.mkpath | |
rescue Errno::EEXIST | |
t = path.dirname | |
until t.exist? | |
t = t.dirname | |
end | |
t1 = "#{t}~" | |
t2 = t + "index#{t.extname}" | |
logger.info("Move") { "#{t.to_s.dump} -> #{t2.to_s.dump}" } | |
FileUtils.mv(t, t1) | |
t.mkpath | |
FileUtils.mv(t1, t2) | |
path.dirname.mkpath | |
end | |
end | |
if path.directory? | |
path += "index" + File.extname(path) | |
end | |
if /\.(?:html|css)\z/ =~ path | |
orig = Pathname("#{path}.orig") | |
logger.info("Backup") { orig.to_s.dump } | |
orig.open("wb") do |f| | |
f.write body | |
end | |
orig.utime(time, time) if time | |
top_path = Pathname([top_uri.scheme, top_uri.host, top_uri.port].join(",")) | |
rel_path_to_top = "#{top_path.relative_path_from(path)}".sub(/\.\.\z/, '') | |
body = body.gsub(Regexp.new(Regexp.quote(top_uri.to_s))) do | |
rel_path_to_top | |
end | |
body = body.sub(/<base href=".+?" \/>/, '') | |
end | |
logger.info("Save") { path.to_s.dump } | |
path.open("wb") do |f| | |
f.write body | |
end | |
path.utime(time, time) if time | |
end | |
def download(uri) | |
uri.open("Accept-Language" => "ja;q=1.0, en;q=0.1") do |f| | |
{ | |
:base_uri => f.base_uri, | |
:body => f.read, | |
:type => f.content_type, | |
:time => f.last_modified, | |
} | |
end | |
end | |
def extract_css_links(css, base_uri) | |
links = Set.new | |
css.scan(/ url\((.+?)\)/).each do |url,| | |
if /\A\'(.+)\'\z/ =~ url | |
url = $1 | |
end | |
links << base_uri + url | |
end | |
links | |
end | |
def extract_links(html, base_uri) | |
doc = Nokogiri::HTML(html) | |
base_href = doc.at("//base/@href") | |
if base_href | |
base_uri = URI(base_href.value) | |
end | |
links = extract_css_links(html, base_uri) | |
[ | |
"//img/@src", | |
"//script/@src", | |
"//link/@href", | |
"//a/@href", | |
].each do |xpath| | |
doc.xpath(xpath).each do |href| | |
next if /\Ajavascript:/ =~ href | |
links << base_uri + href.to_s | |
end | |
end | |
links | |
end | |
def uniq_links(links, top_uri) | |
top = top_uri.to_s | |
s = Set.new | |
links.each do |u| | |
if u.to_s.start_with?(top) | |
s << canonical(u) | |
end | |
end | |
s | |
end | |
def download_all(uri, logger=Logger.new(STDERR)) | |
links_from = {} | |
top_uri = uri | |
q = Set[uri] | |
done = Set.new | |
while 0 < q.size | |
uri = q.each{|v|break v} | |
logger.info("URI") { uri.to_s.dump } | |
logger.info("Progress") { "q=#{q.size} done=#{done.size}" } | |
q.delete(uri) | |
done.add(uri) | |
begin | |
h = download(uri) | |
rescue OpenURI::HTTPError => e | |
logger.error("HTTPError") { "#{e} #{uri.to_s.dump}" } | |
if /\A404 / =~ e.to_s | |
next | |
else | |
raise | |
end | |
end | |
if h[:base_uri] != uri | |
logger.warn("Redirect") { "#{uri.to_s.dump} -> #{h[:base_uri].to_s.dump}" } | |
path = uri_to_path(uri, "text/html") | |
write_file(path, <<-HTML, nil, top_uri, logger) | |
<html><head> | |
<meta http-equiv="refresh" content="0; URL=#{h[:base_uri]}"> | |
</head></html> | |
HTML | |
end | |
path = uri_to_path(h[:base_uri], h[:type]) | |
write_file(path, h[:body], h[:last_modified], top_uri, logger) | |
case h[:type] | |
when "text/html" | |
links = extract_links(h[:body], h[:base_uri]) | |
when "text/css" | |
links = extract_css_links(h[:body], h[:base_uri]) | |
else | |
links = nil | |
end | |
if links | |
links.each do |link| | |
(links_from[link] ||= Set.new) << uri | |
end | |
q.merge(uniq_links(links, top_uri) - done) | |
end | |
end | |
end | |
if __FILE__ == $0 | |
download_all(URI(ARGV.shift)) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment