Created
April 13, 2015 05:15
-
-
Save mwlang/a4dbf5c098fd8f9502b2 to your computer and use it in GitHub Desktop.
I used the following rake tasks to completely clone my two separate blog web sites, one built in python/Zope and the other WordPress. The tasks basically generate HTML files of every page by following every link on every page (spidering, anyone?) And then I used a few more tasks to clean up so I could easily serve the pages with Rails.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
task "website:fix_urls" => :environment do | |
WpPost.all.each do |wp| | |
lr = wp.latest_revision | |
lr.update_attribute(:post_content, lr.post_content.gsub("http://ramblings.gibberishcode.net/", "/")) | |
end | |
end | |
task "website:remove_footer" => :environment do | |
def scrub(fn) | |
content = File.read(fn).split("\n") | |
if content[-26] == "<hr>" | |
File.open(fn, 'w') do |f| | |
f.puts content[0..-27] | |
f.puts "{{ footer }}" | |
end | |
else | |
puts "#{fn}: #{content[-26].inspect}" | |
end | |
end | |
def scan_and_scrub(path) | |
Dir.glob(File.join(path, '*')) do |fn| | |
if File.directory?(fn) | |
scan_and_scrub(fn) | |
else | |
scrub(fn) | |
end | |
end | |
end | |
scan_and_scrub(File.join(Rails.root, 'app', 'views', 'static')) | |
end | |
task "website:scrub" => :environment do | |
def scrub(fn) | |
content = File.read(fn) | |
content.gsub!('href="default.css"', 'href="/stylesheets/default.css"') | |
content.gsub!('<a href="/login">Login</a> | ', '') | |
content.gsub!("<td align=center>\nGuest</td>\n", '') | |
File.open(fn, 'w'){ |f| f.puts content } | |
end | |
def scan_and_scrub(path) | |
Dir.glob(File.join(path, '*')) do |fn| | |
if File.directory?(fn) | |
scan_and_scrub(fn) | |
else | |
scrub(fn) | |
end | |
end | |
end | |
scan_and_scrub(File.join(Rails.root, 'app', 'views', 'static')) | |
end | |
task "website:clone" => :environment do | |
require 'anemone' | |
def site_url | |
"http://www.codeconnoisseur.org/" | |
end | |
def file_path(uri) | |
uri.to_s.gsub(site_url, '').split("/") | |
end | |
def site_folder | |
@site_folder ||= File.join(Rails.root, 'app', 'views', 'static') | |
end | |
def site_public_folder | |
@site_public_folder ||= File.join(Rails.root, 'public') | |
end | |
def ensure_path(path) | |
full_path = File.join(site_folder, path) | |
`mkdir -p #{full_path}` | |
full_path | |
end | |
def ensure_public_path(path) | |
full_path = File.join(site_public_folder, path) | |
`mkdir -p #{full_path}` | |
full_path | |
end | |
def save_image(url, path, src) | |
return if src =~ /http\:\/\// && src !~ /codeconnoisseur/ | |
img_path = src | |
img_path = "#{url}/#{src}" unless src =~ /http\:\/\// | |
cmd = "cd #{path} && wget #{img_path}" | |
puts cmd | |
`#{cmd}` | |
end | |
def save_html_page(page) | |
path = file_path(page.url) | |
path.pop if path.last == 'index.html' | |
full_path = ensure_path(path) | |
File.open(File.join(full_path, 'index.html'), 'w') do |f| | |
f.puts "---" | |
f.puts "title: #{page.doc.at('title').inner_html rescue 'Michael Lang'}" | |
f.puts "---" | |
f.puts page.body.force_encoding('iso-8859-15').encode('utf-8').gsub(site_url, '/') | |
end | |
page.doc.search('img').each do |img| | |
save_image(page.url, ensure_public_path(path), img['src']) | |
end | |
end | |
def save_resource(page) | |
path = file_path(page.url) | |
filename = path.pop | |
full_path = ensure_public_path(path) | |
File.open(File.join(full_path, filename), 'wb') { |f| f.puts page.body } | |
end | |
File.open("routes.txt", 'w') do |routes| | |
Anemone.crawl(site_url) do |anemone| | |
index = 0 | |
anemone.on_every_page do |page| | |
page_title = page.doc.at('title').inner_html rescue nil | |
puts "#{page.headers["content-type"].inspect}: #{page.url}" | |
if page.headers["content-type"].first.split("; ").first == "text/html" | |
save_html_page(page) | |
routes.puts page.url | |
else | |
save_resource(page) | |
end | |
index += 1 | |
end | |
end | |
end | |
end | |
task "ramblings:clone" => :environment do | |
require 'anemone' | |
def site_url | |
"http://ramblings.gibberishcode.net/" | |
end | |
def file_path(uri) | |
uri.to_s.gsub(site_url, '').split("/") | |
end | |
def site_folder | |
@site_folder ||= File.join(Rails.root, 'app', 'views', 'ramblings') | |
end | |
def site_public_folder | |
@site_public_folder ||= File.join(Rails.root, 'public') | |
end | |
def ensure_path(path) | |
full_path = File.join(site_folder, path) | |
`mkdir -p #{full_path}` | |
full_path | |
end | |
def ensure_public_path(path) | |
full_path = File.join(site_public_folder, path) | |
`mkdir -p #{full_path}` | |
full_path | |
end | |
def save_image(url, path, src) | |
return if src =~ /http\:\/\// && src !~ /ramblings/ | |
img_path = src | |
img_path = "#{url}/#{src}" unless src =~ /http\:\/\// | |
cmd = "cd #{path} && wget #{img_path}" | |
puts cmd | |
`#{cmd}` | |
end | |
def save_html_page(page) | |
path = file_path(page.url) | |
path.pop if path.last == 'index.html' | |
full_path = ensure_path(path) | |
File.open(File.join(full_path, 'index.html'), 'w') do |f| | |
f.puts "---" | |
f.puts "title: #{page.doc.at('title').inner_html rescue 'Michael Lang'}" | |
f.puts "---" | |
f.puts page.body.force_encoding('iso-8859-15').encode('utf-8').gsub(site_url, '/') | |
end | |
page.doc.search('img').each do |img| | |
save_image(page.url, ensure_public_path(path), img['src']) | |
end | |
end | |
def save_resource(page) | |
path = file_path(page.url) | |
filename = path.pop | |
full_path = ensure_public_path(path) | |
File.open(File.join(full_path, filename), 'wb') { |f| f.puts page.body } | |
end | |
File.open("ramblings.txt", 'w') do |routes| | |
Anemone.crawl(site_url) do |anemone| | |
index = 0 | |
anemone.on_every_page do |page| | |
page_title = page.doc.at('title').inner_html rescue nil | |
puts "#{page.headers["content-type"].inspect}: #{page.url}" | |
if page.headers["content-type"].first.split("; ").first == "text/html" | |
save_html_page(page) | |
routes.puts page.url | |
else | |
save_resource(page) | |
end | |
index += 1 | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment