Skip to content

Instantly share code, notes, and snippets.

@mwlang
Created April 13, 2015 05:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mwlang/a4dbf5c098fd8f9502b2 to your computer and use it in GitHub Desktop.
Save mwlang/a4dbf5c098fd8f9502b2 to your computer and use it in GitHub Desktop.
I used the following rake tasks to completely clone my two separate blog web sites, one built in python/Zope and the other WordPress. The tasks basically generate HTML files of every page by following every link on every page (spidering, anyone?) And then I used a few more tasks to clean up so I could easily serve the pages with Rails.
task "website:fix_urls" => :environment do
WpPost.all.each do |wp|
lr = wp.latest_revision
lr.update_attribute(:post_content, lr.post_content.gsub("http://ramblings.gibberishcode.net/", "/"))
end
end
task "website:remove_footer" => :environment do
def scrub(fn)
content = File.read(fn).split("\n")
if content[-26] == "<hr>"
File.open(fn, 'w') do |f|
f.puts content[0..-27]
f.puts "{{ footer }}"
end
else
puts "#{fn}: #{content[-26].inspect}"
end
end
def scan_and_scrub(path)
Dir.glob(File.join(path, '*')) do |fn|
if File.directory?(fn)
scan_and_scrub(fn)
else
scrub(fn)
end
end
end
scan_and_scrub(File.join(Rails.root, 'app', 'views', 'static'))
end
task "website:scrub" => :environment do
def scrub(fn)
content = File.read(fn)
content.gsub!('href="default.css"', 'href="/stylesheets/default.css"')
content.gsub!('<a href="/login">Login</a> &nbsp; | &nbsp;', '')
content.gsub!("<td align=center>\nGuest</td>\n", '')
File.open(fn, 'w'){ |f| f.puts content }
end
def scan_and_scrub(path)
Dir.glob(File.join(path, '*')) do |fn|
if File.directory?(fn)
scan_and_scrub(fn)
else
scrub(fn)
end
end
end
scan_and_scrub(File.join(Rails.root, 'app', 'views', 'static'))
end
task "website:clone" => :environment do
require 'anemone'
def site_url
"http://www.codeconnoisseur.org/"
end
def file_path(uri)
uri.to_s.gsub(site_url, '').split("/")
end
def site_folder
@site_folder ||= File.join(Rails.root, 'app', 'views', 'static')
end
def site_public_folder
@site_public_folder ||= File.join(Rails.root, 'public')
end
def ensure_path(path)
full_path = File.join(site_folder, path)
`mkdir -p #{full_path}`
full_path
end
def ensure_public_path(path)
full_path = File.join(site_public_folder, path)
`mkdir -p #{full_path}`
full_path
end
def save_image(url, path, src)
return if src =~ /http\:\/\// && src !~ /codeconnoisseur/
img_path = src
img_path = "#{url}/#{src}" unless src =~ /http\:\/\//
cmd = "cd #{path} && wget #{img_path}"
puts cmd
`#{cmd}`
end
def save_html_page(page)
path = file_path(page.url)
path.pop if path.last == 'index.html'
full_path = ensure_path(path)
File.open(File.join(full_path, 'index.html'), 'w') do |f|
f.puts "---"
f.puts "title: #{page.doc.at('title').inner_html rescue 'Michael Lang'}"
f.puts "---"
f.puts page.body.force_encoding('iso-8859-15').encode('utf-8').gsub(site_url, '/')
end
page.doc.search('img').each do |img|
save_image(page.url, ensure_public_path(path), img['src'])
end
end
def save_resource(page)
path = file_path(page.url)
filename = path.pop
full_path = ensure_public_path(path)
File.open(File.join(full_path, filename), 'wb') { |f| f.puts page.body }
end
File.open("routes.txt", 'w') do |routes|
Anemone.crawl(site_url) do |anemone|
index = 0
anemone.on_every_page do |page|
page_title = page.doc.at('title').inner_html rescue nil
puts "#{page.headers["content-type"].inspect}: #{page.url}"
if page.headers["content-type"].first.split("; ").first == "text/html"
save_html_page(page)
routes.puts page.url
else
save_resource(page)
end
index += 1
end
end
end
end
task "ramblings:clone" => :environment do
require 'anemone'
def site_url
"http://ramblings.gibberishcode.net/"
end
def file_path(uri)
uri.to_s.gsub(site_url, '').split("/")
end
def site_folder
@site_folder ||= File.join(Rails.root, 'app', 'views', 'ramblings')
end
def site_public_folder
@site_public_folder ||= File.join(Rails.root, 'public')
end
def ensure_path(path)
full_path = File.join(site_folder, path)
`mkdir -p #{full_path}`
full_path
end
def ensure_public_path(path)
full_path = File.join(site_public_folder, path)
`mkdir -p #{full_path}`
full_path
end
def save_image(url, path, src)
return if src =~ /http\:\/\// && src !~ /ramblings/
img_path = src
img_path = "#{url}/#{src}" unless src =~ /http\:\/\//
cmd = "cd #{path} && wget #{img_path}"
puts cmd
`#{cmd}`
end
def save_html_page(page)
path = file_path(page.url)
path.pop if path.last == 'index.html'
full_path = ensure_path(path)
File.open(File.join(full_path, 'index.html'), 'w') do |f|
f.puts "---"
f.puts "title: #{page.doc.at('title').inner_html rescue 'Michael Lang'}"
f.puts "---"
f.puts page.body.force_encoding('iso-8859-15').encode('utf-8').gsub(site_url, '/')
end
page.doc.search('img').each do |img|
save_image(page.url, ensure_public_path(path), img['src'])
end
end
def save_resource(page)
path = file_path(page.url)
filename = path.pop
full_path = ensure_public_path(path)
File.open(File.join(full_path, filename), 'wb') { |f| f.puts page.body }
end
File.open("ramblings.txt", 'w') do |routes|
Anemone.crawl(site_url) do |anemone|
index = 0
anemone.on_every_page do |page|
page_title = page.doc.at('title').inner_html rescue nil
puts "#{page.headers["content-type"].inspect}: #{page.url}"
if page.headers["content-type"].first.split("; ").first == "text/html"
save_html_page(page)
routes.puts page.url
else
save_resource(page)
end
index += 1
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment