-
-
Save spiffytech/e73777e167dc5a8b6a87 to your computer and use it in GitHub Desktop.
Imports a Squarespace dump into Jekyll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
require 'rubygems' | |
require 'hpricot' | |
require 'nokogiri' | |
require 'fileutils' | |
require 'safe_yaml' | |
require 'time' | |
require 'uri' | |
require 'open-uri' | |
module JekyllImport | |
# This importer takes a wordpress.xml file, which can be exported from your | |
# wordpress.com blog (/wp-admin/export.php). | |
module WordpressDotCom | |
attr :image_folder | |
@image_folder = 'squarespace_images' | |
def self.download_image(src, dest) | |
return if ::File.exist? dest # Speed-up for when importing multiple times | |
File.open(dest, "wb") do |saved_file| | |
# the following "open" is provided by open-uri | |
open(src, "rb") do |read_file| | |
saved_file.write(read_file.read) | |
end | |
end | |
end | |
def self.process(filename = {:source => "_wordpress.xml"}) | |
Dir.mkdir @image_folder unless ::File.exist? @image_folder | |
import_count = Hash.new(0) | |
doc = Hpricot::XML(File.read(filename[:source])) | |
(doc/:channel/:item).each do |item| | |
title = item.at(:title).inner_text.strip | |
permalink_title = item.at('wp:post_name').inner_text.gsub("/","-") | |
# Fallback to "prettified" title if post_name is empty (can happen) | |
if permalink_title == "" | |
permalink_title = sluggify(title) | |
end | |
if item.at('wp:post_date') | |
begin | |
date = Time.parse(item.at('wp:post_date').inner_text) | |
rescue | |
date = Time.now | |
end | |
else | |
date = Time.now | |
end | |
status = item.at('wp:status').inner_text | |
if status == "publish" | |
published = true | |
else | |
published = false | |
end | |
type = item.at('wp:post_type').inner_text | |
categories = item.search('category[@domain="category"]').map{|c| c.inner_text}.reject{|c| c == 'Uncategorized'}.uniq | |
tags = item.search('category[@domain="post_tag"]').map{|t| t.inner_text}.uniq | |
metas = Hash.new | |
item.search("wp:postmeta").each do |meta| | |
key = meta.at('wp:meta_key').inner_text | |
value = meta.at('wp:meta_value').inner_text | |
metas[key] = value; | |
end | |
# Identify Squarespace-hosted images, download them, and update the | |
# URLs to point to our copies | |
body = item.at('content:encoded').inner_text | |
body = body.gsub(/\[\/?caption[^]]*\]/, '') # Remove caption blocks which don't render properly | |
doc = Nokogiri::HTML(body) | |
doc.css('img').each do |element| | |
puts element | |
src = element['src'] | |
u = URI src | |
if u.host.end_with? 'squarespace.com' | |
filename = u.path.sub(/^\//, '').gsub('/', '_') + '_' + (u.fragment || '') | |
dest = ::File.join(@image_folder, filename) | |
download_image(src, dest) | |
element['src'] = '/' + dest | |
end | |
end | |
body = doc.to_s | |
name = "#{date.strftime('%Y-%m-%d')}-#{permalink_title}.html" | |
header = { | |
'layout' => type, | |
'title' => title, | |
'categories' => categories, | |
'tags' => tags, | |
'status' => status, | |
'type' => type, | |
'published' => published, | |
'meta' => metas | |
} | |
begin | |
FileUtils.mkdir_p "_#{type}s" | |
File.open("_#{type}s/#{name}", "w") do |f| | |
f.puts header.to_yaml | |
f.puts '---' | |
f.puts body | |
end | |
rescue => e | |
puts "Couldn't import post!" | |
puts "Title: #{title}" | |
puts "Name/Slug: #{name}\n" | |
puts "Error: #{e.message}" | |
next | |
end | |
import_count[type] += 1 | |
end | |
import_count.each do |key, value| | |
puts "Imported #{value} #{key}s" | |
end | |
end | |
def self.sluggify(title) | |
title.gsub(/[^[:alnum:]]+/, '-').downcase | |
end | |
end | |
end | |
JekyllImport::WordpressDotCom.process |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'html2markdown' | |
POST_REGEX = %r{(?<year>[0-9]+)-(?<month>[0-9]+)-(?<day>[0-9]+)-(?<title>.*).html} | |
files = Dir.glob('*.html').select{ |f| f.match POST_REGEX } | |
files.each do |post| | |
data = post.match(POST_REGEX) | |
p = HTMLPage.new(contents: File.read(post)) | |
File.open(post, 'w') { |f| f.puts p.markdown } | |
File.rename(post, "#{data[:year]}-#{data[:month]}-#{data[:day]}-#{data[:title]}.md") | |
end |
The images are compressed and not downloaded in the full resolution, you can append ?format=2500w
to the image URL to get the higher resolution
Thanks a lot! I really needed to recover the images.
To make it work I had to replace open wit URI.open in line 23 and 'squarespace.com' with 'squarespace-cdn.com' in line 81.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This functions extremely well! Thank you!
(I'm still working on getting my site set up on Jekyll, but after removing a few broken links, I got this beautiful output):