Skip to content

Instantly share code, notes, and snippets.

@spiffytech
Forked from evanwalsh/import.rb
Last active October 31, 2023 18:44
Show Gist options
  • Save spiffytech/e73777e167dc5a8b6a87 to your computer and use it in GitHub Desktop.
Save spiffytech/e73777e167dc5a8b6a87 to your computer and use it in GitHub Desktop.
Imports a Squarespace dump into Jekyll
# coding: utf-8
require 'rubygems'
require 'hpricot'
require 'nokogiri'
require 'fileutils'
require 'safe_yaml'
require 'time'
require 'uri'
require 'open-uri'
module JekyllImport
# This importer takes a wordpress.xml file, which can be exported from your
# wordpress.com blog (/wp-admin/export.php).
module WordpressDotCom
attr :image_folder
@image_folder = 'squarespace_images'
def self.download_image(src, dest)
return if ::File.exist? dest # Speed-up for when importing multiple times
File.open(dest, "wb") do |saved_file|
# the following "open" is provided by open-uri
open(src, "rb") do |read_file|
saved_file.write(read_file.read)
end
end
end
def self.process(filename = {:source => "_wordpress.xml"})
Dir.mkdir @image_folder unless ::File.exist? @image_folder
import_count = Hash.new(0)
doc = Hpricot::XML(File.read(filename[:source]))
(doc/:channel/:item).each do |item|
title = item.at(:title).inner_text.strip
permalink_title = item.at('wp:post_name').inner_text.gsub("/","-")
# Fallback to "prettified" title if post_name is empty (can happen)
if permalink_title == ""
permalink_title = sluggify(title)
end
if item.at('wp:post_date')
begin
date = Time.parse(item.at('wp:post_date').inner_text)
rescue
date = Time.now
end
else
date = Time.now
end
status = item.at('wp:status').inner_text
if status == "publish"
published = true
else
published = false
end
type = item.at('wp:post_type').inner_text
categories = item.search('category[@domain="category"]').map{|c| c.inner_text}.reject{|c| c == 'Uncategorized'}.uniq
tags = item.search('category[@domain="post_tag"]').map{|t| t.inner_text}.uniq
metas = Hash.new
item.search("wp:postmeta").each do |meta|
key = meta.at('wp:meta_key').inner_text
value = meta.at('wp:meta_value').inner_text
metas[key] = value;
end
# Identify Squarespace-hosted images, download them, and update the
# URLs to point to our copies
body = item.at('content:encoded').inner_text
body = body.gsub(/\[\/?caption[^]]*\]/, '') # Remove caption blocks which don't render properly
doc = Nokogiri::HTML(body)
doc.css('img').each do |element|
puts element
src = element['src']
u = URI src
if u.host.end_with? 'squarespace.com'
filename = u.path.sub(/^\//, '').gsub('/', '_') + '_' + (u.fragment || '')
dest = ::File.join(@image_folder, filename)
download_image(src, dest)
element['src'] = '/' + dest
end
end
body = doc.to_s
name = "#{date.strftime('%Y-%m-%d')}-#{permalink_title}.html"
header = {
'layout' => type,
'title' => title,
'categories' => categories,
'tags' => tags,
'status' => status,
'type' => type,
'published' => published,
'meta' => metas
}
begin
FileUtils.mkdir_p "_#{type}s"
File.open("_#{type}s/#{name}", "w") do |f|
f.puts header.to_yaml
f.puts '---'
f.puts body
end
rescue => e
puts "Couldn't import post!"
puts "Title: #{title}"
puts "Name/Slug: #{name}\n"
puts "Error: #{e.message}"
next
end
import_count[type] += 1
end
import_count.each do |key, value|
puts "Imported #{value} #{key}s"
end
end
def self.sluggify(title)
title.gsub(/[^[:alnum:]]+/, '-').downcase
end
end
end
JekyllImport::WordpressDotCom.process
#!/usr/bin/env ruby
require 'html2markdown'
POST_REGEX = %r{(?<year>[0-9]+)-(?<month>[0-9]+)-(?<day>[0-9]+)-(?<title>.*).html}
files = Dir.glob('*.html').select{ |f| f.match POST_REGEX }
files.each do |post|
data = post.match(POST_REGEX)
p = HTMLPage.new(contents: File.read(post))
File.open(post, 'w') { |f| f.puts p.markdown }
File.rename(post, "#{data[:year]}-#{data[:month]}-#{data[:day]}-#{data[:title]}.md")
end
@josh-works
Copy link

josh-works commented May 12, 2017

This functions extremely well! Thank you!

(I'm still working on getting my site set up on Jekyll, but after removing a few broken links, I got this beautiful output):

Imported 14 pages
Imported 171 posts
Imported 132 attachments

@KrauseFx
Copy link

KrauseFx commented Oct 13, 2017

The images are compressed and not downloaded in the full resolution, you can append ?format=2500w to the image URL to get the higher resolution

@lpattori
Copy link

Thanks a lot! I really needed to recover the images.
To make it work I had to replace open wit URI.open in line 23 and 'squarespace.com' with 'squarespace-cdn.com' in line 81.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment