Created
March 12, 2011 18:47
-
-
Save derek-watson/867468 to your computer and use it in GitHub Desktop.
Tumblr to Jekyll migration
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Script to import tumblr posts into local markdown posts ready to be consumed by Jekyll. | |
# Inspired by New Bamboo's post http://blog.new-bamboo.co.uk/2009/2/20/migrating-from-mephisto-to-jekyll | |
# Supports post types: regular, quote, link, photo, video and audio | |
# Saves local copies of images | |
require 'rubygems' | |
require 'open-uri' | |
require 'nokogiri' | |
require 'net/http' | |
require 'mime/types' | |
require 'fileutils' | |
require 'pathname' | |
require 'date' | |
# Configuration | |
TUMBLR_DOMAIN = "http://tumblr-domain.com" | |
WRITE_DIRECTORY = "_posts" | |
IMAGE_DIRECTORY = "../images" | |
LAYOUT = "default" | |
# follow 3xx redirection | |
def fetch(uri_str, limit = 10) | |
raise ArgumentError, 'HTTP redirect too deep' if limit == 0 | |
response = Net::HTTP.get_response(URI.parse(uri_str)) | |
case response | |
when Net::HTTPSuccess then response | |
when Net::HTTPRedirection then fetch(response['location'], limit - 1) | |
else | |
response.error! | |
end | |
end | |
# save a local copy of a tumblr-hosted image and return the relative uri | |
def fetch_img(uri_str) | |
uri = URI.parse(uri_str) | |
resp = fetch(uri_str) | |
mime_type = MIME::Types[resp["content-type"]].first | |
# build our local image path | |
path = "#{uri.host}#{uri.path}" | |
# rewrite extension | |
extension = mime_type.extensions.first | |
extension = extension == "jpeg" ? "jpg" : extension | |
path = "#{path.chomp(File.extname(path))}.#{extension}" | |
print "Image: #{uri_str} --> #{path}\n" | |
local_path = "#{IMAGE_DIRECTORY}/#{path}" | |
FileUtils.mkdir_p Pathname.new(local_path).dirname | |
open(local_path, "wb") { |file| file.write(resp.body) } | |
return "/images/#{path}" | |
end | |
# Tumblr api only returns 50 posts per call | |
post_offset = 0 | |
posts_returned = -1 | |
while posts_returned != 0 | |
path = TUMBLR_DOMAIN + "/api/read?num=50&filter=none&start=#{post_offset}" | |
# Connect to Tumblr and read the API source | |
open(path) do |xml| | |
doc = Nokogiri::XML.parse(xml) | |
posts = doc.css("post") | |
posts_returned = posts.count | |
post_offset += posts.count | |
posts.each do |post_tag| | |
# Gather data about each post | |
date = Date.parse(post_tag.attributes["date"].content) | |
id = post_tag.css("@id").first.content | |
slug_tag = post_tag.css("slug").first | |
slug = slug_tag.nil? ? nil : slug_tag.content | |
type = post_tag.attributes["type"].content | |
tags = post_tag.css("tag").map{|t| t.content } | |
title = nil | |
body = nil | |
if type == "regular" | |
title_tag = post_tag.css("regular-title").first | |
title = title_tag.nil? ? nil : title_tag.content | |
body = post_tag.css("regular-body").first.content | |
elsif type == "quote" | |
text = post_tag.css("quote-text").first.content | |
source = post_tag.css("quote-source").first.content | |
body = "> #{text}" + "\n\n" + source | |
elsif type == "link" | |
text_tag = post_tag.css("link-text").first | |
text = text_tag.nil? ? nil : text_tag.content | |
link = post_tag.css("link-url").first.content | |
body = "<a href=\"#{link}\">#{text}</a>" | |
desc_tag = post_tag.css("link-description").first | |
if desc_tag != nil | |
body << "\n\n#{desc_tag.content}" | |
end | |
elsif type == "photo" | |
body = "" | |
photoset_tag = post_tag.css("photoset").first | |
if photoset_tag.nil? | |
body += "<img src=\"#{fetch_img(post_tag.css("photo-url").first.content)}\" />" | |
else | |
post_tag.css("photo").each do |photo_tag| | |
body += "<img src=\"#{fetch_img(photo_tag.css("photo-url").first.content)}\" />" | |
end | |
end | |
text = post_tag.css("photo-caption").first.content | |
body += "\n\n#{text}" | |
elsif type == "video" | |
caption_tag = post_tag.css("video-caption").first | |
if caption_tag != nil | |
text = caption_tag.content | |
end | |
body = post_tag.css("video-source").first.content | |
elsif type == "audio" | |
caption_tag = post_tag.css("audio-caption").first | |
text = caption_tag.nil? ? nil : caption_tag.content | |
body = post_tag.css("audio-player").first.content | |
else | |
print "ERROR: Post type not supported\n" | |
next | |
end | |
if !title && !text | |
print "ERROR: Post title and text are nil: #{id}\n" | |
next | |
end | |
# title defaults | |
title ||= text | |
title = title.gsub(/<.*?>/,'') # strip html | |
#title = title.length > 60 ? (title[0,60] + "…") : title # limit length | |
# create the slug if necessary and build a _post filename | |
if slug.nil? | |
slug = "#{title.gsub(/(\s|[^a-zA-Z0-9])/,"-").gsub(/-+/,'-').gsub(/-$/,'').downcase}" | |
end | |
filename = "#{date.strftime("%Y-%m-%d")}-#{slug}.html" | |
# if there's no post, we give up. | |
if !body | |
next | |
end | |
tagcode = "" | |
if tags.size > 0 | |
tagcode = "tags:\n" | |
for t in tags | |
tagcode << " - #{t}\n" | |
end | |
end | |
jekyll_post = <<-EOPOST | |
--- | |
title: #{title} | |
layout: #{LAYOUT} | |
#{ tagcode } | |
type: #{type} | |
--- | |
#{body} | |
EOPOST | |
# Write files | |
puts "#{ filename }" | |
#puts jekyll_post | |
#puts "" | |
file = File.new("#{WRITE_DIRECTORY}/#{filename}", "w+") | |
file.write(jekyll_post) | |
file.close | |
end | |
end | |
end |
I had the same problem as @johdax, I added this block right below line 144:
# shorten slug if too long
if slug.length >= 80
slug = slug[0, 80]
end
Apologies if I'm missing something, but while the description says this will "import tumblr posts into local markdown posts" my output is a series of HTML files.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
awesome script!!
one small note: sometimes filenames tend be over the allowed length (depending on the post), so slug in line 143 may need to be cut to an appropriate length