Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Tumblr to Jekyll migration
#!/usr/bin/env ruby
# Script to import tumblr posts into local markdown posts ready to be consumed by Jekyll.
# Inspired by New Bamboo's post http://blog.new-bamboo.co.uk/2009/2/20/migrating-from-mephisto-to-jekyll
# Supports post types: regular, quote, link, photo, video and audio
# Saves local copies of images
require 'rubygems'
require 'open-uri'
require 'nokogiri'
require 'net/http'
require 'mime/types'
require 'fileutils'
require 'pathname'
require 'date'
# Configuration
TUMBLR_DOMAIN = "http://tumblr-domain.com"
WRITE_DIRECTORY = "_posts"
IMAGE_DIRECTORY = "../images"
LAYOUT = "default"
# follow 3xx redirection
def fetch(uri_str, limit = 10)
raise ArgumentError, 'HTTP redirect too deep' if limit == 0
response = Net::HTTP.get_response(URI.parse(uri_str))
case response
when Net::HTTPSuccess then response
when Net::HTTPRedirection then fetch(response['location'], limit - 1)
else
response.error!
end
end
# save a local copy of a tumblr-hosted image and return the relative uri
def fetch_img(uri_str)
uri = URI.parse(uri_str)
resp = fetch(uri_str)
mime_type = MIME::Types[resp["content-type"]].first
# build our local image path
path = "#{uri.host}#{uri.path}"
# rewrite extension
extension = mime_type.extensions.first
extension = extension == "jpeg" ? "jpg" : extension
path = "#{path.chomp(File.extname(path))}.#{extension}"
print "Image: #{uri_str} --> #{path}\n"
local_path = "#{IMAGE_DIRECTORY}/#{path}"
FileUtils.mkdir_p Pathname.new(local_path).dirname
open(local_path, "wb") { |file| file.write(resp.body) }
return "/images/#{path}"
end
# Tumblr api only returns 50 posts per call
post_offset = 0
posts_returned = -1
while posts_returned != 0
path = TUMBLR_DOMAIN + "/api/read?num=50&filter=none&start=#{post_offset}"
# Connect to Tumblr and read the API source
open(path) do |xml|
doc = Nokogiri::XML.parse(xml)
posts = doc.css("post")
posts_returned = posts.count
post_offset += posts.count
posts.each do |post_tag|
# Gather data about each post
date = Date.parse(post_tag.attributes["date"].content)
id = post_tag.css("@id").first.content
slug_tag = post_tag.css("slug").first
slug = slug_tag.nil? ? nil : slug_tag.content
type = post_tag.attributes["type"].content
tags = post_tag.css("tag").map{|t| t.content }
title = nil
body = nil
if type == "regular"
title_tag = post_tag.css("regular-title").first
title = title_tag.nil? ? nil : title_tag.content
body = post_tag.css("regular-body").first.content
elsif type == "quote"
text = post_tag.css("quote-text").first.content
source = post_tag.css("quote-source").first.content
body = "> #{text}" + "\n\n" + source
elsif type == "link"
text_tag = post_tag.css("link-text").first
text = text_tag.nil? ? nil : text_tag.content
link = post_tag.css("link-url").first.content
body = "<a href=\"#{link}\">#{text}</a>"
desc_tag = post_tag.css("link-description").first
if desc_tag != nil
body << "\n\n#{desc_tag.content}"
end
elsif type == "photo"
body = ""
photoset_tag = post_tag.css("photoset").first
if photoset_tag.nil?
body += "<img src=\"#{fetch_img(post_tag.css("photo-url").first.content)}\" />"
else
post_tag.css("photo").each do |photo_tag|
body += "<img src=\"#{fetch_img(photo_tag.css("photo-url").first.content)}\" />"
end
end
text = post_tag.css("photo-caption").first.content
body += "\n\n#{text}"
elsif type == "video"
caption_tag = post_tag.css("video-caption").first
if caption_tag != nil
text = caption_tag.content
end
body = post_tag.css("video-source").first.content
elsif type == "audio"
caption_tag = post_tag.css("audio-caption").first
text = caption_tag.nil? ? nil : caption_tag.content
body = post_tag.css("audio-player").first.content
else
print "ERROR: Post type not supported\n"
next
end
if !title && !text
print "ERROR: Post title and text are nil: #{id}\n"
next
end
# title defaults
title ||= text
title = title.gsub(/<.*?>/,'') # strip html
#title = title.length > 60 ? (title[0,60] + "…") : title # limit length
# create the slug if necessary and build a _post filename
if slug.nil?
slug = "#{title.gsub(/(\s|[^a-zA-Z0-9])/,"-").gsub(/-+/,'-').gsub(/-$/,'').downcase}"
end
filename = "#{date.strftime("%Y-%m-%d")}-#{slug}.html"
# if there's no post, we give up.
if !body
next
end
tagcode = ""
if tags.size > 0
tagcode = "tags:\n"
for t in tags
tagcode << " - #{t}\n"
end
end
jekyll_post = <<-EOPOST
---
title: #{title}
layout: #{LAYOUT}
#{ tagcode }
type: #{type}
---
#{body}
EOPOST
# Write files
puts "#{ filename }"
#puts jekyll_post
#puts ""
file = File.new("#{WRITE_DIRECTORY}/#{filename}", "w+")
file.write(jekyll_post)
file.close
end
end
end
@johdax

This comment has been minimized.

Copy link

commented Sep 7, 2011

awesome script!!
one small note: sometimes filenames tend be over the allowed length (depending on the post), so slug in line 143 may need to be cut to an appropriate length

@wesbaker

This comment has been minimized.

Copy link

commented Nov 26, 2011

I had the same problem as @johdax, I added this block right below line 144:

# shorten slug if too long
if slug.length >= 80
  slug = slug[0, 80]
end
@joeross

This comment has been minimized.

Copy link

commented Jul 28, 2014

Apologies if I'm missing something, but while the description says this will "import tumblr posts into local markdown posts" my output is a series of HTML files.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.