Skip to content

Instantly share code, notes, and snippets.

@loop
Last active December 18, 2015 21:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save loop/5850220 to your computer and use it in GitHub Desktop.
Save loop/5850220 to your computer and use it in GitHub Desktop.
Modified Tumblr to Octopress migration script, more info - http://finitepost.com/2013/06/24/quick-start-guide-to-setting-up-octopress-from-tumblr/
require 'rubygems'
require 'open-uri'
require 'fileutils'
require 'nokogiri'
require 'date'
require 'json'
require 'uri'
require 'jekyll'
module Jekyll
module Tumblr
def self.process(url, format = "html", grab_images = false,
add_highlights = false, rewrite_urls = true)
@grab_images = grab_images
FileUtils.mkdir_p "_posts/tumblr"
url += "/api/read/json/"
per_page = 50
posts = []
# Two passes are required so that we can rewrite URLs.
# First pass builds up an array of each post as a hash.
begin
current_page = (current_page || -1) + 1
feed = open(url + "?num=#{per_page}&start=#{current_page * per_page}")
json = feed.readlines.join("\n")[21...-2] # Strip Tumblr's JSONP chars.
blog = JSON.parse(json)
puts "Page: #{current_page + 1} - Posts: #{blog["posts"].size}"
posts += blog["posts"].map { |post| post_to_hash(post, format) }
end until blog["posts"].size < per_page
# Rewrite URLs and create redirects.
posts = rewrite_urls_and_redirects posts if rewrite_urls
# Second pass for writing post files.
posts.each do |post|
if format == "markdown"
post[:content] = html_to_markdown post[:content]
post[:content] = add_syntax_highlights post[:content] if add_highlights
end
post[:name] = truncate_post_name post[:name] if post[:name].size > 255
File.open("_posts/tumblr/#{post[:name]}", "w") do |f|
f.puts post[:header].to_yaml + "---\n" + post[:content]
end
end
end
private
def self.truncate_post_name name
post = name.match(/^(.+)\.(.+)$/).captures
post[0][0..(-1 - post[1].size)] + post[1].size
end
# Converts each type of Tumblr post to a hash with all required
# data for Jekyll.
def self.post_to_hash(post, format)
case post['type']
when "regular"
title = post["regular-title"]
content = post["regular-body"]
when "link"
title = post["link-text"] || post["link-url"]
content = "<a href=\"#{post["link-url"]}\">#{title}</a>"
unless post["link-description"].nil?
content << "<br/>" + post["link-description"]
end
when "photo"
title = post["photo-caption"]
max_size = post.keys.map{ |k| k.gsub("photo-url-", "").to_i }.max
url = post["photo-url"] || post["photo-url-#{max_size}"]
ext = "." + post[post.keys.select { |k|
k =~ /^photo-url-/ && post[k].split("/").last =~ /\./
}.first].split(".").last
content = "<img src=\"#{save_file(url, ext)}\"/>"
unless post["photo-link-url"].nil?
content = "<a href=\"#{post["photo-link-url"]}\">#{content}</a>"
end
when "audio"
if !post["id3-title"].nil?
title = post["id3-title"]
content = post.at["audio-player"] + "<br/>" + post["audio-caption"]
else
title = post["audio-caption"]
content = post.at["audio-player"]
end
when "quote"
title = post["quote-text"]
content = "<blockquote>#{post["quote-text"]}</blockquote>"
unless post["quote-source"].nil?
content << "&#8212;" + post["quote-source"]
end
when "conversation"
title = post["conversation-title"]
content = "<section><dialog>"
post["conversation"]["line"].each do |line|
content << "<dt>#{line['label']}</dt><dd>#{line}</dd>"
end
content << "</section></dialog>"
when "video"
title = post["video-title"]
content = post["video-player"]
unless post["video-caption"].nil?
content << "<br/>" + post["video-caption"]
end
end
date = Date.parse(post['date']).to_s
title = Nokogiri::HTML(title).text
slug = title.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')
{
:name => "#{date}-#{slug}.#{format}",
:header => {
"layout" => "post",
"title" => title,
"tags" => post["tags"],
},
:content => content,
:url => post["url"],
:slug => post["url-with-slug"],
}
end
# Create a Hash of old urls => new urls, for rewriting and
# redirects, and replace urls in each post. Instantiate Jekyll
# site/posts to get the correct permalink format.
def self.rewrite_urls_and_redirects(posts)
site = Jekyll::Site.new(Jekyll.configuration({}))
dir = File.join(File.dirname(__FILE__), "..")
urls = Hash[posts.map { |post|
# Create an initial empty file for the post so that
# we can instantiate a post object.
File.open("_posts/tumblr/#{post[:name]}", "w")
tumblr_url = URI.parse(post[:slug]).path
jekyll_url = Jekyll::Post.new(site, dir, "", "tumblr/" + post[:name]).url
redirect_dir = tumblr_url.sub(/\//, "") + "/"
FileUtils.mkdir_p redirect_dir
File.open(redirect_dir + "index.html", "w") do |f|
f.puts "<html><head><meta http-equiv='Refresh' content='0; " +
"url=#{jekyll_url}'></head><body></body></html>"
end
[tumblr_url, jekyll_url]
}]
posts.map { |post|
urls.each do |tumblr_url, jekyll_url|
post[:content].gsub!(/#{tumblr_url}/i, jekyll_url)
end
post
}
end
# Uses Python's html2text to convert a post's content to
# markdown. Preserve HTML tables as per the markdown docs.
def self.html_to_markdown(content)
preserve = ["table", "tr", "th", "td"]
preserve.each do |tag|
content.gsub!(/<#{tag}/i, "$$" + tag)
content.gsub!(/<\/#{tag}/i, "||" + tag)
end
content = %x[echo '#{content.gsub("'", "''")}' | html2text 2>> html2md.log]
preserve.each do |tag|
content.gsub!("$$" + tag, "<" + tag)
content.gsub!("||" + tag, "</" + tag)
end
content
end
# Adds pygments highlight tags to code blocks in posts that use
# markdown format. This doesn't guess the language of the code
# block, so you should modify this to suit your own content.
# For example, my code block only contain Python and JavaScript,
# so I can assume the block is JavaScript if it contains a
# semi-colon.
def self.add_syntax_highlights(content)
lines = content.split("\n")
block, indent, lang, start = false, /^ /, nil, nil
lines.each_with_index do |line, i|
if !block && line =~ indent
block = true
lang = "python"
start = i
elsif block
lang = "javascript" if line =~ /;$/
block = line =~ indent && i < lines.size - 1 # Also handle EOF
if !block
lines[start] = "{% highlight #{lang} %}"
lines[i - 1] = "{% endhighlight %}"
end
lines[i] = lines[i].sub(indent, "")
end
end
lines.join("\n")
end
def self.save_file(url, ext)
if @grab_images
path = "tumblr_files/#{url.split('/').last}"
path += ext unless path =~ /#{ext}$/
FileUtils.mkdir_p "tumblr_files"
File.open(path, "w") { |f| f.write(open(url).read) }
url = "/" + path
end
url
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment