Last active
April 28, 2023 19:40
-
-
Save manton/946ddbb74b5d0948053d7e23793ead77 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Mastodon ActivityStreams (.tar.gz) file to Blog Archive (.bar) converter | |
# by Manton Reece | |
# MIT license | |
require 'rubygems' | |
require 'zip' | |
require 'zlib' | |
require 'minitar' | |
require 'json' | |
require 'fileutils' | |
# you may need to install a few gems like: | |
# gem install zip | |
# gem install minitar | |
# gem install fileutils | |
if ARGV.length != 2 | |
puts "" | |
puts "Usage: ruby mastodon_to_blog_archive.rb /path/mastodon.tar.gz /path/export.bar" | |
puts "" | |
return | |
end | |
# set up some paths | |
mastodon_path = ARGV[0] | |
bar_path = ARGV[1] | |
temp_folder = "blog_archive_temp_" + rand(10000...50000).to_s | |
temp_mastodon_folder = File.join(temp_folder, "mastodon") | |
temp_blog_folder = File.join(temp_folder, "blog") | |
FileUtils.mkdir_p(temp_mastodon_folder) | |
FileUtils.mkdir_p(temp_blog_folder) | |
puts "Converting from #{File.basename(mastodon_path)} to #{File.basename(bar_path)}..." | |
# we'll keep an array of posts between formats | |
mastodon_posts = [] | |
jsonfeed_posts = [] | |
mf2_posts = [] | |
files_to_zip = [] | |
# unzip the Mastodon posts | |
tar_gz = Zlib::GzipReader.open(mastodon_path) | |
tar = Archive::Tar::Minitar::Reader.new(tar_gz) | |
tar.each do |entry| | |
if entry.name.include?("outbox.json") | |
# extract so we can read posts | |
json_path = File.join(temp_mastodon_folder, "outbox.json") | |
File.open(json_path, 'wb') do |file| | |
file.write(entry.read) | |
end | |
mastodon_posts = JSON.parse(IO.read(json_path))["orderedItems"] | |
elsif entry.name.include?("media_attachments") | |
# extract to blog folder | |
media_path = File.join(temp_blog_folder, entry.name) | |
media_folder = File.dirname(media_path) | |
FileUtils.mkdir_p(media_folder) | |
puts "Extracting media file: " + File.basename(media_path) | |
File.open(media_path, 'wb') do |file| | |
file.write(entry.read) | |
end | |
files_to_zip << media_path | |
end | |
end | |
tar.close | |
tar_gz.close | |
puts "Found #{mastodon_posts.size} posts in Mastodon archive." | |
# re-structure the posts for Blog Archive Format | |
for post in mastodon_posts | |
if post["type"] == "Create" | |
obj = post["object"] | |
post_id = obj["id"] | |
post_url = obj["url"] | |
post_published = obj["published"] | |
post_content = obj["content"] | |
post_attachments = obj["attachment"] | |
# add images to HTML | |
html = post_content | |
for a in post_attachments | |
if a["type"] == "Document" | |
attachment_url = a["url"] | |
if a["mediaType"].include?("image/") | |
html = html + "<img src=\"#{attachment_url}\">" | |
elsif a["mediaType"].include?("video/") | |
html = html + "<video src=\"#{attachment_url}\">" | |
elsif a["mediaType"].include?("audio/") | |
html = html + "<audio src=\"#{attachment_url}\">" | |
end | |
end | |
end | |
# relative path | |
html = html.gsub("src=\"/media_attachments/", "src=\"media_attachments/") | |
jsonfeed_posts << { | |
id: post_id, | |
url: post_url, | |
date_published: post_published, | |
content_html: html | |
} | |
mf2_post = "<div class=\"h-entry\"><div class=\"e-content\">" + | |
html + | |
"<div><a href=\"#{post_url}\" class=\"u-url\">" + | |
"<time datetime=\"#{post_published}\" class=\"dt-published\">#{post_published}</time>" + | |
"</a></div>" + | |
"</div></div>" | |
mf2_posts << mf2_post | |
end | |
end | |
# save the JSON Feed | |
puts "Saving posts to JSON Feed..." | |
jsonfeed_info = { | |
version: "https://jsonfeed.org/version/1.1", | |
items: jsonfeed_posts | |
} | |
jsonfeed_path = File.join(temp_blog_folder, "feed.json") | |
IO.write(jsonfeed_path, jsonfeed_info.to_json) | |
# save an HTML file with Microformats | |
puts "Saving posts to HTML with Microformats..." | |
html_path = File.join(temp_blog_folder, "index.html") | |
html_header = "<html><body><div class=\"h-feed\">" | |
html_footer = "</div></body></html>" | |
IO.write(html_path, html_header + mf2_posts.join("\n") + html_footer) | |
# delete the old file if it looks like a .bar archive | |
if File.exists?(bar_path) && !File.directory?(bar_path) && bar_path.include?(".bar") | |
File.delete(bar_path) | |
end | |
files_to_zip << jsonfeed_path | |
files_to_zip << html_path | |
# zip the .bar | |
Zip::File.open(bar_path, Zip::File::CREATE) do |zipfile| | |
for filepath in files_to_zip | |
parent_folder = filepath.gsub(temp_blog_folder + "/", "") | |
zipfile.add(parent_folder, filepath) | |
end | |
end | |
# cleanup | |
puts "Cleaning up temporary folder #{temp_folder}..." | |
FileUtils.rm_r(temp_folder, force: true) | |
puts "Done! Created new file in Blog Archive Format: #{bar_path}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment