Skip to content

Instantly share code, notes, and snippets.

@gwire
Last active December 2, 2022 18:19
Show Gist options
  • Save gwire/0ea2b4424a2c7aa203c558b2b3d3cc26 to your computer and use it in GitHub Desktop.
Save gwire/0ea2b4424a2c7aa203c558b2b3d3cc26 to your computer and use it in GitHub Desktop.
Download missing mastodon avatars to the cache
#!/usr/bin/env ruby
# Remote mastodon accounts can be refreshed with the command
# tootctl accounts refresh --all
# https://docs.joinmastodon.org/admin/tootctl/#accounts-refresh
# however, tootctl was having issues, so I ended up writing a this
# helper script to scan a mastodon cache for missing avatar/header images
# prioritising recently active accounts, and download directly
#
# note: make sure $cache_path is correct!
# assumes a path like .../accounts/avatars/888/888/888/888/888/888/original/filename.jpeg
require 'pg'
require 'net/http'
require 'uri'
require 'fileutils'
$cache_path="/home/mastodon/live/public/system/cache"
$admin_url="https://example.social/admin/accounts/"
$check_size=false
conn = PG.connect(dbname: 'mastodon')
res = conn.exec("
SELECT
id,
CONCAT(username,'@',domain) AS user,
CONCAT('/accounts/avatars/',
SUBSTRING(id::text, 1, 3), '/',
SUBSTRING(id::text, 4, 3), '/',
SUBSTRING(id::text, 7, 3), '/',
SUBSTRING(id::text, 10, 3),'/',
SUBSTRING(id::text, 13, 3),'/',
SUBSTRING(id::text, 16, 3),'/original/') AS avatar_file_path,
avatar_file_name,
avatar_file_size,
avatar_remote_url,
CONCAT('/accounts/headers/',
SUBSTRING(id::text, 1, 3), '/',
SUBSTRING(id::text, 4, 3), '/',
SUBSTRING(id::text, 7, 3), '/',
SUBSTRING(id::text, 10, 3),'/',
SUBSTRING(id::text, 13, 3),'/',
SUBSTRING(id::text, 16, 3),'/original/') AS header_file_path,
header_file_name,
header_file_size,
header_remote_url,updated_at
FROM public.accounts
ORDER BY updated_at DESC
")
#WHERE avatar_updated_at <'2022-11-29'::date
#AND updated_at > '2022-11-24'::date
#LIMIT 6000
#ORDER BY RANDOM ()
def download_file(type,user,id,url,path,filename,expected_size,attempt=0)
file_dest = path + filename
uri = URI(url)
resp = Net::HTTP.get_response(uri)
case resp.code
when "200"
if resp.body.length == expected_size
unless Dir.exist?(path)
FileUtils.mkdir_p path
end
File.open(file_dest, "wb") { |file| file.write(resp.body) }
unless File.exist?(file_dest)
STDERR.puts "___ error writing file?" + file_dest
end
else
### even if it's not the expected size, just write it anyway?
unless Dir.exist?(path)
FileUtils.mkdir_p path
end
File.open(file_dest, "wb") { |file| file.write(resp.body) }
unless File.exist?(file_dest)
STDERR.puts "___ error writing file?" + file_dest
end
end
when "301","302","307"
if attempt < 3
download_file(type,user,id,resp.header['location'],path,filename,expected_size,attempt +1)
end
when "401","403","404","500","502","503","520","521","522"
## 404 usually means the user has changed their uploads
## 403 usually means the user has changed their uploads (and an object store like s3 is in use)
## we want to trigger updating the account profile information at this point
## but the easiest way for me is to output the account admin URL and manually click refresh
#STDERR.print "+"
STDOUT.puts $admin_url + id + "/#@"+ user + " refresh needed?"
STDOUT.puts " " + url + " " + resp.code
else
#STDERR.print "?"
STDERR.puts "___ problem downloading " + type + " for " + user + " code " + resp.code
end
rescue Net::OpenTimeout => e
## the most common cause for this appears to be servers that have an IPv6 address that doesn't respond
STDOUT.puts $admin_url + id + "/#@"+ user
STDOUT.puts " " + url + " open_timeout"
#STDERR.print "X"
#STDERR.puts "___ ERROR: timed out while trying to connect #{e}"
rescue Net::ReadTimeout => e
STDOUT.puts $admin_url + id + "/#@"+ user
STDOUT.puts " " + url + " read_timeout"
#STDERR.print "X"
#STDERR.puts "___ ERROR: timed outreading #{e}"
rescue OpenSSL::SSL::SSLError => e
STDOUT.puts $admin_url + id + "/#@"+ user
STDOUT.puts " " + url + " ssl_error"
#STDERR.print "X"
#STDERR.puts "___ ERROR: SSL Error #{e}"
rescue SocketError => e
## most common cause is domain is missing from DNS
STDOUT.puts $admin_url + id + "/#@"+ user
STDOUT.puts " " + url + " socket_error"
#STDERR.print "X"
#STDERR.puts "___ ERROR: Socket Error #{e}"
rescue Errno::ECONNREFUSED => e
STDOUT.puts $admin_url + id + "/#@"+ user
STDOUT.puts " " + url + " connection_refused"
#STDERR.print "X"
#STDERR.puts "___ ERROR: Errno::ECONNREFUSED #{e}"
rescue Errno::EHOSTUNREACH => e
STDOUT.puts $admin_url + id + "/#@"+ user
STDOUT.puts " " + url + " unreachable"
#STDERR.print "X"
#STDERR.puts "___ ERROR: Errno::EHOSTUNREACH #{e}"
end
res.each do |row|
if row['avatar_file_name']
avatar_dir = $cache_path + row['avatar_file_path']
#STDERR.print "."
unless File.exist?(avatar_dir + row['avatar_file_name'])
unless row['avatar_remote_url'].nil? || row['avatar_remote_url'].empty?
#STDERR.print "|"
download_file("avatar",row['user'],row['id'],row['avatar_remote_url'],avatar_dir,row['avatar_file_name'],row['avatar_file_size'].to_i)
end
end
if $check_size && File.exist?(avatar_dir + row['avatar_file_name'])
avatar_file_size = File.size(avatar_dir + row['avatar_file_name'])
unless avatar_file_size === row['avatar_file_size'].to_i
STDOUT.puts " " + avatar_dir + row['avatar_file_name'] + " size " + avatar_file_size.to_s + " is not db_size " + row['avatar_file_size']+ " " + row['user']
end
end
end
if row['header_file_name']
header_dir = $cache_path + row['header_file_path']
#STDERR.print ","
unless File.exist?(header_dir + row['header_file_name'])
unless row['header_remote_url'].nil? || row['header_remote_url'].empty?
#STDERR.print "|"
download_file("header",row['user'],row['id'],row['header_remote_url'],header_dir,row['header_file_name'],row['header_file_size'].to_i)
end
end
if $check_size && File.exist?(header_dir + row['header_file_name'])
header_file_size = File.size(header_dir + row['header_file_name'])
unless header_file_size === row['header_file_size'].to_i
STDOUT.puts " " + header_dir + row['header_file_name'] + " size " + header_file_size.to_s + " is not db_size " + row['header_file_size'] + " " + row['user']
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment