Skip to content

Instantly share code, notes, and snippets.

@zunda
Last active January 31, 2024 22:02
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zunda/f566e2bb85ca0616d351139363274ec5 to your computer and use it in GitHub Desktop.
Save zunda/f566e2bb85ca0616d351139363274ec5 to your computer and use it in GitHub Desktop.
crawler to measure size distribution of ActivityPub servers
#
# crawler to measure size distribution of ActivityPub servers
#
# usage: ruby crawl-activitypub.rb [initial-hostname]
# Creates a TSV file: apstat-%Y%m%d.tsv
#
# Copyright 2023 by zunda
#
# Permission is granted for use, copying, modification, distribution,
# and distribution of modified versions of this work as long as the
# above copyright notice is included.
#
require "net/http"
require "json"
require "uri"
require "time"
class HttpsConnectionPool
def initialize
@pool = Hash.new
end
def connection_to(host_port)
if @pool[host_port] and @pool[host_port].started?
return @pool[host_port]
end
domain, x = host_port.split(':', 2)
port = x ? x.to_i : 443
conn = Net::HTTP.start(
domain, port,
:use_ssl => true,
:open_timeout => 5,
:read_timeout => 15,
)
@pool[host_port] = conn
return conn
end
def finish(host_port = nil)
(host_port ? [host_port] : @pool.values).each do |h|
begin
@pool[h]&.finish
rescue IOError
end
@pool[h] = nil
end
end
end
class APServer
attr_reader :domain
attr_reader :software, :version, :users_total, :users_active_month, :local_posts
attr_reader :peers
attr_reader :error, :checked_at
attr_reader :nodeinfo_versions
attr_reader :nodeinfo_host
def initialize(domain, connection_pool)
@domain = domain
fetch!(connection_pool)
end
def tsv
[
@domain&.gsub(/\s+/, " "),
@software, @version, @users_total, @users_active_month, @local_posts,
@peers&.size,
@checked_at&.utc.iso8601,
@error&.to_s&.gsub(/\s+/, " "),
@nodeinfo_versions&.join(","),
@nodeinfo_host,
].map{|e| e ? e.to_s.gsub(/\s+/, " ") : "*"}.join("\t")
end
def self.tsv_header
"#" + [
:domain,
:software, :version, :users_total, :users_active_month, :local_posts,
:peers,
:checked_at,
:error,
:nodeinfo_versions,
:nodeinfo_host
].join("\t")
end
private
def fetch!(connection_pool)
begin
header = {:accept => "application/json"}
conn = connection_pool.connection_to(@domain)
# nodeinfo
r = conn.get("/.well-known/nodeinfo", header)
if r.code != "200"
raise "HTTP status code is #{r.code} for nodeinfo"
end
x = JSON.parse(r.body).dig("links")&.map{|e| [
e.dig("rel").scan(%r|http://nodeinfo.diaspora.software/ns/schema/([^/]+)|)&.flatten&.first,
e.dig("href")
]}.to_h.compact
@nodeinfo_versions = x.keys.compact.sort.reverse
if @nodeinfo_versions.empty?
raise "nodeinfo href not found"
end
v = @nodeinfo_versions.first
u = URI.parse(x[v])
if u.scheme != "https"
raise "Scheme is #{u.scheme} for nodeinfo v#{v}"
end
nodeinfo_host = u.host.downcase
if nodeinfo_host == domain
nodeinfo_conn = conn
else
@nodeinfo_host = nodeinfo_host
nodeinfo_conn = connection_pool.connection_to(@nodeinfo_host)
end
# nodeinfo
x = JSON.parse(nodeinfo_conn.get(u.path, header).body)
@software = x.dig("software", "name")
@version = x.dig("software", "version")
@users_total = x.dig("usage", "users", "total")
@users_active_month = x.dig("usage", "users", "activeMonth")
@local_posts = x.dig("usage", "localPosts")
# peers
x = JSON.parse(conn.get("/api/v1/instance/peers", header).body)
if x.kind_of?(Array)
@peers = x
elsif e = x.dig("error")
@error = e unless @software == "gotosocial"
else
@error = "Peers is not an array"
end
rescue JSON::ParserError => e
@error = e.class.to_s
rescue RuntimeError => e
@error = e.message
rescue => e
@error = "#{e.class}: #{e.message}"
ensure
@checked_at = Time.now
connection_pool.finish(@domain) # we don't expect querying again
end
self
end
end
class Measurement
Blocked = Regexp.union(%w(
mastinator.com
repl.co
misskey-forkbomb.cf
activitypub-troll.cf
ngrok.io
ngrok-free.app
netlify.app
activitypub-proxy.cf
gab.best
preview.app.github.dev
onion
).map{|domain| /(?:\.|\A)#{Regexp.escape(domain)}\z/})
def initialize(filename)
@data_io = nil
@queue = Array.new
@n_queue = 0
@checked_or_queued = Hash.new
@n_checked_or_queued = 0
queue_filename = filename + ".queued"
@queue_io = File.new(queue_filename, "w")
prev_filename = nil
if File.exist?(filename)
$stderr.puts "Continuing measurement from #{filename}"
prev_filename = filename + ".back"
File.rename(filename, prev_filename)
end
@data_io = File.new(filename, "w")
if prev_filename
File.open(prev_filename).each_line do |line|
if /^#/ =~ line
@data_io.puts line
else
domain, data = line.chomp.split("\t", 2)
if data
@data_io.puts line
@checked_or_queued[domain] = true
@n_checked_or_queued += 1
elsif domain
queue(domain)
end
end
end
else
@data_io.puts APServer.tsv_header
end
end
def crawl
while @n_queue > 0
check
end
end
def queue(domain)
domain = domain&.chomp&.sub(/https?:\/\//i, "")&.split("/")&.first&.split("@")&.last&.downcase
if not domain or Blocked.match(domain) or @checked_or_queued[domain]
return
end
@queue << domain
@n_queue += 1
@checked_or_queued[domain] = true
@n_checked_or_queued += 1
@queue_io.puts domain
@queue_io.flush
end
def next_domain
@queue.first
end
def check
return if @n_queue == 0
@connection_pool ||= HttpsConnectionPool.new
domain = @queue.first
$stderr.puts "Remaining #{@n_queue}/#{@n_checked_or_queued} #{"%3.0f%%" % (@n_queue*100.0/@n_checked_or_queued)}\tchecking #{domain}"
x = APServer.new(domain, @connection_pool)
@data_io.puts x.tsv
@data_io.flush
x.peers&.each do |peer|
queue(peer)
end
@queue.shift
@n_queue -= 1
end
def finish
@data_io.puts @queue.join("\n") unless @queue.empty?
@data_io.close
@queue_io.close
@connection_pool.finish
end
end
h = ARGV.shift || "mastodon.zunda.ninja"
m = Measurement.new(Time.now.strftime("apstat-%Y%m%d.tsv"))
begin
m.queue(h)
$stderr.puts "Crawling from #{m.next_domain}. Ctrl-C to safely terminate to continue later."
m.crawl
rescue Interrupt
ensure
$stderr.print "Writing data..."
m.finish
$stderr.puts "done"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment