zunda/crawl-activitypub.rb

## crawl-activitypub.rb
#
# crawler to measure size distribution of ActivityPub servers
#
# usage: ruby crawl-activitypub.rb [initial-hostname]
# Creates a TSV file: apstat-%Y%m%d.tsv
#
# Copyright 2023 by zunda
#
# Permission is granted for use, copying, modification, distribution,
# and distribution of modified versions of this work as long as the
# above copyright notice is included.
#
require "net/http"
require "json"
require "uri"
require "time"

class HttpsConnectionPool
  def initialize
    @pool = Hash.new
  end

  def connection_to(host_port)
    if @pool[host_port] and @pool[host_port].started?
      return @pool[host_port]
    end

    domain, x = host_port.split(':', 2)
    port = x ? x.to_i : 443
    conn = Net::HTTP.start(
      domain, port,
      :use_ssl => true,
      :open_timeout => 5,
      :read_timeout => 15,
    )

    @pool[host_port] = conn
    return conn
  end

  def finish(host_port = nil)
    (host_port ? [host_port] : @pool.values).each do |h|
      begin
        @pool[h]&.finish
      rescue IOError
      end
      @pool[h] = nil
    end
  end
end

class APServer
  attr_reader :domain
  attr_reader :software, :version, :users_total, :users_active_month, :local_posts
  attr_reader :peers
  attr_reader :error, :checked_at
  attr_reader :nodeinfo_versions
  attr_reader :nodeinfo_host

  def initialize(domain, connection_pool)
    @domain = domain
    fetch!(connection_pool)
  end

  def tsv
    [
      @domain&.gsub(/\s+/, " "),
      @software, @version, @users_total, @users_active_month, @local_posts,
      @peers&.size,
      @checked_at&.utc.iso8601,
      @error&.to_s&.gsub(/\s+/, " "),
      @nodeinfo_versions&.join(","),
      @nodeinfo_host,
    ].map{|e| e ? e.to_s.gsub(/\s+/, " ") : "*"}.join("\t")
  end

  def self.tsv_header
    "#" + [
      :domain,
      :software, :version, :users_total, :users_active_month, :local_posts,
      :peers,
      :checked_at,
      :error,
      :nodeinfo_versions,
      :nodeinfo_host
    ].join("\t")
  end

  private
  def fetch!(connection_pool)
    begin
      header = {:accept => "application/json"}
      conn = connection_pool.connection_to(@domain)
      # nodeinfo
      r = conn.get("/.well-known/nodeinfo", header)
      if r.code != "200"
        raise "HTTP status code is #{r.code} for nodeinfo"
      end
      x = JSON.parse(r.body).dig("links")&.map{|e| [
        e.dig("rel").scan(%r|http://nodeinfo.diaspora.software/ns/schema/([^/]+)|)&.flatten&.first,
        e.dig("href")
      ]}.to_h.compact
      @nodeinfo_versions = x.keys.compact.sort.reverse
      if @nodeinfo_versions.empty?
        raise "nodeinfo href not found"
      end
      v = @nodeinfo_versions.first
      u = URI.parse(x[v])
      if u.scheme != "https"
        raise "Scheme is #{u.scheme} for nodeinfo v#{v}"
      end
      nodeinfo_host = u.host.downcase
      if nodeinfo_host == domain
        nodeinfo_conn = conn
      else
        @nodeinfo_host = nodeinfo_host
        nodeinfo_conn = connection_pool.connection_to(@nodeinfo_host)
      end
      # nodeinfo
      x = JSON.parse(nodeinfo_conn.get(u.path, header).body)
      @software = x.dig("software", "name")
      @version = x.dig("software", "version")
      @users_total = x.dig("usage", "users", "total")
      @users_active_month = x.dig("usage", "users", "activeMonth")
      @local_posts = x.dig("usage", "localPosts")
      # peers
      x = JSON.parse(conn.get("/api/v1/instance/peers", header).body)
      if x.kind_of?(Array)
        @peers = x
      elsif e = x.dig("error")
        @error = e unless @software == "gotosocial"
      else
        @error = "Peers is not an array"
      end
    rescue JSON::ParserError => e
      @error = e.class.to_s
    rescue RuntimeError => e
      @error = e.message
    rescue => e
      @error = "#{e.class}: #{e.message}"
    ensure
      @checked_at = Time.now
      connection_pool.finish(@domain) # we don't expect querying again
    end
    self
  end
end

class Measurement
  Blocked = Regexp.union(%w(
    mastinator.com
    repl.co
    misskey-forkbomb.cf
    activitypub-troll.cf
    ngrok.io
    ngrok-free.app
    netlify.app
    activitypub-proxy.cf
    gab.best
    preview.app.github.dev
    onion
  ).map{|domain| /(?:\.|\A)#{Regexp.escape(domain)}\z/})

  def initialize(filename)
    @data_io = nil
    @queue = Array.new
    @n_queue = 0
    @checked_or_queued = Hash.new
    @n_checked_or_queued = 0
    queue_filename = filename + ".queued"
    @queue_io = File.new(queue_filename, "w")
    prev_filename = nil
    if File.exist?(filename)
      $stderr.puts "Continuing measurement from #{filename}"
      prev_filename = filename + ".back"
      File.rename(filename, prev_filename)
    end
    @data_io = File.new(filename, "w")
    if prev_filename
      File.open(prev_filename).each_line do |line|
        if /^#/ =~ line
          @data_io.puts line
        else
          domain, data = line.chomp.split("\t", 2)
          if data
            @data_io.puts line
            @checked_or_queued[domain] = true
            @n_checked_or_queued += 1
          elsif domain
            queue(domain)
          end
        end
      end
    else
      @data_io.puts APServer.tsv_header
    end
  end

  def crawl
    while @n_queue > 0
      check
    end
  end

  def queue(domain)
    domain = domain&.chomp&.sub(/https?:\/\//i, "")&.split("/")&.first&.split("@")&.last&.downcase
    if not domain or Blocked.match(domain) or @checked_or_queued[domain]
      return
    end
    @queue << domain
    @n_queue += 1
    @checked_or_queued[domain] = true
    @n_checked_or_queued += 1
    @queue_io.puts domain
    @queue_io.flush
  end

  def next_domain
    @queue.first
  end

  def check
    return if @n_queue == 0
    @connection_pool ||= HttpsConnectionPool.new
    domain = @queue.first
    $stderr.puts "Remaining #{@n_queue}/#{@n_checked_or_queued} #{"%3.0f%%" % (@n_queue*100.0/@n_checked_or_queued)}\tchecking #{domain}"
    x = APServer.new(domain, @connection_pool)
    @data_io.puts x.tsv
    @data_io.flush
    x.peers&.each do |peer|
      queue(peer)
    end
    @queue.shift
    @n_queue -= 1
  end

  def finish
    @data_io.puts @queue.join("\n") unless @queue.empty?
    @data_io.close
    @queue_io.close
    @connection_pool.finish
  end
end

h = ARGV.shift || "mastodon.zunda.ninja"
m = Measurement.new(Time.now.strftime("apstat-%Y%m%d.tsv"))
begin
  m.queue(h)
  $stderr.puts "Crawling from #{m.next_domain}. Ctrl-C to safely terminate to continue later."
  m.crawl
rescue Interrupt
ensure
  $stderr.print "Writing data..."
  m.finish
  $stderr.puts "done"
end
	#
	# crawler to measure size distribution of ActivityPub servers
	#
	# usage: ruby crawl-activitypub.rb [initial-hostname]
	# Creates a TSV file: apstat-%Y%m%d.tsv
	#
	# Copyright 2023 by zunda
	#
	# Permission is granted for use, copying, modification, distribution,
	# and distribution of modified versions of this work as long as the
	# above copyright notice is included.
	#
	require "net/http"
	require "json"
	require "uri"
	require "time"

	class HttpsConnectionPool
	def initialize
	@pool = Hash.new
	end

	def connection_to(host_port)
	if @pool[host_port] and @pool[host_port].started?
	return @pool[host_port]
	end

	domain, x = host_port.split(':', 2)
	port = x ? x.to_i : 443
	conn = Net::HTTP.start(
	domain, port,
	:use_ssl => true,
	:open_timeout => 5,
	:read_timeout => 15,
	)

	@pool[host_port] = conn
	return conn
	end

	def finish(host_port = nil)
	(host_port ? [host_port] : @pool.values).each do \|h\|
	begin
	@pool[h]&.finish
	rescue IOError
	end
	@pool[h] = nil
	end
	end
	end

	class APServer
	attr_reader :domain
	attr_reader :software, :version, :users_total, :users_active_month, :local_posts
	attr_reader :peers
	attr_reader :error, :checked_at
	attr_reader :nodeinfo_versions
	attr_reader :nodeinfo_host

	def initialize(domain, connection_pool)
	@domain = domain
	fetch!(connection_pool)
	end

	def tsv
	[
	@domain&.gsub(/\s+/, " "),
	@software, @version, @users_total, @users_active_month, @local_posts,
	@peers&.size,
	@checked_at&.utc.iso8601,
	@error&.to_s&.gsub(/\s+/, " "),
	@nodeinfo_versions&.join(","),
	@nodeinfo_host,
	].map{\|e\| e ? e.to_s.gsub(/\s+/, " ") : "*"}.join("\t")
	end

	def self.tsv_header
	"#" + [
	:domain,
	:software, :version, :users_total, :users_active_month, :local_posts,
	:peers,
	:checked_at,
	:error,
	:nodeinfo_versions,
	:nodeinfo_host
	].join("\t")
	end

	private
	def fetch!(connection_pool)
	begin
	header = {:accept => "application/json"}
	conn = connection_pool.connection_to(@domain)
	# nodeinfo
	r = conn.get("/.well-known/nodeinfo", header)
	if r.code != "200"
	raise "HTTP status code is #{r.code} for nodeinfo"
	end
	x = JSON.parse(r.body).dig("links")&.map{\|e\| [
	e.dig("rel").scan(%r\|http://nodeinfo.diaspora.software/ns/schema/([^/]+)\|)&.flatten&.first,
	e.dig("href")
	]}.to_h.compact
	@nodeinfo_versions = x.keys.compact.sort.reverse
	if @nodeinfo_versions.empty?
	raise "nodeinfo href not found"
	end
	v = @nodeinfo_versions.first
	u = URI.parse(x[v])
	if u.scheme != "https"
	raise "Scheme is #{u.scheme} for nodeinfo v#{v}"
	end
	nodeinfo_host = u.host.downcase
	if nodeinfo_host == domain
	nodeinfo_conn = conn
	else
	@nodeinfo_host = nodeinfo_host
	nodeinfo_conn = connection_pool.connection_to(@nodeinfo_host)
	end
	# nodeinfo
	x = JSON.parse(nodeinfo_conn.get(u.path, header).body)
	@software = x.dig("software", "name")
	@version = x.dig("software", "version")
	@users_total = x.dig("usage", "users", "total")
	@users_active_month = x.dig("usage", "users", "activeMonth")
	@local_posts = x.dig("usage", "localPosts")
	# peers
	x = JSON.parse(conn.get("/api/v1/instance/peers", header).body)
	if x.kind_of?(Array)
	@peers = x
	elsif e = x.dig("error")
	@error = e unless @software == "gotosocial"
	else
	@error = "Peers is not an array"
	end
	rescue JSON::ParserError => e
	@error = e.class.to_s
	rescue RuntimeError => e
	@error = e.message
	rescue => e
	@error = "#{e.class}: #{e.message}"
	ensure
	@checked_at = Time.now
	connection_pool.finish(@domain) # we don't expect querying again
	end
	self
	end
	end

	class Measurement
	Blocked = Regexp.union(%w(
	mastinator.com
	repl.co
	misskey-forkbomb.cf
	activitypub-troll.cf
	ngrok.io
	ngrok-free.app
	netlify.app
	activitypub-proxy.cf
	gab.best
	preview.app.github.dev
	onion
	).map{\|domain\| /(?:\.\|\A)#{Regexp.escape(domain)}\z/})

	def initialize(filename)
	@data_io = nil
	@queue = Array.new
	@n_queue = 0
	@checked_or_queued = Hash.new
	@n_checked_or_queued = 0
	queue_filename = filename + ".queued"
	@queue_io = File.new(queue_filename, "w")
	prev_filename = nil
	if File.exist?(filename)
	$stderr.puts "Continuing measurement from #{filename}"
	prev_filename = filename + ".back"
	File.rename(filename, prev_filename)
	end
	@data_io = File.new(filename, "w")
	if prev_filename
	File.open(prev_filename).each_line do \|line\|
	if /^#/ =~ line
	@data_io.puts line
	else
	domain, data = line.chomp.split("\t", 2)
	if data
	@data_io.puts line
	@checked_or_queued[domain] = true
	@n_checked_or_queued += 1
	elsif domain
	queue(domain)
	end
	end
	end
	else
	@data_io.puts APServer.tsv_header
	end
	end

	def crawl
	while @n_queue > 0
	check
	end
	end

	def queue(domain)
	domain = domain&.chomp&.sub(/https?:\/\//i, "")&.split("/")&.first&.split("@")&.last&.downcase
	if not domain or Blocked.match(domain) or @checked_or_queued[domain]
	return
	end
	@queue << domain
	@n_queue += 1
	@checked_or_queued[domain] = true
	@n_checked_or_queued += 1
	@queue_io.puts domain
	@queue_io.flush
	end

	def next_domain
	@queue.first
	end

	def check
	return if @n_queue == 0
	@connection_pool \|\|= HttpsConnectionPool.new
	domain = @queue.first
	$stderr.puts "Remaining #{@n_queue}/#{@n_checked_or_queued} #{"%3.0f%%" % (@n_queue*100.0/@n_checked_or_queued)}\tchecking #{domain}"
	x = APServer.new(domain, @connection_pool)
	@data_io.puts x.tsv
	@data_io.flush
	x.peers&.each do \|peer\|
	queue(peer)
	end
	@queue.shift
	@n_queue -= 1
	end

	def finish
	@data_io.puts @queue.join("\n") unless @queue.empty?
	@data_io.close
	@queue_io.close
	@connection_pool.finish
	end
	end

	h = ARGV.shift \|\| "mastodon.zunda.ninja"
	m = Measurement.new(Time.now.strftime("apstat-%Y%m%d.tsv"))
	begin
	m.queue(h)
	$stderr.puts "Crawling from #{m.next_domain}. Ctrl-C to safely terminate to continue later."
	m.crawl
	rescue Interrupt
	ensure
	$stderr.print "Writing data..."
	m.finish
	$stderr.puts "done"
	end