lucahammer/fediverse-to-gdf.py

## fediverse-to-gdf.py
'''
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
'''

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Generates a network file of the Fediverse as GDF.                     #
# (https://gephi.org/users/supported-graph-formats/gdf-format/)         #
# Result can analyzed and visualized with https://gephi.org/            #
# Because the script uses the Mastodon API other platforms like         #
# Pleroma, Peertube, Pixelfed, Funkwhale won't have outgoing peers.     #
#                                                                       #
# Tutorial how to use Gephi:                                            #
# lucahammer.com/guide-analyzing-twitter-networks-with-gephi-0-9-1/     #
#                                                                       #
# GDF from 2018-08-24:                                                  #
# https://lucahammer.at/vis/fediverse/2018-08-24-fediverse-GDF.zip      #
# Related thread: https://vis.social/@Luca/100606625507856187           #
# GDF by mastodon.social/@Gargron from 2018-01-30                       #
# (not generated with this script):                                     #
# https://gist.github.com/Gargron/48e67b1b14723cd178c951fe7f373a38      #
#                                                                       #
# The script generates three files:                                     #
# - nodes.csv                                                           #
# - edges.csv                                                           #
# - fediverse.gdf                                                       #
#                                                                       #
# Change SEED to start from a different instance.                       #
# Change MAX_PROCESSES to change how many requests can run in parallel. #
# (Each process needs about 15MB RAM)                                   #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

SEED = 'mastodon.social'
MAX_PROCESSES = 200
TIMEOUT = 20
NODES_FILENAME = 'fediverse-nodes.csv'
EDGES_FILENAME = 'fediverse-edges.csv'
GDF_FILENAME = 'fediverse.gdf'

import requests
from multiprocessing import Pool
import json
import time
import fuckit

# collect info about instance
def get_instance_info(instance):
    url = 'https://' + instance + '/api/v1/instance'
    r = requests.get(url, timeout=TIMEOUT)
    return (r.json())

# collect connected instances
def get_instance_peers(instance):
    url = 'https://' + instance + '/api/v1/instance/peers'
    r = requests.get(url, timeout=TIMEOUT)
    return (r.json())

# work instance
def process_instance(instance):
    data = dict()
    print('Processing: ' + str(instance))
    try:
        data['instance'] = instance
        data['info'] = get_instance_info(instance)
        data['peers'] = get_instance_peers(instance)
        data['status'] = 'success'
        return (data)
    except:
        data['name'] = instance
        data['status'] = 'failure'
        return (data)

# save data
@fuckit
def save_data(data):
    # instance, status, info, peers
    #todo_bag.update(data['peers'])
    if data['status'] == 'success' and data:
        #edges[data['instance']] = data['peers']
        #nodes[data['instance']] = data['info']

        with open('fediverse-nodes.csv', 'a') as f:
            f.write(data['instance']+',')
            f.write(str(data['info']['stats']['domain_count'])+',')
            f.write(str(data['info']['stats']['status_count'])+',')
            f.write(str(data['info']['stats']['user_count'])+',')
            f.write(data['info']['version'])
            f.write('\n')
        with open('fediverse-edges.csv', 'a') as f:
            for peer in data['peers']:
                f.write(data['instance']+','+peer+',true\n')

    done_bag.add(data['instance'])


if __name__ == '__main__':

    with open('fediverse-nodes.csv', 'w') as f:
        f.write('nodedef>name VARCHAR,domain_count INT,status_count INT,user_count INT,version VARCHAR\n')
    with open('fediverse-edges.csv', 'w') as f:
        f.write('edgedef>node1 VARCHAR,node2 VARCHAR,directed BOOLEAN\n')

    todo_bag = set()
    done_bag = set()
    nodes = dict() #instance -> info dict
    edges = dict() #instance -> [edges]

    todo_bag.update(get_instance_peers(SEED))

    while len(todo_bag) > 0:
        work_bag = []

        while len(todo_bag) > 0 and len(work_bag) < MAX_PROCESSES:
            instance = todo_bag.pop()
            if instance not in done_bag:
                work_bag.append(instance)
        print("work_bag filled: {} todo_bag: {}".format(str(len(work_bag)), str(len(todo_bag))))

        with Pool(MAX_PROCESSES) as p:
            result = p.map_async(process_instance, work_bag)
            while not result.ready():
                print("Instances left in current bag: {}".format(result._number_left))
                time.sleep(10)
            datalist = result.get()
        for data in datalist:
            save_data(data)

    # combine files
    filenames = [NODES_FILENAME, EDGES_FILENAME]
    with open(GDF_FILENAME, 'w') as outfile:
        for fname in filenames:
            with open(fname, 'r') as infile:
                for line in infile:
                    outfile.write(line)

    print('All done.')
	'''
	Permission is hereby granted, free of charge, to any person obtaining a
	copy of this software and associated documentation files (the "Software"),
	to deal in the Software without restriction, including without limitation
	the rights to use, copy, modify, merge, publish, distribute, sublicense,
	and/or sell copies of the Software, and to permit persons to whom the
	Software is furnished to do so, subject to the following conditions:
	The above copyright notice and this permission notice shall be included in
	all copies or substantial portions of the Software.

	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	DEALINGS IN THE SOFTWARE.
	'''

	# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
	# Generates a network file of the Fediverse as GDF. #
	# (https://gephi.org/users/supported-graph-formats/gdf-format/) #
	# Result can analyzed and visualized with https://gephi.org/ #
	# Because the script uses the Mastodon API other platforms like #
	# Pleroma, Peertube, Pixelfed, Funkwhale won't have outgoing peers. #
	# #
	# Tutorial how to use Gephi: #
	# lucahammer.com/guide-analyzing-twitter-networks-with-gephi-0-9-1/ #
	# #
	# GDF from 2018-08-24: #
	# https://lucahammer.at/vis/fediverse/2018-08-24-fediverse-GDF.zip #
	# Related thread: https://vis.social/@Luca/100606625507856187 #
	# GDF by mastodon.social/@Gargron from 2018-01-30 #
	# (not generated with this script): #
	# https://gist.github.com/Gargron/48e67b1b14723cd178c951fe7f373a38 #
	# #
	# The script generates three files: #
	# - nodes.csv #
	# - edges.csv #
	# - fediverse.gdf #
	# #
	# Change SEED to start from a different instance. #
	# Change MAX_PROCESSES to change how many requests can run in parallel. #
	# (Each process needs about 15MB RAM) #
	# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

	SEED = 'mastodon.social'
	MAX_PROCESSES = 200
	TIMEOUT = 20
	NODES_FILENAME = 'fediverse-nodes.csv'
	EDGES_FILENAME = 'fediverse-edges.csv'
	GDF_FILENAME = 'fediverse.gdf'

	import requests
	from multiprocessing import Pool
	import json
	import time
	import fuckit

	# collect info about instance
	def get_instance_info(instance):
	url = 'https://' + instance + '/api/v1/instance'
	r = requests.get(url, timeout=TIMEOUT)
	return (r.json())

	# collect connected instances
	def get_instance_peers(instance):
	url = 'https://' + instance + '/api/v1/instance/peers'
	r = requests.get(url, timeout=TIMEOUT)
	return (r.json())

	# work instance
	def process_instance(instance):
	data = dict()
	print('Processing: ' + str(instance))
	try:
	data['instance'] = instance
	data['info'] = get_instance_info(instance)
	data['peers'] = get_instance_peers(instance)
	data['status'] = 'success'
	return (data)
	except:
	data['name'] = instance
	data['status'] = 'failure'
	return (data)

	# save data
	@fuckit
	def save_data(data):
	# instance, status, info, peers
	#todo_bag.update(data['peers'])
	if data['status'] == 'success' and data:
	#edges[data['instance']] = data['peers']
	#nodes[data['instance']] = data['info']

	with open('fediverse-nodes.csv', 'a') as f:
	f.write(data['instance']+',')
	f.write(str(data['info']['stats']['domain_count'])+',')
	f.write(str(data['info']['stats']['status_count'])+',')
	f.write(str(data['info']['stats']['user_count'])+',')
	f.write(data['info']['version'])
	f.write('\n')
	with open('fediverse-edges.csv', 'a') as f:
	for peer in data['peers']:
	f.write(data['instance']+','+peer+',true\n')

	done_bag.add(data['instance'])



	if __name__ == '__main__':

	with open('fediverse-nodes.csv', 'w') as f:
	f.write('nodedef>name VARCHAR,domain_count INT,status_count INT,user_count INT,version VARCHAR\n')
	with open('fediverse-edges.csv', 'w') as f:
	f.write('edgedef>node1 VARCHAR,node2 VARCHAR,directed BOOLEAN\n')

	todo_bag = set()
	done_bag = set()
	nodes = dict() #instance -> info dict
	edges = dict() #instance -> [edges]

	todo_bag.update(get_instance_peers(SEED))

	while len(todo_bag) > 0:
	work_bag = []

	while len(todo_bag) > 0 and len(work_bag) < MAX_PROCESSES:
	instance = todo_bag.pop()
	if instance not in done_bag:
	work_bag.append(instance)
	print("work_bag filled: {} todo_bag: {}".format(str(len(work_bag)), str(len(todo_bag))))

	with Pool(MAX_PROCESSES) as p:
	result = p.map_async(process_instance, work_bag)
	while not result.ready():
	print("Instances left in current bag: {}".format(result._number_left))
	time.sleep(10)
	datalist = result.get()
	for data in datalist:
	save_data(data)

	# combine files
	filenames = [NODES_FILENAME, EDGES_FILENAME]
	with open(GDF_FILENAME, 'w') as outfile:
	for fname in filenames:
	with open(fname, 'r') as infile:
	for line in infile:
	outfile.write(line)

	print('All done.')