Skip to content

Instantly share code, notes, and snippets.

@benob
Created December 4, 2023 15:13
Show Gist options
  • Save benob/924dfe90a9a62faf0fbcdd8e3db7ae77 to your computer and use it in GitHub Desktop.
Save benob/924dfe90a9a62faf0fbcdd8e3db7ae77 to your computer and use it in GitHub Desktop.
Find connected components in speaker-conversation graphs
import json
import sys
from collections import defaultdict
speakers = defaultdict(list)
conversations = defaultdict(list)
for filename in sys.argv[1:]:
with open(filename) as fp:
metadata = json.loads(fp.read())
for speaker in metadata['speakers']:
speakers[speaker['user_id']].append(metadata['id'])
conversations[metadata['id']].append(speaker['user_id'])
parts = defaultdict(set)
speaker_colors = {}
def mark_reachable(target, color):
if target not in speaker_colors:
speaker_colors[target] = color
for conversation in speakers[target]:
parts[color].add(conversation)
for speaker in conversations[conversation]:
if speaker != target and speaker not in speaker_colors:
mark_reachable(speaker, color)
for color, target in enumerate(speakers.keys()):
mark_reachable(target, color)
# TODO: drop conversations to cut cliques
for i, part in enumerate(sorted(parts.values(), key=len, reverse=True)):
print(i, len(part), ' '.join(part))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment