Created
September 14, 2021 18:35
-
-
Save Recursing/52192d19ad3ee9ff923227b95d2a9c4b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
import typing | |
import unicodedata | |
import json | |
import re | |
T = typing.TypeVar("T") | |
K = typing.TypeVar("K") | |
Message = typing.Any | |
with open("result.json") as infile: | |
data = json.load(infile) | |
chats: list[Message] = data["chats"]["list"] | |
all_messages = [m for chat in chats for m in chat["messages"]] | |
def normalize(s): | |
return "".join( | |
c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn" | |
).lower() | |
def groupby(l: list[T], f: typing.Callable[[T], K]) -> dict[K, list[T]]: | |
r = {} | |
for e in l: | |
r.setdefault(f(e), []).append(e) | |
return r | |
def get_text(message: Message) -> str: | |
if isinstance(message, str): | |
return message | |
if isinstance(message, list): | |
return "".join(map(get_text, message)) | |
if isinstance(message, dict): | |
assert "text" in message, message | |
return get_text(message["text"]) | |
print(message) | |
raise ValueError(message) | |
by_sender = groupby(all_messages, lambda message: message.get("from", "")) | |
total_considered_words = 0 | |
considered_words_occurrences = collections.Counter() | |
wordfinder = re.compile("(?<![a-z])[a-z]+(?![a-z])") | |
frequencies: dict[str, dict[str, float]] = {} | |
for sender, messages in by_sender.items(): | |
sender_words = [ | |
word | |
for message in messages | |
for word in set(wordfinder.findall(normalize(get_text(message)))) | |
] | |
if len(messages) < 1000: | |
continue | |
total_considered_words += len(sender_words) | |
occurrences = collections.Counter(sender_words) | |
considered_words_occurrences += occurrences | |
frequencies[sender] = { | |
word: number / len(sender_words) for word, number in occurrences.items() | |
} | |
global_frequencies = { | |
word: considered_words_occurrences[word] / total_considered_words | |
for word in considered_words_occurrences | |
} | |
for sender, word_frequency in sorted( | |
frequencies.items(), | |
key=lambda name_freq: len(by_sender[name_freq[0]]), | |
reverse=True, | |
): | |
print( | |
sender, | |
sorted( | |
( | |
(freq / global_frequencies[word], word) | |
for word, freq in word_frequency.items() | |
if considered_words_occurrences[word] > 20 | |
), | |
reverse=True, | |
)[:10], | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment