Skip to content

Instantly share code, notes, and snippets.

@Recursing
Created September 14, 2021 18:35
Show Gist options
  • Save Recursing/52192d19ad3ee9ff923227b95d2a9c4b to your computer and use it in GitHub Desktop.
Save Recursing/52192d19ad3ee9ff923227b95d2a9c4b to your computer and use it in GitHub Desktop.
import collections
import typing
import unicodedata
import json
import re
T = typing.TypeVar("T")
K = typing.TypeVar("K")
Message = typing.Any
with open("result.json") as infile:
data = json.load(infile)
chats: list[Message] = data["chats"]["list"]
all_messages = [m for chat in chats for m in chat["messages"]]
def normalize(s):
return "".join(
c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn"
).lower()
def groupby(l: list[T], f: typing.Callable[[T], K]) -> dict[K, list[T]]:
r = {}
for e in l:
r.setdefault(f(e), []).append(e)
return r
def get_text(message: Message) -> str:
if isinstance(message, str):
return message
if isinstance(message, list):
return "".join(map(get_text, message))
if isinstance(message, dict):
assert "text" in message, message
return get_text(message["text"])
print(message)
raise ValueError(message)
by_sender = groupby(all_messages, lambda message: message.get("from", ""))
total_considered_words = 0
considered_words_occurrences = collections.Counter()
wordfinder = re.compile("(?<![a-z])[a-z]+(?![a-z])")
frequencies: dict[str, dict[str, float]] = {}
for sender, messages in by_sender.items():
sender_words = [
word
for message in messages
for word in set(wordfinder.findall(normalize(get_text(message))))
]
if len(messages) < 1000:
continue
total_considered_words += len(sender_words)
occurrences = collections.Counter(sender_words)
considered_words_occurrences += occurrences
frequencies[sender] = {
word: number / len(sender_words) for word, number in occurrences.items()
}
global_frequencies = {
word: considered_words_occurrences[word] / total_considered_words
for word in considered_words_occurrences
}
for sender, word_frequency in sorted(
frequencies.items(),
key=lambda name_freq: len(by_sender[name_freq[0]]),
reverse=True,
):
print(
sender,
sorted(
(
(freq / global_frequencies[word], word)
for word, freq in word_frequency.items()
if considered_words_occurrences[word] > 20
),
reverse=True,
)[:10],
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment