Skip to content

Instantly share code, notes, and snippets.

@neverstew
Created August 14, 2023 19:33
Show Gist options
  • Save neverstew/20b520670817774c557c2e9578294b60 to your computer and use it in GitHub Desktop.
Save neverstew/20b520670817774c557c2e9578294b60 to your computer and use it in GitHub Desktop.
Most common words from WhatsApp export
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
import re
import ssl
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
nltk.download('stopwords')
def load_text_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
text = re.sub(r"^\d{2}\/\d{2}\/\d{4}, \d{2}:\d{2} - [\w -]+:", "", text, flags=re.M)
text = re.sub(r"<Media omitted>", "", text)
text = re.sub(r"(?:https?://)?(?:[\w]+\.)(?:\.?[\w]{2,})+", "", text)
return text
def find_most_common_words(text, n):
words = re.findall(r'\b\w+\b', text.lower())
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]
word_counts = Counter(filtered_words)
most_common_words = word_counts.most_common(n)
return most_common_words
if __name__ == "__main__":
text = load_text_file("corpus.txt")
common_words = find_most_common_words(text, 50)
print(f"\nTop 50 most common words (excluding stop words):")
for word, count in common_words:
print(f"{word}: {count}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment