Last active
December 25, 2021 22:48
-
-
Save henziger/0c968fa054f86a25c267689737970457 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
import re | |
# Open the text file, read the contents and split it into a list of words | |
words = [w.lower() for w in open("tre-bra.txt", "r").read().split()] | |
# Clean the words from punctuation characters and stuff | |
words = [re.sub('[(){}<>,.?!"]', '', w) for w in words] | |
# Remove uninteresting words | |
removals = ["1", "2", "3", "4", "ut", "få", "blev", "inte", "höll", "och", "in", "massa", "sista", "den", "mot", | |
"upp", "var", "par", "hade", "med", "och", "på", "om", "i", "att", "en", "för", "av", "från", "ett", | |
"till", "lite", "som", "det", "när", "är", "var", "de", "ta", "sa", "så", "utan", "vad", "det", | |
"har", "tog", "min", "mina", "över", "ihop", "fick", "mycket", "ville", "många", "mig", "jag", "gjorde"] | |
words = [w for w in words if w not in removals] | |
# Filter out words that are popular (occurring more than 3 times) | |
popular_words = dict(filter(lambda elem: elem[1] > 3, Counter(words).items())) | |
# Pseudonomynize and style words, e.g. replace sensitive names with K-pop idols | |
aliases = {"name1": "Yves", | |
"name2": "Choerry", | |
"name3": "Chuu", | |
"name4": "Vivi", | |
"eim": "EIM", | |
"zelda": "Zelda"} | |
for key in aliases.keys(): | |
popular_words[aliases[key]] = popular_words[key] | |
popular_words.pop(key) | |
# Print all the popular words so that we can feed them to the word cloud generator | |
for item in popular_words.items(): | |
for i in range(item[1]): | |
print(" " + item[0], end='') | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment