Skip to content

Instantly share code, notes, and snippets.

@henziger
Last active December 25, 2021 22:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save henziger/0c968fa054f86a25c267689737970457 to your computer and use it in GitHub Desktop.
Save henziger/0c968fa054f86a25c267689737970457 to your computer and use it in GitHub Desktop.
from collections import Counter
import re
# Open the text file, read the contents and split it into a list of words
words = [w.lower() for w in open("tre-bra.txt", "r").read().split()]
# Clean the words from punctuation characters and stuff
words = [re.sub('[(){}<>,.?!"]', '', w) for w in words]
# Remove uninteresting words
removals = ["1", "2", "3", "4", "ut", "få", "blev", "inte", "höll", "och", "in", "massa", "sista", "den", "mot",
"upp", "var", "par", "hade", "med", "och", "på", "om", "i", "att", "en", "för", "av", "från", "ett",
"till", "lite", "som", "det", "när", "är", "var", "de", "ta", "sa", "så", "utan", "vad", "det",
"har", "tog", "min", "mina", "över", "ihop", "fick", "mycket", "ville", "många", "mig", "jag", "gjorde"]
words = [w for w in words if w not in removals]
# Filter out words that are popular (occurring more than 3 times)
popular_words = dict(filter(lambda elem: elem[1] > 3, Counter(words).items()))
# Pseudonomynize and style words, e.g. replace sensitive names with K-pop idols
aliases = {"name1": "Yves",
"name2": "Choerry",
"name3": "Chuu",
"name4": "Vivi",
"eim": "EIM",
"zelda": "Zelda"}
for key in aliases.keys():
popular_words[aliases[key]] = popular_words[key]
popular_words.pop(key)
# Print all the popular words so that we can feed them to the word cloud generator
for item in popular_words.items():
for i in range(item[1]):
print(" " + item[0], end='')
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment