Skip to content

Instantly share code, notes, and snippets.

@SubhrajitPrusty
Last active April 22, 2019 15:29
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SubhrajitPrusty/c3c9646c2eb3c9548e5b3cd9fe89530e to your computer and use it in GitHub Desktop.
Save SubhrajitPrusty/c3c9646c2eb3c9548e5b3cd9fe89530e to your computer and use it in GitHub Desktop.
Create a wordcloud from Codex telegram group chat
from bs4 import BeautifulSoup as bs4
import os
from wordcloud import WordCloud
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
import numpy as np
from PIL import Image
import random
# returns a random cyan looking color
def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
return "rgb(0, {}, {})".format(random.randint(200, 255),random.randint(200, 255))
# parse data from Telegram export chat data. Data is stored in html format
def parse_data():
files = os.listdir(".")
html_files = [x for x in files if x.endswith("html")]
text = ""
for f in html_files:
print(f, end="\r")
with open(f, "r", encoding="utf-8") as fdata:
soup = bs4(fdata, "html.parser")
txts = soup.findAll("div", {"class": "text"})
for txt in txts:
text += txt.get_text()
text = text.replace("aposs", "'")
text = text.replace("apost", "'")
text = text.replace("aposll", "'")
text = text.replace("https", "'")
stop = set(stopwords.words('english'))
stop.update(set(punctuation))
words = [w for w in word_tokenize(text.lower()) if w not in stop]
return words # the final wordset
words = parse_data()
print(f"{len(words)} words\t\t")
codex_logo = np.array(Image.open("codex_logo.png")) # mask to use
wc = WordCloud(width=1920, height=1080, mask=codex_logo, max_words=2000, background_color="rgb(44,44,44)")
#,contour_width=3, contour_color="white")
wc.generate(" ".join(words))
wc.recolor(color_func=grey_color_func, random_state=3)
wc.to_file("codex_wordcloud.png")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment