Skip to content

Instantly share code, notes, and snippets.

@xMajedz
Created March 4, 2018 05:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xMajedz/980a6880970171c865b532e61f2c6514 to your computer and use it in GitHub Desktop.
Save xMajedz/980a6880970171c865b532e61f2c6514 to your computer and use it in GitHub Desktop.
someone elses code https://pastebin.com/zsW2JG8M
import requests
from html2text import html2text
from nltk.probability import FreqDist
from nltk.corpus import brown
from collections import Counter
from pickle import dump, load
from wordcloud import WordCloud
from nltk.tokenize import TweetTokenizer
import matplotlib.pyplot as plt
import os
# See https://github.com/4chan/4chan-API for more info
tokenizer = TweetTokenizer()
for board in requests.get('https://a.4cdn.org/boards.json').json()['boards']:
try:
print('Board: {} ({})'.format(board['board'], board['title']))
try:
# Load word counts for this board from a file, if available
with open('{}.counts'.format(board['board']), 'rb') as file:
counter = load(file)
except:
# Create list of thread IDs
threads = [
thread['no']
for page in requests.get('https://a.4cdn.org/{}/threads.json'.format(board['board'])).json()
for thread in page['threads']
] + requests.get('https://a.4cdn.org/{}/archive.json'.format(board['board'])).json()
threads = threads[:200] # Limit to last 200 threads at most (faster)
# Count words in every thread and every post
counter = Counter()
for index, thread in enumerate(threads):
print('Thread {} ({} / {})'.format(thread, index, len(threads)))
for post in requests.get('https://a.4cdn.org/{}/thread/{}.json'.format(board['board'], thread)).json()['posts']:
if 'sub' in post:
text = html2text(post['sub']).lower()
counter.update(token for token in tokenizer.tokenize(text) if token.isalpha())
if 'com' in post:
text = html2text(post['com']).lower()
counter.update(token for token in tokenizer.tokenize(text) if token.isalpha())
# Store word counts for this board in a file
with open('{}.counts'.format(board['board']), 'wb') as file:
dump(counter, file)
# Create word cloud and save it if it doesn't exist
if not os.path.isfile('{}.png'.format(board['board'])):
# Reference frequencies of English words
freq = FreqDist(word.lower() for word in brown.words())
for word in counter:
counter[word] /= 1 + freq[word]
del counter[board['board']] # Remove board name
for word in tokenizer.tokenize(board['title'].lower()):
del counter[word] # Remove other keywords
cloud = WordCloud(scale=10, max_words=2000).fit_words(counter)
plt.imsave('{}.png'.format(board['board']), cloud)
except Exception as e:
print('ERROR: {}'.format(e))
continue # Error occurred, skip this board for now
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment