Extracting popular topics from subreddits
from datetime import datetime | |
import json | |
from functools import lru_cache | |
from multiprocessing import Pool | |
import sys | |
from nltk.tokenize import word_tokenize, sent_tokenize | |
from nltk.corpus import stopwords | |
from nltk.util import ngrams | |
from nltk import WordNetLemmatizer, pos_tag | |
if __name__ == '__main__': | |
import nltk | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
nltk.download('averaged_perceptron_tagger') | |
nltk.download('wordnet') | |
stop_words = set(stopwords.words('english')) | |
lemma = WordNetLemmatizer() | |
lemmatize = lru_cache(maxsize=1000000)(lemma.lemmatize) | |
morphy_tag = {'NN':'n', 'JJ':'a', | |
'VB':'v', 'RB':'r'} | |
def to_lemmas(text): | |
for sentence in sent_tokenize(text): | |
for token, tag in pos_tag(word_tokenize(sentence)): | |
token = token.lower() | |
if token in stop_words: | |
continue | |
if len(token) < 2: | |
continue | |
yield lemmatize(token, pos=morphy_tag.get(tag[:2], 'n')) | |
def to_ngrams(text): | |
tokens = [*to_lemmas(text)] | |
for token in tokens: | |
yield token | |
if len(tokens) >= 2: | |
for bigram in ngrams(tokens, 2): | |
yield ' '.join(bigram) | |
if len(tokens) >= 3: | |
for trigram in ngrams(tokens, 3): | |
yield ' '.join(trigram) | |
def handle_line(line): | |
data = json.loads(line) | |
data['ngrams'] = [*to_ngrams(data['body'])] | |
return json.dumps(data) | |
if __name__ == '__main__': | |
with Pool(processes=12) as pool: | |
for handled in pool.imap_unordered(handle_line, sys.stdin): | |
print(handled) |
import json | |
import re | |
from glob import glob | |
from collections import defaultdict | |
from datetime import datetime | |
import sys | |
def read_file(path): | |
with open(path) as f: | |
return [json.loads(line) for line in f] | |
subreddit = sys.argv[1] | |
counter = defaultdict(lambda: 0) | |
paths = list(glob(f'counted/*_{subreddit}')) | |
for n, path in enumerate(paths): | |
if len(path) == len(f'counted/2018-10-10_{subreddit}'): | |
for ngram, count in read_file(path): | |
counter[ngram] += count | |
sys.stderr.write(f'\r{n} of {len(paths)}') | |
sys.stderr.write('\n') | |
print(json.dumps(list(counter.items()))) |
from glob import glob | |
import json | |
from multiprocessing import Pool | |
from collections import Counter, defaultdict | |
import os | |
import sys | |
def count(paths): | |
input_path, output_path = paths | |
counter = defaultdict(lambda: 1) | |
with open(input_path) as f: | |
for line in f: | |
ngram = json.loads(line) | |
counter[ngram] += 1 | |
with open(output_path, 'w') as f: | |
for ngram, count in counter.items(): | |
f.write(json.dumps([ngram, count]) + '\n') | |
if __name__ == '__main__': | |
paths = [(input_path, os.path.join(sys.argv[2], os.path.split(input_path)[-1])) | |
for input_path in glob(os.path.join(sys.argv[1], '*'))] | |
amount = len(paths) | |
with Pool(processes=12) as pool: | |
for n, _ in enumerate(pool.imap_unordered(count, paths)): | |
if n % 100 == 0: | |
sys.stdout.write(f'\r{n} of {amount}') | |
print('done') |
#!/usr/bin/env python | |
# coding: utf-8 | |
# Very ugly, it didn't want to upload `.ipynb` | |
# In[2]: | |
get_ipython().run_line_magic('config', 'IPCompleter.greedy=True') | |
get_ipython().run_line_magic('matplotlib', 'inline') | |
# In[3]: | |
import json | |
import re | |
from glob import glob | |
from datetime import datetime | |
import pandas as pd | |
from pandas.api.types import CategoricalDtype | |
import seaborn as sns | |
from matplotlib import pyplot as plt | |
# In[4]: | |
plt.rc('figure', facecolor='w') | |
# In[76]: | |
norm = lambda x: (x - x.mean()) / (x.max() - x.min()) | |
def diff_n(n, base, path, ngram_size=0, min_amount=0, exclude=None): | |
df = pd .read_json(path, lines=True) .rename(columns={0: 'ngram', 1: 'amount'}) | |
if exclude is not None: | |
df = df[~df.ngram.isin(exclude)] | |
if ngram_size > 0: | |
df = df[df.ngram.str.count(' ') >= ngram_size] | |
if min_amount: | |
df = df[df.amount > min_amount] | |
df['amount_norm'] = norm(df.amount) | |
df = df .merge(base, how='left', on='ngram', suffixes=('_left', '_right')) .fillna(0) | |
df['diff'] = df.amount_norm_left - df.amount_norm_right | |
return df .sort_values('diff', ascending=False) .head(n) | |
def diff_n_by_day(n, base, subreddit, start, end, ngram_size=0, min_amount=0, exclude=None): | |
by_day_df = None | |
for ts in pd.date_range(start, end).tolist(): | |
current_df = diff_n(n, base, f"counted/{ts.strftime('%Y-%m-%d')}_{subreddit}", | |
ngram_size, min_amount, exclude) | |
current_df['date'] = ts | |
if by_day_df is None: | |
by_day_df = current_df | |
else: | |
by_day_df = by_day_df.append(current_df) | |
return by_day_df | |
# In[87]: | |
def weekly_heatmap(by_day_df, title): | |
top_topics = by_day_df .groupby('ngram') .sum()['diff'] .reset_index() .sort_values('diff', ascending=False) .head(10) .ngram | |
only_top_df = by_day_df [['date', 'ngram', 'diff']] [by_day_df.ngram.isin(top_topics)] .groupby([pd.Grouper(key='date', freq='W-MON'), 'ngram']) .mean() .reset_index() .sort_values('date') | |
pivot = only_top_df .pivot(index='ngram', columns='date', values='diff') .fillna(-1) | |
_, ax = plt.subplots(figsize=(10, 8)) | |
sns.heatmap(pivot, xticklabels=only_top_df.date.dt.week.unique(), ax=ax) | |
plt.title(title) | |
plt.xlabel('Week') | |
plt.tight_layout() | |
# In[88]: | |
def weekday_heatmap(by_day_df, title): | |
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] | |
weekdays_type = CategoricalDtype(categories=weekdays, ordered=True) | |
by_day_df['weekday'] = by_day_df.date.dt.weekday_name.astype(weekdays_type) | |
top_topics = by_day_df .groupby('ngram') .sum()['diff'] .reset_index() .sort_values('diff', ascending=False) .head(10) .ngram | |
pivot = by_day_df [by_day_df.ngram.isin(top_topics)] .groupby(['weekday', 'ngram']) .mean()['diff'] .reset_index() .pivot(index='ngram', columns='weekday', values='diff') .fillna(-1) | |
_, ax = plt.subplots(figsize=(10, 8)) | |
sns.heatmap(pivot, ax=ax) | |
plt.title(title) | |
plt.xlabel('Day of week') | |
plt.tight_layout() | |
# # Base words from AskReddit | |
# In[8]: | |
whole_askreddit_df = pd.read_json('aggregated/askreddit_whole.json', orient='values') | |
# In[9]: | |
whole_askreddit_df = whole_askreddit_df.rename(columns={0: 'ngram', 1: 'amount'}) | |
# In[10]: | |
whole_askreddit_df['amount_norm'] = norm(whole_askreddit_df.amount) | |
# In[11]: | |
whole_askreddit_df = whole_askreddit_df[whole_askreddit_df.amount > 99] | |
# # Compare with others | |
# In[84]: | |
r_programming_by_day = diff_n_by_day( | |
50, whole_askreddit_df, 'programming', '2018-08-01', '2018-10-31', | |
exclude=['gt', 'use', 'write'], | |
) | |
# In[85]: | |
weekly_heatmap(r_programming_by_day, 'r/programming') | |
# In[89]: | |
weekday_heatmap(r_programming_by_day, 'r/programming by weekday') | |
# In[96]: | |
r_sports_by_day = diff_n_by_day( | |
50, whole_askreddit_df, 'sports', '2018-08-01', '2018-10-31', | |
exclude=['r/sports'], | |
) | |
# In[102]: | |
weekly_heatmap(r_sports_by_day, 'r/sports') | |
# In[98]: | |
weekday_heatmap(r_sports_by_day, 'r/sports by weekday') | |
# In[204]: | |
r_television_by_day = diff_n_by_day( | |
50, whole_askreddit_df, 'television', '2018-08-01', '2018-10-31', | |
exclude=['r/television'], | |
) | |
# In[208]: | |
weekly_heatmap(r_television_by_day, 'r/television') | |
# In[90]: | |
weekday_heatmap(r_television_by_day, 'r/television by weekday') |
from datetime import datetime | |
import json | |
from functools import lru_cache | |
from collections import defaultdict | |
import os | |
import sys | |
limit = 1000 | |
fhs = {} | |
opened = [] | |
def get_fh(day, subreddit): | |
path = os.path.join(sys.argv[1], f'{day}_{subreddit}') | |
if path not in fhs: | |
if len(opened) > limit: | |
to_close = opened.pop(0) | |
fhs[to_close].close() | |
del fhs[to_close] | |
fhs[path] = open(path, 'a') | |
opened.append(path) | |
return fhs[path] | |
def write(day, subreddit, ngram): | |
get_fh(day, subreddit).write(json.dumps(ngram) + '\n') | |
for line in sys.stdin: | |
data = json.loads(line) | |
day = datetime.fromtimestamp(data['created_utc']).date().isoformat() | |
for ngram in data['ngrams']: | |
write(day, data['subreddit'], ngram) |
#!/usr/bin/env python | |
# coding: utf-8 | |
# Very ugly, it didn't want to upload `.ipynb` | |
# In[1]: | |
get_ipython().run_line_magic('config', 'IPCompleter.greedy=True') | |
# In[2]: | |
import json | |
options = json.load(open('credentials.json')) | |
subreddits = ['all', 'tifu', 'linux', 'trees', 'worldnews', 'news', | |
'personalfinance', 'europe', 'gaming', | |
'docker', 'food', 'python', 'drunk'] | |
# In[3]: | |
from collections import Counter | |
from datetime import datetime | |
from functools import wraps | |
from itertools import takewhile | |
from multiprocessing import Pool | |
import re | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from datetime import datetime | |
from matplotlib.ticker import FuncFormatter | |
from nltk.sentiment.vader import SentimentIntensityAnalyzer | |
from nltk.tokenize import word_tokenize, sent_tokenize | |
from nltk.corpus import stopwords | |
from nltk.util import ngrams | |
from nltk import WordNetLemmatizer, pos_tag | |
import praw | |
from praw.models import Comment | |
reddit = praw.Reddit(**options) | |
sid = SentimentIntensityAnalyzer() | |
stop_words = set(stopwords.words('english')) | |
lemma = WordNetLemmatizer() | |
# In[4]: | |
def eager(fn): | |
@wraps(fn) | |
def wrapper(*args, **kwargs): | |
return list(fn(*args, **kwargs)) | |
return wrapper | |
# In[6]: | |
def get_comments(comments, level=0): | |
if level > 3: | |
return | |
for comment in comments: | |
if isinstance(comment, Comment): | |
yield comment | |
yield from get_comments(comment.replies, level + 1) | |
def fetch_texts(subreddit): | |
[*posts] = reddit.subreddit(subreddit).top(time_filter='year', limit=1000) | |
for n, post in enumerate(posts): | |
yield (f'{post.id}_title', subreddit, post.id, 'title', | |
post.title, post.created, post.score) | |
yield (f'{post.id}_selftext', subreddit, post.id, 'selftext', | |
post.selftext, post.created, post.score) | |
for comment in get_comments(post.comments): | |
yield (f'{post.id}_comment_{comment.id}', subreddit, post.id, | |
'comment', comment.body, comment.created, comment.score) | |
print(f"{subreddit}: {n + 1} of {len(posts)}") | |
def get_subreddit_df(subreddit): | |
try: | |
texts_df = pd.read_pickle(f'r_{subreddit}_texts_df.pkl') | |
except FileNotFoundError: | |
texts_df = pd.DataFrame( | |
list(fetch_texts(subreddit)), | |
columns=['id', 'subreddit', 'post_id', 'kind', 'text', 'created', 'score']) | |
texts_df.to_pickle(f'r_{subreddit}_texts_df.pkl') | |
return texts_df | |
# In[7]: | |
subreddits_dfs = {subreddit: get_subreddit_df(subreddit) | |
for subreddit in subreddits} | |
# In[10]: | |
subreddits_dfs['all'].head() | |
# In[12]: | |
# From https://stackoverflow.com/questions/50992974/nltk-wordnetlemmatizer-not-lemmatizing-as-expected | |
def penn2morphy(penntag): | |
""" Converts Penn Treebank tags to WordNet. """ | |
morphy_tag = {'NN':'n', 'JJ':'a', | |
'VB':'v', 'RB':'r'} | |
try: | |
return morphy_tag[penntag[:2]] | |
except: | |
return 'n' | |
clean_lemma_re = re.compile(r'\W') | |
def to_lemmas(text): | |
for sentence in sent_tokenize(text): | |
for token, tag in pos_tag(word_tokenize(sentence)): | |
token = token.lower() | |
if token in stop_words: | |
continue | |
token = clean_lemma_re.sub('', token) | |
if not token: | |
continue | |
yield lemma.lemmatize(token, pos=penn2morphy(tag)) | |
@eager | |
def to_tokens(text): | |
tokens = [*to_lemmas(text)] | |
for token in tokens: | |
yield token | |
if len(tokens) >= 2: | |
for bigram in ngrams(tokens, 2): | |
yield ' '.join(bigram) | |
if len(tokens) >= 3: | |
for trigram in ngrams(tokens, 3): | |
yield ' '.join(trigram) | |
pool = Pool(4) | |
def to_counted_tokens(series): | |
counter = Counter(token for text_tokens in pool.map(to_tokens, series.tolist()) | |
for token in text_tokens) | |
return counter.items() | |
def to_tokens_df(series): | |
return pd.DataFrame([*to_counted_tokens(series)], | |
columns=['token', 'amount']) | |
def get_tokens_df(subreddit): | |
try: | |
tokens_df = pd.read_pickle(f'r_{subreddit}_tokens_2_df.pkl') | |
except FileNotFoundError: | |
tokens_df = to_tokens_df(subreddits_dfs[subreddit].text) | |
tokens_df.to_pickle(f'r_{subreddit}_tokens_2_df.pkl') | |
return tokens_df | |
# In[13]: | |
tokens_dfs = {subreddit: get_tokens_df(subreddit) | |
for subreddit in subreddits} | |
# In[14]: | |
df = tokens_dfs['all'] | |
df.head() | |
# In[15]: | |
for df in tokens_dfs.values(): | |
df['amount_norm'] = (df.amount - df.amount.mean()) / (df.amount.max() - df.amount.min()) | |
# In[16]: | |
df.head() | |
# In[18]: | |
def diff_tokens(left_df, right_df): | |
tokens_df = left_df .merge(right_df, on='token', how='outer', suffixes=('_left', '_right')) .fillna(-1) | |
tokens_df['amount_diff'] = tokens_df['amount_left'] - tokens_df['amount_right'] | |
tokens_df['amount_norm_diff'] = tokens_df['amount_norm_left'] - tokens_df['amount_norm_right'] | |
return tokens_df[['token', 'amount_diff', 'amount_norm_diff']] .sort_values('amount_norm_diff', ascending=False) | |
# In[20]: | |
diff_tokens(tokens_dfs['linux'], tokens_dfs['all']).head() | |
# In[21]: | |
diff_tokens(tokens_dfs['drunk'], tokens_dfs['all']).head() | |
# In[22]: | |
diff_tokens(tokens_dfs['personalfinance'], tokens_dfs['all']).head() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment