Skip to content

Instantly share code, notes, and snippets.

@nvbn

nvbn/add_ngrams.py

Last active Jan 8, 2019
Embed
What would you like to do?
Extracting popular topics from subreddits
from datetime import datetime
import json
from functools import lru_cache
from multiprocessing import Pool
import sys
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk import WordNetLemmatizer, pos_tag
if __name__ == '__main__':
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemma = WordNetLemmatizer()
lemmatize = lru_cache(maxsize=1000000)(lemma.lemmatize)
morphy_tag = {'NN':'n', 'JJ':'a',
'VB':'v', 'RB':'r'}
def to_lemmas(text):
for sentence in sent_tokenize(text):
for token, tag in pos_tag(word_tokenize(sentence)):
token = token.lower()
if token in stop_words:
continue
if len(token) < 2:
continue
yield lemmatize(token, pos=morphy_tag.get(tag[:2], 'n'))
def to_ngrams(text):
tokens = [*to_lemmas(text)]
for token in tokens:
yield token
if len(tokens) >= 2:
for bigram in ngrams(tokens, 2):
yield ' '.join(bigram)
if len(tokens) >= 3:
for trigram in ngrams(tokens, 3):
yield ' '.join(trigram)
def handle_line(line):
data = json.loads(line)
data['ngrams'] = [*to_ngrams(data['body'])]
return json.dumps(data)
if __name__ == '__main__':
with Pool(processes=12) as pool:
for handled in pool.imap_unordered(handle_line, sys.stdin):
print(handled)
import json
import re
from glob import glob
from collections import defaultdict
from datetime import datetime
import sys
def read_file(path):
with open(path) as f:
return [json.loads(line) for line in f]
subreddit = sys.argv[1]
counter = defaultdict(lambda: 0)
paths = list(glob(f'counted/*_{subreddit}'))
for n, path in enumerate(paths):
if len(path) == len(f'counted/2018-10-10_{subreddit}'):
for ngram, count in read_file(path):
counter[ngram] += count
sys.stderr.write(f'\r{n} of {len(paths)}')
sys.stderr.write('\n')
print(json.dumps(list(counter.items())))
from glob import glob
import json
from multiprocessing import Pool
from collections import Counter, defaultdict
import os
import sys
def count(paths):
input_path, output_path = paths
counter = defaultdict(lambda: 1)
with open(input_path) as f:
for line in f:
ngram = json.loads(line)
counter[ngram] += 1
with open(output_path, 'w') as f:
for ngram, count in counter.items():
f.write(json.dumps([ngram, count]) + '\n')
if __name__ == '__main__':
paths = [(input_path, os.path.join(sys.argv[2], os.path.split(input_path)[-1]))
for input_path in glob(os.path.join(sys.argv[1], '*'))]
amount = len(paths)
with Pool(processes=12) as pool:
for n, _ in enumerate(pool.imap_unordered(count, paths)):
if n % 100 == 0:
sys.stdout.write(f'\r{n} of {amount}')
print('done')
#!/usr/bin/env python
# coding: utf-8
# Very ugly, it didn't want to upload `.ipynb`
# In[2]:
get_ipython().run_line_magic('config', 'IPCompleter.greedy=True')
get_ipython().run_line_magic('matplotlib', 'inline')
# In[3]:
import json
import re
from glob import glob
from datetime import datetime
import pandas as pd
from pandas.api.types import CategoricalDtype
import seaborn as sns
from matplotlib import pyplot as plt
# In[4]:
plt.rc('figure', facecolor='w')
# In[76]:
norm = lambda x: (x - x.mean()) / (x.max() - x.min())
def diff_n(n, base, path, ngram_size=0, min_amount=0, exclude=None):
df = pd .read_json(path, lines=True) .rename(columns={0: 'ngram', 1: 'amount'})
if exclude is not None:
df = df[~df.ngram.isin(exclude)]
if ngram_size > 0:
df = df[df.ngram.str.count(' ') >= ngram_size]
if min_amount:
df = df[df.amount > min_amount]
df['amount_norm'] = norm(df.amount)
df = df .merge(base, how='left', on='ngram', suffixes=('_left', '_right')) .fillna(0)
df['diff'] = df.amount_norm_left - df.amount_norm_right
return df .sort_values('diff', ascending=False) .head(n)
def diff_n_by_day(n, base, subreddit, start, end, ngram_size=0, min_amount=0, exclude=None):
by_day_df = None
for ts in pd.date_range(start, end).tolist():
current_df = diff_n(n, base, f"counted/{ts.strftime('%Y-%m-%d')}_{subreddit}",
ngram_size, min_amount, exclude)
current_df['date'] = ts
if by_day_df is None:
by_day_df = current_df
else:
by_day_df = by_day_df.append(current_df)
return by_day_df
# In[87]:
def weekly_heatmap(by_day_df, title):
top_topics = by_day_df .groupby('ngram') .sum()['diff'] .reset_index() .sort_values('diff', ascending=False) .head(10) .ngram
only_top_df = by_day_df [['date', 'ngram', 'diff']] [by_day_df.ngram.isin(top_topics)] .groupby([pd.Grouper(key='date', freq='W-MON'), 'ngram']) .mean() .reset_index() .sort_values('date')
pivot = only_top_df .pivot(index='ngram', columns='date', values='diff') .fillna(-1)
_, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(pivot, xticklabels=only_top_df.date.dt.week.unique(), ax=ax)
plt.title(title)
plt.xlabel('Week')
plt.tight_layout()
# In[88]:
def weekday_heatmap(by_day_df, title):
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekdays_type = CategoricalDtype(categories=weekdays, ordered=True)
by_day_df['weekday'] = by_day_df.date.dt.weekday_name.astype(weekdays_type)
top_topics = by_day_df .groupby('ngram') .sum()['diff'] .reset_index() .sort_values('diff', ascending=False) .head(10) .ngram
pivot = by_day_df [by_day_df.ngram.isin(top_topics)] .groupby(['weekday', 'ngram']) .mean()['diff'] .reset_index() .pivot(index='ngram', columns='weekday', values='diff') .fillna(-1)
_, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(pivot, ax=ax)
plt.title(title)
plt.xlabel('Day of week')
plt.tight_layout()
# # Base words from AskReddit
# In[8]:
whole_askreddit_df = pd.read_json('aggregated/askreddit_whole.json', orient='values')
# In[9]:
whole_askreddit_df = whole_askreddit_df.rename(columns={0: 'ngram', 1: 'amount'})
# In[10]:
whole_askreddit_df['amount_norm'] = norm(whole_askreddit_df.amount)
# In[11]:
whole_askreddit_df = whole_askreddit_df[whole_askreddit_df.amount > 99]
# # Compare with others
# In[84]:
r_programming_by_day = diff_n_by_day(
50, whole_askreddit_df, 'programming', '2018-08-01', '2018-10-31',
exclude=['gt', 'use', 'write'],
)
# In[85]:
weekly_heatmap(r_programming_by_day, 'r/programming')
# In[89]:
weekday_heatmap(r_programming_by_day, 'r/programming by weekday')
# In[96]:
r_sports_by_day = diff_n_by_day(
50, whole_askreddit_df, 'sports', '2018-08-01', '2018-10-31',
exclude=['r/sports'],
)
# In[102]:
weekly_heatmap(r_sports_by_day, 'r/sports')
# In[98]:
weekday_heatmap(r_sports_by_day, 'r/sports by weekday')
# In[204]:
r_television_by_day = diff_n_by_day(
50, whole_askreddit_df, 'television', '2018-08-01', '2018-10-31',
exclude=['r/television'],
)
# In[208]:
weekly_heatmap(r_television_by_day, 'r/television')
# In[90]:
weekday_heatmap(r_television_by_day, 'r/television by weekday')
from datetime import datetime
import json
from functools import lru_cache
from collections import defaultdict
import os
import sys
limit = 1000
fhs = {}
opened = []
def get_fh(day, subreddit):
path = os.path.join(sys.argv[1], f'{day}_{subreddit}')
if path not in fhs:
if len(opened) > limit:
to_close = opened.pop(0)
fhs[to_close].close()
del fhs[to_close]
fhs[path] = open(path, 'a')
opened.append(path)
return fhs[path]
def write(day, subreddit, ngram):
get_fh(day, subreddit).write(json.dumps(ngram) + '\n')
for line in sys.stdin:
data = json.loads(line)
day = datetime.fromtimestamp(data['created_utc']).date().isoformat()
for ngram in data['ngrams']:
write(day, data['subreddit'], ngram)
#!/usr/bin/env python
# coding: utf-8
# Very ugly, it didn't want to upload `.ipynb`
# In[1]:
get_ipython().run_line_magic('config', 'IPCompleter.greedy=True')
# In[2]:
import json
options = json.load(open('credentials.json'))
subreddits = ['all', 'tifu', 'linux', 'trees', 'worldnews', 'news',
'personalfinance', 'europe', 'gaming',
'docker', 'food', 'python', 'drunk']
# In[3]:
from collections import Counter
from datetime import datetime
from functools import wraps
from itertools import takewhile
from multiprocessing import Pool
import re
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from matplotlib.ticker import FuncFormatter
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk import WordNetLemmatizer, pos_tag
import praw
from praw.models import Comment
reddit = praw.Reddit(**options)
sid = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))
lemma = WordNetLemmatizer()
# In[4]:
def eager(fn):
@wraps(fn)
def wrapper(*args, **kwargs):
return list(fn(*args, **kwargs))
return wrapper
# In[6]:
def get_comments(comments, level=0):
if level > 3:
return
for comment in comments:
if isinstance(comment, Comment):
yield comment
yield from get_comments(comment.replies, level + 1)
def fetch_texts(subreddit):
[*posts] = reddit.subreddit(subreddit).top(time_filter='year', limit=1000)
for n, post in enumerate(posts):
yield (f'{post.id}_title', subreddit, post.id, 'title',
post.title, post.created, post.score)
yield (f'{post.id}_selftext', subreddit, post.id, 'selftext',
post.selftext, post.created, post.score)
for comment in get_comments(post.comments):
yield (f'{post.id}_comment_{comment.id}', subreddit, post.id,
'comment', comment.body, comment.created, comment.score)
print(f"{subreddit}: {n + 1} of {len(posts)}")
def get_subreddit_df(subreddit):
try:
texts_df = pd.read_pickle(f'r_{subreddit}_texts_df.pkl')
except FileNotFoundError:
texts_df = pd.DataFrame(
list(fetch_texts(subreddit)),
columns=['id', 'subreddit', 'post_id', 'kind', 'text', 'created', 'score'])
texts_df.to_pickle(f'r_{subreddit}_texts_df.pkl')
return texts_df
# In[7]:
subreddits_dfs = {subreddit: get_subreddit_df(subreddit)
for subreddit in subreddits}
# In[10]:
subreddits_dfs['all'].head()
# In[12]:
# From https://stackoverflow.com/questions/50992974/nltk-wordnetlemmatizer-not-lemmatizing-as-expected
def penn2morphy(penntag):
""" Converts Penn Treebank tags to WordNet. """
morphy_tag = {'NN':'n', 'JJ':'a',
'VB':'v', 'RB':'r'}
try:
return morphy_tag[penntag[:2]]
except:
return 'n'
clean_lemma_re = re.compile(r'\W')
def to_lemmas(text):
for sentence in sent_tokenize(text):
for token, tag in pos_tag(word_tokenize(sentence)):
token = token.lower()
if token in stop_words:
continue
token = clean_lemma_re.sub('', token)
if not token:
continue
yield lemma.lemmatize(token, pos=penn2morphy(tag))
@eager
def to_tokens(text):
tokens = [*to_lemmas(text)]
for token in tokens:
yield token
if len(tokens) >= 2:
for bigram in ngrams(tokens, 2):
yield ' '.join(bigram)
if len(tokens) >= 3:
for trigram in ngrams(tokens, 3):
yield ' '.join(trigram)
pool = Pool(4)
def to_counted_tokens(series):
counter = Counter(token for text_tokens in pool.map(to_tokens, series.tolist())
for token in text_tokens)
return counter.items()
def to_tokens_df(series):
return pd.DataFrame([*to_counted_tokens(series)],
columns=['token', 'amount'])
def get_tokens_df(subreddit):
try:
tokens_df = pd.read_pickle(f'r_{subreddit}_tokens_2_df.pkl')
except FileNotFoundError:
tokens_df = to_tokens_df(subreddits_dfs[subreddit].text)
tokens_df.to_pickle(f'r_{subreddit}_tokens_2_df.pkl')
return tokens_df
# In[13]:
tokens_dfs = {subreddit: get_tokens_df(subreddit)
for subreddit in subreddits}
# In[14]:
df = tokens_dfs['all']
df.head()
# In[15]:
for df in tokens_dfs.values():
df['amount_norm'] = (df.amount - df.amount.mean()) / (df.amount.max() - df.amount.min())
# In[16]:
df.head()
# In[18]:
def diff_tokens(left_df, right_df):
tokens_df = left_df .merge(right_df, on='token', how='outer', suffixes=('_left', '_right')) .fillna(-1)
tokens_df['amount_diff'] = tokens_df['amount_left'] - tokens_df['amount_right']
tokens_df['amount_norm_diff'] = tokens_df['amount_norm_left'] - tokens_df['amount_norm_right']
return tokens_df[['token', 'amount_diff', 'amount_norm_diff']] .sort_values('amount_norm_diff', ascending=False)
# In[20]:
diff_tokens(tokens_dfs['linux'], tokens_dfs['all']).head()
# In[21]:
diff_tokens(tokens_dfs['drunk'], tokens_dfs['all']).head()
# In[22]:
diff_tokens(tokens_dfs['personalfinance'], tokens_dfs['all']).head()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment