Skip to content

Instantly share code, notes, and snippets.

@davidlenz
Last active April 25, 2018 16:20
Show Gist options
  • Save davidlenz/db9b644549948efb49a0ec8728d965ec to your computer and use it in GitHub Desktop.
Save davidlenz/db9b644549948efb49a0ec8728d965ec to your computer and use it in GitHub Desktop.
Stream reddit submissions with PRAW. Additionally finds urls in submissions and extracts their text.
import newsapi_v2
import findurls
import praw
import pandas as pd
import utils_func
import os
import subreddit
import requests
from newspaper import fulltext
reddit = praw.Reddit(client_id=, # 'my client id',
client_secret=, # 'my client secret',
user_agent=) # 'my user agent'
timestamp = utils_func.get_timestamp()
file = 'reddit_stream/submissions_0_{}.csv'.format(timestamp)
def get_url_text(url):
try:
html = requests.get(url).text
text = fulltext(html)
return text
except Exception as e:
print(url)
print('Error:', e)
text = ''
return text
def process_submission(sub, file):
# data to dataframe
df = pd.DataFrame([sub.title,
sub.selftext,
sub.created_utc,
sub.subreddit.display_name,
sub.author,
sub.url,
sub.id
]).T
if not df[5][0].startswith('https://www.reddit.com'):
df['urltext'] = df[5].apply(get_url_text)
else:
df['urltext'] = ''
# try writing to file. might throw unicodedecode error
try:
with open(file, 'a') as f:
df.to_csv(f, header=False, encoding='utf-8')
except Exception as e:
print(e, df[1])
subreddits = '+'.join(pd.DataFrame(subreddit.subreddits)[0].drop_duplicates().values)
print('Subreddits:', subreddits)
errors = 0
while True:
try:
for i,submission in enumerate(reddit.subreddit(subreddits).stream.submissions()):
print(i)
process_submission(submission, file)
if (i+1)%1000 == 0:
timestamp = utils_func.get_timestamp()
file = 'reddit_stream/submissions_{}_{}.csv'.format(i,timestamp)
print('Starting new file..', file)
except Exception as e:
print('Error')
print(e)
print('Restarting service')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment