davidlenz/reddit_comment_stream_with_praw_example.py

## reddit_comment_stream_with_praw_example.py
import newsapi_v2
import findurls
import praw
import pandas as pd
import utils_func
import os
import time
import subreddit
import requests
from newspaper import fulltext


CLIENT_ID ='XXX'
CLIENT_SECRET = 'XXX'
USER_AGENT = 'XXX'

reddit = praw.Reddit(client_id=CLIENT_ID,  # 'my client id',
                     client_secret=CLIENT_SECRET,  # 'my client secret',
                     user_agent=USER_AGENT)  # 'my user agent'

# create file with current datetime to store the extracted data
timestamp = utils_func.get_timestamp()
file = 'reddit_stream/comments_0_{}.csv'.format(timestamp)


def get_url_text(url):
  """ Download text from url. """
    try:
        html = requests.get(url).text
        text = fulltext(html)
        return text
    except Exception as e:
        print(url)
        print('Error:', e)
        text = ''
        return text


def process_comment(comment, file):
    df = pd.DataFrame([comment.submission,
                       comment.body,
                       comment.created_utc,
                       comment.subreddit,
                       comment.author,
                       comment.link_url,
                       comment.fullname
                       ]).T
    # if there is an url in the comment, try to scrape the text from that comment.
    # if not, fill with empty str
    if not df[5][0].startswith('https://www.reddit.com'):
        df['urltext'] = df[5].apply(get_url_text)
    else:
        df['urltext'] = ''
    # try to write the comment with url text to disk
    try:
        with open(file, 'a') as f:
            df.to_csv(f, header=False, encoding='utf-8')
    except Exception as e:
        print(e, df[1])

if __name__ == "__main__":
  subreddits = '+'.join(pd.DataFrame(subreddit.subreddits)[0].drop_duplicates().values)
  print('Subreddits:', subreddits)

  # run forever. Create a new file every 1000 comments.
  # if an error occurs, wait 15 seconds before restart
  while True:
      try:
          for i,comment in enumerate(reddit.subreddit(subreddits).stream.comments()):
              print(i)
              process_comment(comment, file)

              if (i+1)%1000 == 0:

                  timestamp = utils_func.get_timestamp()
                  file = 'reddit_stream/comments_{}_{}.csv'.format(i,timestamp)
                  print('Starting new file..', file)
      except Exception as e:
          print('Error')
          print(e)
          print('sleeping...')
          time.sleep(15)
          print('Restarting service')

## subreddit.py
subreddits = ['Bitcoin']
	import newsapi_v2
	import findurls
	import praw
	import pandas as pd
	import utils_func
	import os
	import time
	import subreddit
	import requests
	from newspaper import fulltext


	CLIENT_ID ='XXX'
	CLIENT_SECRET = 'XXX'
	USER_AGENT = 'XXX'

	reddit = praw.Reddit(client_id=CLIENT_ID, # 'my client id',
	client_secret=CLIENT_SECRET, # 'my client secret',
	user_agent=USER_AGENT) # 'my user agent'

	# create file with current datetime to store the extracted data
	timestamp = utils_func.get_timestamp()
	file = 'reddit_stream/comments_0_{}.csv'.format(timestamp)


	def get_url_text(url):
	""" Download text from url. """
	try:
	html = requests.get(url).text
	text = fulltext(html)
	return text
	except Exception as e:
	print(url)
	print('Error:', e)
	text = ''
	return text


	def process_comment(comment, file):
	df = pd.DataFrame([comment.submission,
	comment.body,
	comment.created_utc,
	comment.subreddit,
	comment.author,
	comment.link_url,
	comment.fullname
	]).T
	# if there is an url in the comment, try to scrape the text from that comment.
	# if not, fill with empty str
	if not df[5][0].startswith('https://www.reddit.com'):
	df['urltext'] = df[5].apply(get_url_text)
	else:
	df['urltext'] = ''
	# try to write the comment with url text to disk
	try:
	with open(file, 'a') as f:
	df.to_csv(f, header=False, encoding='utf-8')
	except Exception as e:
	print(e, df[1])

	if __name__ == "__main__":
	subreddits = '+'.join(pd.DataFrame(subreddit.subreddits)[0].drop_duplicates().values)
	print('Subreddits:', subreddits)

	# run forever. Create a new file every 1000 comments.
	# if an error occurs, wait 15 seconds before restart
	while True:
	try:
	for i,comment in enumerate(reddit.subreddit(subreddits).stream.comments()):
	print(i)
	process_comment(comment, file)

	if (i+1)%1000 == 0:

	timestamp = utils_func.get_timestamp()
	file = 'reddit_stream/comments_{}_{}.csv'.format(i,timestamp)
	print('Starting new file..', file)
	except Exception as e:
	print('Error')
	print(e)
	print('sleeping...')
	time.sleep(15)
	print('Restarting service')