Skip to content

Instantly share code, notes, and snippets.

@mostlyinteresting
Created July 18, 2020 02:31
Show Gist options
  • Save mostlyinteresting/651aeea24e9acfafc538f3360d1a5e7a to your computer and use it in GitHub Desktop.
Save mostlyinteresting/651aeea24e9acfafc538f3360d1a5e7a to your computer and use it in GitHub Desktop.
import praw
import pandas as pd
import numpy as np
import sqlite3
class redditMethods:
global comment2df
def comment2df(comment):
try:
out = pd.DataFrame(data = {'author_fullname':comment.author_fullname,
'comment':comment.body,
'score':comment.score,
'ups':comment.ups,
'downs':comment.downs,
'total_awards_received':comment.total_awards_received,
'controversiality':comment.controversiality,
'depth':comment.depth,
'created':comment.created}, index = range(1))
except:
out = pd.DataFrame(data = {'author_fullname':None,'comment':None,'score':None,
'ups':None,'downs':None,'total_awards_received':None,
'controversiality':None,'depth':None,'created':None}, index = range(0))
return out
@staticmethod
def getComments(x):
x.comments.replace_more(limit=None)
topComments = []
for top_level_comment in x.comments:
comment = comment2df(top_level_comment)
topComments.append(comment)
try:
topComments = pd.concat(topComments)
topComments.reset_index(drop = True, inplace = True)
except:
topComments = pd.DataFrame(data = {'author_fullname':None,'comment':None,'score':None,
'ups':None,'downs':None,'total_awards_received':None,
'controversiality':None,'depth':None,'created':None}, index = range(0))
return topComments
@staticmethod
def getTopComments(x):
x.comments.replace_more(limit=0)
topComments = []
for top_level_comment in x.comments:
comment = comment2df(top_level_comment)
topComments.append(comment)
try:
topComments = pd.concat(topComments)
topComments.reset_index(drop = True, inplace = True)
except:
topComments = pd.DataFrame(data = {'author_fullname':None,'comment':None,'score':None,
'ups':None,'downs':None,'total_awards_received':None,
'controversiality':None,'depth':None,'created':None}, index = range(0))
return topComments
@staticmethod
def getMedia(x):
try:
media = x.preview['images'][0]['source']['url']
except:
media = None
return media
@staticmethod
def getSubmission(x):
submission = [x.all_awardings,x.allow_live_comments,x.approved_by,x.archived,x.author,
x.author_flair_type,x.author_fullname,x.awarders,x.can_mod_post,x.category,
x.clicked,x.comment_sort, x.comments, x.content_categories,x.contest_mode,
x.created,x.created_utc,x.discussion_type,x.distinguished,x.domain,x.downs,
x.edited,x.fullname,x.gilded,x.hidden,x.hide_score,x.id,x.is_crosspostable,
x.is_meta,x.is_original_content,x.is_reddit_media_domain,x.is_robot_indexable,
x.is_self,x.is_video,x.likes,x.locked,x.media,x.media_only,x.name,x.num_comments,
x.num_crossposts,x.num_reports,x.over_18,x.parent_whitelist_status,x.permalink,
x.pinned,x.pwls,x.quarantine,x.report_reasons,x.score,x.secure_media,
x.selftext,x.send_replies,x.shortlink,x.spoiler,x.stickied,x.subreddit_id,
x.subreddit_name_prefixed,x.subreddit_subscribers,x.subreddit_type,
x.title,x.total_awards_received,x.ups,x.upvote,x.upvote_ratio,
x.url,x.view_count,x.visited,x.whitelist_status,x.wls]
submission = [ y if y != [] else None for y in submission ]
submissionCols = ['all_awardings','allow_live_comments','approved_by','archived','author',
'author_flair_type','author_fullname','awarders','can_mod_post','category',
'clicked','comment_sort', 'comments', 'content_categories','contest_mode',
'created','created_utc','discussion_type','distinguished','domain','downs',
'edited','fullname','gilded','hidden','hide_score','id','is_crosspostable',
'is_meta','is_original_content','is_reddit_media_domain','is_robot_indexable',
'is_self','is_video','likes','locked','media','media_only','name','num_comments',
'num_crossposts','num_reports','over_18','parent_whitelist_status','permalink',
'pinned','pwls','quarantine','report_reasons','score','secure_media',
'selftext','send_replies','shortlink','spoiler','stickied','subreddit_id',
'subreddit_name_prefixed','x.subreddit_subscribers','x.subreddit_type',
'title','total_awards_received','ups','upvote','upvote_ratio',
'url','view_count','visited','whitelist_status','wls']
submission = pd.DataFrame(np.reshape(np.array(submission), (1,len(submission))),
columns = submissionCols, index = range(1))
return submission
def obj2pd(obj, red):
posts = []
comments = []
topComments = []
for i in range(len(obj)):
iPost = red.getSubmission(obj[i])
iComments = red.getComments(obj[i])
iTopComments = red.getTopComments(obj[i])
iPicture = red.getMedia(obj[i])
iPost['picture'] = iPicture
iComments['id'] = iPost['id']
iComments['subreddit_name_prefixed'] = iPost['subreddit_name_prefixed']
iTopComments['id'] = iPost['id']
iTopComments['subreddit_name_prefixed'] = iPost['subreddit_name_prefixed']
posts.append(iPost)
comments.append(iComments)
topComments.append(iTopComments)
posts = pd.concat(posts)
comments = pd.concat(comments)
topComments = pd.concat(topComments)
return posts, comments, topComments
def posts2pd(obj, red):
posts = []
for i in range(len(obj)):
try:
iPost = red.getSubmission(obj[i])
posts.append(iPost)
except:
pass
posts = pd.concat(posts)
return posts
if __name__ == '__main__':
reddit = praw.Reddit(client_id='<YOUR_CLIENT_ID>',
client_secret='<YOUR_CLIENT_SECRET>',
password='<YOUR_PASSWORD>',
user_agent='<YOUR_UA>',
username='<YOUR_USERNAME>')
try:
print(reddit.user.me())
except:
print('ruh roh')
red = redditMethods()
subreddits = ['introvert','extroverts','istj','istp','isfj','isfp',
'infj','infp','intj','intp','estp','estj2','esfp',
'esfj','enfp','enfj','entp','entj']
allPosts = []
#sort types: controversial, gilded, hot, new, rising, top
for subreddit in subreddits:
try:
obj = []
hot = list(reddit.subreddit(subreddit).hot(limit=1000))
new = list(reddit.subreddit(subreddit).new(limit=1000))
rising = list(reddit.subreddit(subreddit).rising(limit=1000))
top = list(reddit.subreddit(subreddit).top(limit=1000))
controversial = list(reddit.subreddit(subreddit).controversial(limit=1000))
gilded = list(reddit.subreddit(subreddit).gilded(limit=1000))
obj.extend(hot)
obj.extend(new)
obj.extend(rising)
obj.extend(top)
obj.extend(controversial)
obj.extend(gilded)
obj = list(obj) #keep in memory
res = posts2pd(obj, red)
allPosts.append(res)
except:
pass
allPosts = pd.concat(allPosts)
allPosts.drop(['all_awardings','upvote','media','secure_media'], axis = 1, inplace = True)
allPosts[['author','comments']] = allPosts[['author','comments']].astype(str)
conn = sqlite3.connect('reddit.sqlite')
allPosts.to_sql('data', conn, if_exists = 'append', index = False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment