Skip to content

Instantly share code, notes, and snippets.

@nmtake
Created June 26, 2017 15:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nmtake/9f01997a3f1ce20d5a84d5c0f3f45057 to your computer and use it in GitHub Desktop.
Save nmtake/9f01997a3f1ce20d5a84d5c0f3f45057 to your computer and use it in GitHub Desktop.
import sys
import dataset
import praw
LIMIT = None
def normalize_submission(submission):
d = {}
for k, v in submission.__dict__.items():
if k in ('author', 'subreddit'):
v = str(v) # for Redditor and Subreddit instance
elif k == 'id':
v = int(v, 36)
elif k.startswith('_'):
continue
elif isinstance(v, (dict, list)):
continue
d[k] = v
return d
def normalize_comment(comment):
return normalize_submission(comment)
def normalize_subreddit(subreddit):
d = {}
for k, v in subreddit.items():
if k in ('banner_size', 'icon_size'):
v = ','.join(str(e) for e in v)
elif k == 'id':
v = int(v, 36)
elif k.startswith('_'):
continue
elif isinstance(v, (dict, list)):
continue
d[k] = v
return d
def normalize_redditor(redditor):
d = {}
for k, v in redditor.__dict__.items():
if k.startswith('_'):
continue
elif k == 'id':
v = int(v, 36)
elif k == 'subreddit':
continue
d[k] = v
return d
def main():
if len(sys.argv) != 3:
sys.stderr.write('Usage: %s PROFILE SUBREDDIT\n')
sys.exit(1)
reddit = praw.Reddit(sys.argv[1])
reddit.read_only = True
subreddit = reddit.subreddit(sys.argv[2])
database = dataset.connect('sqlite:///reddit.db')
table = database['submission']
for submission in subreddit.new(limit=LIMIT):
print('submission %s' % submission.id)
table.insert(normalize_submission(submission))
table = database['comment']
for comment in subreddit.comments(limit=LIMIT):
print('comment %s' % comment.id)
table.insert(normalize_comment(comment))
authors = []
for row in database['submission'].distinct('author'):
authors.append(row['author'])
for row in database['comment'].distinct('author'):
authors.append(row['author'])
authors = set(authors)
redditor_table = database['redditor']
subreddit_table = database['subreddit']
for author in authors:
redditor = reddit.redditor(author)
print('redditor %s' % redditor.id)
redditor_table.insert(normalize_redditor(redditor))
if redditor.subreddit:
print('subreddit %s' % redditor.subreddit['name'])
subreddit_table.insert(normalize_subreddit(redditor.subreddit))
for row in database['subreddit']:
print('\t'.join((row['display_name'], row['title'], row['public_description'])))
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment