Skip to content

Instantly share code, notes, and snippets.

@yi-jiayu
Last active April 25, 2018 05:01
Show Gist options
  • Save yi-jiayu/4359e3f2fdb108da4bcacdcc55ab9dd2 to your computer and use it in GitHub Desktop.
Save yi-jiayu/4359e3f2fdb108da4bcacdcc55ab9dd2 to your computer and use it in GitHub Desktop.
import argparse
import re
import sys
import time
import praw
CLIENT_ID = 'YOUR_CLIENT_ID'
CLIENT_SECRET = 'YOUR_CLIENT_SECRET'
REDDIT_USERNAME = 'your Reddit username'
def get_comments(submission_id):
reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=CLIENT_SECRET,
user_agent='Markov Text Generation from Reddit Comments by /u/' + REDDIT_USERNAME)
print(time.strftime('%H:%M:%S'), 'Downloading comments...', file=sys.stderr)
submission = reddit.submission(id=submission_id)
print(time.strftime('%H:%M:%S'), 'Expanding more children...', file=sys.stderr)
submission.comments.replace_more(limit=None)
return submission.comments.list()
def clean_comments(comments):
for comment in comments:
text = comment.body
# strip whitespace
text = text.strip()
# collapse multiple line breaks
text = re.sub('\n+', '\n', text)
# strip each line
text = '\n'.join(line.strip() for line in text.split('\n'))
# add a full stop if a line doesn't end with a punctuation mark already
text = re.sub('([^.?!])(\n|$)', '\\1.\\2', text)
yield text
def main(submission_id, out_file):
comments = get_comments(submission_id)
print(time.strftime('%H:%M:%S'), 'Cleaning comments...', file=sys.stderr)
cleaned_comments = clean_comments(comments)
for comment in cleaned_comments:
# separate comments with two line breaks so that we can tell them apart
print(comment, end='\n\n', file=out_file)
print(time.strftime('%H:%M:%S'), 'Done!', file=sys.stderr)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='''Download and lightly clean Reddit comments. Multiple line breaks will be collapsed and full
stops will be added. Comments will be separated by double line breaks.''')
parser.add_argument('submission_id', help='Base36 ID of Reddit thread to download comments from')
parser.add_argument('-o', '--out-file', metavar='FILE', dest='out_file',
help='Write comments to FILE or standard output', type=argparse.FileType('w', encoding='utf-8'),
default=sys.stdout)
args = parser.parse_args()
main(args.submission_id, args.out_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment