Last active
April 25, 2018 05:01
-
-
Save yi-jiayu/4359e3f2fdb108da4bcacdcc55ab9dd2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import re | |
import sys | |
import time | |
import praw | |
CLIENT_ID = 'YOUR_CLIENT_ID' | |
CLIENT_SECRET = 'YOUR_CLIENT_SECRET' | |
REDDIT_USERNAME = 'your Reddit username' | |
def get_comments(submission_id): | |
reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, | |
user_agent='Markov Text Generation from Reddit Comments by /u/' + REDDIT_USERNAME) | |
print(time.strftime('%H:%M:%S'), 'Downloading comments...', file=sys.stderr) | |
submission = reddit.submission(id=submission_id) | |
print(time.strftime('%H:%M:%S'), 'Expanding more children...', file=sys.stderr) | |
submission.comments.replace_more(limit=None) | |
return submission.comments.list() | |
def clean_comments(comments): | |
for comment in comments: | |
text = comment.body | |
# strip whitespace | |
text = text.strip() | |
# collapse multiple line breaks | |
text = re.sub('\n+', '\n', text) | |
# strip each line | |
text = '\n'.join(line.strip() for line in text.split('\n')) | |
# add a full stop if a line doesn't end with a punctuation mark already | |
text = re.sub('([^.?!])(\n|$)', '\\1.\\2', text) | |
yield text | |
def main(submission_id, out_file): | |
comments = get_comments(submission_id) | |
print(time.strftime('%H:%M:%S'), 'Cleaning comments...', file=sys.stderr) | |
cleaned_comments = clean_comments(comments) | |
for comment in cleaned_comments: | |
# separate comments with two line breaks so that we can tell them apart | |
print(comment, end='\n\n', file=out_file) | |
print(time.strftime('%H:%M:%S'), 'Done!', file=sys.stderr) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser( | |
description='''Download and lightly clean Reddit comments. Multiple line breaks will be collapsed and full | |
stops will be added. Comments will be separated by double line breaks.''') | |
parser.add_argument('submission_id', help='Base36 ID of Reddit thread to download comments from') | |
parser.add_argument('-o', '--out-file', metavar='FILE', dest='out_file', | |
help='Write comments to FILE or standard output', type=argparse.FileType('w', encoding='utf-8'), | |
default=sys.stdout) | |
args = parser.parse_args() | |
main(args.submission_id, args.out_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment