yi-jiayu/download_comments.py

## download_comments.py
import argparse
import re
import sys
import time

import praw

CLIENT_ID = 'YOUR_CLIENT_ID'
CLIENT_SECRET = 'YOUR_CLIENT_SECRET'
REDDIT_USERNAME = 'your Reddit username'


def get_comments(submission_id):
    reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=CLIENT_SECRET,
                         user_agent='Markov Text Generation from Reddit Comments by /u/' + REDDIT_USERNAME)

    print(time.strftime('%H:%M:%S'), 'Downloading comments...', file=sys.stderr)
    submission = reddit.submission(id=submission_id)

    print(time.strftime('%H:%M:%S'), 'Expanding more children...', file=sys.stderr)
    submission.comments.replace_more(limit=None)

    return submission.comments.list()


def clean_comments(comments):
    for comment in comments:
        text = comment.body

        # strip whitespace
        text = text.strip()

        # collapse multiple line breaks
        text = re.sub('\n+', '\n', text)

        # strip each line
        text = '\n'.join(line.strip() for line in text.split('\n'))

        # add a full stop if a line doesn't end with a punctuation mark already
        text = re.sub('([^.?!])(\n|$)', '\\1.\\2', text)

        yield text


def main(submission_id, out_file):
    comments = get_comments(submission_id)

    print(time.strftime('%H:%M:%S'), 'Cleaning comments...', file=sys.stderr)
    cleaned_comments = clean_comments(comments)

    for comment in cleaned_comments:
        # separate comments with two line breaks so that we can tell them apart
        print(comment, end='\n\n', file=out_file)

    print(time.strftime('%H:%M:%S'), 'Done!', file=sys.stderr)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='''Download and lightly clean Reddit comments. Multiple line breaks will be collapsed and full
        stops will be added. Comments will be separated by double line breaks.''')
    parser.add_argument('submission_id', help='Base36 ID of Reddit thread to download comments from')
    parser.add_argument('-o', '--out-file', metavar='FILE', dest='out_file',
                        help='Write comments to FILE or standard output', type=argparse.FileType('w', encoding='utf-8'),
                        default=sys.stdout)
    args = parser.parse_args()

    main(args.submission_id, args.out_file)
	import argparse
	import re
	import sys
	import time

	import praw

	CLIENT_ID = 'YOUR_CLIENT_ID'
	CLIENT_SECRET = 'YOUR_CLIENT_SECRET'
	REDDIT_USERNAME = 'your Reddit username'


	def get_comments(submission_id):
	reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=CLIENT_SECRET,
	user_agent='Markov Text Generation from Reddit Comments by /u/' + REDDIT_USERNAME)

	print(time.strftime('%H:%M:%S'), 'Downloading comments...', file=sys.stderr)
	submission = reddit.submission(id=submission_id)

	print(time.strftime('%H:%M:%S'), 'Expanding more children...', file=sys.stderr)
	submission.comments.replace_more(limit=None)

	return submission.comments.list()


	def clean_comments(comments):
	for comment in comments:
	text = comment.body

	# strip whitespace
	text = text.strip()

	# collapse multiple line breaks
	text = re.sub('\n+', '\n', text)

	# strip each line
	text = '\n'.join(line.strip() for line in text.split('\n'))

	# add a full stop if a line doesn't end with a punctuation mark already
	text = re.sub('([^.?!])(\n\|$)', '\\1.\\2', text)

	yield text


	def main(submission_id, out_file):
	comments = get_comments(submission_id)

	print(time.strftime('%H:%M:%S'), 'Cleaning comments...', file=sys.stderr)
	cleaned_comments = clean_comments(comments)

	for comment in cleaned_comments:
	# separate comments with two line breaks so that we can tell them apart
	print(comment, end='\n\n', file=out_file)

	print(time.strftime('%H:%M:%S'), 'Done!', file=sys.stderr)


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(
	description='''Download and lightly clean Reddit comments. Multiple line breaks will be collapsed and full
	stops will be added. Comments will be separated by double line breaks.''')
	parser.add_argument('submission_id', help='Base36 ID of Reddit thread to download comments from')
	parser.add_argument('-o', '--out-file', metavar='FILE', dest='out_file',
	help='Write comments to FILE or standard output', type=argparse.FileType('w', encoding='utf-8'),
	default=sys.stdout)
	args = parser.parse_args()

	main(args.submission_id, args.out_file)