Skip to content

Instantly share code, notes, and snippets.

@avoliva
Last active May 27, 2016 14:11
Show Gist options
  • Save avoliva/ba8d157adb5c19f7dd2d to your computer and use it in GitHub Desktop.
Save avoliva/ba8d157adb5c19f7dd2d to your computer and use it in GitHub Desktop.
Reddit comment loader/parser
from BeautifulSoup import BeautifulSoup
import requests
from xml.sax import saxutils as su
class Comment(object):
def __init__(self, id, html, author,
points, postedOn, level, parent=None):
self.id = id
self.content =html
self.user = author
self.points = points
self.created = postedOn
self.level = level
self.parent = parent
self.parent_content = None
class Reddit(object):
def load_comment(self, data, level):
comment = Comment(
id=data.get('name'),
=data.get('body_html'),
author=data.get('author'),
points=data.get('score'),
postedOn=data.get('created_utc'),
level=level,
parent=data.get('parent_id')
)
return comment
def process(self, comments, c, level):
for item in c:
if item.get('kind') is None:
continue
if not item.get("kind") == "t1":
continue
data = item.get('data')
comment = self.load_comment(data, level)
comment.content = '<br/><br/>'.join(
[str(x.contents[0]) for x in
BeautifulSoup(su.unescape(comment.content)).find('div').findAll('p')])
if comment.parent.split('_')[0] == 't1':
comment.parent_content = next((l for l in comments if l.id == comment.parent), None).content
if comment.user:
comments.append(comment)
self.add_replies(comments,data,level+1)
return comments
def add_replies(self, comments, parent, level):
if not parent.get("replies"):
return
r = parent['replies']['data']['children']
self.process(comments, r, level)
def load_subreddit_list(self, subreddit):
headers = {
'User-Agent': 'python/requests',
}
listing = requests.get('https://reddit.com/r/{}.json'.format(subreddit), headers=headers)
x = [dict(
subreddit=c['data']['subreddit'],
score=c['data']['score'],
message_count=c['data']['score'],
user=c['data']['author'],
permalink=c['data']['permalink'],
created_utc=c['data']['created_utc'],
url=c['data']['url'],
title=c['data']['title'],
ups=c['data']['ups'],
downs=c['data']['downs'],
) for c in listing.json()['data']['children']]
return x
def load_subreddit_posts(self, subreddit, topic_id, title):
headers = {
'User-Agent': 'python/requests',
}
posts = requests.get(
'https://www.reddit.com/r/{}/comments/{}/{}/.json'.format(
subreddit, topic_id, title),
headers=headers
)
comments = list()
r = posts.json()[1]['data']['children']
comments = self.process(comments, r, 0)
return comments
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment