Last active
May 27, 2016 14:11
-
-
Save avoliva/ba8d157adb5c19f7dd2d to your computer and use it in GitHub Desktop.
Reddit comment loader/parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from BeautifulSoup import BeautifulSoup | |
import requests | |
from xml.sax import saxutils as su | |
class Comment(object): | |
def __init__(self, id, html, author, | |
points, postedOn, level, parent=None): | |
self.id = id | |
self.content =html | |
self.user = author | |
self.points = points | |
self.created = postedOn | |
self.level = level | |
self.parent = parent | |
self.parent_content = None | |
class Reddit(object): | |
def load_comment(self, data, level): | |
comment = Comment( | |
id=data.get('name'), | |
=data.get('body_html'), | |
author=data.get('author'), | |
points=data.get('score'), | |
postedOn=data.get('created_utc'), | |
level=level, | |
parent=data.get('parent_id') | |
) | |
return comment | |
def process(self, comments, c, level): | |
for item in c: | |
if item.get('kind') is None: | |
continue | |
if not item.get("kind") == "t1": | |
continue | |
data = item.get('data') | |
comment = self.load_comment(data, level) | |
comment.content = '<br/><br/>'.join( | |
[str(x.contents[0]) for x in | |
BeautifulSoup(su.unescape(comment.content)).find('div').findAll('p')]) | |
if comment.parent.split('_')[0] == 't1': | |
comment.parent_content = next((l for l in comments if l.id == comment.parent), None).content | |
if comment.user: | |
comments.append(comment) | |
self.add_replies(comments,data,level+1) | |
return comments | |
def add_replies(self, comments, parent, level): | |
if not parent.get("replies"): | |
return | |
r = parent['replies']['data']['children'] | |
self.process(comments, r, level) | |
def load_subreddit_list(self, subreddit): | |
headers = { | |
'User-Agent': 'python/requests', | |
} | |
listing = requests.get('https://reddit.com/r/{}.json'.format(subreddit), headers=headers) | |
x = [dict( | |
subreddit=c['data']['subreddit'], | |
score=c['data']['score'], | |
message_count=c['data']['score'], | |
user=c['data']['author'], | |
permalink=c['data']['permalink'], | |
created_utc=c['data']['created_utc'], | |
url=c['data']['url'], | |
title=c['data']['title'], | |
ups=c['data']['ups'], | |
downs=c['data']['downs'], | |
) for c in listing.json()['data']['children']] | |
return x | |
def load_subreddit_posts(self, subreddit, topic_id, title): | |
headers = { | |
'User-Agent': 'python/requests', | |
} | |
posts = requests.get( | |
'https://www.reddit.com/r/{}/comments/{}/{}/.json'.format( | |
subreddit, topic_id, title), | |
headers=headers | |
) | |
comments = list() | |
r = posts.json()[1]['data']['children'] | |
comments = self.process(comments, r, 0) | |
return comments |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment