Created
November 13, 2011 21:57
-
-
Save andrew/1362791 to your computer and use it in GitHub Desktop.
Get a list of subreddits (with python)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Requires lxml 2.0.3 and httplib2. Public domain. | |
first_uri = 'http://www.reddit.com/reddits/' | |
import httplib2 | |
import urlparse | |
import lxml.html.soupparser | |
def get_page(uri): | |
print 'Processing %s' % uri | |
http = httplib2.Http() | |
response, content = http.request(uri) | |
return lxml.html.soupparser.fromstring(content) | |
def fetch_reddits(): | |
reddit_list = [] | |
current_uri = first_uri | |
while True: | |
page = get_page(current_uri) | |
reddits = page.xpath('//div[contains(@class, \'subreddit\')]') | |
for reddit in reddits: | |
info = reddit.xpath('descendant::a[@class=\'title\']')[0] | |
name = info.text or info.attrib['href'] | |
uri = urlparse.urljoin(current_uri, info.attrib['href']) | |
try: | |
description = reddit.xpath('descendant::p[@class=\'description\']/text()')[0] | |
except: | |
description = None | |
subscribers = int(reddit.xpath('descendant::span[contains(@class, \'score\')]/text()')[0].split()[0]) | |
reddit_list.append((name, uri, description, subscribers)) | |
try: | |
next_link = page.xpath('//p[@class=\'nextprev\']/a[contains(text(),\'next\')]')[0] | |
current_uri = urlparse.urljoin(current_uri, next_link.attrib['href']) | |
except: | |
break | |
return reddit_list | |
if __name__ == '__main__': | |
reddits = fetch_reddits() | |
reddits.sort(key=lambda reddit: reddit[3]) | |
reddits.reverse() | |
html = open('subreddits.html', 'w+') | |
html.write('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"><meta content="text/html; charset=utf-8" http-equiv="Content-Type"><title>List of subreddits</title><h1>List of subreddits</h1><ul>') | |
for reddit in reddits: | |
name, uri, description, subscribers = reddit[0].encode('utf-8'), reddit[1].encode('utf-8'), (reddit[2] or '').encode('utf-8'), reddit[3] | |
html.write('<li><h2><a href="%s">%s</a></h2><p>%d subscribers</p><p>%s</p></li>' % (uri, name, subscribers, description)) | |
html.write('</ul><p>Generated with <a href="subreddits.py">this Python script</a>.</p>') | |
html.close() | |
print 'Found %d subreddits, saved to subreddits.html.' % len(reddits) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I pulled out 2.7 million subreddits using pushshift.
You can view the full list, as well as a filtered down list of 50,000 here: https://github.com/chapmanjacobd/reddit_mining/blob/main/insights.md#out-of-millions-of-subreddits-these-are-the-top-50000-link-post-subreddits