Skip to content

Instantly share code, notes, and snippets.

@andrew
Created November 13, 2011 21:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andrew/1362791 to your computer and use it in GitHub Desktop.
Save andrew/1362791 to your computer and use it in GitHub Desktop.
Get a list of subreddits (with python)
#!/usr/bin/env python
# Requires lxml 2.0.3 and httplib2. Public domain.
first_uri = 'http://www.reddit.com/reddits/'
import httplib2
import urlparse
import lxml.html.soupparser
def get_page(uri):
print 'Processing %s' % uri
http = httplib2.Http()
response, content = http.request(uri)
return lxml.html.soupparser.fromstring(content)
def fetch_reddits():
reddit_list = []
current_uri = first_uri
while True:
page = get_page(current_uri)
reddits = page.xpath('//div[contains(@class, \'subreddit\')]')
for reddit in reddits:
info = reddit.xpath('descendant::a[@class=\'title\']')[0]
name = info.text or info.attrib['href']
uri = urlparse.urljoin(current_uri, info.attrib['href'])
try:
description = reddit.xpath('descendant::p[@class=\'description\']/text()')[0]
except:
description = None
subscribers = int(reddit.xpath('descendant::span[contains(@class, \'score\')]/text()')[0].split()[0])
reddit_list.append((name, uri, description, subscribers))
try:
next_link = page.xpath('//p[@class=\'nextprev\']/a[contains(text(),\'next\')]')[0]
current_uri = urlparse.urljoin(current_uri, next_link.attrib['href'])
except:
break
return reddit_list
if __name__ == '__main__':
reddits = fetch_reddits()
reddits.sort(key=lambda reddit: reddit[3])
reddits.reverse()
html = open('subreddits.html', 'w+')
html.write('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"><meta content="text/html; charset=utf-8" http-equiv="Content-Type"><title>List of subreddits</title><h1>List of subreddits</h1><ul>')
for reddit in reddits:
name, uri, description, subscribers = reddit[0].encode('utf-8'), reddit[1].encode('utf-8'), (reddit[2] or '').encode('utf-8'), reddit[3]
html.write('<li><h2><a href="%s">%s</a></h2><p>%d subscribers</p><p>%s</p></li>' % (uri, name, subscribers, description))
html.write('</ul><p>Generated with <a href="subreddits.py">this Python script</a>.</p>')
html.close()
print 'Found %d subreddits, saved to subreddits.html.' % len(reddits)
@chapmanjacobd
Copy link

I pulled out 2.7 million subreddits using pushshift.

You can view the full list, as well as a filtered down list of 50,000 here: https://github.com/chapmanjacobd/reddit_mining/blob/main/insights.md#out-of-millions-of-subreddits-these-are-the-top-50000-link-post-subreddits

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment