Skip to content

Instantly share code, notes, and snippets.

@anvaka
Created February 29, 2012 13:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anvaka/1941011 to your computer and use it in GitHub Desktop.
Save anvaka/1941011 to your computer and use it in GitHub Desktop.
Extract related subreddits from subreddit description.
import re
from models import RedditInfo
from google.appengine.ext.db import BadValueError
reLink = re.compile('/r/[a-zA-Z0-9_-]+', re.I)
def parseRedditLinks(text, exclude):
if text is not None:
return sorted(set([match.lower() for match in reLink.findall(text) if match.lower != exclude]))
else:
return []
def convertJsonToRedditInfoModel(entry):
"""
Converts JSON data entry of subreddit description (e.g. http://www.reddit.com/r/science/about.json )
to the model entry
"""
try:
url = entry['url'].lower()
description = entry['description']
parsedLinks = parseRedditLinks(description, exclude=url)
return RedditInfo(key_name=url,
url = url,
display_name = entry['display_name'],
name = entry['name'],
title = entry['title'],
created = entry['created'],
created_utc = entry['created_utc'],
over18 = entry['over18'],
subscribers = entry['subscribers'],
id = entry['id'],
description = description,
parsedLinks = parsedLinks)
except BadValueError, ex:
# Some entries has multiline urls which crashes this code... I assume they are invalid.
return None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment