Skip to content

Instantly share code, notes, and snippets.

@cwbriones
Last active December 30, 2015 16:09
Show Gist options
  • Save cwbriones/7852792 to your computer and use it in GitHub Desktop.
Save cwbriones/7852792 to your computer and use it in GitHub Desktop.
The reddit image downloader, done as an exercise as per advice from feroc on r/learnprogramming.
import urllib
import urllib2
import json
import re
import sys
import os
"""
The reddit image downloader, done as an exercise as per
advice from feroc on r/learnprogramming.
See docstring of SubredditScraper for more info on what
this does.
usage: python redditscraper.py <user-name>
Alternatively can be used to download submissions from a particular
user to the subreddit specified in main.
"""
class ImageRetriever(object):
"""
Class for retrieving image urls from a specific source
"""
def __init__(self):
pass
def is_valid(self, url):
raise NotImplementedError
def retrieve_image_urls(self, url):
raise NotImplementedError
class MinusRetriever(ImageRetriever):
"""
Image retriever for the min.us site. Doesn't support albums.
"""
def retrieve_image_urls(self, url):
html = urllib.urlopen(url).read()
image = re.findall(
'<meta property="og:image"'
' content="(http:\/\/i1?.minus.com\/[a-zA-Z0-9]+'
'\.(gif|jpeg|jpg|png))"',
html)
if len(image):
return [image]
return []
def is_valid(self, url):
return url.find("min.us") != -1
class ImgurRetriever(ImageRetriever):
"""
Image retriever for the imgur.com site. Supports albums.
"""
def is_valid(self, url):
return url.find("imgur.com") != -1
def retrieve_image_urls(self, url):
extension = self._get_extension(url)
if extension is None:
if self.is_album(url):
return self._extract_urls_from_album(url)
else:
return [self._find_direct_url(url)]
else:
return [url]
def is_album(self, url):
return len(re.findall("\/a\/([0-9a-zA-Z]{5})", url)) > 0
def _get_extension(self, url):
ext = re.findall('\.(gif|jpeg|jpg|png)', url)
if len(ext):
return ext[0]
return None
def _find_direct_url(self, url):
html = urllib.urlopen(url).read()
images = re.findall(
'img src="(\/\/i\.imgur\.com\/([a-zA-Z0-9]+)"'
'"\.(jpg|jpeg|png|gif))"',
html)
if images:
return "http:" + images[0][0]
return None
def _extract_urls_from_album(self, url):
html = urllib.urlopen(url).read()
urls = set()
images = re.findall(
'(?<!alt="") data-src="(\/\/i\.imgur\.com\/([a-zA-Z0-9]+)'
'\.(jpg|jpeg|png|gif))"',
html)
return ["http:" + image[0] for image in images]
class SubredditScraper(object):
"""
Retrieves images from the top users on a particular subreddit.
Builds a list of N=frontpage_limit users and downloads images
from their top M=user_limit posts to the same subreddit.
Images are saved into the /r/<subreddit-name> directory,
with subfolders organized by user.
"""
def __init__(self, subreddit, user_limit=None, frontpage_limit=10):
self.subreddit = subreddit
self.user_limit = user_limit
self.frontpage_limit = frontpage_limit
# Standard image sources, only min.us and imgur.com supported
self.image_retrievers = [ImgurRetriever(), MinusRetriever()]
self.queue = {}
self.total_downloaded = 0
# Check for directory and possibly create it
cwd = os.getcwd()
self.img_dir = "{}{}r_{}_dump".format(cwd, os.sep, self.subreddit)
if not os.path.exists(self.img_dir):
print "Creating /r/{} dump directory".format(self.subreddit)
os.mkdir(self.img_dir)
os.chdir(self.img_dir)
else:
print "Subreddit directory already exists. Adding to it."
def enqueue_frontpage(self, sorting=None):
"""
Fills download queue with image links from the top
N=frontpage_limit users
"""
for user in self._get_users_on_frontpage(sorting):
self.enqueue_user(user)
def enqueue_user(self, user):
"""
Adds images from the top M=user_limit submissions to this subreddit
to the download queue
"""
urls = self._get_user_submissions(user)
direct = []
for url in urls:
for r in self.image_retrievers:
if r.is_valid(url):
direct += r.retrieve_image_urls(url)
break
self.queue[user] = direct
def _safely_read_json_content(self, url):
"""
Safe retrieves json content from reddit as per the API documentation
"""
# User agent needed by API
urlcontent = None
hdr = {'User-Agent':
'Educational reddit post scraper for imgur albums'}
request = urllib2.Request(url, headers=hdr)
try:
html = urllib2.urlopen(request).read()
return json.loads(html.decode('utf8'))['data']['children']
except:
print "Error retrieving user: User does not exist"
return {}
def _get_user_submissions(self, user):
"""
Retrieves json data consisting of user submissions
"""
url = ("http://www.reddit.com/user/" + user +
"/submitted/.json")
if self.user_limit is not None:
url += "?limit=" + str(self.user_limit)
json_content = self._safely_read_json_content(url)
urls = []
posts = (x for x in json_content if x['data']['subreddit'] ==
self.subreddit)
for post in posts:
if not post['data']['is_self']:
urls.append(post['data']['url'])
return urls
def _get_users_on_frontpage(self, limit, sorting=None):
"""
Retrieves a list of current frontpage submission authors
"""
subreddit_url = (
'http://www.reddit.com/r/' +
self.subreddit +
'/.json?limit=' +
str(self.frontpage_limit)
)
posts = self._safely_read_json_content(subreddit_url)
return [post['data']['author'] for post in posts]
def print_banner(self, text, length):
print "-" * length + '\n' + text + '\n' + '-' * length + '\n'
def check_user_directory(self, user):
"""
Checks for existence of user directory and creates one
if it does not exist.
"""
user_dir = self.img_dir + os.sep + user
if not os.path.exists(user_dir):
print "Directory created.\n".format(user)
os.mkdir(user_dir)
else:
print "Checking existing directory.\n".format(user)
return user_dir
def download_all(self):
"""
Empties the download queue and retrieves an image if it is
not already downloaded.
"""
total_downloaded = 0
downloaded_users = {}
for user in self.queue:
downloaded = 0
images = self.queue[user]
user_dir = self.check_user_directory(user)
self.print_banner(user, 35)
for url in self.queue[user]:
if url is not None and self.save_image(user_dir, url):
downloaded += 1
total_downloaded += downloaded
if downloaded > 0:
downloaded_users[user] = downloaded
self.print_banner("Summary", 35)
if total_downloaded != 0:
print "{} images downloaded from {} users.".format(
total_downloaded,
len(downloaded_users))
for user in downloaded_users:
print "{}: {}".format(user, downloaded_users[user])
else:
print "No images downloaded."
def save_image(self, folder, url):
"""
Checks for the existence of an image and downloads it
if it is not there.
"""
length = len('http://i.imgur.com/')
image_name = url[length:]
image_path = folder + os.sep + image_name
if not os.path.exists(image_path):
print "Downloading image: {}".format(url)
urllib.urlretrieve(url, image_path)
return True
return False
def main():
scraper = SubredditScraper("aww")
if len(sys.argv) != 2:
# Enqueue from the entire frontpage
scraper.enqueue_frontpage()
else:
# Enqueue from a particular user
scraper.enqueue_user(sys.argv[1])
# Needed to actuall empty the queue
scraper.download_all()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment