Last active
December 30, 2015 16:09
-
-
Save cwbriones/7852792 to your computer and use it in GitHub Desktop.
The reddit image downloader, done as an exercise as per advice from feroc on r/learnprogramming.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib | |
import urllib2 | |
import json | |
import re | |
import sys | |
import os | |
""" | |
The reddit image downloader, done as an exercise as per | |
advice from feroc on r/learnprogramming. | |
See docstring of SubredditScraper for more info on what | |
this does. | |
usage: python redditscraper.py <user-name> | |
Alternatively can be used to download submissions from a particular | |
user to the subreddit specified in main. | |
""" | |
class ImageRetriever(object): | |
""" | |
Class for retrieving image urls from a specific source | |
""" | |
def __init__(self): | |
pass | |
def is_valid(self, url): | |
raise NotImplementedError | |
def retrieve_image_urls(self, url): | |
raise NotImplementedError | |
class MinusRetriever(ImageRetriever): | |
""" | |
Image retriever for the min.us site. Doesn't support albums. | |
""" | |
def retrieve_image_urls(self, url): | |
html = urllib.urlopen(url).read() | |
image = re.findall( | |
'<meta property="og:image"' | |
' content="(http:\/\/i1?.minus.com\/[a-zA-Z0-9]+' | |
'\.(gif|jpeg|jpg|png))"', | |
html) | |
if len(image): | |
return [image] | |
return [] | |
def is_valid(self, url): | |
return url.find("min.us") != -1 | |
class ImgurRetriever(ImageRetriever): | |
""" | |
Image retriever for the imgur.com site. Supports albums. | |
""" | |
def is_valid(self, url): | |
return url.find("imgur.com") != -1 | |
def retrieve_image_urls(self, url): | |
extension = self._get_extension(url) | |
if extension is None: | |
if self.is_album(url): | |
return self._extract_urls_from_album(url) | |
else: | |
return [self._find_direct_url(url)] | |
else: | |
return [url] | |
def is_album(self, url): | |
return len(re.findall("\/a\/([0-9a-zA-Z]{5})", url)) > 0 | |
def _get_extension(self, url): | |
ext = re.findall('\.(gif|jpeg|jpg|png)', url) | |
if len(ext): | |
return ext[0] | |
return None | |
def _find_direct_url(self, url): | |
html = urllib.urlopen(url).read() | |
images = re.findall( | |
'img src="(\/\/i\.imgur\.com\/([a-zA-Z0-9]+)"' | |
'"\.(jpg|jpeg|png|gif))"', | |
html) | |
if images: | |
return "http:" + images[0][0] | |
return None | |
def _extract_urls_from_album(self, url): | |
html = urllib.urlopen(url).read() | |
urls = set() | |
images = re.findall( | |
'(?<!alt="") data-src="(\/\/i\.imgur\.com\/([a-zA-Z0-9]+)' | |
'\.(jpg|jpeg|png|gif))"', | |
html) | |
return ["http:" + image[0] for image in images] | |
class SubredditScraper(object): | |
""" | |
Retrieves images from the top users on a particular subreddit. | |
Builds a list of N=frontpage_limit users and downloads images | |
from their top M=user_limit posts to the same subreddit. | |
Images are saved into the /r/<subreddit-name> directory, | |
with subfolders organized by user. | |
""" | |
def __init__(self, subreddit, user_limit=None, frontpage_limit=10): | |
self.subreddit = subreddit | |
self.user_limit = user_limit | |
self.frontpage_limit = frontpage_limit | |
# Standard image sources, only min.us and imgur.com supported | |
self.image_retrievers = [ImgurRetriever(), MinusRetriever()] | |
self.queue = {} | |
self.total_downloaded = 0 | |
# Check for directory and possibly create it | |
cwd = os.getcwd() | |
self.img_dir = "{}{}r_{}_dump".format(cwd, os.sep, self.subreddit) | |
if not os.path.exists(self.img_dir): | |
print "Creating /r/{} dump directory".format(self.subreddit) | |
os.mkdir(self.img_dir) | |
os.chdir(self.img_dir) | |
else: | |
print "Subreddit directory already exists. Adding to it." | |
def enqueue_frontpage(self, sorting=None): | |
""" | |
Fills download queue with image links from the top | |
N=frontpage_limit users | |
""" | |
for user in self._get_users_on_frontpage(sorting): | |
self.enqueue_user(user) | |
def enqueue_user(self, user): | |
""" | |
Adds images from the top M=user_limit submissions to this subreddit | |
to the download queue | |
""" | |
urls = self._get_user_submissions(user) | |
direct = [] | |
for url in urls: | |
for r in self.image_retrievers: | |
if r.is_valid(url): | |
direct += r.retrieve_image_urls(url) | |
break | |
self.queue[user] = direct | |
def _safely_read_json_content(self, url): | |
""" | |
Safe retrieves json content from reddit as per the API documentation | |
""" | |
# User agent needed by API | |
urlcontent = None | |
hdr = {'User-Agent': | |
'Educational reddit post scraper for imgur albums'} | |
request = urllib2.Request(url, headers=hdr) | |
try: | |
html = urllib2.urlopen(request).read() | |
return json.loads(html.decode('utf8'))['data']['children'] | |
except: | |
print "Error retrieving user: User does not exist" | |
return {} | |
def _get_user_submissions(self, user): | |
""" | |
Retrieves json data consisting of user submissions | |
""" | |
url = ("http://www.reddit.com/user/" + user + | |
"/submitted/.json") | |
if self.user_limit is not None: | |
url += "?limit=" + str(self.user_limit) | |
json_content = self._safely_read_json_content(url) | |
urls = [] | |
posts = (x for x in json_content if x['data']['subreddit'] == | |
self.subreddit) | |
for post in posts: | |
if not post['data']['is_self']: | |
urls.append(post['data']['url']) | |
return urls | |
def _get_users_on_frontpage(self, limit, sorting=None): | |
""" | |
Retrieves a list of current frontpage submission authors | |
""" | |
subreddit_url = ( | |
'http://www.reddit.com/r/' + | |
self.subreddit + | |
'/.json?limit=' + | |
str(self.frontpage_limit) | |
) | |
posts = self._safely_read_json_content(subreddit_url) | |
return [post['data']['author'] for post in posts] | |
def print_banner(self, text, length): | |
print "-" * length + '\n' + text + '\n' + '-' * length + '\n' | |
def check_user_directory(self, user): | |
""" | |
Checks for existence of user directory and creates one | |
if it does not exist. | |
""" | |
user_dir = self.img_dir + os.sep + user | |
if not os.path.exists(user_dir): | |
print "Directory created.\n".format(user) | |
os.mkdir(user_dir) | |
else: | |
print "Checking existing directory.\n".format(user) | |
return user_dir | |
def download_all(self): | |
""" | |
Empties the download queue and retrieves an image if it is | |
not already downloaded. | |
""" | |
total_downloaded = 0 | |
downloaded_users = {} | |
for user in self.queue: | |
downloaded = 0 | |
images = self.queue[user] | |
user_dir = self.check_user_directory(user) | |
self.print_banner(user, 35) | |
for url in self.queue[user]: | |
if url is not None and self.save_image(user_dir, url): | |
downloaded += 1 | |
total_downloaded += downloaded | |
if downloaded > 0: | |
downloaded_users[user] = downloaded | |
self.print_banner("Summary", 35) | |
if total_downloaded != 0: | |
print "{} images downloaded from {} users.".format( | |
total_downloaded, | |
len(downloaded_users)) | |
for user in downloaded_users: | |
print "{}: {}".format(user, downloaded_users[user]) | |
else: | |
print "No images downloaded." | |
def save_image(self, folder, url): | |
""" | |
Checks for the existence of an image and downloads it | |
if it is not there. | |
""" | |
length = len('http://i.imgur.com/') | |
image_name = url[length:] | |
image_path = folder + os.sep + image_name | |
if not os.path.exists(image_path): | |
print "Downloading image: {}".format(url) | |
urllib.urlretrieve(url, image_path) | |
return True | |
return False | |
def main(): | |
scraper = SubredditScraper("aww") | |
if len(sys.argv) != 2: | |
# Enqueue from the entire frontpage | |
scraper.enqueue_frontpage() | |
else: | |
# Enqueue from a particular user | |
scraper.enqueue_user(sys.argv[1]) | |
# Needed to actuall empty the queue | |
scraper.download_all() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment