cwbriones/redditscraper.py

## redditscraper.py
import urllib
import urllib2
import json
import re

import sys
import os

"""
The reddit image downloader, done as an exercise as per
advice from feroc on r/learnprogramming.

See docstring of SubredditScraper for more info on what
this does.

usage: python redditscraper.py <user-name>

Alternatively can be used to download submissions from a particular
user to the subreddit specified in main.
"""

class ImageRetriever(object):
    """
    Class for retrieving image urls from a specific source
    """
    def __init__(self):
        pass

    def is_valid(self, url):
        raise NotImplementedError

    def retrieve_image_urls(self, url):
        raise NotImplementedError


class MinusRetriever(ImageRetriever):
    """
    Image retriever for the min.us site. Doesn't support albums.
    """
    def retrieve_image_urls(self, url):
        html = urllib.urlopen(url).read()
        image = re.findall(
            '<meta property="og:image"'
            '  content="(http:\/\/i1?.minus.com\/[a-zA-Z0-9]+'
            '\.(gif|jpeg|jpg|png))"',
            html)
        if len(image):
            return [image]
        return []

    def is_valid(self, url):
        return url.find("min.us") != -1


class ImgurRetriever(ImageRetriever):
    """
    Image retriever for the imgur.com site. Supports albums.
    """
    def is_valid(self, url):
        return url.find("imgur.com") != -1

    def retrieve_image_urls(self, url):
        extension = self._get_extension(url)

        if extension is None:
            if self.is_album(url):
                return self._extract_urls_from_album(url)
            else:
                return [self._find_direct_url(url)]
        else:
            return [url]

    def is_album(self, url):
        return len(re.findall("\/a\/([0-9a-zA-Z]{5})", url)) > 0

    def _get_extension(self, url):
        ext = re.findall('\.(gif|jpeg|jpg|png)', url)
        if len(ext):
            return ext[0]
        return None

    def _find_direct_url(self, url):
        html = urllib.urlopen(url).read()

        images = re.findall(
            'img src="(\/\/i\.imgur\.com\/([a-zA-Z0-9]+)"'
            '"\.(jpg|jpeg|png|gif))"',
            html)
        if images:
            return "http:" + images[0][0]
        return None

    def _extract_urls_from_album(self, url):
        html = urllib.urlopen(url).read()
        urls = set()

        images = re.findall(
            '(?<!alt="") data-src="(\/\/i\.imgur\.com\/([a-zA-Z0-9]+)'
            '\.(jpg|jpeg|png|gif))"',
            html)
        return ["http:" + image[0] for image in images]


class SubredditScraper(object):
    """
    Retrieves images from the top users on a particular subreddit.
    Builds a list of N=frontpage_limit users and downloads images
    from their top M=user_limit posts to the same subreddit.

    Images are saved into the /r/<subreddit-name> directory,
    with subfolders organized by user.
    """
    def __init__(self, subreddit, user_limit=None, frontpage_limit=10):
        self.subreddit = subreddit
        self.user_limit = user_limit
        self.frontpage_limit = frontpage_limit

        # Standard image sources, only min.us and imgur.com supported
        self.image_retrievers = [ImgurRetriever(), MinusRetriever()]

        self.queue = {}
        self.total_downloaded = 0

        # Check for directory and possibly create it
        cwd = os.getcwd()
        self.img_dir = "{}{}r_{}_dump".format(cwd, os.sep, self.subreddit)

        if not os.path.exists(self.img_dir):
            print "Creating /r/{} dump directory".format(self.subreddit)
            os.mkdir(self.img_dir)
            os.chdir(self.img_dir)
        else:
            print "Subreddit directory already exists. Adding to it."

    def enqueue_frontpage(self, sorting=None):
        """
        Fills download queue with image links from the top
        N=frontpage_limit users
        """
        for user in self._get_users_on_frontpage(sorting):
            self.enqueue_user(user)

    def enqueue_user(self, user):
        """
        Adds images from the top M=user_limit submissions to this subreddit
        to the download queue
        """
        urls = self._get_user_submissions(user)
        direct = []
        for url in urls:
            for r in self.image_retrievers:
                if r.is_valid(url):
                    direct += r.retrieve_image_urls(url)
                    break
        self.queue[user] = direct

    def _safely_read_json_content(self, url):
        """
        Safe retrieves json content from reddit as per the API documentation
        """

        # User agent needed by API
        urlcontent = None
        hdr = {'User-Agent':
               'Educational reddit post scraper for imgur albums'}
        request = urllib2.Request(url, headers=hdr)

        try:
            html = urllib2.urlopen(request).read()
            return json.loads(html.decode('utf8'))['data']['children']
        except:
            print "Error retrieving user: User does not exist"
            return {}

    def _get_user_submissions(self, user):
        """
        Retrieves json data consisting of user submissions
        """
        url = ("http://www.reddit.com/user/" + user +
               "/submitted/.json")
        if self.user_limit is not None:
            url += "?limit=" + str(self.user_limit)
        json_content = self._safely_read_json_content(url)

        urls = []

        posts = (x for x in json_content if x['data']['subreddit'] ==
                 self.subreddit)

        for post in posts:
            if not post['data']['is_self']:
                urls.append(post['data']['url'])
        return urls

    def _get_users_on_frontpage(self, limit, sorting=None):
        """
        Retrieves a list of current frontpage submission authors
        """
        subreddit_url = (
            'http://www.reddit.com/r/' +
            self.subreddit +
            '/.json?limit=' +
            str(self.frontpage_limit)
            )

        posts = self._safely_read_json_content(subreddit_url)
        return [post['data']['author'] for post in posts]

    def print_banner(self, text, length):
        print "-" * length + '\n' + text + '\n' + '-' * length + '\n'

    def check_user_directory(self, user):
        """
        Checks for existence of user directory and creates one
        if it does not exist.
        """
        user_dir = self.img_dir + os.sep + user
        if not os.path.exists(user_dir):
            print "Directory created.\n".format(user)
            os.mkdir(user_dir)
        else:
            print "Checking existing directory.\n".format(user)
        return user_dir

    def download_all(self):
        """
        Empties the download queue and retrieves an image if it is
        not already downloaded.
        """
        total_downloaded = 0
        downloaded_users = {}

        for user in self.queue:
            downloaded = 0
            images = self.queue[user]
            user_dir = self.check_user_directory(user)

            self.print_banner(user, 35)
            for url in self.queue[user]:
                if url is not None and self.save_image(user_dir, url):
                    downloaded += 1
            total_downloaded += downloaded

            if downloaded > 0:
                downloaded_users[user] = downloaded

        self.print_banner("Summary", 35)

        if total_downloaded != 0:
            print "{} images downloaded from {} users.".format(
                total_downloaded,
                len(downloaded_users))
            for user in downloaded_users:
                print "{}: {}".format(user, downloaded_users[user])
        else:
            print "No images downloaded."

    def save_image(self, folder, url):
        """
        Checks for the existence of an image and downloads it
        if it is not there.
        """
        length = len('http://i.imgur.com/')

        image_name = url[length:]
        image_path = folder + os.sep + image_name

        if not os.path.exists(image_path):
            print "Downloading image: {}".format(url)
            urllib.urlretrieve(url, image_path)
            return True
        return False


def main():
    scraper = SubredditScraper("aww")
    if len(sys.argv) != 2:
        # Enqueue from the entire frontpage
        scraper.enqueue_frontpage()
    else:
        # Enqueue from a particular user
        scraper.enqueue_user(sys.argv[1])
    # Needed to actuall empty the queue
    scraper.download_all()

if __name__ == '__main__':
    main()
	import urllib
	import urllib2
	import json
	import re

	import sys
	import os

	"""
	The reddit image downloader, done as an exercise as per
	advice from feroc on r/learnprogramming.

	See docstring of SubredditScraper for more info on what
	this does.

	usage: python redditscraper.py <user-name>

	Alternatively can be used to download submissions from a particular
	user to the subreddit specified in main.
	"""

	class ImageRetriever(object):
	"""
	Class for retrieving image urls from a specific source
	"""
	def __init__(self):
	pass

	def is_valid(self, url):
	raise NotImplementedError

	def retrieve_image_urls(self, url):
	raise NotImplementedError


	class MinusRetriever(ImageRetriever):
	"""
	Image retriever for the min.us site. Doesn't support albums.
	"""
	def retrieve_image_urls(self, url):
	html = urllib.urlopen(url).read()
	image = re.findall(
	'<meta property="og:image"'
	' content="(http:\/\/i1?.minus.com\/[a-zA-Z0-9]+'
	'\.(gif\|jpeg\|jpg\|png))"',
	html)
	if len(image):
	return [image]
	return []

	def is_valid(self, url):
	return url.find("min.us") != -1


	class ImgurRetriever(ImageRetriever):
	"""
	Image retriever for the imgur.com site. Supports albums.
	"""
	def is_valid(self, url):
	return url.find("imgur.com") != -1

	def retrieve_image_urls(self, url):
	extension = self._get_extension(url)

	if extension is None:
	if self.is_album(url):
	return self._extract_urls_from_album(url)
	else:
	return [self._find_direct_url(url)]
	else:
	return [url]

	def is_album(self, url):
	return len(re.findall("\/a\/([0-9a-zA-Z]{5})", url)) > 0

	def _get_extension(self, url):
	ext = re.findall('\.(gif\|jpeg\|jpg\|png)', url)
	if len(ext):
	return ext[0]
	return None

	def _find_direct_url(self, url):
	html = urllib.urlopen(url).read()

	images = re.findall(
	'img src="(\/\/i\.imgur\.com\/([a-zA-Z0-9]+)"'
	'"\.(jpg\|jpeg\|png\|gif))"',
	html)
	if images:
	return "http:" + images[0][0]
	return None

	def _extract_urls_from_album(self, url):
	html = urllib.urlopen(url).read()
	urls = set()

	images = re.findall(
	'(?<!alt="") data-src="(\/\/i\.imgur\.com\/([a-zA-Z0-9]+)'
	'\.(jpg\|jpeg\|png\|gif))"',
	html)
	return ["http:" + image[0] for image in images]


	class SubredditScraper(object):
	"""
	Retrieves images from the top users on a particular subreddit.
	Builds a list of N=frontpage_limit users and downloads images
	from their top M=user_limit posts to the same subreddit.

	Images are saved into the /r/<subreddit-name> directory,
	with subfolders organized by user.
	"""
	def __init__(self, subreddit, user_limit=None, frontpage_limit=10):
	self.subreddit = subreddit
	self.user_limit = user_limit
	self.frontpage_limit = frontpage_limit

	# Standard image sources, only min.us and imgur.com supported
	self.image_retrievers = [ImgurRetriever(), MinusRetriever()]

	self.queue = {}
	self.total_downloaded = 0

	# Check for directory and possibly create it
	cwd = os.getcwd()
	self.img_dir = "{}{}r_{}_dump".format(cwd, os.sep, self.subreddit)

	if not os.path.exists(self.img_dir):
	print "Creating /r/{} dump directory".format(self.subreddit)
	os.mkdir(self.img_dir)
	os.chdir(self.img_dir)
	else:
	print "Subreddit directory already exists. Adding to it."

	def enqueue_frontpage(self, sorting=None):
	"""
	Fills download queue with image links from the top
	N=frontpage_limit users
	"""
	for user in self._get_users_on_frontpage(sorting):
	self.enqueue_user(user)

	def enqueue_user(self, user):
	"""
	Adds images from the top M=user_limit submissions to this subreddit
	to the download queue
	"""
	urls = self._get_user_submissions(user)
	direct = []
	for url in urls:
	for r in self.image_retrievers:
	if r.is_valid(url):
	direct += r.retrieve_image_urls(url)
	break
	self.queue[user] = direct

	def _safely_read_json_content(self, url):
	"""
	Safe retrieves json content from reddit as per the API documentation
	"""

	# User agent needed by API
	urlcontent = None
	hdr = {'User-Agent':
	'Educational reddit post scraper for imgur albums'}
	request = urllib2.Request(url, headers=hdr)

	try:
	html = urllib2.urlopen(request).read()
	return json.loads(html.decode('utf8'))['data']['children']
	except:
	print "Error retrieving user: User does not exist"
	return {}

	def _get_user_submissions(self, user):
	"""
	Retrieves json data consisting of user submissions
	"""
	url = ("http://www.reddit.com/user/" + user +
	"/submitted/.json")
	if self.user_limit is not None:
	url += "?limit=" + str(self.user_limit)
	json_content = self._safely_read_json_content(url)

	urls = []

	posts = (x for x in json_content if x['data']['subreddit'] ==
	self.subreddit)

	for post in posts:
	if not post['data']['is_self']:
	urls.append(post['data']['url'])
	return urls

	def _get_users_on_frontpage(self, limit, sorting=None):
	"""
	Retrieves a list of current frontpage submission authors
	"""
	subreddit_url = (
	'http://www.reddit.com/r/' +
	self.subreddit +
	'/.json?limit=' +
	str(self.frontpage_limit)
	)

	posts = self._safely_read_json_content(subreddit_url)
	return [post['data']['author'] for post in posts]

	def print_banner(self, text, length):
	print "-" * length + '\n' + text + '\n' + '-' * length + '\n'

	def check_user_directory(self, user):
	"""
	Checks for existence of user directory and creates one
	if it does not exist.
	"""
	user_dir = self.img_dir + os.sep + user
	if not os.path.exists(user_dir):
	print "Directory created.\n".format(user)
	os.mkdir(user_dir)
	else:
	print "Checking existing directory.\n".format(user)
	return user_dir

	def download_all(self):
	"""
	Empties the download queue and retrieves an image if it is
	not already downloaded.
	"""
	total_downloaded = 0
	downloaded_users = {}

	for user in self.queue:
	downloaded = 0
	images = self.queue[user]
	user_dir = self.check_user_directory(user)

	self.print_banner(user, 35)
	for url in self.queue[user]:
	if url is not None and self.save_image(user_dir, url):
	downloaded += 1
	total_downloaded += downloaded

	if downloaded > 0:
	downloaded_users[user] = downloaded

	self.print_banner("Summary", 35)

	if total_downloaded != 0:
	print "{} images downloaded from {} users.".format(
	total_downloaded,
	len(downloaded_users))
	for user in downloaded_users:
	print "{}: {}".format(user, downloaded_users[user])
	else:
	print "No images downloaded."

	def save_image(self, folder, url):
	"""
	Checks for the existence of an image and downloads it
	if it is not there.
	"""
	length = len('http://i.imgur.com/')

	image_name = url[length:]
	image_path = folder + os.sep + image_name

	if not os.path.exists(image_path):
	print "Downloading image: {}".format(url)
	urllib.urlretrieve(url, image_path)
	return True
	return False


	def main():
	scraper = SubredditScraper("aww")
	if len(sys.argv) != 2:
	# Enqueue from the entire frontpage
	scraper.enqueue_frontpage()
	else:
	# Enqueue from a particular user
	scraper.enqueue_user(sys.argv[1])
	# Needed to actuall empty the queue
	scraper.download_all()

	if __name__ == '__main__':
	main()