nicokoch/reddit_dl.py

## reddit_dl.py
#!/usr/bin/env python2
import os
import getpass
import urllib2
from StringIO import StringIO
import time
import datetime
import argparse
import praw
from bs4 import BeautifulSoup
from progressbar import ProgressBar, Percentage, Bar, ETA


def get_submissions(subreddit, count, filter):
    r = praw.Reddit(user_agent=getpass.getuser()+'\'s reddit_dl')
    sr = r.get_subreddit(subreddit)
    return filter(sr, count)


def get_links(submissions):
    res = []
    for sub in submissions:
        res.append(sub.url)
    return res


def filter_for_imgur(urls):
    res = []
    for url in urls:
        url = url.encode('ascii', 'ignore')
        if 'imgur' in url:
            if url.endswith('jpg') or url.endswith('gif') or url.endswith('png'):
                res.append(url)
            else: #we have to get the direct links here
                try:
                    response = urllib2.urlopen(url)
                except urllib2.HTTPError as e:
                    print url +": \t"+str(e.code)+" "+e.msg
                    continue
                except urllib2.URLError as e:
                    print "Could not download "+url
                    continue

                if "image" in get_content_type(response):
                    res.append(url)
                    continue
                soup = BeautifulSoup(response.read())
                image_container = soup.select("#image-container") if "/a/" in url else soup.select("#image")
                imgs = image_container[0].findChildren("img") if len(image_container) > 0 else []
                for img in imgs:
                    link = img.get("data-src") if img.get("data-src") else img.get("src")
                    if not link:
                        continue
                    res.append("http://" + link[2:])
        else:
            try:
                response = urllib2.urlopen(url)
            except urllib2.HTTPError as e:
                print url +": \t"+str(e.code)+" "+e.msg
                continue
            except urllib2.URLError as e:
                print "Could not download "+url
                continue

            if "image" in get_content_type(response):
                res.append(url)
    return res


def get_content_type(response):
    for header in response.info().headers:
        if header.startswith("Content-Type"):
            return header.split(":")[1]

def get_file_format(content_type):
    short=content_type.split("/")[1]
    if ("jpg" in short or "jpeg" in short):
        return "jpg"
    elif ("gif" in short):
        return "gif"
    else:
        return "png"


def parse_args():
    parser = argparse.ArgumentParser(description="Download Images from Reddit")
    parser.add_argument('subreddit', help="The subreddit to load images from")
    parser.add_argument('--count', '-c', default='10', type=int, help="Number of images (top posts first)")
    parser.add_argument('--output', '-o', default=".", action='store', help="The output directory for the images")
    parser.add_argument('--category', '-t', default="top",
                        choices=["top", "top-all", "top-day", "top-hour", "top-month", "top-month", "top-week",
                                 "top-year", "con", "con-all", "con-day", "con-hour", "con-month", "con-week",
                                 "con-year", "hot", "new", "new-bydate", "new-byrising", "random", "rising"],
                        help="From which category do you want to download")
    return parser.parse_args()


def download_images(urls, directory):
    actual = 0
    not_read=[]
    ts=time.time()
    timestamp=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S')
    widgets=["Downloading ",Percentage(),' ', Bar(), ETA(),' ']
    pbar = ProgressBar(widgets=widgets, maxval=100 ).start();
    if directory and not directory.endswith("/"):
        directory += "/"
        if not os.path.isdir(directory):
            print directory+" could not be found"
    for i, url in enumerate(urls):
        try:
            response = urllib2.urlopen(url)
        except urllib2.HTTPError as e:
            print url +": \t"+str(e.code)+" "+e.msg
            continue
        except urllib2.URLError as e:
            print "Could not download "+url
            continue

        content_type = get_content_type(response)
        if "image" in content_type:
            percent = float(i+1) / len(urls) * 100
            pbar.update(percent);
            image_data = StringIO(response.read())
            directory = directory if directory else ""
            with open(directory + timestamp +"-image-"+str(i)+"."+get_file_format(content_type), 'w') as f:
                f.write(image_data.buf)
            actual += 1
            time.sleep(2)
        else:
            not_read.append(url);
    pbar.finish();
    if(len(not_read) > 0):
        print "Could not read the following urls:"
        for url in not_read:
            print url
    return actual

def get_filters():
    return {"top": lambda r, c: r.get_top(limit=c),
            "top-all": lambda r, c: r.get_top_from_all(limit=c),
            "top-day": lambda r, c: r.get_top_from_day(limit=c),
            "top-hour": lambda r, c: r.get_top_from_hour(limit=c),
            "top-month": lambda r, c: r.get_top_from_month(limit=c),
            "top-week": lambda r, c: r.get_top_from_week(limit=c),
            "top-year": lambda r, c: r.get_top_from_year(limit=c),
            "con": lambda r, c: r.get_controversial(limit=c),
            "con-all": lambda r, c: r.get_controversial_from_all(limit=c),
            "con-day": lambda r, c: r.get_controversial_from_day(limit=c),
            "con-hour": lambda r, c: r.get_controversial_from_hour(limit=c),
            "con-month": lambda r, c: r.get_controversial_from_month(limit=c),
            "con-week": lambda r, c: r.get_controversial_from_week(limit=c),
            "con-year": lambda r, c: r.get_controversial_from_year(limit=c),
            "hot": lambda r, c: r.get_hot(limit=c),
            "new": lambda r, c: r.get_new(limit=c),
            "new-bydate": lambda r, c: r.get_new_by_date(limit=c),
            "new-byrising": lambda r, c: r.get_new_by_rising(limit=c),
            "random": lambda r, c: r.get_random_submission(limit=c),
            "rising": lambda r, c: r.get_rising(limit=c),
    }


def main():
    args = parse_args()
    urls = get_links(get_submissions(args.subreddit, args.count, get_filters()[args.category]))
    print "Found "+str(len(urls))+" reddit threads"
    urls = filter_for_imgur(urls)
    print "Found "+str(len(urls))+" image links"
    actual = download_images(urls, args.output)
    print "Downloaded "+str(actual)+" images to "+ args.output if args.output else "current directory"


if __name__ == "__main__":
    main()

## reddit_dl3.py
#!/usr/bin/env python3
# To use this, install the following python packages (for example with pip):
#   beautifulsoup4
#   praw
#   py3-progressbar
import os
import getpass
import urllib.request, urllib.error, urllib.parse
from io import BytesIO
import time
import datetime
import argparse
import praw
from bs4 import BeautifulSoup
from progressbar import ProgressBar, Percentage, Bar, ETA


def get_submissions(subreddit, count, filter):
    r = praw.Reddit(user_agent=getpass.getuser()+'\'s reddit_dl')
    sr = r.get_subreddit(subreddit)
    return list(filter(sr, count))


def get_links(submissions):
    res = []
    for sub in submissions:
        res.append(sub.url)
    return res


def filter_for_imgur(urls):
    res = []
    for url in urls:
        url_enc = url.encode('UTF-8', 'ignore')
        if "imgur" in url:
            if url.endswith('jpg') or url.endswith('gif') or url.endswith('png'):
                res.append(url)
            else: #we have to get the direct links here
                try:
                    response = urllib.request.urlopen(url)
                except urllib.error.HTTPError as e:
                    print(url +": \t"+str(e.code)+" "+e.msg)
                    continue
                except urllib.error.URLError as e:
                    print("Could not download "+url)
                    continue

                if "image" in get_content_type(response):
                    res.append(url)
                    continue
                soup = BeautifulSoup(response.read())
                image_container = soup.select("#image-container") if "/a/" in url else soup.select("#image")
                imgs = image_container[0].findChildren("img") if len(image_container) > 0 else []
                for img in imgs:
                    link = img.get("data-src") if img.get("data-src") else img.get("src")
                    if not link:
                        continue
                    res.append("http://" + link[2:])
        else:
            try:
                response = urllib.request.urlopen(url)
            except urllib.error.HTTPError as e:
                print(url +": \t"+str(e.code)+" "+e.msg)
                continue
            except urllib.error.URLError as e:
                print("Could not download "+url)
                continue

            if "image" in get_content_type(response):
                res.append(url)
    return res


def get_content_type(response):
    return response.info().get("Content-Type")

def get_file_format(content_type):
    short=content_type.split("/")[1]
    if ("jpg" in short or "jpeg" in short):
        return "jpg"
    elif ("gif" in short):
        return "gif"
    else:
        return "png"


def parse_args():
    parser = argparse.ArgumentParser(description="Download Images from Reddit")
    parser.add_argument('subreddit', help="The subreddit to load images from")
    parser.add_argument('--count', '-c', default='10', type=int, help="Number of images (top posts first)")
    parser.add_argument('--output', '-o', default=".", action='store', help="The output directory for the images")
    parser.add_argument('--category', '-t', default="top",
                        choices=["top", "top-all", "top-day", "top-hour", "top-month", "top-month", "top-week",
                                 "top-year", "con", "con-all", "con-day", "con-hour", "con-month", "con-week",
                                 "con-year", "hot", "new", "new-bydate", "new-byrising", "random", "rising"],
                        help="From which category do you want to download")
    return parser.parse_args()


def download_images(urls, directory):
    actual = 0
    not_read=[]
    ts=time.time()
    timestamp=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S')
    widgets=["Downloading ",Percentage(),' ', Bar(), ETA(),' ']
    pbar = ProgressBar(widgets=widgets, maxval=100 ).start();
    if directory and not directory.endswith("/"):
        directory += "/"
        if not os.path.isdir(directory):
            print(directory+" could not be found")
    for i, url in enumerate(urls):
        try:
            response = urllib.request.urlopen(url)
        except urllib.error.HTTPError as e:
            print(url +": \t"+str(e.code)+" "+e.msg)
            continue
        except urllib.error.URLError as e:
            print("Could not download "+url)
            continue

        content_type = get_content_type(response)
        if "image" in content_type:
            percent = float(i+1) / len(urls) * 100
            pbar.update(percent);
            image_data = BytesIO(response.read())
            directory = directory if directory else ""
            with open(directory + timestamp +"-image-"+str(i)+"."+get_file_format(content_type), 'wb') as f:
                f.write(image_data.read())
            actual += 1
            time.sleep(2)
        else:
            not_read.append(url);
    pbar.finish();
    if(len(not_read) > 0):
        print("Could not read the following urls:")
        for url in not_read:
            print(url)
    return actual

def get_filters():
    return {"top": lambda r, c: r.get_top(limit=c),
            "top-all": lambda r, c: r.get_top_from_all(limit=c),
            "top-day": lambda r, c: r.get_top_from_day(limit=c),
            "top-hour": lambda r, c: r.get_top_from_hour(limit=c),
            "top-month": lambda r, c: r.get_top_from_month(limit=c),
            "top-week": lambda r, c: r.get_top_from_week(limit=c),
            "top-year": lambda r, c: r.get_top_from_year(limit=c),
            "con": lambda r, c: r.get_controversial(limit=c),
            "con-all": lambda r, c: r.get_controversial_from_all(limit=c),
            "con-day": lambda r, c: r.get_controversial_from_day(limit=c),
            "con-hour": lambda r, c: r.get_controversial_from_hour(limit=c),
            "con-month": lambda r, c: r.get_controversial_from_month(limit=c),
            "con-week": lambda r, c: r.get_controversial_from_week(limit=c),
            "con-year": lambda r, c: r.get_controversial_from_year(limit=c),
            "hot": lambda r, c: r.get_hot(limit=c),
            "new": lambda r, c: r.get_new(limit=c),
            "new-bydate": lambda r, c: r.get_new_by_date(limit=c),
            "new-byrising": lambda r, c: r.get_new_by_rising(limit=c),
            "random": lambda r, c: r.get_random_submission(limit=c),
            "rising": lambda r, c: r.get_rising(limit=c),
    }


def main():
    args = parse_args()
    urls = get_links(get_submissions(args.subreddit, args.count, get_filters()[args.category]))
    print("Found "+str(len(urls))+" reddit threads")
    urls = filter_for_imgur(urls)
    print("Found "+str(len(urls))+" image links")
    actual = download_images(urls, args.output)
    print("Downloaded "+str(actual)+" images to "+ args.output if args.output else "current directory")


if __name__ == "__main__":
    main()
	#!/usr/bin/env python2
	import os
	import getpass
	import urllib2
	from StringIO import StringIO
	import time
	import datetime
	import argparse
	import praw
	from bs4 import BeautifulSoup
	from progressbar import ProgressBar, Percentage, Bar, ETA


	def get_submissions(subreddit, count, filter):
	r = praw.Reddit(user_agent=getpass.getuser()+'\'s reddit_dl')
	sr = r.get_subreddit(subreddit)
	return filter(sr, count)


	def get_links(submissions):
	res = []
	for sub in submissions:
	res.append(sub.url)
	return res


	def filter_for_imgur(urls):
	res = []
	for url in urls:
	url = url.encode('ascii', 'ignore')
	if 'imgur' in url:
	if url.endswith('jpg') or url.endswith('gif') or url.endswith('png'):
	res.append(url)
	else: #we have to get the direct links here
	try:
	response = urllib2.urlopen(url)
	except urllib2.HTTPError as e:
	print url +": \t"+str(e.code)+" "+e.msg
	continue
	except urllib2.URLError as e:
	print "Could not download "+url
	continue

	if "image" in get_content_type(response):
	res.append(url)
	continue
	soup = BeautifulSoup(response.read())
	image_container = soup.select("#image-container") if "/a/" in url else soup.select("#image")
	imgs = image_container[0].findChildren("img") if len(image_container) > 0 else []
	for img in imgs:
	link = img.get("data-src") if img.get("data-src") else img.get("src")
	if not link:
	continue
	res.append("http://" + link[2:])
	else:
	try:
	response = urllib2.urlopen(url)
	except urllib2.HTTPError as e:
	print url +": \t"+str(e.code)+" "+e.msg
	continue
	except urllib2.URLError as e:
	print "Could not download "+url
	continue

	if "image" in get_content_type(response):
	res.append(url)
	return res


	def get_content_type(response):
	for header in response.info().headers:
	if header.startswith("Content-Type"):
	return header.split(":")[1]

	def get_file_format(content_type):
	short=content_type.split("/")[1]
	if ("jpg" in short or "jpeg" in short):
	return "jpg"
	elif ("gif" in short):
	return "gif"
	else:
	return "png"


	def parse_args():
	parser = argparse.ArgumentParser(description="Download Images from Reddit")
	parser.add_argument('subreddit', help="The subreddit to load images from")
	parser.add_argument('--count', '-c', default='10', type=int, help="Number of images (top posts first)")
	parser.add_argument('--output', '-o', default=".", action='store', help="The output directory for the images")
	parser.add_argument('--category', '-t', default="top",
	choices=["top", "top-all", "top-day", "top-hour", "top-month", "top-month", "top-week",
	"top-year", "con", "con-all", "con-day", "con-hour", "con-month", "con-week",
	"con-year", "hot", "new", "new-bydate", "new-byrising", "random", "rising"],
	help="From which category do you want to download")
	return parser.parse_args()


	def download_images(urls, directory):
	actual = 0
	not_read=[]
	ts=time.time()
	timestamp=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S')
	widgets=["Downloading ",Percentage(),' ', Bar(), ETA(),' ']
	pbar = ProgressBar(widgets=widgets, maxval=100 ).start();
	if directory and not directory.endswith("/"):
	directory += "/"
	if not os.path.isdir(directory):
	print directory+" could not be found"
	for i, url in enumerate(urls):
	try:
	response = urllib2.urlopen(url)
	except urllib2.HTTPError as e:
	print url +": \t"+str(e.code)+" "+e.msg
	continue
	except urllib2.URLError as e:
	print "Could not download "+url
	continue

	content_type = get_content_type(response)
	if "image" in content_type:
	percent = float(i+1) / len(urls) * 100
	pbar.update(percent);
	image_data = StringIO(response.read())
	directory = directory if directory else ""
	with open(directory + timestamp +"-image-"+str(i)+"."+get_file_format(content_type), 'w') as f:
	f.write(image_data.buf)
	actual += 1
	time.sleep(2)
	else:
	not_read.append(url);
	pbar.finish();
	if(len(not_read) > 0):
	print "Could not read the following urls:"
	for url in not_read:
	print url
	return actual

	def get_filters():
	return {"top": lambda r, c: r.get_top(limit=c),
	"top-all": lambda r, c: r.get_top_from_all(limit=c),
	"top-day": lambda r, c: r.get_top_from_day(limit=c),
	"top-hour": lambda r, c: r.get_top_from_hour(limit=c),
	"top-month": lambda r, c: r.get_top_from_month(limit=c),
	"top-week": lambda r, c: r.get_top_from_week(limit=c),
	"top-year": lambda r, c: r.get_top_from_year(limit=c),
	"con": lambda r, c: r.get_controversial(limit=c),
	"con-all": lambda r, c: r.get_controversial_from_all(limit=c),
	"con-day": lambda r, c: r.get_controversial_from_day(limit=c),
	"con-hour": lambda r, c: r.get_controversial_from_hour(limit=c),
	"con-month": lambda r, c: r.get_controversial_from_month(limit=c),
	"con-week": lambda r, c: r.get_controversial_from_week(limit=c),
	"con-year": lambda r, c: r.get_controversial_from_year(limit=c),
	"hot": lambda r, c: r.get_hot(limit=c),
	"new": lambda r, c: r.get_new(limit=c),
	"new-bydate": lambda r, c: r.get_new_by_date(limit=c),
	"new-byrising": lambda r, c: r.get_new_by_rising(limit=c),
	"random": lambda r, c: r.get_random_submission(limit=c),
	"rising": lambda r, c: r.get_rising(limit=c),
	}


	def main():
	args = parse_args()
	urls = get_links(get_submissions(args.subreddit, args.count, get_filters()[args.category]))
	print "Found "+str(len(urls))+" reddit threads"
	urls = filter_for_imgur(urls)
	print "Found "+str(len(urls))+" image links"
	actual = download_images(urls, args.output)
	print "Downloaded "+str(actual)+" images to "+ args.output if args.output else "current directory"


	if __name__ == "__main__":
	main()
	#!/usr/bin/env python3
	# To use this, install the following python packages (for example with pip):
	# beautifulsoup4
	# praw
	# py3-progressbar
	import os
	import getpass
	import urllib.request, urllib.error, urllib.parse
	from io import BytesIO
	import time
	import datetime
	import argparse
	import praw
	from bs4 import BeautifulSoup
	from progressbar import ProgressBar, Percentage, Bar, ETA


	def get_submissions(subreddit, count, filter):
	r = praw.Reddit(user_agent=getpass.getuser()+'\'s reddit_dl')
	sr = r.get_subreddit(subreddit)
	return list(filter(sr, count))


	def get_links(submissions):
	res = []
	for sub in submissions:
	res.append(sub.url)
	return res


	def filter_for_imgur(urls):
	res = []
	for url in urls:
	url_enc = url.encode('UTF-8', 'ignore')
	if "imgur" in url:
	if url.endswith('jpg') or url.endswith('gif') or url.endswith('png'):
	res.append(url)
	else: #we have to get the direct links here
	try:
	response = urllib.request.urlopen(url)
	except urllib.error.HTTPError as e:
	print(url +": \t"+str(e.code)+" "+e.msg)
	continue
	except urllib.error.URLError as e:
	print("Could not download "+url)
	continue

	if "image" in get_content_type(response):
	res.append(url)
	continue
	soup = BeautifulSoup(response.read())
	image_container = soup.select("#image-container") if "/a/" in url else soup.select("#image")
	imgs = image_container[0].findChildren("img") if len(image_container) > 0 else []
	for img in imgs:
	link = img.get("data-src") if img.get("data-src") else img.get("src")
	if not link:
	continue
	res.append("http://" + link[2:])
	else:
	try:
	response = urllib.request.urlopen(url)
	except urllib.error.HTTPError as e:
	print(url +": \t"+str(e.code)+" "+e.msg)
	continue
	except urllib.error.URLError as e:
	print("Could not download "+url)
	continue

	if "image" in get_content_type(response):
	res.append(url)
	return res


	def get_content_type(response):
	return response.info().get("Content-Type")

	def get_file_format(content_type):
	short=content_type.split("/")[1]
	if ("jpg" in short or "jpeg" in short):
	return "jpg"
	elif ("gif" in short):
	return "gif"
	else:
	return "png"


	def parse_args():
	parser = argparse.ArgumentParser(description="Download Images from Reddit")
	parser.add_argument('subreddit', help="The subreddit to load images from")
	parser.add_argument('--count', '-c', default='10', type=int, help="Number of images (top posts first)")
	parser.add_argument('--output', '-o', default=".", action='store', help="The output directory for the images")
	parser.add_argument('--category', '-t', default="top",
	choices=["top", "top-all", "top-day", "top-hour", "top-month", "top-month", "top-week",
	"top-year", "con", "con-all", "con-day", "con-hour", "con-month", "con-week",
	"con-year", "hot", "new", "new-bydate", "new-byrising", "random", "rising"],
	help="From which category do you want to download")
	return parser.parse_args()


	def download_images(urls, directory):
	actual = 0
	not_read=[]
	ts=time.time()
	timestamp=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S')
	widgets=["Downloading ",Percentage(),' ', Bar(), ETA(),' ']
	pbar = ProgressBar(widgets=widgets, maxval=100 ).start();
	if directory and not directory.endswith("/"):
	directory += "/"
	if not os.path.isdir(directory):
	print(directory+" could not be found")
	for i, url in enumerate(urls):
	try:
	response = urllib.request.urlopen(url)
	except urllib.error.HTTPError as e:
	print(url +": \t"+str(e.code)+" "+e.msg)
	continue
	except urllib.error.URLError as e:
	print("Could not download "+url)
	continue

	content_type = get_content_type(response)
	if "image" in content_type:
	percent = float(i+1) / len(urls) * 100
	pbar.update(percent);
	image_data = BytesIO(response.read())
	directory = directory if directory else ""
	with open(directory + timestamp +"-image-"+str(i)+"."+get_file_format(content_type), 'wb') as f:
	f.write(image_data.read())
	actual += 1
	time.sleep(2)
	else:
	not_read.append(url);
	pbar.finish();
	if(len(not_read) > 0):
	print("Could not read the following urls:")
	for url in not_read:
	print(url)
	return actual

	def get_filters():
	return {"top": lambda r, c: r.get_top(limit=c),
	"top-all": lambda r, c: r.get_top_from_all(limit=c),
	"top-day": lambda r, c: r.get_top_from_day(limit=c),
	"top-hour": lambda r, c: r.get_top_from_hour(limit=c),
	"top-month": lambda r, c: r.get_top_from_month(limit=c),
	"top-week": lambda r, c: r.get_top_from_week(limit=c),
	"top-year": lambda r, c: r.get_top_from_year(limit=c),
	"con": lambda r, c: r.get_controversial(limit=c),
	"con-all": lambda r, c: r.get_controversial_from_all(limit=c),
	"con-day": lambda r, c: r.get_controversial_from_day(limit=c),
	"con-hour": lambda r, c: r.get_controversial_from_hour(limit=c),
	"con-month": lambda r, c: r.get_controversial_from_month(limit=c),
	"con-week": lambda r, c: r.get_controversial_from_week(limit=c),
	"con-year": lambda r, c: r.get_controversial_from_year(limit=c),
	"hot": lambda r, c: r.get_hot(limit=c),
	"new": lambda r, c: r.get_new(limit=c),
	"new-bydate": lambda r, c: r.get_new_by_date(limit=c),
	"new-byrising": lambda r, c: r.get_new_by_rising(limit=c),
	"random": lambda r, c: r.get_random_submission(limit=c),
	"rising": lambda r, c: r.get_rising(limit=c),
	}


	def main():
	args = parse_args()
	urls = get_links(get_submissions(args.subreddit, args.count, get_filters()[args.category]))
	print("Found "+str(len(urls))+" reddit threads")
	urls = filter_for_imgur(urls)
	print("Found "+str(len(urls))+" image links")
	actual = download_images(urls, args.output)
	print("Downloaded "+str(actual)+" images to "+ args.output if args.output else "current directory")


	if __name__ == "__main__":
	main()