Skip to content

Instantly share code, notes, and snippets.

@nicokoch
Last active August 29, 2015 14:03
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save nicokoch/08f10781dc7870ffcd98 to your computer and use it in GitHub Desktop.
Save nicokoch/08f10781dc7870ffcd98 to your computer and use it in GitHub Desktop.
Command line script to download images from reddit image-posts
#!/usr/bin/env python2
import os
import getpass
import urllib2
from StringIO import StringIO
import time
import datetime
import argparse
import praw
from bs4 import BeautifulSoup
from progressbar import ProgressBar, Percentage, Bar, ETA
def get_submissions(subreddit, count, filter):
r = praw.Reddit(user_agent=getpass.getuser()+'\'s reddit_dl')
sr = r.get_subreddit(subreddit)
return filter(sr, count)
def get_links(submissions):
res = []
for sub in submissions:
res.append(sub.url)
return res
def filter_for_imgur(urls):
res = []
for url in urls:
url = url.encode('ascii', 'ignore')
if 'imgur' in url:
if url.endswith('jpg') or url.endswith('gif') or url.endswith('png'):
res.append(url)
else: #we have to get the direct links here
try:
response = urllib2.urlopen(url)
except urllib2.HTTPError as e:
print url +": \t"+str(e.code)+" "+e.msg
continue
except urllib2.URLError as e:
print "Could not download "+url
continue
if "image" in get_content_type(response):
res.append(url)
continue
soup = BeautifulSoup(response.read())
image_container = soup.select("#image-container") if "/a/" in url else soup.select("#image")
imgs = image_container[0].findChildren("img") if len(image_container) > 0 else []
for img in imgs:
link = img.get("data-src") if img.get("data-src") else img.get("src")
if not link:
continue
res.append("http://" + link[2:])
else:
try:
response = urllib2.urlopen(url)
except urllib2.HTTPError as e:
print url +": \t"+str(e.code)+" "+e.msg
continue
except urllib2.URLError as e:
print "Could not download "+url
continue
if "image" in get_content_type(response):
res.append(url)
return res
def get_content_type(response):
for header in response.info().headers:
if header.startswith("Content-Type"):
return header.split(":")[1]
def get_file_format(content_type):
short=content_type.split("/")[1]
if ("jpg" in short or "jpeg" in short):
return "jpg"
elif ("gif" in short):
return "gif"
else:
return "png"
def parse_args():
parser = argparse.ArgumentParser(description="Download Images from Reddit")
parser.add_argument('subreddit', help="The subreddit to load images from")
parser.add_argument('--count', '-c', default='10', type=int, help="Number of images (top posts first)")
parser.add_argument('--output', '-o', default=".", action='store', help="The output directory for the images")
parser.add_argument('--category', '-t', default="top",
choices=["top", "top-all", "top-day", "top-hour", "top-month", "top-month", "top-week",
"top-year", "con", "con-all", "con-day", "con-hour", "con-month", "con-week",
"con-year", "hot", "new", "new-bydate", "new-byrising", "random", "rising"],
help="From which category do you want to download")
return parser.parse_args()
def download_images(urls, directory):
actual = 0
not_read=[]
ts=time.time()
timestamp=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S')
widgets=["Downloading ",Percentage(),' ', Bar(), ETA(),' ']
pbar = ProgressBar(widgets=widgets, maxval=100 ).start();
if directory and not directory.endswith("/"):
directory += "/"
if not os.path.isdir(directory):
print directory+" could not be found"
for i, url in enumerate(urls):
try:
response = urllib2.urlopen(url)
except urllib2.HTTPError as e:
print url +": \t"+str(e.code)+" "+e.msg
continue
except urllib2.URLError as e:
print "Could not download "+url
continue
content_type = get_content_type(response)
if "image" in content_type:
percent = float(i+1) / len(urls) * 100
pbar.update(percent);
image_data = StringIO(response.read())
directory = directory if directory else ""
with open(directory + timestamp +"-image-"+str(i)+"."+get_file_format(content_type), 'w') as f:
f.write(image_data.buf)
actual += 1
time.sleep(2)
else:
not_read.append(url);
pbar.finish();
if(len(not_read) > 0):
print "Could not read the following urls:"
for url in not_read:
print url
return actual
def get_filters():
return {"top": lambda r, c: r.get_top(limit=c),
"top-all": lambda r, c: r.get_top_from_all(limit=c),
"top-day": lambda r, c: r.get_top_from_day(limit=c),
"top-hour": lambda r, c: r.get_top_from_hour(limit=c),
"top-month": lambda r, c: r.get_top_from_month(limit=c),
"top-week": lambda r, c: r.get_top_from_week(limit=c),
"top-year": lambda r, c: r.get_top_from_year(limit=c),
"con": lambda r, c: r.get_controversial(limit=c),
"con-all": lambda r, c: r.get_controversial_from_all(limit=c),
"con-day": lambda r, c: r.get_controversial_from_day(limit=c),
"con-hour": lambda r, c: r.get_controversial_from_hour(limit=c),
"con-month": lambda r, c: r.get_controversial_from_month(limit=c),
"con-week": lambda r, c: r.get_controversial_from_week(limit=c),
"con-year": lambda r, c: r.get_controversial_from_year(limit=c),
"hot": lambda r, c: r.get_hot(limit=c),
"new": lambda r, c: r.get_new(limit=c),
"new-bydate": lambda r, c: r.get_new_by_date(limit=c),
"new-byrising": lambda r, c: r.get_new_by_rising(limit=c),
"random": lambda r, c: r.get_random_submission(limit=c),
"rising": lambda r, c: r.get_rising(limit=c),
}
def main():
args = parse_args()
urls = get_links(get_submissions(args.subreddit, args.count, get_filters()[args.category]))
print "Found "+str(len(urls))+" reddit threads"
urls = filter_for_imgur(urls)
print "Found "+str(len(urls))+" image links"
actual = download_images(urls, args.output)
print "Downloaded "+str(actual)+" images to "+ args.output if args.output else "current directory"
if __name__ == "__main__":
main()
#!/usr/bin/env python3
# To use this, install the following python packages (for example with pip):
# beautifulsoup4
# praw
# py3-progressbar
import os
import getpass
import urllib.request, urllib.error, urllib.parse
from io import BytesIO
import time
import datetime
import argparse
import praw
from bs4 import BeautifulSoup
from progressbar import ProgressBar, Percentage, Bar, ETA
def get_submissions(subreddit, count, filter):
r = praw.Reddit(user_agent=getpass.getuser()+'\'s reddit_dl')
sr = r.get_subreddit(subreddit)
return list(filter(sr, count))
def get_links(submissions):
res = []
for sub in submissions:
res.append(sub.url)
return res
def filter_for_imgur(urls):
res = []
for url in urls:
url_enc = url.encode('UTF-8', 'ignore')
if "imgur" in url:
if url.endswith('jpg') or url.endswith('gif') or url.endswith('png'):
res.append(url)
else: #we have to get the direct links here
try:
response = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
print(url +": \t"+str(e.code)+" "+e.msg)
continue
except urllib.error.URLError as e:
print("Could not download "+url)
continue
if "image" in get_content_type(response):
res.append(url)
continue
soup = BeautifulSoup(response.read())
image_container = soup.select("#image-container") if "/a/" in url else soup.select("#image")
imgs = image_container[0].findChildren("img") if len(image_container) > 0 else []
for img in imgs:
link = img.get("data-src") if img.get("data-src") else img.get("src")
if not link:
continue
res.append("http://" + link[2:])
else:
try:
response = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
print(url +": \t"+str(e.code)+" "+e.msg)
continue
except urllib.error.URLError as e:
print("Could not download "+url)
continue
if "image" in get_content_type(response):
res.append(url)
return res
def get_content_type(response):
return response.info().get("Content-Type")
def get_file_format(content_type):
short=content_type.split("/")[1]
if ("jpg" in short or "jpeg" in short):
return "jpg"
elif ("gif" in short):
return "gif"
else:
return "png"
def parse_args():
parser = argparse.ArgumentParser(description="Download Images from Reddit")
parser.add_argument('subreddit', help="The subreddit to load images from")
parser.add_argument('--count', '-c', default='10', type=int, help="Number of images (top posts first)")
parser.add_argument('--output', '-o', default=".", action='store', help="The output directory for the images")
parser.add_argument('--category', '-t', default="top",
choices=["top", "top-all", "top-day", "top-hour", "top-month", "top-month", "top-week",
"top-year", "con", "con-all", "con-day", "con-hour", "con-month", "con-week",
"con-year", "hot", "new", "new-bydate", "new-byrising", "random", "rising"],
help="From which category do you want to download")
return parser.parse_args()
def download_images(urls, directory):
actual = 0
not_read=[]
ts=time.time()
timestamp=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S')
widgets=["Downloading ",Percentage(),' ', Bar(), ETA(),' ']
pbar = ProgressBar(widgets=widgets, maxval=100 ).start();
if directory and not directory.endswith("/"):
directory += "/"
if not os.path.isdir(directory):
print(directory+" could not be found")
for i, url in enumerate(urls):
try:
response = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
print(url +": \t"+str(e.code)+" "+e.msg)
continue
except urllib.error.URLError as e:
print("Could not download "+url)
continue
content_type = get_content_type(response)
if "image" in content_type:
percent = float(i+1) / len(urls) * 100
pbar.update(percent);
image_data = BytesIO(response.read())
directory = directory if directory else ""
with open(directory + timestamp +"-image-"+str(i)+"."+get_file_format(content_type), 'wb') as f:
f.write(image_data.read())
actual += 1
time.sleep(2)
else:
not_read.append(url);
pbar.finish();
if(len(not_read) > 0):
print("Could not read the following urls:")
for url in not_read:
print(url)
return actual
def get_filters():
return {"top": lambda r, c: r.get_top(limit=c),
"top-all": lambda r, c: r.get_top_from_all(limit=c),
"top-day": lambda r, c: r.get_top_from_day(limit=c),
"top-hour": lambda r, c: r.get_top_from_hour(limit=c),
"top-month": lambda r, c: r.get_top_from_month(limit=c),
"top-week": lambda r, c: r.get_top_from_week(limit=c),
"top-year": lambda r, c: r.get_top_from_year(limit=c),
"con": lambda r, c: r.get_controversial(limit=c),
"con-all": lambda r, c: r.get_controversial_from_all(limit=c),
"con-day": lambda r, c: r.get_controversial_from_day(limit=c),
"con-hour": lambda r, c: r.get_controversial_from_hour(limit=c),
"con-month": lambda r, c: r.get_controversial_from_month(limit=c),
"con-week": lambda r, c: r.get_controversial_from_week(limit=c),
"con-year": lambda r, c: r.get_controversial_from_year(limit=c),
"hot": lambda r, c: r.get_hot(limit=c),
"new": lambda r, c: r.get_new(limit=c),
"new-bydate": lambda r, c: r.get_new_by_date(limit=c),
"new-byrising": lambda r, c: r.get_new_by_rising(limit=c),
"random": lambda r, c: r.get_random_submission(limit=c),
"rising": lambda r, c: r.get_rising(limit=c),
}
def main():
args = parse_args()
urls = get_links(get_submissions(args.subreddit, args.count, get_filters()[args.category]))
print("Found "+str(len(urls))+" reddit threads")
urls = filter_for_imgur(urls)
print("Found "+str(len(urls))+" image links")
actual = download_images(urls, args.output)
print("Downloaded "+str(actual)+" images to "+ args.output if args.output else "current directory")
if __name__ == "__main__":
main()
@nicokoch
Copy link
Author

nicokoch commented Jul 6, 2014

This script can be used to download images from reddit posts.

Dependencies (all installable by pip)

beatifulsoup4
praw
progressbar
The other imports should be installed by default.

Usage

You can for example download wallpapers from /r/wallpapers.

usage: reddit_dl.py [-h] [--count COUNT] [--output OUTPUT]
                    [--category {top,top-all,top-day,top-hour,top-month,top-month,top-week,top-year,con,con-all,con-day,con-hour,con-month,con-week,con-year,hot,new,new-bydate,new-byrising,random,rising}]
                    subreddit

Example:

./reddit_dl.py --output ~/wallpapers --count 10 --category top-all wallpapers

@vizakenjaro
Copy link

Is there any way to contact you privately? This script returns error when I try to run it.

@nicokoch
Copy link
Author

nicokoch commented Jul 5, 2015

Just ask here, so others can benefit from a solution. What's the error message?

Edit: So I just tested the script and it still works fine for me. Make sure you follow the steps from the first comment (install all the dependencies). In case you are using python3 instead of python2, I just added a new script for that (The dependencies of the python3 script are descriped in the code comment)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment