Skip to content

Instantly share code, notes, and snippets.

@dentearl
Created December 22, 2012 02:19
Show Gist options
  • Save dentearl/4357111 to your computer and use it in GitHub Desktop.
Save dentearl/4357111 to your computer and use it in GitHub Desktop.
A script to return valid imgur urls to subreddit posted images. Has a cache mode to ensure no repeated images.
#!/usr/bin/env python
"""
imgurURLs.py,
21 December 2012
dent earl, dent.earl (a) gmail com
a script to return valid imgur urls to subreddit posted images
based in large part on a script from Tankor Smash:
http://blog.tankorsmash.com/?p=266
"""
from argparse import ArgumentParser
import cPickle
import datetime
import json
import os
from pprint import pprint
import requests
def initArgs(parser):
parser.add_argument('--limit', dest='limit', type=int, default=1,
help='number of URLs to output. default=%(default)s')
parser.add_argument('--subreddit', dest='subreddit', type=str, default='cats',
help='subreddit to scrape. default=%(default)s')
parser.add_argument('--page', dest='page', type=int, default=0,
help='imgur page to start searching')
parser.add_argument('--novel', '--remember', dest='isNovel', default=False, action='store_true',
help=('Remembers which images are returned, trys to not return '
'previously seen image'))
parser.add_argument('--nsfwOK', dest='isNSFWOK', default=False, action='store_true',
help='Normally NSFW images are ignored, this option allows them.')
parser.add_argument('--forget', dest='isForget', default=False, action='store_true',
help='before going to imgur, forgets the cache.')
def getUrls(history, args):
# get json object from imgur gallery. can be appended with /month or /week for
# more recent entries
args.page -= 1
urls = [] # list of pairs containing the image name and file extension
while len(urls) < args.limit:
args.page += 1
r = requests.get(r'http://imgur.com/r/%s/top/page/%d.json' % (args.subreddit, args.page))
j = json.loads(r.text) # creates a python dict from the JSON object
for entry in j['data']:
if len(urls) == args.limit:
break
name = entry['hash'] # get the raw image name
ext = entry['ext'] # get the image extension (.jpg, .gif etc)
if entry['nsfw'] and not args.isNSFWOK:
continue
url = r'http://imgur.com/%s%s' % (name, ext)
if url not in history:
# history will be empty if --novel is off
urls.append(url)
history.add(url)
return urls
def reportUrls(urls, history, args):
for u in urls:
print u
recordHistory(history, args)
def checkHistory(args):
history = set([])
if not args.isNovel:
return history
if os.path.exists(os.path.join(os.getcwd(), '.imgurURLsHistory.pickle')):
f = open(os.path.join(os.getcwd(), '.imgurURLsHistory.pickle'), 'r')
history = cPickle.load(f)
f.close()
return history
def recordHistory(history, args):
if not args.isNovel:
return
f = open(os.path.join(os.getcwd(), '.imgurURLsHistory.pickle'), 'wb')
cPickle.dump(history, f, 1) # 1 is ascii, 2 is binary format
f.close()
def forgetful(args):
if args.isForget:
os.remove(os.path.join(os.getcwd(), '.imgurURLsHistory.pickle'))
def main():
parser = ArgumentParser()
initArgs(parser)
args = parser.parse_args()
forgetful(args)
history = checkHistory(args)
urls = getUrls(history, args)
reportUrls(urls, history, args)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment