brwnj/timesuck.py

## timesuck.py
#!/usr/bin/env python
# coding=utf-8
"""
Download the best resolution of the top <limit> video from subreddit 'videos'
to <out> directory.
"""

import multiprocessing
import os
import pafy
import praw
import sys
import time
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter

USERAGENT = "yt_download"


def downloader((video, title, out), callback=None):
    sys.stderr.write("Downloading %s\n" % title)
    video.download(quiet=True, filepath=out)


def main(out, limit, pool, debug):
    try:
        os.mkdir(out)
    except OSError:
        pass

    r = praw.Reddit(user_agent=USERAGENT)
    p = multiprocessing.Pool(pool)
    # list of tuples (pafy_obj, video_title, filename)
    videos = []
    # track duplicate links on reddit
    seen = set()

    sys.stderr.write("Retrieving URLs from Reddit\n")
    for sub in r.get_subreddit("videos").get_hot(limit=limit):
        if debug:
            print "processing %s" % sub.title
        if 'youtu' not in sub.url:
            if debug:
                print "  >> non-youtube"
            continue
        if 'attribution_link' in sub.url:
            watch_id = sub.url.split("watch%3Fv%3D")[1].split("%")[0]
            sub.url = "http://www.youtube.com/watch?v=%s" % watch_id
        try:
            best = pafy.new(sub.url).getbest()
            title = ''.join(x for x in best.title.replace(" ", "_") if x.isalnum() or x == "_")
            if debug:
                print "  >> new title from youtube: %s" % title
            fn = out + "/" + title + "." + best.extension
            if not os.path.exists(fn):
                if not fn in seen:
                    if debug:
                        print "  >> url added to queue"
                    videos.append((best, title, fn))
                    seen.add(fn)
                else:
                    if debug:
                        print "  >> duplicate link"
            else:
                if debug:
                    print "  >> video exists in %s" % out
        except IOError:
            # video no longer available on youtube
            if debug:
                print "  >> failed to retrieve youtube link"
            pass

    p.map(downloader, videos)


if __name__ == '__main__':
    p = ArgumentParser(description=__doc__, formatter_class=ArgumentDefaultsHelpFormatter)
    p.add_argument('-o', '--out', default=time.strftime("%Y%m%d"), help='output directory')
    p.add_argument('-l', '--limit', type=int, default=50, help='number of reddit links to download (not all will be youtube videos)')
    p.add_argument('-p', '--pool', type=int, default=20, help='simultaneous downloads')
    p.add_argument('--debug', action='store_true')

    args = p.parse_args()
    main(args.out, args.limit, args.pool, args.debug)
	#!/usr/bin/env python
	# coding=utf-8
	"""
	Download the best resolution of the top <limit> video from subreddit 'videos'
	to <out> directory.
	"""

	import multiprocessing
	import os
	import pafy
	import praw
	import sys
	import time
	from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter

	USERAGENT = "yt_download"


	def downloader((video, title, out), callback=None):
	sys.stderr.write("Downloading %s\n" % title)
	video.download(quiet=True, filepath=out)


	def main(out, limit, pool, debug):
	try:
	os.mkdir(out)
	except OSError:
	pass

	r = praw.Reddit(user_agent=USERAGENT)
	p = multiprocessing.Pool(pool)
	# list of tuples (pafy_obj, video_title, filename)
	videos = []
	# track duplicate links on reddit
	seen = set()

	sys.stderr.write("Retrieving URLs from Reddit\n")
	for sub in r.get_subreddit("videos").get_hot(limit=limit):
	if debug:
	print "processing %s" % sub.title
	if 'youtu' not in sub.url:
	if debug:
	print " >> non-youtube"
	continue
	if 'attribution_link' in sub.url:
	watch_id = sub.url.split("watch%3Fv%3D")[1].split("%")[0]
	sub.url = "http://www.youtube.com/watch?v=%s" % watch_id
	try:
	best = pafy.new(sub.url).getbest()
	title = ''.join(x for x in best.title.replace(" ", "_") if x.isalnum() or x == "_")
	if debug:
	print " >> new title from youtube: %s" % title
	fn = out + "/" + title + "." + best.extension
	if not os.path.exists(fn):
	if not fn in seen:
	if debug:
	print " >> url added to queue"
	videos.append((best, title, fn))
	seen.add(fn)
	else:
	if debug:
	print " >> duplicate link"
	else:
	if debug:
	print " >> video exists in %s" % out
	except IOError:
	# video no longer available on youtube
	if debug:
	print " >> failed to retrieve youtube link"
	pass

	p.map(downloader, videos)


	if __name__ == '__main__':
	p = ArgumentParser(description=__doc__, formatter_class=ArgumentDefaultsHelpFormatter)
	p.add_argument('-o', '--out', default=time.strftime("%Y%m%d"), help='output directory')
	p.add_argument('-l', '--limit', type=int, default=50, help='number of reddit links to download (not all will be youtube videos)')
	p.add_argument('-p', '--pool', type=int, default=20, help='simultaneous downloads')
	p.add_argument('--debug', action='store_true')

	args = p.parse_args()
	main(args.out, args.limit, args.pool, args.debug)