PeterDing/download_tumblr_images.py

## download_tumblr_images.py
#!/usr/bin/env python2
# vim: set fileencoding=utf8

import os, sys, re, json, requests, argparse, random, multiprocessing, time, subprocess

api_key = ''

############################################################
# wget exit status
wget_es = {
    0: "No problems occurred.",
    2: "User interference.",
    1<<8: "Generic error code.",
    2<<8: "Parse error - for instance, when parsing command-line optio.wgetrc or .netrc...",
    3<<8: "File I/O error.",
    4<<8: "Network failure.",
    5<<8: "SSL verification failure.",
    6<<8: "Username/password authentication failure.",
    7<<8: "Protocol errors.",
    8<<8: "Server issued an error response."
}
############################################################

s = '\x1b[1;%dm%s\x1b[0m'       # terminual color template

headers = {
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Encoding":"text/html",
    "Accept-Language":"en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4,zh-TW;q=0.2",
    "Content-Type":"application/x-www-form-urlencoded",
    "Referer":"https://api.tumblr.com/console//calls/blog/posts",
    "User-Agent":"Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"
}

ss = requests.session()
ss.headers.update(headers)

class tumblr(object):
    def __init__(self, burl):
        self.infos = {'host': re.search(r'http(s|)://(.+?)($|/)', burl).group(2)}
        self.infos['dir_'] = os.path.join(os.getcwd(), self.infos['host'])
        self.processes = int(args.processes)

        if not os.path.exists(self.infos['dir_']):
            os.makedirs(self.infos['dir_'])
            self.json_path = os.path.join(self.infos['dir_'], 'json.json')
            self.offset = 0
            print s % (92, '\n   ## begin'), 'offset = %s' % self.offset
        else:
            self.json_path = os.path.join(self.infos['dir_'], 'json.json')
            if os.path.exists(self.json_path):
                self.offset = json.loads(open(self.json_path).read())['offset'] - 20
                print s % (92, '\n   ## begin'), 'offset = %s' % self.offset
            else:
                self.offset = 0

    def save_json(self):
        with open(self.json_path, 'w') as g:
            g.write(json.dumps({'offset': self.offset}, indent=4, sort_keys=True))

    def get_infos(self, postid=None):
        self.infos['photos'] = []
        self.url = 'http://api.tumblr.com/v2/blog/%s/posts/photo' % self.infos['host']
        params = {
            "offset": self.offset if not postid else "",
            "limit": 20 if not postid else "",
            "type": "photo",
            "filter": "text",
            #"id": postid if postid else "",
            "api_key": api_key
        }

        r = None
        while True:
            try:
                r = ss.get(self.url, params=params, timeout=10)
                break
            except Exception as e:
                print s % (91, '  !! Error, ss.get'), e
                time.sleep(5)
        if r.ok:
            j = r.json()
            if j['response']['posts']:
                for i in j['response']['posts']:
                    index = 1
                    for ii in i['photos']:
                        durl = ii['original_size']['url'].encode('utf8')
                        filepath = os.path.join(self.infos['dir_'], '%s_%s.%s' % (i['id'], index, durl.split('.')[-1]))
                        filename = os.path.split(filepath)[-1]
                        t = {
                            'filepath': filepath,
                            'durl': durl,
                            'filename': filename
                        }
                        index += 1
                        self.infos['photos'].append(t)
            else:
                print s % (92, '\n   --- job over ---')
                sys.exit(0)
        else:
            print s % (91, '\n   !! Error, get_infos')
            print r.status_code, r.content
            sys.exit(1)

    def download(self):
        def run(i):
            #if not os.path.exists(i['filepath']):
            num = random.randint(0, 7) % 7
            col = s % (num + 90, i['filepath'])
            print '\n  ++ 正在下载: %s' % col

            cmd = 'wget -c -T 4 -q -O "%s.tmp" --header "Referer: http://www.tumblr.com" --user-agent "%s" "%s"' % (i['filepath'], headers['User-Agent'], i['durl'])

            status = os.system(cmd)
            if status != 0:     # other http-errors, such as 302.
                wget_exit_status_info = wget_es[status]
                print('\n\n ----###   \x1b[1;91mERROR\x1b[0m ==> \x1b[1;91m%d (%s)\x1b[0m   ###--- \n\n' % (status, wget_exit_status_info))
                print s % (91, '  ===> '), cmd
                sys.exit(1)
            else:
                os.rename('%s.tmp' % i['filepath'], i['filepath'])

        l = [self.infos['photos'][i:i+self.processes] for i in range(len(self.infos['photos']))[::self.processes]]
        for yy in l:
            ppool = []
            for ii in yy:
                if not os.path.exists(ii['filepath']):
                    p = multiprocessing.Process(target=run, args=(ii,))
                    p.start()
                    print p
                    ppool.append(p)

            for p in ppool: p.join()

        #print self.infos['photos']
        #sys.exit()
        #pool = {i:None for i in range(self.processes)}
        #for ii in self.infos['photos']:
            #if not os.path.exists(ii['filepath']):
                #is_on = False
                #while not is_on:
                    #for n in pool:
                        #if not pool[n]:
                            #p = multiprocessing.Process(target=run, args=(ii,))
                            #p.start()
                            #pool[n] = p
                            #is_on = True
                        #elif not pool[n].is_alive:
                            #pool[n].join()
                            #p = multiprocessing.Process(target=run, args=(ii,))
                            #p.start()
                            #pool[n] = p
                            #is_on = True
                        #else:
                            #pass
                    #if not is_on:
                        #time.sleep(2)
            #else:
                #print 'pass'

    def do(self):
        if args.check:
            t = subprocess.check_output('ls "%s" | grep ".tmp"' % self.infos['dir_'], shell=True)
            t = re.findall(r'\d\d\d+', t)
            ltmp = list(set(t))
            for postid in ltmp:
                self.get_infos(postid)
                self.download()
        else:
            while True:
                self.get_infos()
                self.offset += 20
                self.save_json()
                self.download()

if __name__ == '__main__':
    p = argparse.ArgumentParser(description='download from tumblr.com')
    p.add_argument('url', help='url')
    p.add_argument('-p', '--processes', action='store', default=4, help='amount of process')
    p.add_argument('-c', '--check', action='store_true', help='fix reminded tmp')
    args = p.parse_args()
    url = args.url
    x = tumblr(url)
    x.do()
	#!/usr/bin/env python2
	# vim: set fileencoding=utf8

	import os, sys, re, json, requests, argparse, random, multiprocessing, time, subprocess

	api_key = ''

	############################################################
	# wget exit status
	wget_es = {
	0: "No problems occurred.",
	2: "User interference.",
	1<<8: "Generic error code.",
	2<<8: "Parse error - for instance, when parsing command-line optio.wgetrc or .netrc...",
	3<<8: "File I/O error.",
	4<<8: "Network failure.",
	5<<8: "SSL verification failure.",
	6<<8: "Username/password authentication failure.",
	7<<8: "Protocol errors.",
	8<<8: "Server issued an error response."
	}
	############################################################

	s = '\x1b[1;%dm%s\x1b[0m' # terminual color template

	headers = {
	"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
	"Accept-Encoding":"text/html",
	"Accept-Language":"en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4,zh-TW;q=0.2",
	"Content-Type":"application/x-www-form-urlencoded",
	"Referer":"https://api.tumblr.com/console//calls/blog/posts",
	"User-Agent":"Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"
	}

	ss = requests.session()
	ss.headers.update(headers)

	class tumblr(object):
	def __init__(self, burl):
	self.infos = {'host': re.search(r'http(s\|)://(.+?)($\|/)', burl).group(2)}
	self.infos['dir_'] = os.path.join(os.getcwd(), self.infos['host'])
	self.processes = int(args.processes)

	if not os.path.exists(self.infos['dir_']):
	os.makedirs(self.infos['dir_'])
	self.json_path = os.path.join(self.infos['dir_'], 'json.json')
	self.offset = 0
	print s % (92, '\n ## begin'), 'offset = %s' % self.offset
	else:
	self.json_path = os.path.join(self.infos['dir_'], 'json.json')
	if os.path.exists(self.json_path):
	self.offset = json.loads(open(self.json_path).read())['offset'] - 20
	print s % (92, '\n ## begin'), 'offset = %s' % self.offset
	else:
	self.offset = 0

	def save_json(self):
	with open(self.json_path, 'w') as g:
	g.write(json.dumps({'offset': self.offset}, indent=4, sort_keys=True))

	def get_infos(self, postid=None):
	self.infos['photos'] = []
	self.url = 'http://api.tumblr.com/v2/blog/%s/posts/photo' % self.infos['host']
	params = {
	"offset": self.offset if not postid else "",
	"limit": 20 if not postid else "",
	"type": "photo",
	"filter": "text",
	#"id": postid if postid else "",
	"api_key": api_key
	}

	r = None
	while True:
	try:
	r = ss.get(self.url, params=params, timeout=10)
	break
	except Exception as e:
	print s % (91, ' !! Error, ss.get'), e
	time.sleep(5)
	if r.ok:
	j = r.json()
	if j['response']['posts']:
	for i in j['response']['posts']:
	index = 1
	for ii in i['photos']:
	durl = ii['original_size']['url'].encode('utf8')
	filepath = os.path.join(self.infos['dir_'], '%s_%s.%s' % (i['id'], index, durl.split('.')[-1]))
	filename = os.path.split(filepath)[-1]
	t = {
	'filepath': filepath,
	'durl': durl,
	'filename': filename
	}
	index += 1
	self.infos['photos'].append(t)
	else:
	print s % (92, '\n --- job over ---')
	sys.exit(0)
	else:
	print s % (91, '\n !! Error, get_infos')
	print r.status_code, r.content
	sys.exit(1)

	def download(self):
	def run(i):
	#if not os.path.exists(i['filepath']):
	num = random.randint(0, 7) % 7
	col = s % (num + 90, i['filepath'])
	print '\n ++ 正在下载: %s' % col

	cmd = 'wget -c -T 4 -q -O "%s.tmp" --header "Referer: http://www.tumblr.com" --user-agent "%s" "%s"' % (i['filepath'], headers['User-Agent'], i['durl'])

	status = os.system(cmd)
	if status != 0: # other http-errors, such as 302.
	wget_exit_status_info = wget_es[status]
	print('\n\n ----### \x1b[1;91mERROR\x1b[0m ==> \x1b[1;91m%d (%s)\x1b[0m ###--- \n\n' % (status, wget_exit_status_info))
	print s % (91, ' ===> '), cmd
	sys.exit(1)
	else:
	os.rename('%s.tmp' % i['filepath'], i['filepath'])

	l = [self.infos['photos'][i:i+self.processes] for i in range(len(self.infos['photos']))[::self.processes]]
	for yy in l:
	ppool = []
	for ii in yy:
	if not os.path.exists(ii['filepath']):
	p = multiprocessing.Process(target=run, args=(ii,))
	p.start()
	print p
	ppool.append(p)

	for p in ppool: p.join()

	#print self.infos['photos']
	#sys.exit()
	#pool = {i:None for i in range(self.processes)}
	#for ii in self.infos['photos']:
	#if not os.path.exists(ii['filepath']):
	#is_on = False
	#while not is_on:
	#for n in pool:
	#if not pool[n]:
	#p = multiprocessing.Process(target=run, args=(ii,))
	#p.start()
	#pool[n] = p
	#is_on = True
	#elif not pool[n].is_alive:
	#pool[n].join()
	#p = multiprocessing.Process(target=run, args=(ii,))
	#p.start()
	#pool[n] = p
	#is_on = True
	#else:
	#pass
	#if not is_on:
	#time.sleep(2)
	#else:
	#print 'pass'

	def do(self):
	if args.check:
	t = subprocess.check_output('ls "%s" \| grep ".tmp"' % self.infos['dir_'], shell=True)
	t = re.findall(r'\d\d\d+', t)
	ltmp = list(set(t))
	for postid in ltmp:
	self.get_infos(postid)
	self.download()
	else:
	while True:
	self.get_infos()
	self.offset += 20
	self.save_json()
	self.download()

	if __name__ == '__main__':
	p = argparse.ArgumentParser(description='download from tumblr.com')
	p.add_argument('url', help='url')
	p.add_argument('-p', '--processes', action='store', default=4, help='amount of process')
	p.add_argument('-c', '--check', action='store_true', help='fix reminded tmp')
	args = p.parse_args()
	url = args.url
	x = tumblr(url)
	x.do()