-
-
Save 872409/e29557ff750052d8bd03114c8ada44ee to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding=utf-8 -*- | |
import Queue | |
import functools | |
import logging | |
import os | |
import re | |
import sys | |
import time | |
from functools import partial | |
from itertools import repeat | |
from multiprocessing import Pool, cpu_count, freeze_support | |
from threading import Thread | |
import requests | |
from retry import retry | |
from tqdm import tqdm | |
api_url = 'http://%s.tumblr.com/api/read?&num=50&start=' | |
UQueue = Queue.Queue() | |
def getpost(uid, queue): | |
url = 'http://%s.tumblr.com/api/read?&num=50' % uid | |
page = requests.get(url).content | |
total = re.findall('<posts start="0" total="(.*?)">', page)[0] | |
total = int(total) | |
a = [i * 50 for i in range(1000) if i * 50 - total < 0] | |
ul = api_url % uid | |
for i in a: | |
queue.put(ul + str(i)) | |
# search for url of maxium size of a picture, which starts with | |
# '<photo-url max-width="1280">' and ends with '</photo-url>' | |
extractpicre = re.compile( | |
r'(?<=<photo-url max-width="1280">).+?(?=</photo-url>)', flags=re.S) | |
extractvideore = re.compile('/tumblr_(.*?)" type="video/mp4"') | |
video_links = [] | |
pic_links = [] | |
vhead = 'https://vt.tumblr.com/tumblr_%s.mp4' | |
class Consumer(Thread): | |
def __init__(self, l_queue): | |
super(Consumer, self).__init__() | |
self.queue = l_queue | |
def run(self): | |
session = requests.Session() | |
while 1: | |
link = self.queue.get() | |
print 'start parse post: ' + link | |
try: | |
content = session.get(link).content | |
videos = extractvideore.findall(content) | |
video_links.extend([vhead % v for v in videos]) | |
pic_links.extend(extractpicre.findall(content)) | |
except: | |
print 'url: %s parse failed\n' % link | |
if self.queue.empty(): | |
break | |
def main(): | |
task = [] | |
for i in range(min(10, UQueue.qsize())): | |
t = Consumer(UQueue) | |
task.append(t) | |
for t in task: | |
t.start() | |
for t in task: | |
t.join | |
while 1: | |
for t in task: | |
if t.is_alive(): | |
continue | |
else: | |
task.remove(t) | |
if len(task) == 0: | |
break | |
def write(): | |
videos = [i.replace('/480', '') for i in video_links] | |
pictures = pic_links | |
videos = list(set(videos)) | |
pictures = list(set(pictures)) | |
with open('pictures.txt', 'w') as f: | |
for i in pictures: | |
f.write('%s\n' % i) | |
with open('videos.txt', 'w') as f: | |
for i in videos: | |
f.write('%s\n' % i) | |
class TqdmHandler(logging.StreamHandler): | |
def __init__(self): | |
logging.StreamHandler.__init__(self) | |
def emit(self, record): | |
msg = self.format(record) | |
tqdm.write(msg) | |
@retry(tries=3, delay=2) | |
def download(url, local_filename=None, proxies=None, root_dir=None, if_exist='ignore'): | |
if not local_filename: | |
local_filename = url.split('/')[-1] | |
if root_dir: | |
local_filename = os.path.join(root_dir, local_filename) | |
if os.path.exists(local_filename) and if_exist == 'ignore': | |
return None | |
try: | |
r = requests.get(url, proxies=proxies, stream=True) | |
if r.status_code == 200: | |
with open(local_filename, 'wb') as f: | |
for chunk in r.iter_content(chunk_size=4096): | |
if chunk: | |
f.write(chunk) | |
print "finish download %s" % local_filename | |
time.sleep(.5) | |
else: | |
raise Exception("Unexpected response.") | |
except requests.exceptions.RequestException as e: | |
raise | |
except Exception as e: | |
raise | |
if __name__ == '__main__': | |
freeze_support() # for Windows support | |
# name=sys.argv[1] | |
# name=name.strip() | |
name = 'wanimal1983' | |
getpost(name, UQueue) | |
main() | |
write() | |
with open('pictures.txt', 'r') as f: | |
urls = [url.strip() for url in f.readlines()] | |
if not os.path.exists(name): | |
os.mkdir(name) | |
pool = Pool(cpu_count()) | |
download_base_dir = partial(download, root_dir=name) | |
pool = Pool() | |
try: | |
for square in tqdm(pool.imap_unordered(download_base_dir, urls), total=len(urls)): | |
pass | |
except KeyboardInterrupt: | |
logging.warning("got Ctrl+C") | |
finally: | |
pool.terminate() | |
pool.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment