Skip to content

Instantly share code, notes, and snippets.

@za2016
Last active October 28, 2016 23:53
Show Gist options
  • Save za2016/217c949863f5195c5e1374b0c78dacc9 to your computer and use it in GitHub Desktop.
Save za2016/217c949863f5195c5e1374b0c78dacc9 to your computer and use it in GitHub Desktop.
# -*- coding=utf-8 -*-
from threading import Thread
import Queue
import requests
import re
import os
import sys
import time
api_url='http://%s.tumblr.com/api/read?&num=50&start='
UQueue=Queue.Queue()
def getpost(uid,queue):
url='http://%s.tumblr.com/api/read?&num=50'%uid
page=requests.get(url).content
total=re.findall('<posts start="0" total="(.*?)">',page)[0]
total=int(total)
a=[i*50 for i in range(1000) if i*50-total<0]
ul=api_url%uid
for i in a[:10]:
queue.put(ul+str(i))
extractpicre = re.compile(r'(?<=<photo-url max-width="1280">).+?(?=</photo-url>)',flags=re.S) #search for url of maxium size of a picture, which starts with '<photo-url max-width="1280">' and ends with '</photo-url>'
extractvideore=re.compile('/tumblr_(.*?)" type="video/mp4"')
video_links = []
pic_links = []
vhead = 'https://vt.tumblr.com/tumblr_%s.mp4'
class Consumer(Thread):
def __init__(self, l_queue):
super(Consumer,self).__init__()
self.queue = l_queue
def run(self):
session = requests.Session()
while 1:
link = self.queue.get()
print 'start parse post: ' + link
try:
content = session.get(link).content
videos = extractvideore.findall(content)
video_links.extend([vhead % v for v in videos])
pic_links.extend(extractpicre.findall(content))
except:
print 'url: %s parse failed\n' % link
if self.queue.empty():
break
def main():
task=[]
for i in range(min(10,UQueue.qsize())):
t=Consumer(UQueue)
task.append(t)
for t in task:
t.start()
for t in task:
t.join
while 1:
for t in task:
if t.is_alive():
continue
else:
task.remove(t)
if len(task)==0:
break
def write():
videos=[i.replace('/480','') for i in video_links]
pictures=pic_links
with open('pictures.txt','w') as f:
for i in pictures:
f.write('%s\n'%i)
with open('videos.txt','w') as f:
for i in videos:
f.write('%s\n'%i)
if __name__=='__main__':
#name=sys.argv[1]
#name=name.strip()
name='mzcyx2011'
getpost(name,UQueue)
main()
write()
@zhiyue
Copy link

zhiyue commented Oct 28, 2016

接个力 加个下载

# -*- coding=utf-8 -*-
import Queue
import functools
import logging
import os
import re
import sys
import time
from functools import partial
from itertools import repeat
from multiprocessing import Pool, cpu_count, freeze_support
from threading import Thread

import requests
from retry import retry
from tqdm import tqdm

api_url = 'http://%s.tumblr.com/api/read?&num=50&start='
UQueue = Queue.Queue()



def getpost(uid, queue):
    url = 'http://%s.tumblr.com/api/read?&num=50' % uid
    page = requests.get(url).content
    total = re.findall('<posts start="0" total="(.*?)">', page)[0]
    total = int(total)
    a = [i * 50 for i in range(1000) if i * 50 - total < 0]
    ul = api_url % uid
    for i in a:
        queue.put(ul + str(i))


# search for url of maxium size of a picture, which starts with
# '<photo-url max-width="1280">' and ends with '</photo-url>'
extractpicre = re.compile(
    r'(?<=<photo-url max-width="1280">).+?(?=</photo-url>)', flags=re.S)
extractvideore = re.compile('/tumblr_(.*?)" type="video/mp4"')

video_links = []
pic_links = []
vhead = 'https://vt.tumblr.com/tumblr_%s.mp4'


class Consumer(Thread):
    def __init__(self, l_queue):
        super(Consumer, self).__init__()
        self.queue = l_queue

    def run(self):
        session = requests.Session()
        while 1:
            link = self.queue.get()
            print 'start parse post: ' + link
            try:
                content = session.get(link).content
                videos = extractvideore.findall(content)
                video_links.extend([vhead % v for v in videos])
                pic_links.extend(extractpicre.findall(content))
            except:
                print 'url: %s parse failed\n' % link
            if self.queue.empty():
                break


def main():
    task = []
    for i in range(min(10, UQueue.qsize())):
        t = Consumer(UQueue)
        task.append(t)
    for t in task:
        t.start()
    for t in task:
        t.join
    while 1:
        for t in task:
            if t.is_alive():
                continue
            else:
                task.remove(t)
        if len(task) == 0:
            break


def write():
    videos = [i.replace('/480', '') for i in video_links]
    pictures = pic_links
    videos=list(set(videos))
    pictures=list(set(pictures))
    with open('pictures.txt', 'w') as f:
        for i in pictures:
            f.write('%s\n' % i)
    with open('videos.txt', 'w') as f:
        for i in videos:
            f.write('%s\n' % i)


class TqdmHandler(logging.StreamHandler):
    def __init__(self):
        logging.StreamHandler.__init__(self)

    def emit(self, record):
        msg = self.format(record)
        tqdm.write(msg)


@retry(tries=3, delay=2)
@config_log
def download(url, local_filename=None, proxies=None, root_dir=None, if_exist='ignore'):

    if not local_filename:
        local_filename = url.split('/')[-1]
    if root_dir:
        local_filename = os.path.join(root_dir, local_filename)
    if os.path.exists(local_filename) and if_exist == 'ignore':
        return None
    try:
        r = requests.get(url, proxies=proxies, stream=True)
        if r.status_code == 200:
            with open(local_filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=4096):
                    if chunk:
                        f.write(chunk)
            print "finish download %s" % local_filename
            time.sleep(.5)
        else:
            raise Exception("Unexpected response.")
    except requests.exceptions.RequestException as e:
        raise
    except Exception as e:
        raise





if __name__ == '__main__':
    freeze_support()  # for Windows support
    # name=sys.argv[1]
    # name=name.strip()
    name = 'mzcyx2011'
    getpost(name, UQueue)
    main()
    write()
    with open('pictures.txt', 'r') as f:
        urls = [url.strip() for url in f.readlines()]

    if not os.path.exists(name):
        os.mkdir(name)

    pool = Pool(cpu_count())
    download_base_dir = partial(download, root_dir=name)
    pool = Pool()
    try:
        for square in tqdm(pool.imap_unordered(download_base_dir, urls), total=len(urls)):
            pass
    except KeyboardInterrupt:
        logging.warning("got Ctrl+C")
    finally:
        pool.terminate()
        pool.join()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment