Skip to content

Instantly share code, notes, and snippets.

@demonkit
Created May 9, 2015 04:08
Show Gist options
  • Save demonkit/69f387745a908287419d to your computer and use it in GitHub Desktop.
Save demonkit/69f387745a908287419d to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Get images from http://www.douban.com/photos/album/145486923/.
"""
__author__ = 'demonkit'
import re
import threading
import time
from Queue import Empty, Queue
import requests
from lxml import etree
PUBLISH_FINISHED = False
CONSUME_FINISHED = False
class Pulibsher(object):
def __init__(self, queue):
self.queue = queue
def put(self, item):
self.queue.put(item)
class Consumer(object):
def __init__(self, queue):
self.queue = queue
def get(self, block=True):
return self.queue.get(block=block)
class ImgMaker(Pulibsher, threading.Thread):
KICK_START_URL = "http://www.douban.com/photos/album/145486923/"
param = 'start'
XPATH_PATTERN = '//*[@id="content"]/div[3]/div[1]/div[2]/div/a'
IMG_RE_PATTERN = re.compile('http://www.douban.com/photos/photo/(\d+)/')
PIC_NUM_PER_PAGE = 18
def __init__(self, queue, page_no):
threading.Thread.__init__(self)
Pulibsher.__init__(self, queue)
self.setName(self.__class__.__name__)
self.page_no = page_no
def run(self):
for page in range(0,
self.page_no * self.PIC_NUM_PER_PAGE + 1,
self.PIC_NUM_PER_PAGE):
url = self.KICK_START_URL + "?%s=%s" % (self.param, str(page))
content = requests.get(url).text
tree = etree.HTML(content)
elements = tree.xpath(self.XPATH_PATTERN)
for ele in elements:
img_url = ele.get('href')
if img_url is not None:
matcher = self.IMG_RE_PATTERN.match(img_url)
if matcher:
img_no = matcher.groups()[0]
self.queue.put(img_no)
PUBLISH_FINISHED = True
class ImgSaver(Consumer, threading.Thread):
REAL_IMG_URL_PATTERN = 'http://img3.douban.com/view/photo/photo/public/p%s.jpg'
FOLDER = 'emoji'
def __init__(self, queue):
threading.Thread.__init__(self)
Consumer.__init__(self, queue)
self.setName(self.__class__.__name__)
def run(self):
while 1:
try:
img_no = self.queue.get(block=False)
except Empty, em:
if PUBLISH_FINISHED:
break
else:
time.sleep(1)
else:
real_img_url = self.REAL_IMG_URL_PATTERN % img_no
resp = requests.get(real_img_url, stream=True)
with open("%s/%s.jpg" % (self.FOLDER, img_no), 'wb') as fout:
fout.write(resp.content)
CONSUME_FINISHED = True
if __name__ == '__main__':
queue = Queue()
publisher = ImgMaker(queue, 21)
consumer = ImgSaver(queue)
publisher.start()
consumer.start()
while not CONSUME_FINISHED:
time.sleep(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment