demonkit/emoji.py

## emoji.py
# -*- coding: utf-8 -*-

"""
Get images from http://www.douban.com/photos/album/145486923/.
"""
__author__ = 'demonkit'


import re
import threading
import time
from Queue import Empty, Queue

import requests
from lxml import etree


PUBLISH_FINISHED = False
CONSUME_FINISHED = False


class Pulibsher(object):

    def __init__(self, queue):
        self.queue = queue

    def put(self, item):
        self.queue.put(item)


class Consumer(object):

    def __init__(self, queue):
        self.queue = queue

    def get(self, block=True):
        return self.queue.get(block=block)


class ImgMaker(Pulibsher, threading.Thread):

    KICK_START_URL = "http://www.douban.com/photos/album/145486923/"
    param = 'start'
    XPATH_PATTERN = '//*[@id="content"]/div[3]/div[1]/div[2]/div/a'
    IMG_RE_PATTERN = re.compile('http://www.douban.com/photos/photo/(\d+)/')

    PIC_NUM_PER_PAGE = 18

    def __init__(self, queue, page_no):
        threading.Thread.__init__(self)
        Pulibsher.__init__(self, queue)
        self.setName(self.__class__.__name__)
        self.page_no = page_no

    def run(self):
        for page in range(0,
                          self.page_no * self.PIC_NUM_PER_PAGE + 1,
                          self.PIC_NUM_PER_PAGE):
            url = self.KICK_START_URL + "?%s=%s" % (self.param, str(page))
            content = requests.get(url).text
            tree = etree.HTML(content)
            elements = tree.xpath(self.XPATH_PATTERN)
            for ele in elements:
                img_url = ele.get('href')
                if img_url is not None:
                    matcher = self.IMG_RE_PATTERN.match(img_url)
                    if matcher:
                        img_no = matcher.groups()[0]
                        self.queue.put(img_no)
        PUBLISH_FINISHED = True


class ImgSaver(Consumer, threading.Thread):

    REAL_IMG_URL_PATTERN = 'http://img3.douban.com/view/photo/photo/public/p%s.jpg'
    FOLDER = 'emoji'

    def __init__(self, queue):
        threading.Thread.__init__(self)
        Consumer.__init__(self, queue)
        self.setName(self.__class__.__name__)

    def run(self):
        while 1:
            try:
                img_no =  self.queue.get(block=False)
            except Empty, em:
                if PUBLISH_FINISHED:
                    break
                else:
                    time.sleep(1)
            else:
                real_img_url = self.REAL_IMG_URL_PATTERN % img_no
                resp = requests.get(real_img_url, stream=True)
                with open("%s/%s.jpg" % (self.FOLDER, img_no), 'wb') as fout:
                    fout.write(resp.content)
        CONSUME_FINISHED = True


if __name__ == '__main__':
    queue = Queue()
    publisher = ImgMaker(queue, 21)
    consumer = ImgSaver(queue)
    publisher.start()
    consumer.start()
    while not CONSUME_FINISHED:
        time.sleep(1)
	# -- coding: utf-8 --

	"""
	Get images from http://www.douban.com/photos/album/145486923/.
	"""
	__author__ = 'demonkit'


	import re
	import threading
	import time
	from Queue import Empty, Queue

	import requests
	from lxml import etree


	PUBLISH_FINISHED = False
	CONSUME_FINISHED = False


	class Pulibsher(object):

	def __init__(self, queue):
	self.queue = queue

	def put(self, item):
	self.queue.put(item)



	class Consumer(object):

	def __init__(self, queue):
	self.queue = queue

	def get(self, block=True):
	return self.queue.get(block=block)


	class ImgMaker(Pulibsher, threading.Thread):

	KICK_START_URL = "http://www.douban.com/photos/album/145486923/"
	param = 'start'
	XPATH_PATTERN = '//*[@id="content"]/div[3]/div[1]/div[2]/div/a'
	IMG_RE_PATTERN = re.compile('http://www.douban.com/photos/photo/(\d+)/')

	PIC_NUM_PER_PAGE = 18

	def __init__(self, queue, page_no):
	threading.Thread.__init__(self)
	Pulibsher.__init__(self, queue)
	self.setName(self.__class__.__name__)
	self.page_no = page_no

	def run(self):
	for page in range(0,
	self.page_no * self.PIC_NUM_PER_PAGE + 1,
	self.PIC_NUM_PER_PAGE):
	url = self.KICK_START_URL + "?%s=%s" % (self.param, str(page))
	content = requests.get(url).text
	tree = etree.HTML(content)
	elements = tree.xpath(self.XPATH_PATTERN)
	for ele in elements:
	img_url = ele.get('href')
	if img_url is not None:
	matcher = self.IMG_RE_PATTERN.match(img_url)
	if matcher:
	img_no = matcher.groups()[0]
	self.queue.put(img_no)
	PUBLISH_FINISHED = True


	class ImgSaver(Consumer, threading.Thread):

	REAL_IMG_URL_PATTERN = 'http://img3.douban.com/view/photo/photo/public/p%s.jpg'
	FOLDER = 'emoji'

	def __init__(self, queue):
	threading.Thread.__init__(self)
	Consumer.__init__(self, queue)
	self.setName(self.__class__.__name__)

	def run(self):
	while 1:
	try:
	img_no = self.queue.get(block=False)
	except Empty, em:
	if PUBLISH_FINISHED:
	break
	else:
	time.sleep(1)
	else:
	real_img_url = self.REAL_IMG_URL_PATTERN % img_no
	resp = requests.get(real_img_url, stream=True)
	with open("%s/%s.jpg" % (self.FOLDER, img_no), 'wb') as fout:
	fout.write(resp.content)
	CONSUME_FINISHED = True


	if __name__ == '__main__':
	queue = Queue()
	publisher = ImgMaker(queue, 21)
	consumer = ImgSaver(queue)
	publisher.start()
	consumer.start()
	while not CONSUME_FINISHED:
	time.sleep(1)