muxuezi/alldoulist.py

## alldoulist.py
# -*- coding: utf-8 -*-
import urllib2
import thread
import Queue
import time
from bs4 import BeautifulSoup


def findlen(item):
    url = 'http://dongxi.douban.com/doulists/%s/?start=0' % (item)
    html_doc = urllib2.urlopen(url).read()
    soup = BeautifulSoup(html_doc, from_encoding="gb18030")
    tempage = soup.find('div', class_='paginator').text
    lenitem = int(tempage.split()[-2])
    return lenitem


def getname_link(item, d):
    url = 'http://dongxi.douban.com/doulists/%s/?start=%d' % (item)
    html_doc = urllib2.urlopen(url).read()
    soup = BeautifulSoup(html_doc, from_encoding="gb18030")
    name_link = soup.findAll('a', class_='doulist-title')
    for unit in name_link:
        doulist_name_link.append((unit.text.strip(), unit.get('href')))


def test(idx, name, url, allpage, dataQueue):
    for page in allpage:
        aurl = '%s?start=%d' % (url.split('?')[0], page)
        html_doc = urllib2.urlopen(aurl).read()
        soup = BeautifulSoup(html_doc, from_encoding="gb18030")
        for itemchild in soup.findAll('li', class_='carditem card-story-large '):
            name = itemchild.find('a').get('title').strip()
            link = itemchild.find('a').get('href')
            try:
                price = itemchild.find('span', class_="commodity-price").text
            except AttributeError:
                price = 'None'
            finally:
                cons = itemchild.find('ul', class_="stats-list").text.split()
                data = [str(idx), name, link, price] + cons
                # print data
                dataQueue.put(data)


def producer(doulist_name_link, idnum, dataQueue):
    for msgnum in range(nummessages):
        msgnum += idnum * nummessages
        name, url = doulist_name_link[msgnum].split('\t')
        url = url.strip()
        # print url
        html_doc = urllib2.urlopen(url).read()
        soup = BeautifulSoup(html_doc, from_encoding="gb18030")
        try:
            tempage = soup.find('div', class_='paginator').text
        except AttributeError:
            allpage = [0]
        else:
            lenpage = int(tempage.split()[-2])
            allpage = map(lambda x: 20 * x, range(lenpage))
        finally:
            # print url,allpage
            test(msgnum, name, url, allpage, dataQueue)


def consumer(idnum, dataQueue):
    while True:
        time.sleep(0.1)
        try:
            data = dataQueue.get(block=False)
        except Queue.Empty:
            pass
        else:
            with safeprint:
                with open('doutemp2.txt', 'a+') as fileout:
                    fileout.write('consumer %s got => %s+\n' %
                                  (str(idnum), '\t'.join(data).encode('utf-8')))
                print 'consumer', idnum, 'got =>', data[0]


def alldoulist():
    doulist_name_link = []
    list_all = {'life': 83, 'interesting': 25,
                'fashion': 35, 'tech': 10}  # 20140525
    for k, v in list_all.items():
        v = findlen(k)
        list_all[k] = v  # update dict
        print k, v
        allpage = map(lambda x: 20 * x, range(v))
        for d in allpage:
            getname_link(k, d)  # write list name
    with open('doulist.txt', 'w') as filelist:
        for x in doulist_name_link:
            filelist.write('\t'.join(x) + '\n')
    return doulist_name_link

if __name__ == '__main__':
    doulist_name_link = alldoulist()

    numconsumers = 4    # how many consumers to start
    numproducers = 4     # how many producers to start
    # messages per producer to put
    nummessages = len(doulist_name_link) / numproducers

    safeprint = thread.allocate_lock()    # else prints may overlap
    dataQueue = Queue.Queue()             # shared global, infinite size

    with open('doutemp2.txt', 'w') as fileout:
        fileout.write('')
    for i in range(numconsumers):
        thread.start_new_thread(consumer, (i, dataQueue))
    for i in range(numproducers):
        thread.start_new_thread(producer, (doulist_name_link, i, dataQueue))
    print 'Main thread exit.'
	# -- coding: utf-8 --
	import urllib2
	import thread
	import Queue
	import time
	from bs4 import BeautifulSoup


	def findlen(item):
	url = 'http://dongxi.douban.com/doulists/%s/?start=0' % (item)
	html_doc = urllib2.urlopen(url).read()
	soup = BeautifulSoup(html_doc, from_encoding="gb18030")
	tempage = soup.find('div', class_='paginator').text
	lenitem = int(tempage.split()[-2])
	return lenitem


	def getname_link(item, d):
	url = 'http://dongxi.douban.com/doulists/%s/?start=%d' % (item)
	html_doc = urllib2.urlopen(url).read()
	soup = BeautifulSoup(html_doc, from_encoding="gb18030")
	name_link = soup.findAll('a', class_='doulist-title')
	for unit in name_link:
	doulist_name_link.append((unit.text.strip(), unit.get('href')))


	def test(idx, name, url, allpage, dataQueue):
	for page in allpage:
	aurl = '%s?start=%d' % (url.split('?')[0], page)
	html_doc = urllib2.urlopen(aurl).read()
	soup = BeautifulSoup(html_doc, from_encoding="gb18030")
	for itemchild in soup.findAll('li', class_='carditem card-story-large '):
	name = itemchild.find('a').get('title').strip()
	link = itemchild.find('a').get('href')
	try:
	price = itemchild.find('span', class_="commodity-price").text
	except AttributeError:
	price = 'None'
	finally:
	cons = itemchild.find('ul', class_="stats-list").text.split()
	data = [str(idx), name, link, price] + cons
	# print data
	dataQueue.put(data)


	def producer(doulist_name_link, idnum, dataQueue):
	for msgnum in range(nummessages):
	msgnum += idnum * nummessages
	name, url = doulist_name_link[msgnum].split('\t')
	url = url.strip()
	# print url
	html_doc = urllib2.urlopen(url).read()
	soup = BeautifulSoup(html_doc, from_encoding="gb18030")
	try:
	tempage = soup.find('div', class_='paginator').text
	except AttributeError:
	allpage = [0]
	else:
	lenpage = int(tempage.split()[-2])
	allpage = map(lambda x: 20 * x, range(lenpage))
	finally:
	# print url,allpage
	test(msgnum, name, url, allpage, dataQueue)


	def consumer(idnum, dataQueue):
	while True:
	time.sleep(0.1)
	try:
	data = dataQueue.get(block=False)
	except Queue.Empty:
	pass
	else:
	with safeprint:
	with open('doutemp2.txt', 'a+') as fileout:
	fileout.write('consumer %s got => %s+\n' %
	(str(idnum), '\t'.join(data).encode('utf-8')))
	print 'consumer', idnum, 'got =>', data[0]


	def alldoulist():
	doulist_name_link = []
	list_all = {'life': 83, 'interesting': 25,
	'fashion': 35, 'tech': 10} # 20140525
	for k, v in list_all.items():
	v = findlen(k)
	list_all[k] = v # update dict
	print k, v
	allpage = map(lambda x: 20 * x, range(v))
	for d in allpage:
	getname_link(k, d) # write list name
	with open('doulist.txt', 'w') as filelist:
	for x in doulist_name_link:
	filelist.write('\t'.join(x) + '\n')
	return doulist_name_link

	if __name__ == '__main__':
	doulist_name_link = alldoulist()

	numconsumers = 4 # how many consumers to start
	numproducers = 4 # how many producers to start
	# messages per producer to put
	nummessages = len(doulist_name_link) / numproducers

	safeprint = thread.allocate_lock() # else prints may overlap
	dataQueue = Queue.Queue() # shared global, infinite size

	with open('doutemp2.txt', 'w') as fileout:
	fileout.write('')
	for i in range(numconsumers):
	thread.start_new_thread(consumer, (i, dataQueue))
	for i in range(numproducers):
	thread.start_new_thread(producer, (doulist_name_link, i, dataQueue))
	print 'Main thread exit.'