udonmai/gevent_bookmeter_crawler.py

## gevent_bookmeter_crawler.py
#!/usr/bin/python
# coding=utf-8

#import os
import sys
#import string
#import urllib
import urllib2
import re
import json
from bs4 import BeautifulSoup

import gevent
from gevent import monkey, queue, event, pool
monkey.patch_all()


def req(url):
    user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"
    headers = {'User-Agent': user_agent}

    request = urllib2.Request(url, headers=headers)
    tempreq = urllib2.urlopen(request)

    response = tempreq.read()

    # get the search result page content
    html = response
    soup = BeautifulSoup(html)
    return soup


def eightbooks_processor(page_content):
    book_collector = {}
    num = 0

    had_read_section = page_content.find('h3', text=re.compile(u'読み終わった本'))
    if had_read_section:
        had_read_section = had_read_section.next_siblings
    else:
        print('***********************')
        return book_collector

    for book in had_read_section:
        if book == '\n' or book.name == 'br':
            continue

        if book['class'][0] == 'more':
            continue
        #print(book['class'][0])

        if book['class'][1] == 'book_box_inline_3r':
            book_collector[num] = {}

            xx = '+' * num
            print(xx)
            doc_book_image = book.find(attrs={'class': 'book_box_book_image'})
            book_collector[num]['imageurl'] = doc_book_image.a.img['src']

            doc_book_title = book.find(attrs={'class': 'book_box_book_title'})
            book_collector[num]['book_name'] = doc_book_title.a.contents
            book_collector[num]['url'] = doc_book_title.a['href']

            bookurl = 'http://bookmeter.com' + book_collector[num]['url']
            amazon_url = book_amazon_url_processor(bookurl)
            book_collector[num]['isbn'] = {}
            book_collector[num]['isbn'] = amazon_processor(amazon_url)

            doc_book_author = book.find(attrs={'class': 'book_box_book_author'})
            if doc_book_author:
                book_collector[num]['author_name'] = doc_book_author.a.contents
            else:
                book_collector[num]['author_name'] = ''

        num += 1

    return book_collector


def book_amazon_url_processor(bookurl):
    page_content = req(bookurl)
    amazon_url_exited = page_content.find(attrs={'class': 'book_detail_amazon_right'})
    if amazon_url_exited:
        return amazon_url_exited.a['href']
    else:
        return ''


def amazon_processor(amazon_url):
    if amazon_url == '':
        isbn = {}
        isbn['10'] = ''
        isbn['13'] = ''
        return isbn

    try:
        page_content = req(amazon_url)

    except urllib2.HTTPError, e:
        print 'We failed with error code - %s.' % e.code
        print('This link (' + amazon_url + ')is not available.')

        isbn = {}
        isbn['10'] = ''
        isbn['13'] = ''
        return isbn

    else:
        isbn = {}

        isbn_tmp = page_content.find('b', text=re.compile('ISBN-10:'))
        if isbn_tmp:
            isbn['10'] = isbn_tmp.next_sibling.split(' ')[1]

        isbn_tmp = page_content.find('b', text=re.compile('ISBN-13:'))
        if isbn_tmp:
            isbn['13'] = isbn_tmp.next_sibling.split(' ')[1]

        #print(isbn)
        return isbn


def basicworker(user, user_id):
    # Original url
    ori_url = 'http://bookmeter.com/u/'

    user[user_id] = {}
    url = ori_url + str(user_id)

    try:
        page_content = req(url)

    except urllib2.HTTPError, e:
        print 'We failed with error code - %s.' % e.code
        print('user ' + str(user_id) + ' is not existed.')
        print('- - - - - - - - - - -')

        #not_exist = page_content.find_all(text='このページはご利用いただけません')
        #if not_exist:
            #print('user ' + num + ' is not existed.\n')

    else:
        #print(page_content)
        print('- - - - - - - - - - -')
        print(url)
        print('- - - - - - - - - - -')

    user[user_id] = eightbooks_processor(page_content)
    print(str(user_id) + ' => 成功！')

    with open('data_gevent_15_46.json', 'w') as outfile:
        json.dump(user, outfile)

if __name__ == '__main__':

    # Total users
    #num = 53200

    # User
    user = {}
    i = 16

    pool = pool.Pool(300)

    for i in xrange((i - 1) * 1000 + 1, (i + 30) * 1000 + 1):
        pool.spawn(basicworker, user, i)
    pool.join()


    # Main thread
    #for i in xrange(260, 521):
        #jobs = [gevent.spawn(basicworker, user, user_id) for user_id in
                #xrange((i - 1) * 100 + 1, i * 100 + 1)]
        #gevent.joinall(jobs)
        #print(str(i*100) + ' SUCCEED!!!')
	#!/usr/bin/python
	# coding=utf-8

	#import os
	import sys
	#import string
	#import urllib
	import urllib2
	import re
	import json
	from bs4 import BeautifulSoup

	import gevent
	from gevent import monkey, queue, event, pool
	monkey.patch_all()


	def req(url):
	user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"
	headers = {'User-Agent': user_agent}

	request = urllib2.Request(url, headers=headers)
	tempreq = urllib2.urlopen(request)

	response = tempreq.read()

	# get the search result page content
	html = response
	soup = BeautifulSoup(html)
	return soup


	def eightbooks_processor(page_content):
	book_collector = {}
	num = 0

	had_read_section = page_content.find('h3', text=re.compile(u'読み終わった本'))
	if had_read_section:
	had_read_section = had_read_section.next_siblings
	else:
	print('***********************')
	return book_collector

	for book in had_read_section:
	if book == '\n' or book.name == 'br':
	continue

	if book['class'][0] == 'more':
	continue
	#print(book['class'][0])

	if book['class'][1] == 'book_box_inline_3r':
	book_collector[num] = {}

	xx = '+' * num
	print(xx)
	doc_book_image = book.find(attrs={'class': 'book_box_book_image'})
	book_collector[num]['imageurl'] = doc_book_image.a.img['src']

	doc_book_title = book.find(attrs={'class': 'book_box_book_title'})
	book_collector[num]['book_name'] = doc_book_title.a.contents
	book_collector[num]['url'] = doc_book_title.a['href']

	bookurl = 'http://bookmeter.com' + book_collector[num]['url']
	amazon_url = book_amazon_url_processor(bookurl)
	book_collector[num]['isbn'] = {}
	book_collector[num]['isbn'] = amazon_processor(amazon_url)

	doc_book_author = book.find(attrs={'class': 'book_box_book_author'})
	if doc_book_author:
	book_collector[num]['author_name'] = doc_book_author.a.contents
	else:
	book_collector[num]['author_name'] = ''

	num += 1

	return book_collector


	def book_amazon_url_processor(bookurl):
	page_content = req(bookurl)
	amazon_url_exited = page_content.find(attrs={'class': 'book_detail_amazon_right'})
	if amazon_url_exited:
	return amazon_url_exited.a['href']
	else:
	return ''


	def amazon_processor(amazon_url):
	if amazon_url == '':
	isbn = {}
	isbn['10'] = ''
	isbn['13'] = ''
	return isbn

	try:
	page_content = req(amazon_url)

	except urllib2.HTTPError, e:
	print 'We failed with error code - %s.' % e.code
	print('This link (' + amazon_url + ')is not available.')

	isbn = {}
	isbn['10'] = ''
	isbn['13'] = ''
	return isbn

	else:
	isbn = {}

	isbn_tmp = page_content.find('b', text=re.compile('ISBN-10:'))
	if isbn_tmp:
	isbn['10'] = isbn_tmp.next_sibling.split(' ')[1]

	isbn_tmp = page_content.find('b', text=re.compile('ISBN-13:'))
	if isbn_tmp:
	isbn['13'] = isbn_tmp.next_sibling.split(' ')[1]

	#print(isbn)
	return isbn


	def basicworker(user, user_id):
	# Original url
	ori_url = 'http://bookmeter.com/u/'

	user[user_id] = {}
	url = ori_url + str(user_id)

	try:
	page_content = req(url)

	except urllib2.HTTPError, e:
	print 'We failed with error code - %s.' % e.code
	print('user ' + str(user_id) + ' is not existed.')
	print('- - - - - - - - - - -')

	#not_exist = page_content.find_all(text='このページはご利用いただけません')
	#if not_exist:
	#print('user ' + num + ' is not existed.\n')

	else:
	#print(page_content)
	print('- - - - - - - - - - -')
	print(url)
	print('- - - - - - - - - - -')

	user[user_id] = eightbooks_processor(page_content)
	print(str(user_id) + ' => 成功！')

	with open('data_gevent_15_46.json', 'w') as outfile:
	json.dump(user, outfile)

	if __name__ == '__main__':

	# Total users
	#num = 53200

	# User
	user = {}
	i = 16

	pool = pool.Pool(300)

	for i in xrange((i - 1) * 1000 + 1, (i + 30) * 1000 + 1):
	pool.spawn(basicworker, user, i)
	pool.join()


	# Main thread
	#for i in xrange(260, 521):
	#jobs = [gevent.spawn(basicworker, user, user_id) for user_id in
	#xrange((i - 1) * 100 + 1, i * 100 + 1)]
	#gevent.joinall(jobs)
	#print(str(i*100) + ' SUCCEED!!!')