Skip to content

Instantly share code, notes, and snippets.

@udonmai
Last active August 29, 2015 14:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save udonmai/019829532db98f6866bb to your computer and use it in GitHub Desktop.
Save udonmai/019829532db98f6866bb to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# coding=utf-8
#import os
import sys
#import string
#import urllib
import urllib2
import re
import json
from bs4 import BeautifulSoup
import gevent
from gevent import monkey, queue, event, pool
monkey.patch_all()
def req(url):
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"
headers = {'User-Agent': user_agent}
request = urllib2.Request(url, headers=headers)
tempreq = urllib2.urlopen(request)
response = tempreq.read()
# get the search result page content
html = response
soup = BeautifulSoup(html)
return soup
def eightbooks_processor(page_content):
book_collector = {}
num = 0
had_read_section = page_content.find('h3', text=re.compile(u'読み終わった本'))
if had_read_section:
had_read_section = had_read_section.next_siblings
else:
print('***********************')
return book_collector
for book in had_read_section:
if book == '\n' or book.name == 'br':
continue
if book['class'][0] == 'more':
continue
#print(book['class'][0])
if book['class'][1] == 'book_box_inline_3r':
book_collector[num] = {}
xx = '+' * num
print(xx)
doc_book_image = book.find(attrs={'class': 'book_box_book_image'})
book_collector[num]['imageurl'] = doc_book_image.a.img['src']
doc_book_title = book.find(attrs={'class': 'book_box_book_title'})
book_collector[num]['book_name'] = doc_book_title.a.contents
book_collector[num]['url'] = doc_book_title.a['href']
bookurl = 'http://bookmeter.com' + book_collector[num]['url']
amazon_url = book_amazon_url_processor(bookurl)
book_collector[num]['isbn'] = {}
book_collector[num]['isbn'] = amazon_processor(amazon_url)
doc_book_author = book.find(attrs={'class': 'book_box_book_author'})
if doc_book_author:
book_collector[num]['author_name'] = doc_book_author.a.contents
else:
book_collector[num]['author_name'] = ''
num += 1
return book_collector
def book_amazon_url_processor(bookurl):
page_content = req(bookurl)
amazon_url_exited = page_content.find(attrs={'class': 'book_detail_amazon_right'})
if amazon_url_exited:
return amazon_url_exited.a['href']
else:
return ''
def amazon_processor(amazon_url):
if amazon_url == '':
isbn = {}
isbn['10'] = ''
isbn['13'] = ''
return isbn
try:
page_content = req(amazon_url)
except urllib2.HTTPError, e:
print 'We failed with error code - %s.' % e.code
print('This link (' + amazon_url + ')is not available.')
isbn = {}
isbn['10'] = ''
isbn['13'] = ''
return isbn
else:
isbn = {}
isbn_tmp = page_content.find('b', text=re.compile('ISBN-10:'))
if isbn_tmp:
isbn['10'] = isbn_tmp.next_sibling.split(' ')[1]
isbn_tmp = page_content.find('b', text=re.compile('ISBN-13:'))
if isbn_tmp:
isbn['13'] = isbn_tmp.next_sibling.split(' ')[1]
#print(isbn)
return isbn
def basicworker(user, user_id):
# Original url
ori_url = 'http://bookmeter.com/u/'
user[user_id] = {}
url = ori_url + str(user_id)
try:
page_content = req(url)
except urllib2.HTTPError, e:
print 'We failed with error code - %s.' % e.code
print('user ' + str(user_id) + ' is not existed.')
print('- - - - - - - - - - -')
#not_exist = page_content.find_all(text='このページはご利用いただけません')
#if not_exist:
#print('user ' + num + ' is not existed.\n')
else:
#print(page_content)
print('- - - - - - - - - - -')
print(url)
print('- - - - - - - - - - -')
user[user_id] = eightbooks_processor(page_content)
print(str(user_id) + ' => 成功!')
with open('data_gevent_15_46.json', 'w') as outfile:
json.dump(user, outfile)
if __name__ == '__main__':
# Total users
#num = 53200
# User
user = {}
i = 16
pool = pool.Pool(300)
for i in xrange((i - 1) * 1000 + 1, (i + 30) * 1000 + 1):
pool.spawn(basicworker, user, i)
pool.join()
# Main thread
#for i in xrange(260, 521):
#jobs = [gevent.spawn(basicworker, user, user_id) for user_id in
#xrange((i - 1) * 100 + 1, i * 100 + 1)]
#gevent.joinall(jobs)
#print(str(i*100) + ' SUCCEED!!!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment