Skip to content

Instantly share code, notes, and snippets.

@udonmai
Last active August 29, 2015 14:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save udonmai/fba1629821115b97d544 to your computer and use it in GitHub Desktop.
Save udonmai/fba1629821115b97d544 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# coding=utf-8
import os
#import sys
#import string
#import urllib
import urllib2
import re
import json
from bs4 import BeautifulSoup
def req(url):
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"
headers = {'User-Agent': user_agent}
request = urllib2.Request(url, headers=headers)
tempreq = urllib2.urlopen(request)
response = tempreq.read()
# get the search result page content
html = response
soup = BeautifulSoup(html)
return soup
def eightbooks_processor(page_content):
book_collector = {}
num = 0
had_read_section = page_content.find('h3', text=re.compile(u'読み終わった本'))
if had_read_section:
had_read_section = had_read_section.next_siblings
else:
print('***********************')
return book_collector
for book in had_read_section:
if book == '\n' or book.name == 'br':
continue
if book['class'][0] == 'more':
continue
print(book['class'][0])
if book['class'][1] == 'book_box_inline_3r':
book_collector[num] = {}
print('&&&&&&&&&&&&&&&&&&&&&')
doc_book_image = book.find(attrs={'class': 'book_box_book_image'})
book_collector[num]['imageurl'] = doc_book_image.a.img['src']
doc_book_title = book.find(attrs={'class': 'book_box_book_title'})
book_collector[num]['book_name'] = doc_book_title.a.contents
book_collector[num]['url'] = doc_book_title.a['href']
bookurl = 'http://bookmeter.com' + book_collector[num]['url']
amazon_url = book_amazon_url_processor(bookurl)
book_collector[num]['isbn'] = {}
book_collector[num]['isbn'] = amazon_processor(amazon_url)
doc_book_author = book.find(attrs={'class': 'book_box_book_author'})
if doc_book_author:
book_collector[num]['author_name'] = doc_book_author.a.contents
else:
book_collector[num]['author_name'] = ''
num += 1
return book_collector
def book_amazon_url_processor(bookurl):
page_content = req(bookurl)
amazon_url_exited = page_content.find(attrs={'class': 'book_detail_amazon_right'})
if amazon_url_exited:
return amazon_url_exited.a['href']
else:
return ''
def amazon_processor(amazon_url):
if amazon_url == '':
isbn = {}
isbn['10'] = ''
isbn['13'] = ''
return isbn
try:
page_content = req(amazon_url)
except urllib2.HTTPError, e:
print 'We failed with error code - %s.' % e.code
print('This link (' + amazon_url + ')is not available.')
isbn = {}
isbn['10'] = ''
isbn['13'] = ''
return isbn
else:
isbn = {}
isbn_tmp = page_content.find('b', text=re.compile('ISBN-10:'))
if isbn_tmp:
isbn['10'] = isbn_tmp.next_sibling.split(' ')[1]
isbn_tmp = page_content.find('b', text=re.compile('ISBN-13:'))
if isbn_tmp:
isbn['13'] = isbn_tmp.next_sibling.split(' ')[1]
#print(isbn)
return isbn
if __name__ == '__main__':
# Total users
num = 99999
# 2nd
#num = 277
# 3nd
#num = 966
# User
user = {}
# Original url
ori_url = 'http://bookmeter.com/u/'
# Main cycle
while True:
user[num] = {}
url = ori_url + str(num + 1)
try:
page_content = req(url)
except urllib2.HTTPError, e:
print 'We failed with error code - %s.' % e.code
print('user ' + str(num) + ' is not existed.')
print('- - - - - - - - - - -')
#not_exist = page_content.find_all(text='このページはご利用いただけません')
#if not_exist:
#print('user ' + num + ' is not existed.\n')
else:
#print(page_content)
print('- - - - - - - - - - -')
print(url)
user[num] = eightbooks_processor(page_content)
print(user)
with open('data_100000.json', 'w') as outfile:
json.dump(user, outfile)
num += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment