Skip to content

Instantly share code, notes, and snippets.

@laughingman7743
Created November 5, 2014 14:19
Show Gist options
  • Save laughingman7743/6ad901257aea1acf7bf1 to your computer and use it in GitHub Desktop.
Save laughingman7743/6ad901257aea1acf7bf1 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
from scrapy.item import Item, Field
class RamenStyle(Item):
shop_id = Field()
style = Field()
soup = Field()
class RamenReview(Item):
review_id = Field()
shop_id = Field()
url = Field()
user_name = Field()
score = Field()
title = Field()
comment = Field()
image = Field()
create_date = Field()
modified_date = Field()
class RamenShop(Item):
shop_id = Field()
url = Field()
name = Field()
address = Field()
tel = Field()
business_hours = Field()
closed_on = Field()
capacity = Field()
smoking = Field()
station = Field()
access = Field()
parking = Field()
opening_date = Field()
menu = Field()
note = Field()
tags = Field()
score = Field()
review_count = Field()
review_user_count = Field()
average = Field()
ranking = Field()
like = Field()
lat = Field()
lon = Field()
image = Field()
retire = Field()
class RamenShopImage(Item):
shop_id = Field()
blob = Field()
# -*- coding: utf-8 -*-
"""
MySQL Pipeline
http://snipplr.com/view/66986/mysql-pipeline/
"""
import datetime
import MySQLdb.cursors
from twisted.enterprise import adbapi
from decimal import Decimal
from scrapy import log
from ramendb import settings
from ramendb.items import RamenShop, RamenReview, RamenStyle, RamenShopImage
class MySQLPipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb',
host=settings.MYSQL_HOST,
port=settings.MYSQL_PORT,
db=settings.MYSQL_DATABASE,
user=settings.MYSQL_USER,
passwd=settings.MYSQL_PASSWORD,
unix_socket=settings.MYSQL_SOCKET,
cursorclass=MySQLdb.cursors.DictCursor,
charset='utf8',
use_unicode=True)
self.dbpool.runInteraction(self._truncate_table)
def _truncate_table(self, tx):
today = datetime.datetime.today()
tx.execute('alter table RamenShop rename to RamenShop_%s'
% today.strftime('%Y%m%d%H%M%S'))
tx.execute('alter table RamenReview rename to RamenReview_%s'
% today.strftime('%Y%m%d%H%M%S'))
tx.execute('alter table RamenStyle rename to RamenStyle_%s'
% today.strftime('%Y%m%d%H%M%S'))
tx.execute('create table RamenShop (\
ShopID integer unsigned not null, \
Url text, \
Name text, \
Address text, \
Tel text, \
BusinessHours text, \
ClosedOn text, \
Capacity text, \
Smoking text, \
Station text, \
Access text, \
Parking text, \
OpeningDate text, \
Menu text, \
Note text, \
Tags text, \
Score float, \
ReviewCount integer, \
ReviewUserCount integer, \
Average float, \
Ranking integer, \
`Like` integer, \
Latitude decimal(9,6), \
Longitude decimal(9,6), \
Image text, \
ImageBlob mediumblob, \
Retire bit, \
CreateDate datetime, \
ModifiedDate datetime, \
primary key(ShopID)\
) engine=InnoDB default charset=utf8')
tx.execute('create table RamenReview (\
ReviewID integer not null, \
ShopID integer not null, \
Url text, \
UserName text, \
Score float, \
Title text, \
Comment mediumtext, \
Image text, \
CreateDate text, \
ModifiedDate text, \
primary key(ReviewID)\
) engine=InnoDB default charset=utf8')
tx.execute('create table RamenStyle (\
ShopID integer not null, \
Style varchar(255) not null, \
Soup varchar(255) not null, \
primary key(ShopID, Style, Soup)\
) engine=InnoDB default charset=utf8')
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self, tx, item):
if isinstance(item, RamenShop):
self._insert_shop(tx, item)
elif isinstance(item, RamenReview):
self._insert_review(tx, item)
elif isinstance(item, RamenStyle):
self._insert_style(tx, item)
elif isinstance(item, RamenShopImage):
self._insert_shop_image(tx, item)
def _insert_shop(self, tx, item):
tx.execute('select ShopID from RamenShop where ShopID = %s', item['shop_id'])
result = tx.fetchone()
if result:
tx.execute('update RamenShop set \
Url = %s, \
Name = %s, \
Address = %s, \
Tel = %s, \
BusinessHours = %s, \
ClosedOn = %s, \
Capacity = %s, \
Smoking = %s, \
Station = %s, \
Access = %s, \
Parking = %s, \
OpeningDate = %s, \
Menu = %s, \
Note = %s, \
Tags = %s, \
Score = %s, \
ReviewCount = %s, \
ReviewUserCount = %s, \
Average = %s, \
Ranking = %s, \
`Like` = %s, \
Latitude = %s, \
Longitude = %s, \
Image = %s, \
Retire = %s, \
CreateDate = %s \
where ShopID = %s',
(item['url'] if 'url' in item else None,
item['name'] if 'name' in item else None,
item['address'] if 'address' in item else None,
item['tel'] if 'tel' in item else None,
item['business_hours'] if 'business_hours' in item else None,
item['closed_on'] if 'closed_on' in item else None,
item['capacity'] if 'capacity' in item else None,
item['smoking'] if 'smoking' in item else None,
item['station'] if 'station' in item else None,
item['access'] if 'access' in item else None,
item['parking'] if 'parking' in item else None,
item['opening_date'] if 'opening_date' in item else None,
item['menu'] if 'menu' in item else None,
item['note'] if 'note' in item else None,
item['tags'] if 'tags' in item else None,
Decimal(item['score']) if 'score' in item else None,
int(item['review_count']) if 'review_count' in item else None,
int(item['review_user_count']) if 'review_user_count' in item else None,
Decimal(item['average']) if 'average' in item else None,
int(item['ranking']) if 'ranking' in item else None,
int(item['like']) if 'like' in item else None,
Decimal(item['lat']) if 'lat' in item else None,
Decimal(item['lon']) if 'lon' in item else None,
item['image'] if 'image' in item else None,
item['retire'] if 'retire' in item else None,
datetime.datetime.now(),
item['shop_id']))
else:
tx.execute('insert into RamenShop ( \
ShopID, \
Url, \
Name, \
Address, \
Tel, \
BusinessHours, \
ClosedOn, \
Capacity, \
Smoking, \
Station, \
Access, \
Parking, \
OpeningDate, \
Menu, \
Note, \
Tags, \
Score, \
ReviewCount, \
ReviewUserCount, \
Average, \
Ranking, \
`Like`, \
Latitude, \
Longitude, \
Image, \
Retire, \
CreateDate \
) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
(int(item['shop_id']),
item['url'] if 'url' in item else None,
item['name'] if 'name' in item else None,
item['address'] if 'address' in item else None,
item['tel'] if 'tel' in item else None,
item['business_hours'] if 'business_hours' in item else None,
item['closed_on'] if 'closed_on' in item else None,
item['capacity'] if 'capacity' in item else None,
item['smoking'] if 'smoking' in item else None,
item['station'] if 'station' in item else None,
item['access'] if 'access' in item else None,
item['parking'] if 'parking' in item else None,
item['opening_date'] if 'opening_date' in item else None,
item['menu'] if 'menu' in item else None,
item['note'] if 'note' in item else None,
item['tags'] if 'tags' in item else None,
Decimal(item['score']) if 'score' in item else None,
int(item['review_count']) if 'review_count' in item else None,
int(item['review_user_count']) if 'review_user_count' in item else None,
Decimal(item['average']) if 'average' in item else None,
int(item['ranking']) if 'ranking' in item else None,
int(item['like']) if 'like' in item else None,
Decimal(item['lat']) if 'lat' in item else None,
Decimal(item['lon']) if 'lon' in item else None,
item['image'] if 'image' in item else None,
item['retire'] if 'retire' in item else None,
datetime.datetime.now()))
def _insert_shop_image(self, tx, item):
tx.execute('select ShopID from RamenShop where ShopID = %s', item['shop_id'])
result = tx.fetchone()
if result:
tx.execute('update RamenShop set ImageBlob = %s where ShopID = %s',
(item['blob'] if 'blob' in item else None, item['shop_id']))
else:
tx.execute('insert into RamenShop (ShopID, ImageBlob) values (%s, %s)',
(int(item['shop_id']), item['blob']) if 'blob' in item else None)
def _insert_review(self, tx, item):
tx.execute('select ReviewID from RamenReview where ReviewID = %s', item['review_id'])
result = tx.fetchone()
if result:
pass
else:
tx.execute('insert into RamenReview ( \
ReviewID, \
ShopID, \
Url, \
UserName, \
Score, \
Title, \
Comment, \
Image, \
CreateDate, \
ModifiedDate \
) values (%s,%s,%s,%s,%s,%s,%s,%s,%s, %s)',
(int(item['review_id']),
int(item['shop_id']),
item['url'] if 'url' in item else None,
item['user_name'] if 'user_name' in item else None,
Decimal(item['score']) if 'score' in item else None,
item['title'] if 'title' in item else None,
item['comment'] if 'comment' in item else None,
item['image'] if 'image' in item else None,
item['create_date'] if 'create_date' in item else None,
item['modified_date'] if 'modified_date' in item else None))
def _insert_style(self, tx, item):
tx.execute('select ShopID from RamenStyle where ShopID = %s and Style = %s and Soup = %s', [item['shop_id'], item['style'], item['soup']])
result = tx.fetchone()
if result:
pass
els:e
tx.execute('insert into RamenStyle ( \
ShopID, \
Style, \
Soup \
) values (%s,%s,%s)',
(int(item['shop_id']),
item['style'],
item['soup']))
def handle_error(self, e):
log.err(e)
# -*- coding: utf-8 -*-
import re
from boto.s3.key import Key
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from ramendb.items import RamenReview, RamenStyle, RamenShop, RamenShopImage
from scrapy.http.request import Request
from scrapy import log
import json
from ramendb.spiders import s3bucket
def _remove_html_tags(data):
p = re.compile(r'<.*?>')
return p.sub('', data)
def _remove_anchor_tag(data):
p = re.compile(r'<a.*?>.*?</a>')
return p.sub('', data)
def _remove_tab_code(data):
return data.replace('\t', '')
def _remove_lf_code(data):
return data.replace('\n', '')
def _parse_break_tag(data):
p = re.compile(r'<br.*?>')
return p.sub('\n', data.replace('\n', ''))
def _find_keyword(value):
# http://ramendb.supleks.jp/search/shop?q=%E9%86%A4%E6%B2%B9
m = re.search('q=(.*?)$', value)
if m:
return m.group(1)
def _find_unique_id(value):
# http://ramendb.supleks.jp/s/12559.html
m = re.search('\/(\d+?).html', value)
if m:
return m.group(1)
def _find_next_link(value):
# window.location.href='/search/shop?page=100&q=%E9%86%A4%E6%B2%B9';return false;
m = re.search('href=\'(.*?)\'', value)
if m:
return m.group(1)
def _find_lat_lon(value):
# http://maps.google.com/maps/api/staticmap?center=35.88240910824,139.82840538025&amp;zoom=15&amp;size=204x180&amp;markers=icon:http://s1.supleks.jp/shopdb/icon16_star_yellow.png|35.88240910824,139.82840538025&amp;language=ja&amp;sensor=false
m = re.search('center=([-|\.|\d]+?),([-|\.|\d]+?)&', value)
if m:
return m.group(1, 2)
def _find_create_modified_date(value):
# 2012年7月3日投稿(2012年7月4日更新) |
m = re.search(u'^(\d+年\d+月\d+日)?(投稿)?(()?(\d+年\d+月\d+日)?(更新))? |', value, re.U)
if m:
return m.group(1, 4)
def _find_style(value):
# [汁なし/鶏白湯]
m = re.search('\[(.*?)/(.*?)\]', value)
if m:
return m.group(1, 2)
def _find_graph_array(key, value):
m = re.search("reviewGraph\.addData\(\'" + key + "\'\, (.*?)\)\;", value)
if m:
return m.group(1).replace('name', '"name"').replace('data', '"data"').replace("'", '"')
class RamenDBSpider(BaseSpider):
name = 'ramendb'
allowed_domains = ['ramendb.supleks.jp']
start_urls = [
# 醤油
# 'http://ramendb.supleks.jp/search/shop?q=%E9%86%A4%E6%B2%B9',
# 味噌
# 'http://ramendb.supleks.jp/search/shop?q=%e5%91%b3%e5%99%8c',
# 塩
# 'http://ramendb.supleks.jp/search/shop?q=%e5%a1%a9',
# 豚骨
# 'http://ramendb.supleks.jp/search/shop?q=%e8%b1%9a%e9%aa%a8',
# 鶏白湯
# 'http://ramendb.supleks.jp/search/shop?q=%e9%b6%8f%e7%99%bd%e6%b9%af',
# 担々麺
# 'http://ramendb.supleks.jp/search/shop?q=%e6%8b%85%e3%80%85%e9%ba%ba',
# カレー
# 'http://ramendb.supleks.jp/search/shop?q=%e3%82%ab%e3%83%ac%e3%83%bc',
# Ranking
# 'http://ramendb.supleks.jp/rank'
# Search
'http://ramendb.supleks.jp/search'
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
"""
for a in hxs.select('//ul[@class="searched"]//a[@class="name"]'):
href = a.select('@href').extract()[0]
request = Request('http://ramendb.supleks.jp' + href, self.parse_shop)
yield request
for a in hxs.select('//span[@class="name"]/a'):
href = a.select('@href').extract()[0]
request = Request('http://ramendb.supleks.jp' + href, self.parse_shop)
yield request
"""
for a in hxs.select('//div[@class="name"]/h4/a'):
href = a.select('@href').extract()[0]
request = Request('http://ramendb.supleks.jp' + href, self.parse_shop)
yield request
"""
for a in hxs.select('//div[@class="menu"]/span/a'):
href = a.select('@href').extract()[0]
request = Request('http://ramendb.supleks.jp' + href, self.parse_review)
yield request
"""
for page in hxs.select('//a[@class="page"]'):
href = _find_next_link(page.select('@onclick').extract()[0])
request = Request('http://ramendb.supleks.jp' + href, self.parse)
yield request
def parse_review(self, response):
hxs = HtmlXPathSelector(response)
alert = hxs.select('//*[@id="waku"]')
if len(alert) > 0:
return
style = RamenStyle()
style['shop_id'] = _find_unique_id(hxs.select('//div[@class="props"]/span[1]/a[1]/@href').extract()[0])
style['style'], style['soup'] = _find_style(hxs.select('//span[@class="style"]/text()').extract()[0])
yield style
review = RamenReview()
review['shop_id'] = style['shop_id']
review['review_id'] = _find_unique_id(hxs.select('//div[@class="menu"]/span/a/@href').extract()[0])
review['url'] = response.url
review['user_name'] = hxs.select('//div[@class="props"]/span[2]/a/text()').extract()[0]
review['score'] = hxs.select('//div[@class="score"]/text()').extract()[0]
review['title'] = hxs.select('//div[@class="menu"]/span/a/text()').extract()[0]
review['comment'] = _remove_html_tags(_parse_break_tag(hxs.select('//div[@class="comment"]/p').extract()[0]))
image = hxs.select('//div[@class="comment"]/p/a/img/@src').extract()
if len(image) > 0:
review['image'] = image[0]
review['create_date'], review['modified_date'] = _find_create_modified_date(hxs.select('//div[@class="foot"]//div[@class="right"]/text()').extract()[0])
yield review
def parse_shop(self, response):
hxs = HtmlXPathSelector(response)
alert = hxs.select('//div[@class="alertbox"]')
if len(alert) > 0:
return
"""
for a in hxs.select('//a[@class="btn-more"]'):
href = a.select('@href').extract()[0]
if not href.endswith('/photo/review'):
request = Request('http://ramendb.supleks.jp' + href, self.parse)
yield request
"""
shop = RamenShop()
shop['retire'] = True if len(hxs.select('//span[@class="plate retire"]/text()')) > 0 else False
#shop['url'] = hxs.select('//meta[@property="og:url"]/@content').extract()[0]
shop['url'] = response.url
shop['shop_id'] = _find_unique_id(shop['url'])
shop['image'] = hxs.select('//meta[@property="og:image"]/@content').extract()[0]
shop['lat'], shop['lon'] = _find_lat_lon(hxs.select('//a[@id="minimap"]//img/@src').extract()[0])
point = hxs.select('//div[@id="point"]')
point_float = point.select('span[@class="float"]')
if len(point) > 0 and len(point_float) > 0:
shop['score'] = point.select('span[@class="int"]/text()').extract()[0] + point.select('span[@class="float"]/text()').extract()[0]
for row in hxs.select('//table[@id="data-table"]//tr'):
th = row.select('th/text()').extract()[0]
if th == u'店名':
shop['name'] = row.select('td/strong/text()').extract()[0]
elif th == u'住所':
shop['address'] = _remove_lf_code(_remove_tab_code(_remove_html_tags(row.select('td').extract()[0])))
elif th == u'電話番号':
shop['tel'] = row.select('td/text()').extract()[0]
elif th == u'営業時間':
shop['business_hours'] = row.select('td/text()').extract()[0]
elif th == u'定休日':
shop['closed_on'] = row.select('td/text()').extract()[0]
elif th == u'席数':
shop['capacity'] = row.select('td/text()').extract()[0]
elif th == u'喫煙':
shop['smoking'] = row.select('td/text()').extract()[0]
elif th == u'最寄り駅':
shop['station'] = _remove_lf_code(_remove_tab_code(_remove_html_tags(row.select('td').extract()[0])))
elif th == u'アクセス':
shop['access'] = row.select('td/text()').extract()[0]
elif th == u'駐車場':
shop['parking'] = row.select('td/text()').extract()[0]
elif th == u'開店日':
shop['opening_date'] = row.select('td/text()').extract()[0]
elif th == u'メニュー':
shop['menu'] = _remove_html_tags(_remove_anchor_tag(_parse_break_tag(row.select('td/p[@class="more"]').extract()[0])))
elif th == u'備考':
shop['note'] = _remove_html_tags(_remove_anchor_tag(_parse_break_tag(row.select('td/p[@class="more"]/node()').extract()[0])))
elif th == u'タグ':
shop['tags'] = ','.join([node for node in row.select('td/a/text()').extract()])
for row in hxs.select('//table[@class="key-value"]//tr'):
th = row.select('th/text()').extract()[0]
if th == u'レビュー件数':
shop['review_count'] = row.select('td/text()').extract()[0][:-1].replace(',', '')
elif th == u'レビューユーザー数':
shop['review_user_count'] = row.select('td/text()').extract()[0][:-1].replace(',', '')
elif th == u'平均点':
shop['average'] = row.select('td/text()').extract()[0][:-1].replace(',', '')
elif th == u'総合順位':
ranking = row.select('td/text()').extract()[0][:-1].replace(',', '')
if len(ranking) > 0:
shop['ranking'] = row.select('td/text()').extract()[0][:-1].replace(',', '')
elif th == u'スキ':
shop['like'] = row.select('td/text()').extract()[0][:-1].replace(',', '')
yield shop
graph = hxs.select('//div[@id="graph"]/following-sibling::node()[name()="script"]/text()').extract()
if len(graph) > 0:
styles = []
graphArrayStyle = _find_graph_array('all_0', graph[0])
if graphArrayStyle is not None:
jsonStyles = json.loads(graphArrayStyle)
for jsonStyle in jsonStyles:
if sum(jsonStyle['data']) > 0:
styles.append(jsonStyle['name'])
soups = []
graphArraySoup = _find_graph_array('all_1', graph[0])
if graphArraySoup is not None:
jsonSoups = json.loads(graphArraySoup)
for jsonSoup in jsonSoups:
if sum(jsonSoup['data']) > 0:
soups.append(jsonSoup['name'])
for style in styles:
for soup in soups:
ramenStyle = RamenStyle()
ramenStyle['shop_id'] = shop['shop_id']
ramenStyle['style'] = style
ramenStyle['soup'] = soup
yield ramenStyle
if shop['image']:
request = Request(
headers = {'shop_id': shop['shop_id']},
url = shop['image'],
callback = self.parse_shop_image)
yield request
def parse_shop_image(self, response):
image = RamenShopImage()
image['shop_id'] = response.request.headers.get('shop_id')
image['blob'] = response.body
key = Key(s3bucket)
key.key = 'ramendb/img/shop/%s.jpg' % image['shop_id']
key.set_contents_from_string(response.body, replace=True, headers={'Content-Type': 'image/jpeg'})
key.set_acl('public-read')
return image
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment