laughingman7743/items.py Secret

## items.py
# -*- coding: utf-8 -*-
from scrapy.item import Item, Field

class RamenStyle(Item):
    shop_id = Field()
    style = Field()
    soup = Field()

class RamenReview(Item):
    review_id = Field()
    shop_id = Field()
    url = Field()
    user_name = Field()
    score = Field()
    title = Field()
    comment = Field()
    image = Field()
    create_date = Field()
    modified_date = Field()

class RamenShop(Item):
    shop_id = Field()
    url = Field()
    name = Field()
    address = Field()
    tel = Field()
    business_hours = Field()
    closed_on = Field()
    capacity = Field()
    smoking = Field()
    station = Field()
    access = Field()
    parking = Field()
    opening_date = Field()
    menu = Field()
    note = Field()
    tags = Field()
    score = Field()
    review_count = Field()
    review_user_count = Field()
    average = Field()
    ranking = Field()
    like = Field()
    lat = Field()
    lon = Field()
    image = Field()
    retire = Field()

class RamenShopImage(Item):
    shop_id = Field()
    blob = Field()

## pipelines.py
# -*- coding: utf-8 -*-
"""
MySQL Pipeline
    http://snipplr.com/view/66986/mysql-pipeline/
"""
import datetime
import MySQLdb.cursors
from twisted.enterprise import adbapi
from decimal import Decimal
from scrapy import log
from ramendb import settings
from ramendb.items import RamenShop, RamenReview, RamenStyle, RamenShopImage

class MySQLPipeline(object):

    def __init__(self):
        self.dbpool = adbapi.ConnectionPool('MySQLdb',
                                            host=settings.MYSQL_HOST,
                                            port=settings.MYSQL_PORT,
                                            db=settings.MYSQL_DATABASE,
                                            user=settings.MYSQL_USER,
                                            passwd=settings.MYSQL_PASSWORD,
                                            unix_socket=settings.MYSQL_SOCKET,
                                            cursorclass=MySQLdb.cursors.DictCursor,
                                            charset='utf8',
                                            use_unicode=True)
        self.dbpool.runInteraction(self._truncate_table)

    def _truncate_table(self, tx):
        today = datetime.datetime.today()
        tx.execute('alter table RamenShop rename to RamenShop_%s'
                   % today.strftime('%Y%m%d%H%M%S'))
        tx.execute('alter table RamenReview rename to RamenReview_%s'
                   % today.strftime('%Y%m%d%H%M%S'))
        tx.execute('alter table RamenStyle rename to RamenStyle_%s'
                   % today.strftime('%Y%m%d%H%M%S'))
        tx.execute('create table RamenShop (\
                               ShopID integer unsigned not null, \
                               Url text, \
                               Name text, \
                               Address text, \
                               Tel text, \
                               BusinessHours text, \
                               ClosedOn text, \
                               Capacity text, \
                               Smoking text, \
                               Station text, \
                               Access text, \
                               Parking text, \
                               OpeningDate text, \
                               Menu text, \
                               Note text, \
                               Tags text, \
                               Score float, \
                               ReviewCount integer, \
                               ReviewUserCount integer, \
                               Average float, \
                               Ranking integer, \
                               `Like` integer, \
                               Latitude decimal(9,6), \
                               Longitude decimal(9,6), \
                               Image text, \
                               ImageBlob mediumblob, \
                               Retire bit, \
                               CreateDate datetime, \
                               ModifiedDate datetime, \
                               primary key(ShopID)\
                           ) engine=InnoDB default charset=utf8')
        tx.execute('create table RamenReview (\
                               ReviewID integer not null, \
                               ShopID integer not null, \
                               Url text, \
                               UserName text, \
                               Score float, \
                               Title text, \
                               Comment mediumtext, \
                               Image text, \
                               CreateDate text, \
                               ModifiedDate text, \
                               primary key(ReviewID)\
                           ) engine=InnoDB default charset=utf8')
        tx.execute('create table RamenStyle (\
                               ShopID integer not null, \
                               Style varchar(255) not null, \
                               Soup varchar(255) not null, \
                               primary key(ShopID, Style, Soup)\
                           ) engine=InnoDB default charset=utf8')

    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(self._conditional_insert, item)
        query.addErrback(self.handle_error)
        return item

    def _conditional_insert(self, tx, item):
        if isinstance(item, RamenShop):
            self._insert_shop(tx, item)
        elif isinstance(item, RamenReview):
            self._insert_review(tx, item)
        elif isinstance(item, RamenStyle):
            self._insert_style(tx, item)
        elif isinstance(item, RamenShopImage):
            self._insert_shop_image(tx, item)

    def _insert_shop(self, tx, item):
        tx.execute('select ShopID from RamenShop where ShopID = %s', item['shop_id'])
        result = tx.fetchone()
        if result:
            tx.execute('update RamenShop set \
                                               Url = %s, \
                                               Name = %s, \
                                               Address = %s, \
                                               Tel = %s, \
                                               BusinessHours = %s, \
                                               ClosedOn = %s, \
                                               Capacity = %s, \
                                               Smoking = %s, \
                                               Station = %s, \
                                               Access = %s, \
                                               Parking = %s, \
                                               OpeningDate = %s, \
                                               Menu = %s, \
                                               Note = %s, \
                                               Tags = %s, \
                                               Score = %s, \
                                               ReviewCount = %s, \
                                               ReviewUserCount = %s, \
                                               Average = %s, \
                                               Ranking = %s, \
                                               `Like` = %s, \
                                               Latitude = %s, \
                                               Longitude = %s, \
                                               Image = %s, \
                                               Retire = %s, \
                                               CreateDate = %s \
                                               where ShopID = %s',
                                               (item['url'] if 'url' in item else None,
                                                item['name'] if 'name' in item else None,
                                                item['address'] if 'address' in item else None,
                                                item['tel'] if 'tel' in item else None,
                                                item['business_hours'] if 'business_hours' in item else None,
                                                item['closed_on'] if 'closed_on' in item else None,
                                                item['capacity'] if 'capacity' in item else None,
                                                item['smoking'] if 'smoking' in item else None,
                                                item['station'] if 'station' in item else None,
                                                item['access'] if 'access' in item else None,
                                                item['parking'] if 'parking' in item else None,
                                                item['opening_date'] if 'opening_date' in item else None,
                                                item['menu'] if 'menu' in item else None,
                                                item['note'] if 'note' in item else None,
                                                item['tags'] if 'tags' in item else None,
                                                Decimal(item['score']) if 'score' in item else None,
                                                int(item['review_count']) if 'review_count' in item else None,
                                                int(item['review_user_count']) if 'review_user_count' in item else None,
                                                Decimal(item['average']) if 'average' in item else None,
                                                int(item['ranking']) if 'ranking' in item else None,
                                                int(item['like']) if 'like' in item else None,
                                                Decimal(item['lat']) if 'lat' in item else None,
                                                Decimal(item['lon']) if 'lon' in item else None,
                                                item['image'] if 'image' in item else None,
                                                item['retire'] if 'retire' in item else None,
                                                datetime.datetime.now(),
                                                item['shop_id']))
        else:
            tx.execute('insert into RamenShop ( \
                                               ShopID, \
                                               Url, \
                                               Name, \
                                               Address, \
                                               Tel, \
                                               BusinessHours, \
                                               ClosedOn, \
                                               Capacity, \
                                               Smoking, \
                                               Station, \
                                               Access, \
                                               Parking, \
                                               OpeningDate, \
                                               Menu, \
                                               Note, \
                                               Tags, \
                                               Score, \
                                               ReviewCount, \
                                               ReviewUserCount, \
                                               Average, \
                                               Ranking, \
                                               `Like`, \
                                               Latitude, \
                                               Longitude, \
                                               Image, \
                                               Retire, \
                                               CreateDate \
                                               ) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
                                               (int(item['shop_id']),
                                                item['url'] if 'url' in item else None,
                                                item['name'] if 'name' in item else None,
                                                item['address'] if 'address' in item else None,
                                                item['tel'] if 'tel' in item else None,
                                                item['business_hours'] if 'business_hours' in item else None,
                                                item['closed_on'] if 'closed_on' in item else None,
                                                item['capacity'] if 'capacity' in item else None,
                                                item['smoking'] if 'smoking' in item else None,
                                                item['station'] if 'station' in item else None,
                                                item['access'] if 'access' in item else None,
                                                item['parking'] if 'parking' in item else None,
                                                item['opening_date'] if 'opening_date' in item else None,
                                                item['menu'] if 'menu' in item else None,
                                                item['note'] if 'note' in item else None,
                                                item['tags'] if 'tags' in item else None,
                                                Decimal(item['score']) if 'score' in item else None,
                                                int(item['review_count']) if 'review_count' in item else None,
                                                int(item['review_user_count']) if 'review_user_count' in item else None,
                                                Decimal(item['average']) if 'average' in item else None,
                                                int(item['ranking']) if 'ranking' in item else None,
                                                int(item['like']) if 'like' in item else None,
                                                Decimal(item['lat']) if 'lat' in item else None,
                                                Decimal(item['lon']) if 'lon' in item else None,
                                                item['image'] if 'image' in item else None,
                                                item['retire'] if 'retire' in item else None,
                                                datetime.datetime.now()))

    def _insert_shop_image(self, tx, item):
        tx.execute('select ShopID from RamenShop where ShopID = %s', item['shop_id'])
        result = tx.fetchone()
        if result:
            tx.execute('update RamenShop set ImageBlob = %s where ShopID = %s',
                       (item['blob'] if 'blob' in item else None, item['shop_id']))
        else:
            tx.execute('insert into RamenShop (ShopID, ImageBlob) values (%s, %s)',
                       (int(item['shop_id']), item['blob']) if 'blob' in item else None)


    def _insert_review(self, tx, item):
        tx.execute('select ReviewID from RamenReview where ReviewID = %s', item['review_id'])
        result = tx.fetchone()
        if result:
            pass
        else:
            tx.execute('insert into RamenReview ( \
                                               ReviewID, \
                                               ShopID, \
                                               Url, \
                                               UserName, \
                                               Score, \
                                               Title, \
                                               Comment, \
                                               Image, \
                                               CreateDate, \
                                               ModifiedDate \
                                               ) values (%s,%s,%s,%s,%s,%s,%s,%s,%s, %s)',
                                               (int(item['review_id']),
                                                int(item['shop_id']),
                                                item['url'] if 'url' in item else None,
                                                item['user_name'] if 'user_name' in item else None,
                                                Decimal(item['score']) if 'score' in item else None,
                                                item['title'] if 'title' in item else None,
                                                item['comment'] if 'comment' in item else None,
                                                item['image'] if 'image' in item else None,
                                                item['create_date'] if 'create_date' in item else None,
                                                item['modified_date'] if 'modified_date' in item else None))

    def _insert_style(self, tx, item):
        tx.execute('select ShopID from RamenStyle where ShopID = %s and Style = %s and Soup = %s', [item['shop_id'], item['style'], item['soup']])
        result = tx.fetchone()
        if result:
            pass
        els:e
            tx.execute('insert into RamenStyle ( \
                                               ShopID, \
                                               Style, \
                                               Soup \
                                               ) values (%s,%s,%s)',
                                               (int(item['shop_id']),
                                                item['style'],
                                                item['soup']))

    def handle_error(self, e):
        log.err(e)

## ramendb_spider.py
# -*- coding: utf-8 -*-
import re
from boto.s3.key import Key
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from ramendb.items import RamenReview, RamenStyle, RamenShop, RamenShopImage
from scrapy.http.request import Request
from scrapy import log
import json
from ramendb.spiders import s3bucket


def _remove_html_tags(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)

def _remove_anchor_tag(data):
    p = re.compile(r'<a.*?>.*?</a>')
    return p.sub('', data)

def _remove_tab_code(data):
    return data.replace('\t', '')

def _remove_lf_code(data):
    return data.replace('\n', '')

def _parse_break_tag(data):
    p = re.compile(r'<br.*?>')
    return p.sub('\n', data.replace('\n', ''))

def _find_keyword(value):
    # http://ramendb.supleks.jp/search/shop?q=%E9%86%A4%E6%B2%B9
    m = re.search('q=(.*?)$', value)
    if m:
        return m.group(1)

def _find_unique_id(value):
    # http://ramendb.supleks.jp/s/12559.html
    m = re.search('\/(\d+?).html', value)
    if m:
        return m.group(1)

def _find_next_link(value):
    # window.location.href='/search/shop?page=100&q=%E9%86%A4%E6%B2%B9';return false;
    m = re.search('href=\'(.*?)\'', value)
    if m:
        return m.group(1)

def _find_lat_lon(value):
    # http://maps.google.com/maps/api/staticmap?center=35.88240910824,139.82840538025&amp;zoom=15&amp;size=204x180&amp;markers=icon:http://s1.supleks.jp/shopdb/icon16_star_yellow.png|35.88240910824,139.82840538025&amp;language=ja&amp;sensor=false
    m = re.search('center=([-|\.|\d]+?),([-|\.|\d]+?)&', value)
    if m:
        return m.group(1, 2)

def _find_create_modified_date(value):
    # 2012年7月3日投稿（2012年7月4日更新） |
    m = re.search(u'^(\d+年\d+月\d+日)?(投稿)?(（)?(\d+年\d+月\d+日)?(更新）)? |', value, re.U)
    if m:
        return m.group(1, 4)

def _find_style(value):
    # [汁なし/鶏白湯]
    m = re.search('\[(.*?)/(.*?)\]', value)
    if m:
        return m.group(1, 2)

def _find_graph_array(key, value):
    m = re.search("reviewGraph\.addData\(\'" + key + "\'\, (.*?)\)\;", value)
    if m:
        return m.group(1).replace('name', '"name"').replace('data', '"data"').replace("'", '"')

class RamenDBSpider(BaseSpider):
    name = 'ramendb'
    allowed_domains = ['ramendb.supleks.jp']
    start_urls = [
                  # 醤油
                  # 'http://ramendb.supleks.jp/search/shop?q=%E9%86%A4%E6%B2%B9',
                  # 味噌
                  # 'http://ramendb.supleks.jp/search/shop?q=%e5%91%b3%e5%99%8c',
                  # 塩
                  # 'http://ramendb.supleks.jp/search/shop?q=%e5%a1%a9',
                  # 豚骨
                  # 'http://ramendb.supleks.jp/search/shop?q=%e8%b1%9a%e9%aa%a8',
                  # 鶏白湯
                  # 'http://ramendb.supleks.jp/search/shop?q=%e9%b6%8f%e7%99%bd%e6%b9%af',
                  # 担々麺
                  # 'http://ramendb.supleks.jp/search/shop?q=%e6%8b%85%e3%80%85%e9%ba%ba',
                  # カレー
                  # 'http://ramendb.supleks.jp/search/shop?q=%e3%82%ab%e3%83%ac%e3%83%bc',
                  # Ranking
                  # 'http://ramendb.supleks.jp/rank'
                  # Search
                  'http://ramendb.supleks.jp/search'
                  ]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        """
        for a in hxs.select('//ul[@class="searched"]//a[@class="name"]'):
            href = a.select('@href').extract()[0]
            request = Request('http://ramendb.supleks.jp' + href, self.parse_shop)
            yield request

        for a in hxs.select('//span[@class="name"]/a'):
            href = a.select('@href').extract()[0]
            request = Request('http://ramendb.supleks.jp' + href, self.parse_shop)
            yield request
        """
        for a in hxs.select('//div[@class="name"]/h4/a'):
            href = a.select('@href').extract()[0]
            request = Request('http://ramendb.supleks.jp' + href, self.parse_shop)
            yield request
        """
        for a in hxs.select('//div[@class="menu"]/span/a'):
            href = a.select('@href').extract()[0]
            request = Request('http://ramendb.supleks.jp' + href, self.parse_review)
            yield request
        """
        for page in hxs.select('//a[@class="page"]'):
            href = _find_next_link(page.select('@onclick').extract()[0])
            request = Request('http://ramendb.supleks.jp' + href, self.parse)
            yield request


    def parse_review(self, response):
        hxs = HtmlXPathSelector(response)

        alert = hxs.select('//*[@id="waku"]')
        if len(alert) > 0:
            return

        style = RamenStyle()
        style['shop_id'] = _find_unique_id(hxs.select('//div[@class="props"]/span[1]/a[1]/@href').extract()[0])
        style['style'], style['soup'] = _find_style(hxs.select('//span[@class="style"]/text()').extract()[0])
        yield style

        review = RamenReview()
        review['shop_id'] = style['shop_id']
        review['review_id'] = _find_unique_id(hxs.select('//div[@class="menu"]/span/a/@href').extract()[0])
        review['url'] = response.url
        review['user_name'] = hxs.select('//div[@class="props"]/span[2]/a/text()').extract()[0]
        review['score'] = hxs.select('//div[@class="score"]/text()').extract()[0]
        review['title'] = hxs.select('//div[@class="menu"]/span/a/text()').extract()[0]
        review['comment'] = _remove_html_tags(_parse_break_tag(hxs.select('//div[@class="comment"]/p').extract()[0]))
        image = hxs.select('//div[@class="comment"]/p/a/img/@src').extract()
        if len(image) > 0:
            review['image'] = image[0]
        review['create_date'], review['modified_date'] = _find_create_modified_date(hxs.select('//div[@class="foot"]//div[@class="right"]/text()').extract()[0])
        yield review


    def parse_shop(self, response):
        hxs = HtmlXPathSelector(response)

        alert = hxs.select('//div[@class="alertbox"]')
        if len(alert) > 0:
            return
        """
        for a in hxs.select('//a[@class="btn-more"]'):
            href = a.select('@href').extract()[0]
            if not href.endswith('/photo/review'):
                request = Request('http://ramendb.supleks.jp' + href, self.parse)
                yield request
        """
        shop = RamenShop()
        shop['retire'] = True if len(hxs.select('//span[@class="plate retire"]/text()')) > 0 else False
        #shop['url'] = hxs.select('//meta[@property="og:url"]/@content').extract()[0]
        shop['url'] = response.url
        shop['shop_id'] = _find_unique_id(shop['url'])
        shop['image'] = hxs.select('//meta[@property="og:image"]/@content').extract()[0]
        shop['lat'], shop['lon'] = _find_lat_lon(hxs.select('//a[@id="minimap"]//img/@src').extract()[0])
        point = hxs.select('//div[@id="point"]')
        point_float = point.select('span[@class="float"]')
        if len(point) > 0 and len(point_float) > 0:
            shop['score'] = point.select('span[@class="int"]/text()').extract()[0] + point.select('span[@class="float"]/text()').extract()[0]

        for row in hxs.select('//table[@id="data-table"]//tr'):
            th = row.select('th/text()').extract()[0]
            if th == u'店名':
                shop['name'] = row.select('td/strong/text()').extract()[0]
            elif th == u'住所':
                shop['address'] = _remove_lf_code(_remove_tab_code(_remove_html_tags(row.select('td').extract()[0])))
            elif th == u'電話番号':
                shop['tel'] = row.select('td/text()').extract()[0]
            elif th == u'営業時間':
                shop['business_hours'] = row.select('td/text()').extract()[0]
            elif th == u'定休日':
                shop['closed_on'] = row.select('td/text()').extract()[0]
            elif th == u'席数':
                shop['capacity'] = row.select('td/text()').extract()[0]
            elif th == u'喫煙':
                shop['smoking'] = row.select('td/text()').extract()[0]
            elif th == u'最寄り駅':
                shop['station'] = _remove_lf_code(_remove_tab_code(_remove_html_tags(row.select('td').extract()[0])))
            elif th == u'アクセス':
                shop['access'] = row.select('td/text()').extract()[0]
            elif th == u'駐車場':
                shop['parking'] = row.select('td/text()').extract()[0]
            elif th == u'開店日':
                shop['opening_date'] = row.select('td/text()').extract()[0]
            elif th == u'メニュー':
                shop['menu'] = _remove_html_tags(_remove_anchor_tag(_parse_break_tag(row.select('td/p[@class="more"]').extract()[0])))
            elif th == u'備考':
                shop['note'] = _remove_html_tags(_remove_anchor_tag(_parse_break_tag(row.select('td/p[@class="more"]/node()').extract()[0])))
            elif th == u'タグ':
                shop['tags'] = ','.join([node for node in row.select('td/a/text()').extract()])

        for row in hxs.select('//table[@class="key-value"]//tr'):
            th = row.select('th/text()').extract()[0]
            if th == u'レビュー件数':
                shop['review_count'] = row.select('td/text()').extract()[0][:-1].replace(',', '')
            elif th == u'レビューユーザー数':
                shop['review_user_count'] = row.select('td/text()').extract()[0][:-1].replace(',', '')
            elif th == u'平均点':
                shop['average'] = row.select('td/text()').extract()[0][:-1].replace(',', '')
            elif th == u'総合順位':
                ranking = row.select('td/text()').extract()[0][:-1].replace(',', '')
                if len(ranking) > 0:
                    shop['ranking'] = row.select('td/text()').extract()[0][:-1].replace(',', '')
            elif th == u'スキ':
                shop['like'] = row.select('td/text()').extract()[0][:-1].replace(',', '')

        yield shop

        graph = hxs.select('//div[@id="graph"]/following-sibling::node()[name()="script"]/text()').extract()
        if len(graph) > 0:
            styles = []
            graphArrayStyle = _find_graph_array('all_0', graph[0])
            if graphArrayStyle is not None:
                jsonStyles = json.loads(graphArrayStyle)
                for jsonStyle in jsonStyles:
                    if sum(jsonStyle['data']) > 0:
                        styles.append(jsonStyle['name'])
            soups = []
            graphArraySoup = _find_graph_array('all_1', graph[0])
            if graphArraySoup is not None:
                jsonSoups = json.loads(graphArraySoup)
                for jsonSoup in jsonSoups:
                    if sum(jsonSoup['data']) > 0:
                        soups.append(jsonSoup['name'])
            for style in styles:
                for soup in soups:
                    ramenStyle = RamenStyle()
                    ramenStyle['shop_id'] = shop['shop_id']
                    ramenStyle['style'] = style
                    ramenStyle['soup'] = soup
                    yield ramenStyle

        if shop['image']:
            request = Request(
                              headers = {'shop_id': shop['shop_id']},
                              url = shop['image'],
                              callback = self.parse_shop_image)
            yield request

    def parse_shop_image(self, response):
        image = RamenShopImage()
        image['shop_id'] = response.request.headers.get('shop_id')
        image['blob'] = response.body

        key = Key(s3bucket)
        key.key = 'ramendb/img/shop/%s.jpg' % image['shop_id']
        key.set_contents_from_string(response.body, replace=True, headers={'Content-Type': 'image/jpeg'})
        key.set_acl('public-read')

        return image
	# -- coding: utf-8 --
	from scrapy.item import Item, Field

	class RamenStyle(Item):
	shop_id = Field()
	style = Field()
	soup = Field()

	class RamenReview(Item):
	review_id = Field()
	shop_id = Field()
	url = Field()
	user_name = Field()
	score = Field()
	title = Field()
	comment = Field()
	image = Field()
	create_date = Field()
	modified_date = Field()

	class RamenShop(Item):
	shop_id = Field()
	url = Field()
	name = Field()
	address = Field()
	tel = Field()
	business_hours = Field()
	closed_on = Field()
	capacity = Field()
	smoking = Field()
	station = Field()
	access = Field()
	parking = Field()
	opening_date = Field()
	menu = Field()
	note = Field()
	tags = Field()
	score = Field()
	review_count = Field()
	review_user_count = Field()
	average = Field()
	ranking = Field()
	like = Field()
	lat = Field()
	lon = Field()
	image = Field()
	retire = Field()

	class RamenShopImage(Item):
	shop_id = Field()
	blob = Field()
	# -- coding: utf-8 --
	"""
	MySQL Pipeline
	http://snipplr.com/view/66986/mysql-pipeline/
	"""
	import datetime
	import MySQLdb.cursors
	from twisted.enterprise import adbapi
	from decimal import Decimal
	from scrapy import log
	from ramendb import settings
	from ramendb.items import RamenShop, RamenReview, RamenStyle, RamenShopImage

	class MySQLPipeline(object):

	def __init__(self):
	self.dbpool = adbapi.ConnectionPool('MySQLdb',
	host=settings.MYSQL_HOST,
	port=settings.MYSQL_PORT,
	db=settings.MYSQL_DATABASE,
	user=settings.MYSQL_USER,
	passwd=settings.MYSQL_PASSWORD,
	unix_socket=settings.MYSQL_SOCKET,
	cursorclass=MySQLdb.cursors.DictCursor,
	charset='utf8',
	use_unicode=True)
	self.dbpool.runInteraction(self._truncate_table)

	def _truncate_table(self, tx):
	today = datetime.datetime.today()
	tx.execute('alter table RamenShop rename to RamenShop_%s'
	% today.strftime('%Y%m%d%H%M%S'))
	tx.execute('alter table RamenReview rename to RamenReview_%s'
	% today.strftime('%Y%m%d%H%M%S'))
	tx.execute('alter table RamenStyle rename to RamenStyle_%s'
	% today.strftime('%Y%m%d%H%M%S'))
	tx.execute('create table RamenShop (\
	ShopID integer unsigned not null, \
	Url text, \
	Name text, \
	Address text, \
	Tel text, \
	BusinessHours text, \
	ClosedOn text, \
	Capacity text, \
	Smoking text, \
	Station text, \
	Access text, \
	Parking text, \
	OpeningDate text, \
	Menu text, \
	Note text, \
	Tags text, \
	Score float, \
	ReviewCount integer, \
	ReviewUserCount integer, \
	Average float, \
	Ranking integer, \
	`Like` integer, \
	Latitude decimal(9,6), \
	Longitude decimal(9,6), \
	Image text, \
	ImageBlob mediumblob, \
	Retire bit, \
	CreateDate datetime, \
	ModifiedDate datetime, \
	primary key(ShopID)\
	) engine=InnoDB default charset=utf8')
	tx.execute('create table RamenReview (\
	ReviewID integer not null, \
	ShopID integer not null, \
	Url text, \
	UserName text, \
	Score float, \
	Title text, \
	Comment mediumtext, \
	Image text, \
	CreateDate text, \
	ModifiedDate text, \
	primary key(ReviewID)\
	) engine=InnoDB default charset=utf8')
	tx.execute('create table RamenStyle (\
	ShopID integer not null, \
	Style varchar(255) not null, \
	Soup varchar(255) not null, \
	primary key(ShopID, Style, Soup)\
	) engine=InnoDB default charset=utf8')

	def process_item(self, item, spider):
	query = self.dbpool.runInteraction(self._conditional_insert, item)
	query.addErrback(self.handle_error)
	return item

	def _conditional_insert(self, tx, item):
	if isinstance(item, RamenShop):
	self._insert_shop(tx, item)
	elif isinstance(item, RamenReview):
	self._insert_review(tx, item)
	elif isinstance(item, RamenStyle):
	self._insert_style(tx, item)
	elif isinstance(item, RamenShopImage):
	self._insert_shop_image(tx, item)

	def _insert_shop(self, tx, item):
	tx.execute('select ShopID from RamenShop where ShopID = %s', item['shop_id'])
	result = tx.fetchone()
	if result:
	tx.execute('update RamenShop set \
	Url = %s, \
	Name = %s, \
	Address = %s, \
	Tel = %s, \
	BusinessHours = %s, \
	ClosedOn = %s, \
	Capacity = %s, \
	Smoking = %s, \
	Station = %s, \
	Access = %s, \
	Parking = %s, \
	OpeningDate = %s, \
	Menu = %s, \
	Note = %s, \
	Tags = %s, \
	Score = %s, \
	ReviewCount = %s, \
	ReviewUserCount = %s, \
	Average = %s, \
	Ranking = %s, \
	`Like` = %s, \
	Latitude = %s, \
	Longitude = %s, \
	Image = %s, \
	Retire = %s, \
	CreateDate = %s \
	where ShopID = %s',
	(item['url'] if 'url' in item else None,
	item['name'] if 'name' in item else None,
	item['address'] if 'address' in item else None,
	item['tel'] if 'tel' in item else None,
	item['business_hours'] if 'business_hours' in item else None,
	item['closed_on'] if 'closed_on' in item else None,
	item['capacity'] if 'capacity' in item else None,
	item['smoking'] if 'smoking' in item else None,
	item['station'] if 'station' in item else None,
	item['access'] if 'access' in item else None,
	item['parking'] if 'parking' in item else None,
	item['opening_date'] if 'opening_date' in item else None,
	item['menu'] if 'menu' in item else None,
	item['note'] if 'note' in item else None,
	item['tags'] if 'tags' in item else None,
	Decimal(item['score']) if 'score' in item else None,
	int(item['review_count']) if 'review_count' in item else None,
	int(item['review_user_count']) if 'review_user_count' in item else None,
	Decimal(item['average']) if 'average' in item else None,
	int(item['ranking']) if 'ranking' in item else None,
	int(item['like']) if 'like' in item else None,
	Decimal(item['lat']) if 'lat' in item else None,
	Decimal(item['lon']) if 'lon' in item else None,
	item['image'] if 'image' in item else None,
	item['retire'] if 'retire' in item else None,
	datetime.datetime.now(),
	item['shop_id']))
	else:
	tx.execute('insert into RamenShop ( \
	ShopID, \
	Url, \
	Name, \
	Address, \
	Tel, \
	BusinessHours, \
	ClosedOn, \
	Capacity, \
	Smoking, \
	Station, \
	Access, \
	Parking, \
	OpeningDate, \
	Menu, \
	Note, \
	Tags, \
	Score, \
	ReviewCount, \
	ReviewUserCount, \
	Average, \
	Ranking, \
	`Like`, \
	Latitude, \
	Longitude, \
	Image, \
	Retire, \
	CreateDate \
	) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
	(int(item['shop_id']),
	item['url'] if 'url' in item else None,
	item['name'] if 'name' in item else None,
	item['address'] if 'address' in item else None,
	item['tel'] if 'tel' in item else None,
	item['business_hours'] if 'business_hours' in item else None,
	item['closed_on'] if 'closed_on' in item else None,
	item['capacity'] if 'capacity' in item else None,
	item['smoking'] if 'smoking' in item else None,
	item['station'] if 'station' in item else None,
	item['access'] if 'access' in item else None,
	item['parking'] if 'parking' in item else None,
	item['opening_date'] if 'opening_date' in item else None,
	item['menu'] if 'menu' in item else None,
	item['note'] if 'note' in item else None,
	item['tags'] if 'tags' in item else None,
	Decimal(item['score']) if 'score' in item else None,
	int(item['review_count']) if 'review_count' in item else None,
	int(item['review_user_count']) if 'review_user_count' in item else None,
	Decimal(item['average']) if 'average' in item else None,
	int(item['ranking']) if 'ranking' in item else None,
	int(item['like']) if 'like' in item else None,
	Decimal(item['lat']) if 'lat' in item else None,
	Decimal(item['lon']) if 'lon' in item else None,
	item['image'] if 'image' in item else None,
	item['retire'] if 'retire' in item else None,
	datetime.datetime.now()))

	def _insert_shop_image(self, tx, item):
	tx.execute('select ShopID from RamenShop where ShopID = %s', item['shop_id'])
	result = tx.fetchone()
	if result:
	tx.execute('update RamenShop set ImageBlob = %s where ShopID = %s',
	(item['blob'] if 'blob' in item else None, item['shop_id']))
	else:
	tx.execute('insert into RamenShop (ShopID, ImageBlob) values (%s, %s)',
	(int(item['shop_id']), item['blob']) if 'blob' in item else None)


	def _insert_review(self, tx, item):
	tx.execute('select ReviewID from RamenReview where ReviewID = %s', item['review_id'])
	result = tx.fetchone()
	if result:
	pass
	else:
	tx.execute('insert into RamenReview ( \
	ReviewID, \
	ShopID, \
	Url, \
	UserName, \
	Score, \
	Title, \
	Comment, \
	Image, \
	CreateDate, \
	ModifiedDate \
	) values (%s,%s,%s,%s,%s,%s,%s,%s,%s, %s)',
	(int(item['review_id']),
	int(item['shop_id']),
	item['url'] if 'url' in item else None,
	item['user_name'] if 'user_name' in item else None,
	Decimal(item['score']) if 'score' in item else None,
	item['title'] if 'title' in item else None,
	item['comment'] if 'comment' in item else None,
	item['image'] if 'image' in item else None,
	item['create_date'] if 'create_date' in item else None,
	item['modified_date'] if 'modified_date' in item else None))

	def _insert_style(self, tx, item):
	tx.execute('select ShopID from RamenStyle where ShopID = %s and Style = %s and Soup = %s', [item['shop_id'], item['style'], item['soup']])
	result = tx.fetchone()
	if result:
	pass
	els:e
	tx.execute('insert into RamenStyle ( \
	ShopID, \
	Style, \
	Soup \
	) values (%s,%s,%s)',
	(int(item['shop_id']),
	item['style'],
	item['soup']))

	def handle_error(self, e):
	log.err(e)
	# -- coding: utf-8 --
	import re
	from boto.s3.key import Key
	from scrapy.spider import BaseSpider
	from scrapy.selector import HtmlXPathSelector
	from ramendb.items import RamenReview, RamenStyle, RamenShop, RamenShopImage
	from scrapy.http.request import Request
	from scrapy import log
	import json
	from ramendb.spiders import s3bucket


	def _remove_html_tags(data):
	p = re.compile(r'<.*?>')
	return p.sub('', data)

	def _remove_anchor_tag(data):
	p = re.compile(r'<a.?>.?</a>')
	return p.sub('', data)

	def _remove_tab_code(data):
	return data.replace('\t', '')

	def _remove_lf_code(data):
	return data.replace('\n', '')

	def _parse_break_tag(data):
	p = re.compile(r'<br.*?>')
	return p.sub('\n', data.replace('\n', ''))

	def _find_keyword(value):
	# http://ramendb.supleks.jp/search/shop?q=%E9%86%A4%E6%B2%B9
	m = re.search('q=(.*?)$', value)
	if m:
	return m.group(1)

	def _find_unique_id(value):
	# http://ramendb.supleks.jp/s/12559.html
	m = re.search('\/(\d+?).html', value)
	if m:
	return m.group(1)

	def _find_next_link(value):
	# window.location.href='/search/shop?page=100&q=%E9%86%A4%E6%B2%B9';return false;
	m = re.search('href=\'(.*?)\'', value)
	if m:
	return m.group(1)

	def _find_lat_lon(value):
	# http://maps.google.com/maps/api/staticmap?center=35.88240910824,139.82840538025&zoom=15&size=204x180&markers=icon:http://s1.supleks.jp/shopdb/icon16_star_yellow.png\|35.88240910824,139.82840538025&language=ja&sensor=false
	m = re.search('center=([-\|\.\|\d]+?),([-\|\.\|\d]+?)&', value)
	if m:
	return m.group(1, 2)

	def _find_create_modified_date(value):
	# 2012年7月3日投稿（2012年7月4日更新） \|
	m = re.search(u'^(\d+年\d+月\d+日)?(投稿)?(（)?(\d+年\d+月\d+日)?(更新）)? \|', value, re.U)
	if m:
	return m.group(1, 4)

	def _find_style(value):
	# [汁なし/鶏白湯]
	m = re.search('\[(.?)/(.?)\]', value)
	if m:
	return m.group(1, 2)

	def _find_graph_array(key, value):
	m = re.search("reviewGraph\.addData\(\'" + key + "\'\, (.*?)\)\;", value)
	if m:
	return m.group(1).replace('name', '"name"').replace('data', '"data"').replace("'", '"')

	class RamenDBSpider(BaseSpider):
	name = 'ramendb'
	allowed_domains = ['ramendb.supleks.jp']
	start_urls = [
	# 醤油
	# 'http://ramendb.supleks.jp/search/shop?q=%E9%86%A4%E6%B2%B9',
	# 味噌
	# 'http://ramendb.supleks.jp/search/shop?q=%e5%91%b3%e5%99%8c',
	# 塩
	# 'http://ramendb.supleks.jp/search/shop?q=%e5%a1%a9',
	# 豚骨
	# 'http://ramendb.supleks.jp/search/shop?q=%e8%b1%9a%e9%aa%a8',
	# 鶏白湯
	# 'http://ramendb.supleks.jp/search/shop?q=%e9%b6%8f%e7%99%bd%e6%b9%af',
	# 担々麺
	# 'http://ramendb.supleks.jp/search/shop?q=%e6%8b%85%e3%80%85%e9%ba%ba',
	# カレー
	# 'http://ramendb.supleks.jp/search/shop?q=%e3%82%ab%e3%83%ac%e3%83%bc',
	# Ranking
	# 'http://ramendb.supleks.jp/rank'
	# Search
	'http://ramendb.supleks.jp/search'
	]

	def parse(self, response):
	hxs = HtmlXPathSelector(response)
	"""
	for a in hxs.select('//ul[@class="searched"]//a[@class="name"]'):
	href = a.select('@href').extract()[0]
	request = Request('http://ramendb.supleks.jp' + href, self.parse_shop)
	yield request

	for a in hxs.select('//span[@class="name"]/a'):
	href = a.select('@href').extract()[0]
	request = Request('http://ramendb.supleks.jp' + href, self.parse_shop)
	yield request
	"""
	for a in hxs.select('//div[@class="name"]/h4/a'):
	href = a.select('@href').extract()[0]
	request = Request('http://ramendb.supleks.jp' + href, self.parse_shop)
	yield request
	"""
	for a in hxs.select('//div[@class="menu"]/span/a'):
	href = a.select('@href').extract()[0]
	request = Request('http://ramendb.supleks.jp' + href, self.parse_review)
	yield request
	"""
	for page in hxs.select('//a[@class="page"]'):
	href = _find_next_link(page.select('@onclick').extract()[0])
	request = Request('http://ramendb.supleks.jp' + href, self.parse)
	yield request


	def parse_review(self, response):
	hxs = HtmlXPathSelector(response)

	alert = hxs.select('//*[@id="waku"]')
	if len(alert) > 0:
	return

	style = RamenStyle()
	style['shop_id'] = _find_unique_id(hxs.select('//div[@class="props"]/span[1]/a[1]/@href').extract()[0])
	style['style'], style['soup'] = _find_style(hxs.select('//span[@class="style"]/text()').extract()[0])
	yield style

	review = RamenReview()
	review['shop_id'] = style['shop_id']
	review['review_id'] = _find_unique_id(hxs.select('//div[@class="menu"]/span/a/@href').extract()[0])
	review['url'] = response.url
	review['user_name'] = hxs.select('//div[@class="props"]/span[2]/a/text()').extract()[0]
	review['score'] = hxs.select('//div[@class="score"]/text()').extract()[0]
	review['title'] = hxs.select('//div[@class="menu"]/span/a/text()').extract()[0]
	review['comment'] = _remove_html_tags(_parse_break_tag(hxs.select('//div[@class="comment"]/p').extract()[0]))
	image = hxs.select('//div[@class="comment"]/p/a/img/@src').extract()
	if len(image) > 0:
	review['image'] = image[0]
	review['create_date'], review['modified_date'] = _find_create_modified_date(hxs.select('//div[@class="foot"]//div[@class="right"]/text()').extract()[0])
	yield review


	def parse_shop(self, response):
	hxs = HtmlXPathSelector(response)

	alert = hxs.select('//div[@class="alertbox"]')
	if len(alert) > 0:
	return
	"""
	for a in hxs.select('//a[@class="btn-more"]'):
	href = a.select('@href').extract()[0]
	if not href.endswith('/photo/review'):
	request = Request('http://ramendb.supleks.jp' + href, self.parse)
	yield request
	"""
	shop = RamenShop()
	shop['retire'] = True if len(hxs.select('//span[@class="plate retire"]/text()')) > 0 else False
	#shop['url'] = hxs.select('//meta[@property="og:url"]/@content').extract()[0]
	shop['url'] = response.url
	shop['shop_id'] = _find_unique_id(shop['url'])
	shop['image'] = hxs.select('//meta[@property="og:image"]/@content').extract()[0]
	shop['lat'], shop['lon'] = _find_lat_lon(hxs.select('//a[@id="minimap"]//img/@src').extract()[0])
	point = hxs.select('//div[@id="point"]')
	point_float = point.select('span[@class="float"]')
	if len(point) > 0 and len(point_float) > 0:
	shop['score'] = point.select('span[@class="int"]/text()').extract()[0] + point.select('span[@class="float"]/text()').extract()[0]

	for row in hxs.select('//table[@id="data-table"]//tr'):
	th = row.select('th/text()').extract()[0]
	if th == u'店名':
	shop['name'] = row.select('td/strong/text()').extract()[0]
	elif th == u'住所':
	shop['address'] = _remove_lf_code(_remove_tab_code(_remove_html_tags(row.select('td').extract()[0])))
	elif th == u'電話番号':
	shop['tel'] = row.select('td/text()').extract()[0]
	elif th == u'営業時間':
	shop['business_hours'] = row.select('td/text()').extract()[0]
	elif th == u'定休日':
	shop['closed_on'] = row.select('td/text()').extract()[0]
	elif th == u'席数':
	shop['capacity'] = row.select('td/text()').extract()[0]
	elif th == u'喫煙':
	shop['smoking'] = row.select('td/text()').extract()[0]
	elif th == u'最寄り駅':
	shop['station'] = _remove_lf_code(_remove_tab_code(_remove_html_tags(row.select('td').extract()[0])))
	elif th == u'アクセス':
	shop['access'] = row.select('td/text()').extract()[0]
	elif th == u'駐車場':
	shop['parking'] = row.select('td/text()').extract()[0]
	elif th == u'開店日':
	shop['opening_date'] = row.select('td/text()').extract()[0]
	elif th == u'メニュー':
	shop['menu'] = _remove_html_tags(_remove_anchor_tag(_parse_break_tag(row.select('td/p[@class="more"]').extract()[0])))
	elif th == u'備考':
	shop['note'] = _remove_html_tags(_remove_anchor_tag(_parse_break_tag(row.select('td/p[@class="more"]/node()').extract()[0])))
	elif th == u'タグ':
	shop['tags'] = ','.join([node for node in row.select('td/a/text()').extract()])

	for row in hxs.select('//table[@class="key-value"]//tr'):
	th = row.select('th/text()').extract()[0]
	if th == u'レビュー件数':
	shop['review_count'] = row.select('td/text()').extract()[0][:-1].replace(',', '')
	elif th == u'レビューユーザー数':
	shop['review_user_count'] = row.select('td/text()').extract()[0][:-1].replace(',', '')
	elif th == u'平均点':
	shop['average'] = row.select('td/text()').extract()[0][:-1].replace(',', '')
	elif th == u'総合順位':
	ranking = row.select('td/text()').extract()[0][:-1].replace(',', '')
	if len(ranking) > 0:
	shop['ranking'] = row.select('td/text()').extract()[0][:-1].replace(',', '')
	elif th == u'スキ':
	shop['like'] = row.select('td/text()').extract()[0][:-1].replace(',', '')

	yield shop

	graph = hxs.select('//div[@id="graph"]/following-sibling::node()[name()="script"]/text()').extract()
	if len(graph) > 0:
	styles = []
	graphArrayStyle = _find_graph_array('all_0', graph[0])
	if graphArrayStyle is not None:
	jsonStyles = json.loads(graphArrayStyle)
	for jsonStyle in jsonStyles:
	if sum(jsonStyle['data']) > 0:
	styles.append(jsonStyle['name'])
	soups = []
	graphArraySoup = _find_graph_array('all_1', graph[0])
	if graphArraySoup is not None:
	jsonSoups = json.loads(graphArraySoup)
	for jsonSoup in jsonSoups:
	if sum(jsonSoup['data']) > 0:
	soups.append(jsonSoup['name'])
	for style in styles:
	for soup in soups:
	ramenStyle = RamenStyle()
	ramenStyle['shop_id'] = shop['shop_id']
	ramenStyle['style'] = style
	ramenStyle['soup'] = soup
	yield ramenStyle

	if shop['image']:
	request = Request(
	headers = {'shop_id': shop['shop_id']},
	url = shop['image'],
	callback = self.parse_shop_image)
	yield request

	def parse_shop_image(self, response):
	image = RamenShopImage()
	image['shop_id'] = response.request.headers.get('shop_id')
	image['blob'] = response.body

	key = Key(s3bucket)
	key.key = 'ramendb/img/shop/%s.jpg' % image['shop_id']
	key.set_contents_from_string(response.body, replace=True, headers={'Content-Type': 'image/jpeg'})
	key.set_acl('public-read')

	return image