Skip to content

Instantly share code, notes, and snippets.

@shenyubao
Created October 31, 2014 03:27
Show Gist options
  • Save shenyubao/bace51bf028153855cde to your computer and use it in GitHub Desktop.
Save shenyubao/bace51bf028153855cde to your computer and use it in GitHub Desktop.
DealerSpider
from scrapy.item import Item, Field
class BanksiteItem(Item):
# define the fields for your item here like:
# name = Field()
id = Field()
bank = Field()
province = Field()
city = Field()
name = Field()
address = Field()
phone = Field()
pass
class BanksitePipeline(object):
conn = None
def __init__(self):
self.conn = MySQLdb.connect(host='127.0.0.1', user='root', passwd='123456', db='person', port=3306,
charset='utf8', use_unicode=True)
def process_item(self, item, spider):
try:
cur = self.conn.cursor()
sql = "insert into banksite_new (`id`, `bank`,`city`,`name`,`address`,`phone`) values (%s,%s,%s,%s,%s,%s)"
args = (item['id'], item['bank'], item['city'], item['name'], item['address'], item['phone'])
ret = cur.execute(sql, argsettis)
self.conn.commit()
cur.close()
return item
except MySQLdb.Error, e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
def __del__(self):
self.conn.close()
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from items import BanksiteItem
import sys
import re
class DealerSpider(CrawlSpider):
reload(sys)
sys.setdefaultencoding("utf-8")
name = "banksite"
allowed_domains = ["yhwdt.com"]
start_urls = []
for i in range(2, 146):
start_urls.append("http://www.yhwdt.com/" + str(i) + "-0-0-1/")
rules = (
Rule(SgmlLinkExtractor(allow=('\d+\.html$')), callback='parse_item'),
Rule(SgmlLinkExtractor(allow=('\d+-\d+-\d+-\d+')), follow=True),
)
def parse_item(self, response):
item = BanksiteItem()
sel = Selector(response)
id = str(response.url).replace("http://www.yhwdt.com/", "").replace(".html", "")
item['id'] = int(id)
item['city'] = sel.xpath("/html/body/div[1]/div[8]/div[2]/div[1]/div[1]/h4/font/text()").extract()[0]
item['name'] = sel.xpath("/html/body/div[1]/div[8]/div[1]/div/div[3]/div[2]/ul/li[1]/text()").extract()[0]
item['phone'] = sel.xpath("/html/body/div[1]/div[8]/div[1]/div/div[3]/div[2]/ul/li[3]/text()").extract()[0]
if (len(item['phone']) == 3):
item['phone'] = ""
item['bank'] = sel.xpath('/html/body/div[1]/div[8]/div[1]/div/div[3]/div[2]/ul/li[4]/a/text()').extract()[0]
item['bank'] = str(item['bank']).replace("网点", "")
item['address'] = sel.xpath("/html/body/div[1]/div[8]/div[1]/div/div[3]/div[2]/ul/li[2]/text()").extract()[0]
if (item['address'] != []):
item['address'] = filter_tags(item['address'])
return item
pass
def filter_tags(htmlstr):
#先过滤CDATA
re_cdata = re.compile('//<!\[CDATA\[[^>]*//\]\]>', re.I) #匹配CDATA
re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I)#Script
re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I)#style
re_br = re.compile('<br\s*?/?>')#处理换行
re_h = re.compile('</?\w+[^>]*>')#HTML标签
re_comment = re.compile('<!--[^>]*-->')#HTML注释
re_js_comment = re.compile('/\*.*\*/', re.S) #Js注释
s = re_cdata.sub('', htmlstr)#去掉CDATA
s = re_script.sub('', s) #去掉SCRIPT
s = re_style.sub('', s)#去掉style
s = re_br.sub('\n', s)#将br转换为换行
s = re_h.sub('', s) #去掉HTML 标签
s = re_comment.sub('', s)#去掉HTML注释
s = re_js_comment.sub("", s)
#去掉多余的空行
blank_line = re.compile('\n+')
s = blank_line.sub('\n', s)
s = blank_line.sub('\t', s)
# s = self.replaceCharEntity(s)#替换实体
s = s.replace("\t", "")
return s
def printhxs(self, hxs):
for i in hxs:
print i.encode('utf-8')
BOT_NAME = 'banksite'
SPIDER_MODULES = ['banksite.spiders']
NEWSPIDER_MODULE = 'banksite.spiders'
ITEM_PIPELINES = ['banksite.pipelines.BanksitePipeline']
LOG_LEVEL = "INFO"
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment