Created
October 31, 2014 03:27
-
-
Save shenyubao/bace51bf028153855cde to your computer and use it in GitHub Desktop.
DealerSpider
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.item import Item, Field | |
class BanksiteItem(Item): | |
# define the fields for your item here like: | |
# name = Field() | |
id = Field() | |
bank = Field() | |
province = Field() | |
city = Field() | |
name = Field() | |
address = Field() | |
phone = Field() | |
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class BanksitePipeline(object): | |
conn = None | |
def __init__(self): | |
self.conn = MySQLdb.connect(host='127.0.0.1', user='root', passwd='123456', db='person', port=3306, | |
charset='utf8', use_unicode=True) | |
def process_item(self, item, spider): | |
try: | |
cur = self.conn.cursor() | |
sql = "insert into banksite_new (`id`, `bank`,`city`,`name`,`address`,`phone`) values (%s,%s,%s,%s,%s,%s)" | |
args = (item['id'], item['bank'], item['city'], item['name'], item['address'], item['phone']) | |
ret = cur.execute(sql, argsettis) | |
self.conn.commit() | |
cur.close() | |
return item | |
except MySQLdb.Error, e: | |
print "Mysql Error %d: %s" % (e.args[0], e.args[1]) | |
def __del__(self): | |
self.conn.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.spider import BaseSpider | |
from scrapy.contrib.spiders import CrawlSpider, Rule | |
from scrapy.selector import Selector | |
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
from items import BanksiteItem | |
import sys | |
import re | |
class DealerSpider(CrawlSpider): | |
reload(sys) | |
sys.setdefaultencoding("utf-8") | |
name = "banksite" | |
allowed_domains = ["yhwdt.com"] | |
start_urls = [] | |
for i in range(2, 146): | |
start_urls.append("http://www.yhwdt.com/" + str(i) + "-0-0-1/") | |
rules = ( | |
Rule(SgmlLinkExtractor(allow=('\d+\.html$')), callback='parse_item'), | |
Rule(SgmlLinkExtractor(allow=('\d+-\d+-\d+-\d+')), follow=True), | |
) | |
def parse_item(self, response): | |
item = BanksiteItem() | |
sel = Selector(response) | |
id = str(response.url).replace("http://www.yhwdt.com/", "").replace(".html", "") | |
item['id'] = int(id) | |
item['city'] = sel.xpath("/html/body/div[1]/div[8]/div[2]/div[1]/div[1]/h4/font/text()").extract()[0] | |
item['name'] = sel.xpath("/html/body/div[1]/div[8]/div[1]/div/div[3]/div[2]/ul/li[1]/text()").extract()[0] | |
item['phone'] = sel.xpath("/html/body/div[1]/div[8]/div[1]/div/div[3]/div[2]/ul/li[3]/text()").extract()[0] | |
if (len(item['phone']) == 3): | |
item['phone'] = "" | |
item['bank'] = sel.xpath('/html/body/div[1]/div[8]/div[1]/div/div[3]/div[2]/ul/li[4]/a/text()').extract()[0] | |
item['bank'] = str(item['bank']).replace("网点", "") | |
item['address'] = sel.xpath("/html/body/div[1]/div[8]/div[1]/div/div[3]/div[2]/ul/li[2]/text()").extract()[0] | |
if (item['address'] != []): | |
item['address'] = filter_tags(item['address']) | |
return item | |
pass | |
def filter_tags(htmlstr): | |
#先过滤CDATA | |
re_cdata = re.compile('//<!\[CDATA\[[^>]*//\]\]>', re.I) #匹配CDATA | |
re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I)#Script | |
re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I)#style | |
re_br = re.compile('<br\s*?/?>')#处理换行 | |
re_h = re.compile('</?\w+[^>]*>')#HTML标签 | |
re_comment = re.compile('<!--[^>]*-->')#HTML注释 | |
re_js_comment = re.compile('/\*.*\*/', re.S) #Js注释 | |
s = re_cdata.sub('', htmlstr)#去掉CDATA | |
s = re_script.sub('', s) #去掉SCRIPT | |
s = re_style.sub('', s)#去掉style | |
s = re_br.sub('\n', s)#将br转换为换行 | |
s = re_h.sub('', s) #去掉HTML 标签 | |
s = re_comment.sub('', s)#去掉HTML注释 | |
s = re_js_comment.sub("", s) | |
#去掉多余的空行 | |
blank_line = re.compile('\n+') | |
s = blank_line.sub('\n', s) | |
s = blank_line.sub('\t', s) | |
# s = self.replaceCharEntity(s)#替换实体 | |
s = s.replace("\t", "") | |
return s | |
def printhxs(self, hxs): | |
for i in hxs: | |
print i.encode('utf-8') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BOT_NAME = 'banksite' | |
SPIDER_MODULES = ['banksite.spiders'] | |
NEWSPIDER_MODULE = 'banksite.spiders' | |
ITEM_PIPELINES = ['banksite.pipelines.BanksitePipeline'] | |
LOG_LEVEL = "INFO" | |
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment