haotian-liu/ckc_spider_first_demo.py

## ckc_spider_first_demo.py
import scrapy
import pymysql.cursors
import datetime

from tutorial.items import CKCItem

class CKCSpider(scrapy.Spider):
    name = "ckc"
    allowed_domains = ["ckc.zju.edu.cn"]
    start_urls = [
        "http://ckc.zju.edu.cn/office/",
    ]

    def parse(self, response):
        for href in response.css("ul.cg-news-list > li > a::attr('href')"):
            url = response.urljoin(href.extract())
            yield scrapy.Request(url, callback=self.parse_dir_contents)

    def parse_dir_contents(self, response):
        item = CKCItem()
        item['title'] = response.css('h2.art-heading::text').extract_first()
        item['content'] = response.xpath('//div[@class="art-content article-content"]').extract_first()
        yield item

class LTSpider(scrapy.Spider):
    name = "lantian"
    allowed_domains = ["lantian.zju.edu.cn"]
    start_urls = [
        "http://lantian.zju.edu.cn/redir.php?catalog_id=18749",
    ]
    def parse(self, response):
        for href in response.css("div.zt > ul > li a::attr('href')"):
            url = response.urljoin(href.extract())
            yield scrapy.Request(url, callback=self.parse_dir_contents)

    def parse_dir_contents(self, response):
        createdate = datetime.datetime.now().strftime("%Y%m%d")
        title = response.css('h1::text').extract_first()
        content = response.xpath('//table[@width="950"]//tr//td[@height="250"]').extract_first()

        connection = pymysql.connect(
            host='localhost',
            user='root',
            password='root',
            db='spider',
            charset='utf8mb4',
            cursorclass=pymysql.cursors.DictCursor
        )

        sql = "INSERT INTO `notice` (`title`,`content`,`date`,`createdate`) VALUES(%s,%s,%s,%s)"
        connection.cursor().execute(sql,(title,content,createdate,createdate))
        connection.commit()
        connection.close()
	import scrapy
	import pymysql.cursors
	import datetime

	from tutorial.items import CKCItem

	class CKCSpider(scrapy.Spider):
	name = "ckc"
	allowed_domains = ["ckc.zju.edu.cn"]
	start_urls = [
	"http://ckc.zju.edu.cn/office/",
	]

	def parse(self, response):
	for href in response.css("ul.cg-news-list > li > a::attr('href')"):
	url = response.urljoin(href.extract())
	yield scrapy.Request(url, callback=self.parse_dir_contents)

	def parse_dir_contents(self, response):
	item = CKCItem()
	item['title'] = response.css('h2.art-heading::text').extract_first()
	item['content'] = response.xpath('//div[@class="art-content article-content"]').extract_first()
	yield item

	class LTSpider(scrapy.Spider):
	name = "lantian"
	allowed_domains = ["lantian.zju.edu.cn"]
	start_urls = [
	"http://lantian.zju.edu.cn/redir.php?catalog_id=18749",
	]
	def parse(self, response):
	for href in response.css("div.zt > ul > li a::attr('href')"):
	url = response.urljoin(href.extract())
	yield scrapy.Request(url, callback=self.parse_dir_contents)

	def parse_dir_contents(self, response):
	createdate = datetime.datetime.now().strftime("%Y%m%d")
	title = response.css('h1::text').extract_first()
	content = response.xpath('//table[@width="950"]//tr//td[@height="250"]').extract_first()

	connection = pymysql.connect(
	host='localhost',
	user='root',
	password='root',
	db='spider',
	charset='utf8mb4',
	cursorclass=pymysql.cursors.DictCursor
	)

	sql = "INSERT INTO `notice` (`title`,`content`,`date`,`createdate`) VALUES(%s,%s,%s,%s)"
	connection.cursor().execute(sql,(title,content,createdate,createdate))
	connection.commit()
	connection.close()