Skip to content

Instantly share code, notes, and snippets.

@haotian-liu
Created May 23, 2016 08:45
Show Gist options
  • Save haotian-liu/afaa1d21e636d0527fe3136c7e30be95 to your computer and use it in GitHub Desktop.
Save haotian-liu/afaa1d21e636d0527fe3136c7e30be95 to your computer and use it in GitHub Desktop.
import scrapy
import pymysql.cursors
import datetime
from tutorial.items import CKCItem
class CKCSpider(scrapy.Spider):
name = "ckc"
allowed_domains = ["ckc.zju.edu.cn"]
start_urls = [
"http://ckc.zju.edu.cn/office/",
]
def parse(self, response):
for href in response.css("ul.cg-news-list > li > a::attr('href')"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
item = CKCItem()
item['title'] = response.css('h2.art-heading::text').extract_first()
item['content'] = response.xpath('//div[@class="art-content article-content"]').extract_first()
yield item
class LTSpider(scrapy.Spider):
name = "lantian"
allowed_domains = ["lantian.zju.edu.cn"]
start_urls = [
"http://lantian.zju.edu.cn/redir.php?catalog_id=18749",
]
def parse(self, response):
for href in response.css("div.zt > ul > li a::attr('href')"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
createdate = datetime.datetime.now().strftime("%Y%m%d")
title = response.css('h1::text').extract_first()
content = response.xpath('//table[@width="950"]//tr//td[@height="250"]').extract_first()
connection = pymysql.connect(
host='localhost',
user='root',
password='root',
db='spider',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor
)
sql = "INSERT INTO `notice` (`title`,`content`,`date`,`createdate`) VALUES(%s,%s,%s,%s)"
connection.cursor().execute(sql,(title,content,createdate,createdate))
connection.commit()
connection.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment