Skip to content

Instantly share code, notes, and snippets.

@Chestermozhao
Last active November 23, 2019 14:12
Show Gist options
  • Save Chestermozhao/27ce5dcf7f07aeaeb5f02964a9eb6d75 to your computer and use it in GitHub Desktop.
Save Chestermozhao/27ce5dcf7f07aeaeb5f02964a9eb6d75 to your computer and use it in GitHub Desktop.
splash_scrapy crawler
# -*- coding: utf-8 -*-
import scrapy
class SplashTestItem(scrapy.Item):
hometeam = scrapy.Field()
# -*- coding: utf-8 -*-
# 添加相關的資料庫python擴展
# import pymongo
# import pymysql
class SplashTestPipeline(object):
def process_item(self, item, spider):
save_to_db(item)
return item
def save_to_db(self, item):
# write code save to db(mongodb, mysqldb, and others)
pass
# -*- coding: utf-8 -*-
BOT_NAME = 'splash_test'
SPIDER_MODULES = ['splash_test.spiders']
NEWSPIDER_MODULE = 'splash_test.spiders'
# 輸出的log_file
LOG_FILE = "20191121_splash.log"
# scrapy_splash配置
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
# splash_url
SPLASH_URL = 'http://localhost:8050/'
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from items import SplashTestItem
class SplashSpider(Spider):
name = "splash_test"
start_urls = ["https://www.livescore.bet3000.com/"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback=self.parse, args={"wait": 5})
def parse(self, response):
hometeam = response.css(".hometeam").extract()
yield SplashTestItem(dict(hometeam=hometeam))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment