Skip to content

Instantly share code, notes, and snippets.

@seanwu1105
Last active July 21, 2018 11:52
Show Gist options
  • Save seanwu1105/8fce0262b3a3c92ae42d56d9e28ca66b to your computer and use it in GitHub Desktop.
Save seanwu1105/8fce0262b3a3c92ae42d56d9e28ca66b to your computer and use it in GitHub Desktop.
Wikipedia zhTW Article Names Spider
import scrapy
class ArticlesListSpider(scrapy.Spider):
name = "articles_list"
page_counter = 0
def start_requests(self):
urls = [
r'https://zh.wikipedia.org/wiki/Special:%E6%89%80%E6%9C%89%E9%A1%B5%E9%9D%A2'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
names = set()
next_page = None
for unorder_list in response.css('ul.mw-allpages-chunk'):
for list_item in unorder_list.css('li'):
names.add(list_item.css('::text').extract_first())
self.page_counter += 1
yield {self.page_counter: names}
nav = response.css('div.mw-allpages-nav a')
for link in nav:
if '下一页(' in link.css('::text').extract_first():
next_page = link.css('::attr(href)').extract_first()
self.log('next page text is: ' + link.css('::text').extract_first())
break
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
# -*- coding: utf-8 -*-
# Scrapy settings for wiki_articles_list project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'wiki_articles_list'
SPIDER_MODULES = ['wiki_articles_list.spiders']
NEWSPIDER_MODULE = 'wiki_articles_list.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'wiki_articles_list (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
FEED_EXPORT_ENCODING = 'utf-8'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'wiki_articles_list.middlewares.WikiArticlesListSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'wiki_articles_list.middlewares.WikiArticlesListDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'wiki_articles_list.pipelines.WikiArticlesListPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment