Skip to content

Instantly share code, notes, and snippets.

@clemfromspace
Last active March 10, 2018 17:02
Show Gist options
  • Save clemfromspace/25ad63767d19355398d1 to your computer and use it in GitHub Desktop.
Save clemfromspace/25ad63767d19355398d1 to your computer and use it in GitHub Desktop.
Scrapy wikipédia url
import scrapy
class LinkItem(scrapy.Item):
href = scrapy.Field()
"""Spider for the wikipedia website"""
from urllib.parse import urljoin
from scrapy import Request
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import LinkItem
class WikipediaSpider(CrawlSpider):
name = 'wikipedia'
allowed_domains = ['en.wikipedia.org'] # Add other domains maybe ?
start_urls = [
'https://en.wikipedia.org/wiki/Main_Page'
]
links = set()
def filter_duplicate_link(self, link):
if link not in self.links:
self.links.add(link)
return False
else:
return True
def parse(self, response):
for link in response.xpath('//a/@href').extract():
# Link do not belong to wikipedia, yield a new Item
if 'http://' in link or 'https://' in link not in link:
if not self.filter_duplicate_link(link):
yield LinkItem(
href=link
)
# Link belong to wikipedia, follow (We don't want the "Special" links and do want the "wiki" links)
elif 'Special' not in link and 'wiki' in link:
yield Request(
urljoin(response.url, link)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment