Skip to content

Instantly share code, notes, and snippets.

@jluczak
Last active July 19, 2017 07:21
Show Gist options
  • Save jluczak/6c8cfedfe753c070bfa4bc116f29389d to your computer and use it in GitHub Desktop.
Save jluczak/6c8cfedfe753c070bfa4bc116f29389d to your computer and use it in GitHub Desktop.
scrapy sample - single & mulitlink
from scrapy.spiders import CrawlSpider, Rule
from mobile.items import MobileItem
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
class MySpider(CrawlSpider):
name = "mobile"
allowed_domains = ["mobiletechcon.de"]
start_urls = ["https://mobiletechcon.de/speakers-en/"]
rules = (
Rule(LxmlLinkExtractor(
restrict_xpaths=(".//*[@id='content-section-1']//a")),
follow=False,
callback='parse_item'
),
)
def parse_item(self, response):
sel = Selector(response)
item = MobileItem()
item['name'] = sel.xpath('//*[@class="gdlr-speaker-content-wrapper"]//h1/text()').extract_first()
item['bio'] = sel.xpath('//*[@class="gdlr-speaker-content"]//p').extract_first()
item['link'] = response.url
yield item
import scrapy
class QuotesSpider(scrapy.Spider):
name = "vmss"
start_urls = [
'http://soft-dev.org/events/vmss16/'
]
def parse(self, response):
for quote in response.css('tr.talk'):
yield {
'name': quote.css('a::text').extract(),
'link': quote.css('td a::attr(href)').extract(),
'title':quote.css('td::text').extract()[1],
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment