Skip to content

Instantly share code, notes, and snippets.

@lisitsky
Last active November 17, 2016 11:20
Show Gist options
  • Save lisitsky/c4aac52edcb7abfd5975be067face1bb to your computer and use it in GitHub Desktop.
Save lisitsky/c4aac52edcb7abfd5975be067face1bb to your computer and use it in GitHub Desktop.
Simple crawler to parse projects from Weblancer
from urllib.parse import urljoin
from scrapy import Spider, Request
class WeblancerParser(Spider):
name = 'wbl_prj'
allowed_domains = ["www.weblancer.net"]
_url = 'http://www.weblancer.net/projects/'
_extract_fields = (
('title', 'h2.title > a'),
('categories', 'a.text-muted'),
('price', 'div.amount.title'),
('apps', '.text-nowrap'),
)
def start_requests(self):
yield self.make_requests_from_url(self._url)
def parse(self, response):
for row in response.css('div.cols_table.container-fluid > div.row'):
yield dict(
(k, '|'.join(c.strip()
for c in row.css(v+'::text').extract()))
for k, v in self._extract_fields
)
for link in response.css('ul.pagination > li > a::attr(href)'):
yield Request(urljoin(self._url, link.extract()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment