Skip to content

Instantly share code, notes, and snippets.

@vasmedvedev
Created October 16, 2016 17:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vasmedvedev/1b61f30a0679f5138113fd77a223cbb3 to your computer and use it in GitHub Desktop.
Save vasmedvedev/1b61f30a0679f5138113fd77a223cbb3 to your computer and use it in GitHub Desktop.
Scrape fragment
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from scrapper.items import ModelItem, DjangoCarModel, DjangoCarBrand
class CarModelSpider(CrawlSpider):
name = 'car_models'
allowed_domains = ['drom.ru']
start_urls = ['http://auto.drom.ru']
rules = (
Rule(LinkExtractor(restrict_xpaths='//div[@class="selectCars"]', deny=('other',)),
callback='parse_brand'),
)
def parse_brand(self, response):
brand_name = response.meta.get('link_text')
if not brand_name:
return
brand_name = brand_name.encode('utf-8')
brand, created = DjangoCarBrand.objects.get_or_create(name=brand_name)
brand.save()
model_list = []
for item in response.xpath('//td/h3//a/text()'):
car_model = ModelItem()
car_model['name'] = item.extract()
car_model['brand'] = brand
car_model = car_model.save(commit=False)
model_list.append(car_model)
DjangoCarModel.objects.bulk_create(model_list)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment