Skip to content

Instantly share code, notes, and snippets.

@XeroxGH
Last active December 19, 2015 21:29
Show Gist options
  • Save XeroxGH/6020708 to your computer and use it in GitHub Desktop.
Save XeroxGH/6020708 to your computer and use it in GitHub Desktop.
extraction récursive
#!/usr/bin/python
#-*- coding: utf-8 -*-
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from ProjetVinNicolas2.items import Projetvinnicolas2Item
from scrapy.contrib.exporter import CsvItemExporter
import re
#<a href="javascript:val('/page.php/fr/18_409~80~10.htm')" class="glo_pagination_link_page">9</a>
def strip_javascript(value):
m = re.search("javascript:val\('(\S+~(\d+)~10\.htm).*'\)", value)
if m:
return m.group(1)
class MySpider(CrawlSpider):
name = "vinNico2"
allowed_domains = ["nicolas.com"]
start_urls = ["http://www.nicolas.com/fr/commander_bordeaux.html"]
# la regle pour suivre les liens des pages suivantes
rules = (
Rule(
SgmlLinkExtractor(
restrict_xpaths='//div[@class="glo_pagination_droite"]/a[1]',
process_value=strip_javascript,
unique=True
),
callback='parse_items', follow=True,
),
Rule(
SgmlLinkExtractor(
# fixer les liens vers chaque bouteille du vin
restrict_xpaths='//table[@class="cpt_fav_table_commande"]/tr[position()>1]/td[3]/a[1]',
),
callback='parse_wine_page', follow=True,
),
)
#Fonction qui extrait les données à partir du tableau
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
res = hxs.select('//table[@class="cpt_fav_table_commande"]/tr[position()>1]')
items = []
for res in res:
item = Projetvinnicolas2Item()
item ["nomVin"] = map(unicode.strip, res.select('td[3]/a/text()').extract())
item ["appelation"] = map(unicode.strip, res.select('td[5]/text()').extract())
item ["millesime"] = map(unicode.strip, res.select('td[7]/text()').extract())
if (map(unicode.strip, res.select('td[9]/b/text()').extract())):
item ["prix"] = map(unicode.strip, res.select('td[9]/b/text()').extract())
else:
item ["prix"] = map(unicode.strip, res.select('td[9]/span/text()').extract())
items.append(item)
return items
#Fonction qui extrait les données à partir de chaque lien d'une bouteille
def parse_wine_page(self, response):
hxs = HtmlXPathSelector(response)
content = hxs.select('//*[@id="glo_right"]')
items1 = []
for res in content:
item = Projetvinnicolas2Item()
item ["temperature_de_service"] = map(unicode.strip, res.select('form/div[6]/div[7]/div[9]/text()').extract())
items1.append(item)
return items1
parse_start_url = parse_items
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment