Last active
December 19, 2015 21:29
-
-
Save XeroxGH/6020708 to your computer and use it in GitHub Desktop.
extraction récursive
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
#-*- coding: utf-8 -*- | |
from scrapy.contrib.spiders import CrawlSpider, Rule | |
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
from scrapy.selector import HtmlXPathSelector | |
from ProjetVinNicolas2.items import Projetvinnicolas2Item | |
from scrapy.contrib.exporter import CsvItemExporter | |
import re | |
#<a href="javascript:val('/page.php/fr/18_409~80~10.htm')" class="glo_pagination_link_page">9</a> | |
def strip_javascript(value): | |
m = re.search("javascript:val\('(\S+~(\d+)~10\.htm).*'\)", value) | |
if m: | |
return m.group(1) | |
class MySpider(CrawlSpider): | |
name = "vinNico2" | |
allowed_domains = ["nicolas.com"] | |
start_urls = ["http://www.nicolas.com/fr/commander_bordeaux.html"] | |
# la regle pour suivre les liens des pages suivantes | |
rules = ( | |
Rule( | |
SgmlLinkExtractor( | |
restrict_xpaths='//div[@class="glo_pagination_droite"]/a[1]', | |
process_value=strip_javascript, | |
unique=True | |
), | |
callback='parse_items', follow=True, | |
), | |
Rule( | |
SgmlLinkExtractor( | |
# fixer les liens vers chaque bouteille du vin | |
restrict_xpaths='//table[@class="cpt_fav_table_commande"]/tr[position()>1]/td[3]/a[1]', | |
), | |
callback='parse_wine_page', follow=True, | |
), | |
) | |
#Fonction qui extrait les données à partir du tableau | |
def parse_items(self, response): | |
hxs = HtmlXPathSelector(response) | |
res = hxs.select('//table[@class="cpt_fav_table_commande"]/tr[position()>1]') | |
items = [] | |
for res in res: | |
item = Projetvinnicolas2Item() | |
item ["nomVin"] = map(unicode.strip, res.select('td[3]/a/text()').extract()) | |
item ["appelation"] = map(unicode.strip, res.select('td[5]/text()').extract()) | |
item ["millesime"] = map(unicode.strip, res.select('td[7]/text()').extract()) | |
if (map(unicode.strip, res.select('td[9]/b/text()').extract())): | |
item ["prix"] = map(unicode.strip, res.select('td[9]/b/text()').extract()) | |
else: | |
item ["prix"] = map(unicode.strip, res.select('td[9]/span/text()').extract()) | |
items.append(item) | |
return items | |
#Fonction qui extrait les données à partir de chaque lien d'une bouteille | |
def parse_wine_page(self, response): | |
hxs = HtmlXPathSelector(response) | |
content = hxs.select('//*[@id="glo_right"]') | |
items1 = [] | |
for res in content: | |
item = Projetvinnicolas2Item() | |
item ["temperature_de_service"] = map(unicode.strip, res.select('form/div[6]/div[7]/div[9]/text()').extract()) | |
items1.append(item) | |
return items1 | |
parse_start_url = parse_items |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment