public
Created

wine crawling from www.nicolas.com

  • Download Gist
WineSpider3.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
#!/usr/bin/python
#-*- coding: utf-8 -*-
from ProjetVinNicolas3.items import Projetvinnicolas3Item
from scrapy.contrib.exporter import CsvItemExporter
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
import lxml
import lxml.etree
import lxml.html
import re
import urlparse
#<a href="javascript:val('/page.php/fr/18_409~80~10.htm')" class="glo_pagination_link_page">9</a>
def strip_javascript(value):
m = re.search("javascript:val\('(\S+~(\d+)~10\.htm).*'\)", value)
if m:
return m.group(1)
 
class MySpider(CrawlSpider):
name = "vinNico3"
allowed_domains = ["nicolas.com"]
start_urls = ["http://www.nicolas.com/fr/commander_bordeaux.html"]
# la regle pour suivre les liens des pages suivantes
rules = (
Rule(
SgmlLinkExtractor(
restrict_xpaths='//div[@class="glo_pagination_droite"]/a[1]',
process_value=strip_javascript,
unique=True
),
follow=True,
),
Rule(
SgmlLinkExtractor(
# fixer les liens vers chaque bouteille du vin
restrict_xpaths='//table[@class="cpt_fav_table_commande"]/tr[position()>1]/td[3]/a[1]',
),
callback='parse_wine_page', follow=True,
),
)
#Fonction qui extrait les donn�es � partir de chaque lien d'une bouteille du vin
def parse_wine_page(self, response):
items = []
item = Projetvinnicolas3Item()
# lxml only for temperature de conservation
root = lxml.html.fromstring(response.body)
for content in root.xpath('//*[@id="glo_right"]'):
temperature = content.xpath('.//div[@class="pro_col_right"]/div[@class="pro_blk_trans"]/br')
if temperature:
#print 'OK Température'
item ["temp_Conserv"] = lxml.html.tostring(temperature[0], method="text", encoding=unicode).strip()
# back to hxs
hxs = HtmlXPathSelector(response)
content = hxs.select('//*[@id="glo_right"]')
for res in content:
#Méthode 1
#title = map(unicode.strip,
# content.select('.//div[@class="pro_detail_tit"]/div[@class="pro_titre"]/h1/text()').extract())
#m = re.compile(r'^(?P<nomVin1>.+)\s+(?P<millesime>\d+)$').search(title[0])
#if m:
# item ["nomVin1"]=m.group("nomVin")
# item ["millesime"]=m.group("millesime")
#Méthode 2
nomVin1, millesime = res.select(
'.//div[@class="pro_detail_tit"]/div[@class="pro_titre"]/h1/text()').re(
r'^(.+)\s+(\d+)\s*$')
nomVin1=nomVin1.strip()
item["millesime"]=millesime.strip()
iNomVin= map(unicode.strip, res.select('//div[@class="pro_detail_tit"]//div[@class="pro_titre"]/h1/text()').extract())
#print type(x)
iNomVin=(str(iNomVin[0].encode('utf-8'))).lower()
item ["nomVin"]=iNomVin.strip()
#Methode 1
item ["classement"] = map(unicode.strip, res.select('.//div[@class="pro_col_right"]/div[@class="pro_blk_trans"]/div[@class="pro_blk_trans_titre"]/text()').re(r'^(\d\w+\s*\w+)'))
#iClassement=(str(iClassement[0].encode('utf-8'))).lower()
#item ["classement"]=iClassement.strip()
iAppelation = map(unicode.strip, res.select('.//div[@class="pro_col_right"]/div[@class="pro_blk_trans"]/div[@class="pro_blk_trans_titre"]/text()').re(r'([\w-]+)(?=\W+(Rouge|Blanc|Rosé))'))
iAppelation=(str(iAppelation[0].encode('utf-8'))).lower()
item ["appelation"] =iAppelation.strip()
iCouleur = map(unicode.strip, res.select('.//div[@class="pro_col_right"]/div[@class="pro_blk_trans"]/div[@class="pro_blk_trans_titre"]/text()').re(r'\w+\s*$'))
iCouleur =(str(iCouleur[0].encode('utf-8'))).lower()
item ["couleur"] =iCouleur.strip()
#Methode 2
#title = map(unicode.strip,
# content.select('.//div[@class="pro_col_right"]/div[@class="pro_blk_trans"]/div[@class="pro_blk_trans_titre"]/text()').extract())
#m = re.compile(r'^(?P<classement>\d\w+\s*\w+)\S\s+(?P<appelation>\w+\-\w+|\w+)\S\s+(?P<couleur>\w+)\s*$').search(title[0])
#if m:
# item ["classement"]=m.group("classement")
# item ["appelation"]=m.group("appelation")
# item ["couleur"]=m.group("couleur")
verifCepage=map(unicode.strip, res.select('//div[@class="pro_col_right"]/div[@class="pro_blk_mauve"]/div[1]/text()').extract())
if 'page' in str(verifCepage) :
item ["cepage_s"] = map(unicode.strip, res.select('//div[@class="pro_col_right"]/div[@class="pro_blk_mauve"]/div[2]/text()').re(r'\d+%\s+.+'))
#iCepage_s=(str(iCepage_s[0].encode('utf-8'))).lower()
#item ["cepage_s"] = iCepage_s.strip()
else:
item ["cepage_s"] = ''
item ["temperature_de_service"] = map(unicode.strip, res.select('//div[@class="pro_col_right"]//div[@class="pro_blk_mauve"]//div[@class="pro_lgn_contenu"]/text()').re(r'\d+[°]'))
item ["volume"]= map(unicode.strip, res.select('//div[@class="pro_col_right"]/table[@class="pro_table_commande"]/tr/td[@class="pro_td_commande_1"]/div[@class="pro_commande_prix_3"]/text()').re(r'\d+\scl'))
item ["prix"]= map(unicode.strip, res.select('//div[@class="pro_col_right"]/table[@class="pro_table_commande"]/tr/td[@class="pro_td_commande_1"]/div[@class="pro_commande_prix"]/text()').re(r'\d+.\d*'))
#item ["Image"]= map(unicode.strip, res.select('//div[@class="pro_detail_tit"]//div[@class="pro_titre"]/h1/text()').extract())
#link = res.select('//div[@class="pro_col_left"]//img/@src').extract()
#link=str(link)
item['image_urls'] = map(lambda src: urlparse.urljoin(response.url, src), res.select('./div[@class="pro_col_left"]/img/@src').extract())
items.append(item)
return items
#parse_start_url = parse_wine_page

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.