Skip to content

Instantly share code, notes, and snippets.

Created September 3, 2013 13:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/6424305 to your computer and use it in GitHub Desktop.
Save anonymous/6424305 to your computer and use it in GitHub Desktop.
wine extraction
#!/usr/bin/python
#-*- coding: utf-8 -*-
from ProjetVinNicolas3.items import Projetvinnicolas3Item
from scrapy.contrib.exporter import CsvItemExporter
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
import lxml
import lxml.etree
import lxml.html
import re
#<a href="javascript:val('/page.php/fr/18_409~80~10.htm')" class="glo_pagination_link_page">9</a>
def strip_javascript(value):
m = re.search("javascript:val\('(\S+~(\d+)~10\.htm).*'\)", value)
if m:
return m.group(1)
class MySpider(CrawlSpider):
name = "vinNico3"
allowed_domains = ["nicolas.com"]
start_urls = ["http://www.nicolas.com/fr/commander_bordeaux.html"]
# la regle pour suivre les liens des pages suivantes
rules = (
Rule(
SgmlLinkExtractor(
restrict_xpaths='//div[@class="glo_pagination_droite"]/a[1]',
process_value=strip_javascript,
unique=True
),
follow=True,
),
Rule(
SgmlLinkExtractor(
# fixer les liens vers chaque bouteille du vin
restrict_xpaths='//table[@class="cpt_fav_table_commande"]/tr[position()>1]/td[3]/a[1]',
),
callback='parse_wine_page', follow=True,
),
)
#Fonction qui extrait les donn�es � partir de chaque lien d'une bouteille du vin
def parse_wine_page(self, response):
items = []
item = Projetvinnicolas3Item()
# lxml only for temperature de conservation
root = lxml.html.fromstring(response.body)
for content in root.xpath('//*[@id="glo_right"]'):
temperature = content.xpath('.//div[@class="pro_col_right"]/div[@class="pro_blk_trans"]/br')
if temperature:
#print 'OK Température'
item ["temp_Conserv"] = lxml.html.tostring(temperature[0], method="text", encoding=unicode).strip()
# back to hxs
hxs = HtmlXPathSelector(response)
content = hxs.select('//*[@id="glo_right"]')
for res in content:
#Méthode 1
#title = map(unicode.strip,
# content.select('.//div[@class="pro_detail_tit"]/div[@class="pro_titre"]/h1/text()').extract())
#m = re.compile(r'^(?P<nomVin1>.+)\s+(?P<millesime>\d+)$').search(title[0])
#if m:
# item ["nomVin1"]=m.group("nomVin")
# item ["millesime"]=m.group("millesime")
#Méthode 2
nomVin1, millesime = res.select(
'.//div[@class="pro_detail_tit"]/div[@class="pro_titre"]/h1/text()').re(
r'^(.+)\s+(\d+)\s*$')
nomVin1=nomVin1.strip()
item["millesime"]=millesime.strip()
iNomVin= map(unicode.strip, res.select('//div[@class="pro_detail_tit"]//div[@class="pro_titre"]/h1/text()').extract())
#print type(x)
iNomVin=(str(iNomVin[0].encode('utf-8'))).lower()
item ["nomVin"]=iNomVin.strip()
#Methode 1
item ["classement"] = map(unicode.strip, res.select('.//div[@class="pro_col_right"]/div[@class="pro_blk_trans"]/div[@class="pro_blk_trans_titre"]/text()').re(r'^(\d\w+\s*\w+)'))
#iClassement=(str(iClassement[0].encode('utf-8'))).lower()
#item ["classement"]=iClassement.strip()
iAppelation = map(unicode.strip, res.select('.//div[@class="pro_col_right"]/div[@class="pro_blk_trans"]/div[@class="pro_blk_trans_titre"]/text()').re(r'([\w-]+)(?=\W+(Rouge|Blanc|Rosé))'))
iAppelation=(str(iAppelation[0].encode('utf-8'))).lower()
item ["appelation"] =iAppelation.strip()
iCouleur = map(unicode.strip, res.select('.//div[@class="pro_col_right"]/div[@class="pro_blk_trans"]/div[@class="pro_blk_trans_titre"]/text()').re(r'\w+\s*$'))
iCouleur =(str(iCouleur[0].encode('utf-8'))).lower()
item ["couleur"] =iCouleur.strip()
#Methode 2
#title = map(unicode.strip,
# content.select('.//div[@class="pro_col_right"]/div[@class="pro_blk_trans"]/div[@class="pro_blk_trans_titre"]/text()').extract())
#m = re.compile(r'^(?P<classement>\d\w+\s*\w+)\S\s+(?P<appelation>\w+\-\w+|\w+)\S\s+(?P<couleur>\w+)\s*$').search(title[0])
#if m:
# item ["classement"]=m.group("classement")
# item ["appelation"]=m.group("appelation")
# item ["couleur"]=m.group("couleur")
verifCepage=map(unicode.strip, res.select('//div[@class="pro_col_right"]/div[@class="pro_blk_mauve"]/div[1]/text()').extract())
if 'page' in str(verifCepage) :
item ["cepage_s"] = map(unicode.strip, res.select('//div[@class="pro_col_right"]/div[@class="pro_blk_mauve"]/div[2]/text()').re(r'\d+%\s+.+'))
#iCepage_s=(str(iCepage_s[0].encode('utf-8'))).lower()
#item ["cepage_s"] = iCepage_s.strip()
else:
item ["cepage_s"] = ''
item ["temperature_de_service"] = map(unicode.strip, res.select('//div[@class="pro_col_right"]//div[@class="pro_blk_mauve"]//div[@class="pro_lgn_contenu"]/text()').re(r'\d+[°]'))
item ["volume"]= map(unicode.strip, res.select('//div[@class="pro_col_right"]/table[@class="pro_table_commande"]/tr/td[@class="pro_td_commande_1"]/div[@class="pro_commande_prix_3"]/text()').re(r'\d+\scl'))
item ["prix"]= map(unicode.strip, res.select('//div[@class="pro_col_right"]/table[@class="pro_table_commande"]/tr/td[@class="pro_td_commande_1"]/div[@class="pro_commande_prix"]/text()').re(r'\d+.\d*'))
#item ["Image"]= map(unicode.strip, res.select('//div[@class="pro_detail_tit"]//div[@class="pro_titre"]/h1/text()').extract())
items.append(item)
return items
#parse_start_url = parse_wine_page
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment