clizarralde/promiedos_spider.py

## promiedos_spider.py
#!/usr/bin/python
 # -*- coding: utf-8 -*-

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor


from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from sqlalchemy.ext.declarative import declarative_base

from sqlalchemy import Column, Integer, String, DateTime, Float, Text, Boolean

from sqlalchemy.engine.url import URL

import settings

# cosas de base de datos
Base = declarative_base()
engine = create_engine(URL(**settings.DATABASE))
Session = sessionmaker(bind=engine)

class PartidoModel(Base):
        __tablename__ = 'vetvot_partido'

        id = Column(Integer, primary_key=True)
        descripcion = Column(String(200))
        equipo1 = Column(String(100))
        gol1 = Column(Integer)
        equipo2 = Column(String(100))
        gol2 = Column(Integer)
        fecha = Column(DateTime)
        anio = Column(Integer)
        nfecha = Column(Integer)

class PosicionModel(Base):
    __tablename__ = 'vetvot_posicion'

    id = Column(Integer, primary_key=True)
    anio = Column(Integer)
    nfecha = Column(Integer)
    equipo = Column(String(100))
    puntos = Column(Integer)
    posicion = Column(Integer)


class PartidoItem(scrapy.Item):
    partido = scrapy.Field()
    equipo1 = scrapy.Field()
    equipo2 = scrapy.Field()
    gol1 = scrapy.Field()
    gol2 = scrapy.Field()

class PartidoPipeline(object):

    def __init__(self):
        """
        Initializes database connection and sessionmaker.
        Creates deals table.
        """

        Base.metadata.create_all(engine)

    def process_item(self, item, spider):
        """Save deals in the database.

        This method is called for every item pipeline component.

        """
        session = Session()

        try:
            if not session.query(PartidoModel).filter_by(equipo1=item['equipo1'].strip(), equipo2=item['equipo2'].strip(),descripcion = item['partido'].strip()).first():
                partido = PartidoModel()
                partido.equipo1 = item['equipo1'].strip()
                partido.equipo2 = item['equipo2'].strip()
                partido.gol1 = int(item['gol1'])
                partido.gol2 = int(item['gol2'])
                partido.descripcion = item['partido'].strip()
                session.add(partido)
                session.commit()
        except:
            session.rollback()
            raise
        finally:
            session.close()

        return item


class PromiedosSpider(CrawlSpider):
    '''
    Un spider para bajar los partidos de la pagina promiedos
    '''
    name = 'promiedos.com.ar'
    allowed_domains = ['promiedos.com.ar']
    start_urls = ['http://www.promiedos.com.ar/historialpartidos.php' ,
                   #'http://www.promiedos.com.ar/historialpartidos.php?equipo1=Atl%20Rafaela&equipo2=Belgrano'
                   ]

    custom_settings = { 'ITEM_PIPELINES' : ['promiedos.PartidoPipeline'] }

    rules = (
        # Extract links matching 'category.php' (but not matching 'subsection.php')
        # and follow links from them (since no callback means follow=True by default).
        Rule(LinkExtractor(allow=('historialpartidos\.php\?equipo=.*', ), deny=('subsection\.php', ))),

        # Extract links matching 'item.php' and parse them with the spider's method parse_item
        Rule(LinkExtractor(allow=('historialpartidos\.php?.equipo.*equipo.*', )), callback='parse_item', follow=False),
    )

    def parse_item(self, response):
        '''
        <div style="border: 1px solid #888; width: 360px">
            <table align="center">
            <tbody>
            <tr>
                <td colspan="5">
                <span class="diadelpart">
                Torneo Clausura 2012 - Promoción</span>
                </td>
            </tr>
            <tr style="background: #e5e5e5">
                <td width="140">
                <img src="images/cuadrados/sanlorenzo.gif" border="0"> San Lorenzo</td>
                <td width="20" style="text-align: center">
                <span class="datoequipo">1</span>
                </td>
                <td width="20" style="text-align: center">
                <span class="datoequipo">1</span>
                </td>
                <td width="140">
                <img src="images/cuadrados/instituto.gif" border="0"> Instituto</td>
            </tr>
            </tbody></table>
        </div>
        '''
        self.logger.info('Acá esta el historia de dos equipos!: %s', response.url)

        matches = response.xpath('//div[@style="border: 1px solid #888; width: 360px"]')
        self.logger.info('Hay %i partidos entre ellos', len(matches))
        partidos = []

        for m in matches:
            if 1==1:
                item = PartidoItem()
                item['partido'] = m.xpath('table/tr/td/span/text()').extract()[0]
                item['equipo1'] = m.xpath('table/tr/td/text()').extract()[3]
                item['gol1'] = m.xpath('table/tr/td/span/text()').extract()[1]
                item['equipo2'] = m.xpath('table/tr/td/text()').extract()[9]
                item['gol2'] = m.xpath('table/tr/td/span/text()').extract()[2]
                partidos.append(item)
        return partidos

if __name__ == '__main__':
    Base.metadata.create_all(engine)
	#!/usr/bin/python
	# -- coding: utf-8 --

	import scrapy
	from scrapy.spiders import CrawlSpider, Rule
	from scrapy.linkextractors import LinkExtractor


	from sqlalchemy import create_engine
	from sqlalchemy.orm import sessionmaker

	from sqlalchemy.ext.declarative import declarative_base

	from sqlalchemy import Column, Integer, String, DateTime, Float, Text, Boolean

	from sqlalchemy.engine.url import URL

	import settings

	# cosas de base de datos
	Base = declarative_base()
	engine = create_engine(URL(**settings.DATABASE))
	Session = sessionmaker(bind=engine)

	class PartidoModel(Base):
	__tablename__ = 'vetvot_partido'

	id = Column(Integer, primary_key=True)
	descripcion = Column(String(200))
	equipo1 = Column(String(100))
	gol1 = Column(Integer)
	equipo2 = Column(String(100))
	gol2 = Column(Integer)
	fecha = Column(DateTime)
	anio = Column(Integer)
	nfecha = Column(Integer)

	class PosicionModel(Base):
	__tablename__ = 'vetvot_posicion'

	id = Column(Integer, primary_key=True)
	anio = Column(Integer)
	nfecha = Column(Integer)
	equipo = Column(String(100))
	puntos = Column(Integer)
	posicion = Column(Integer)


	class PartidoItem(scrapy.Item):
	partido = scrapy.Field()
	equipo1 = scrapy.Field()
	equipo2 = scrapy.Field()
	gol1 = scrapy.Field()
	gol2 = scrapy.Field()

	class PartidoPipeline(object):

	def __init__(self):
	"""
	Initializes database connection and sessionmaker.
	Creates deals table.
	"""

	Base.metadata.create_all(engine)

	def process_item(self, item, spider):
	"""Save deals in the database.

	This method is called for every item pipeline component.

	"""
	session = Session()

	try:
	if not session.query(PartidoModel).filter_by(equipo1=item['equipo1'].strip(), equipo2=item['equipo2'].strip(),descripcion = item['partido'].strip()).first():
	partido = PartidoModel()
	partido.equipo1 = item['equipo1'].strip()
	partido.equipo2 = item['equipo2'].strip()
	partido.gol1 = int(item['gol1'])
	partido.gol2 = int(item['gol2'])
	partido.descripcion = item['partido'].strip()
	session.add(partido)
	session.commit()
	except:
	session.rollback()
	raise
	finally:
	session.close()

	return item


	class PromiedosSpider(CrawlSpider):
	'''
	Un spider para bajar los partidos de la pagina promiedos
	'''
	name = 'promiedos.com.ar'
	allowed_domains = ['promiedos.com.ar']
	start_urls = ['http://www.promiedos.com.ar/historialpartidos.php' ,
	#'http://www.promiedos.com.ar/historialpartidos.php?equipo1=Atl%20Rafaela&equipo2=Belgrano'
	]

	custom_settings = { 'ITEM_PIPELINES' : ['promiedos.PartidoPipeline'] }

	rules = (
	# Extract links matching 'category.php' (but not matching 'subsection.php')
	# and follow links from them (since no callback means follow=True by default).
	Rule(LinkExtractor(allow=('historialpartidos\.php\?equipo=.*', ), deny=('subsection\.php', ))),

	# Extract links matching 'item.php' and parse them with the spider's method parse_item
	Rule(LinkExtractor(allow=('historialpartidos\.php?.equipo.equipo.', )), callback='parse_item', follow=False),
	)

	def parse_item(self, response):
	'''
	<div style="border: 1px solid #888; width: 360px">
	<table align="center">
	<tbody>
	<tr>
	<td colspan="5">
	<span class="diadelpart">
	Torneo Clausura 2012 - Promoción</span>
	</td>
	</tr>
	<tr style="background: #e5e5e5">
	<td width="140">
	<img src="images/cuadrados/sanlorenzo.gif" border="0"> San Lorenzo</td>
	<td width="20" style="text-align: center">
	<span class="datoequipo">1</span>
	</td>
	<td width="20" style="text-align: center">
	<span class="datoequipo">1</span>
	</td>
	<td width="140">
	<img src="images/cuadrados/instituto.gif" border="0"> Instituto</td>
	</tr>
	</tbody></table>
	</div>
	'''
	self.logger.info('Acá esta el historia de dos equipos!: %s', response.url)

	matches = response.xpath('//div[@style="border: 1px solid #888; width: 360px"]')
	self.logger.info('Hay %i partidos entre ellos', len(matches))
	partidos = []

	for m in matches:
	if 1==1:
	item = PartidoItem()
	item['partido'] = m.xpath('table/tr/td/span/text()').extract()[0]
	item['equipo1'] = m.xpath('table/tr/td/text()').extract()[3]
	item['gol1'] = m.xpath('table/tr/td/span/text()').extract()[1]
	item['equipo2'] = m.xpath('table/tr/td/text()').extract()[9]
	item['gol2'] = m.xpath('table/tr/td/span/text()').extract()[2]
	partidos.append(item)
	return partidos

	if __name__ == '__main__':
	Base.metadata.create_all(engine)