Skip to content

Instantly share code, notes, and snippets.

@clizarralde
Created March 23, 2016 10:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save clizarralde/a3dcf653d375d476a9c2 to your computer and use it in GitHub Desktop.
Save clizarralde/a3dcf653d375d476a9c2 to your computer and use it in GitHub Desktop.
This is a spider to get Argentinian soccer stats from promiedos.com.ar
#!/usr/bin/python
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, DateTime, Float, Text, Boolean
from sqlalchemy.engine.url import URL
import settings
# cosas de base de datos
Base = declarative_base()
engine = create_engine(URL(**settings.DATABASE))
Session = sessionmaker(bind=engine)
class PartidoModel(Base):
__tablename__ = 'vetvot_partido'
id = Column(Integer, primary_key=True)
descripcion = Column(String(200))
equipo1 = Column(String(100))
gol1 = Column(Integer)
equipo2 = Column(String(100))
gol2 = Column(Integer)
fecha = Column(DateTime)
anio = Column(Integer)
nfecha = Column(Integer)
class PosicionModel(Base):
__tablename__ = 'vetvot_posicion'
id = Column(Integer, primary_key=True)
anio = Column(Integer)
nfecha = Column(Integer)
equipo = Column(String(100))
puntos = Column(Integer)
posicion = Column(Integer)
class PartidoItem(scrapy.Item):
partido = scrapy.Field()
equipo1 = scrapy.Field()
equipo2 = scrapy.Field()
gol1 = scrapy.Field()
gol2 = scrapy.Field()
class PartidoPipeline(object):
def __init__(self):
"""
Initializes database connection and sessionmaker.
Creates deals table.
"""
Base.metadata.create_all(engine)
def process_item(self, item, spider):
"""Save deals in the database.
This method is called for every item pipeline component.
"""
session = Session()
try:
if not session.query(PartidoModel).filter_by(equipo1=item['equipo1'].strip(), equipo2=item['equipo2'].strip(),descripcion = item['partido'].strip()).first():
partido = PartidoModel()
partido.equipo1 = item['equipo1'].strip()
partido.equipo2 = item['equipo2'].strip()
partido.gol1 = int(item['gol1'])
partido.gol2 = int(item['gol2'])
partido.descripcion = item['partido'].strip()
session.add(partido)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
class PromiedosSpider(CrawlSpider):
'''
Un spider para bajar los partidos de la pagina promiedos
'''
name = 'promiedos.com.ar'
allowed_domains = ['promiedos.com.ar']
start_urls = ['http://www.promiedos.com.ar/historialpartidos.php' ,
#'http://www.promiedos.com.ar/historialpartidos.php?equipo1=Atl%20Rafaela&equipo2=Belgrano'
]
custom_settings = { 'ITEM_PIPELINES' : ['promiedos.PartidoPipeline'] }
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(LinkExtractor(allow=('historialpartidos\.php\?equipo=.*', ), deny=('subsection\.php', ))),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
Rule(LinkExtractor(allow=('historialpartidos\.php?.equipo.*equipo.*', )), callback='parse_item', follow=False),
)
def parse_item(self, response):
'''
<div style="border: 1px solid #888; width: 360px">
<table align="center">
<tbody>
<tr>
<td colspan="5">
<span class="diadelpart">
Torneo Clausura 2012 - Promoción</span>
</td>
</tr>
<tr style="background: #e5e5e5">
<td width="140">
<img src="images/cuadrados/sanlorenzo.gif" border="0"> San Lorenzo</td>
<td width="20" style="text-align: center">
<span class="datoequipo">1</span>
</td>
<td width="20" style="text-align: center">
<span class="datoequipo">1</span>
</td>
<td width="140">
<img src="images/cuadrados/instituto.gif" border="0"> Instituto</td>
</tr>
</tbody></table>
</div>
'''
self.logger.info('Acá esta el historia de dos equipos!: %s', response.url)
matches = response.xpath('//div[@style="border: 1px solid #888; width: 360px"]')
self.logger.info('Hay %i partidos entre ellos', len(matches))
partidos = []
for m in matches:
if 1==1:
item = PartidoItem()
item['partido'] = m.xpath('table/tr/td/span/text()').extract()[0]
item['equipo1'] = m.xpath('table/tr/td/text()').extract()[3]
item['gol1'] = m.xpath('table/tr/td/span/text()').extract()[1]
item['equipo2'] = m.xpath('table/tr/td/text()').extract()[9]
item['gol2'] = m.xpath('table/tr/td/span/text()').extract()[2]
partidos.append(item)
return partidos
if __name__ == '__main__':
Base.metadata.create_all(engine)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment