Skip to content

Instantly share code, notes, and snippets.

@a-castellano
Created September 30, 2016 17:16
Show Gist options
  • Save a-castellano/402b11f157fa486cd79420fb840739a6 to your computer and use it in GitHub Desktop.
Save a-castellano/402b11f157fa486cd79420fb840739a6 to your computer and use it in GitHub Desktop.
Boxing Scrapper
[WEBSITE]
name = clarin.com
[SECTIONS]
section1_name = clarin_boxeo
section1_url = http://www.clarin.com/deportes/boxeo/
section1_slug = deportes/boxeo
[LOG]
pathOutFile=NewsScaper
pathErrFile=NewsScaper.error
[DB]
host = YOUR_HOST
port = PORT
user = YOUR_USER
password = USER's_PASSWORD
database = THE_DATABASE
[WP]
host = wordpress.site
user = admin
password = admin_password
[WEBSITES]
websites = clarin.com
#!/usr/bin/python3
# Alvaro Castellano Vela - 22/07/2016
# https://github.com/a-castellano
import sys
sys.path.append('../')
from lib.scrappers import clarin_boxeo
class ScrapperFactory( object ):
def factory( type, db, wpinfo, table, url, slug, log ):
if type == "clarin_boxeo": return clarin_boxeo.ScrapperClarinBoxeo( db, wpinfo, table, url, slug, log )
factory = staticmethod(factory)
#!/usr/bin/python3
# Alvaro Castellano Vela - 21/07/2016
# https://github.com/a-castellano
import sys
sys.path.append('../../')
from lib.scrapper import Scrapper
import codecs
import re
import time
import unicodedata
from bs4 import BeautifulSoup
from subprocess import call
from lxml import html
import requests
from pyvirtualdisplay import Display
from selenium import webdriver
class ScrapperClarinBoxeo( Scrapper ):
def __init__( self, db, wpinfo, table, url, slug, log ):
Scrapper.__init__( self, db,wpinfo, table, url, slug, log )
def scrape( self ):
self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Starting scrapper ]" )
self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Original URL -> {} ]".format( self.url ) )
self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Getting previous URL's ]" )
currentItems = self.db.getURLs( self.table )
storedItems = []
itemsToScrappe = []
for item in currentItems:
storedItems.append(item[0])
page_filename = '.data_{}.scrapper_data'.format( self.table )
call( [ 'curl', self.url, '-o', page_filename ] )
data = open( page_filename, "r" )
dataReaded = data.read()
data.close()
#remove page_filename
hrefObject = re.findall(r'<a href="/deportes/boxeo/(.*?)"', dataReaded, re.M|re.I|re.S)
for link in hrefObject:
realLink = self.url + link[ : link.find( "#" ) ]
if realLink not in itemsToScrappe and realLink not in storedItems and realLink[ realLink.rfind( "." ) + 1 : ] == "html" :
itemsToScrappe.append(realLink)
if itemsToScrappe:
self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Items to scrappe ]" )
for item in itemsToScrappe:
self.log.info( "[\t\t\t{} ]".format( item ) )
self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Running Web Browser ]" )
display = Display(visible=0, size=(800,600))
display.start()
driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver")
for item in itemsToScrappe:
self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Scraping {} ]".format( item ) )
try:
newItem = {}
newItem['title'] = ""
newItem['description'] = ""
newItem['url'] = item
newItem['image_url'] = ""
newItem['video_url'] = ""
newItem['content'] = ""
newItem['slug'] = ""
newItem['keywords'] = ""
newItem['referer'] = "Clarin"
newItem['referer_url'] = "http://www.clarin.com"
driver.get( newItem['url'] )
dataReaded = driver.page_source
self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Scraping {} ]".format( newItem['url'] ) )
matchTitle = re.findall( r'og:title" content="(.*?)"', dataReaded, re.M|re.I|re.S )
newItem['title'] = matchTitle[0].replace('&#039;',"'").replace('"',"'").replace("&quot;","'")
self.log.info( "\t\t\t\t[ title: '{}' ]".format( newItem['title'] ) )
matchDescription = re.findall( r'<meta name="DESCRIPTION" content="(.*?)"', dataReaded, re.M|re.I|re.S )
newItem['description'] = matchDescription[0].replace('&#039;',"'").replace('"',"'").replace("&quot;","'")
self.log.info( "\t\t\t\t[ description: '{}' ]".format( newItem['description'] ) )
matchImage = re.findall( r'<link rel="image_src" href="(.*?)"', dataReaded, re.M|re.I|re.S )
if len(matchImage) > 0:
if 'http' in matchImage[0]:
newItem['image_url'] = matchImage[0]
self.log.info( "\t\t\t\t[ image_url: '{}' ]".format( newItem['image_url'] ) )
matchvVideo = re.findall( r'<iframe src="https://www.youtube.com/embed/(.*?)"', dataReaded, re.M|re.I|re.S )
if len(matchvVideo) > 0:
newItem['video_url'] = "https://www.youtube.com/watch?v=" + matchvVideo[0]
self.log.info( "\t\t\t\t[ video_url: '{}' ]".format( newItem['video_url'] ) )
soup = BeautifulSoup(dataReaded, "lxml")
newItem['content'] = str( soup.select('div[class=nota]')[0] ).replace('"',"'")
self.log.info( "\t\t\t\t[ content: '{}' ]".format( newItem['content'] ) )
newItem['slug'] = item
newItem['slug'] = newItem['slug'].replace("http://www.clarin.com/deportes/boxeo/", "")
newItem['slug'] = newItem['slug'][ : newItem['slug'].find("_") ]
self.log.info( "\t\t\t\t[ slug: '{}' ]".format( newItem['slug'] ) )
matchKeywords = re.findall( r'<meta name="KEYWORDS" content="(.*?)"', dataReaded, re.M|re.I|re.S )
newItem['keywords'] = matchKeywords[0]
self.log.info( "\t\t\t\t[ keywords: '{}' ]".format( newItem['keywords'] ) )
self.items.append( newItem )
except NameError:
print("Oops!",sys.exc_info()[0])
pass
driver.close()
display.stop()
self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Finishing scrapper ]" )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment