a-castellano/conf-clarin.com.conf

## conf-clarin.com.conf
[WEBSITE]
name = clarin.com

[SECTIONS]

section1_name = clarin_boxeo
section1_url = http://www.clarin.com/deportes/boxeo/
section1_slug = deportes/boxeo

## conf-scrapper.conf
[LOG]
pathOutFile=NewsScaper
pathErrFile=NewsScaper.error

[DB]
host = YOUR_HOST
port = PORT
user = YOUR_USER
password = USER's_PASSWORD
database = THE_DATABASE

[WP]
host = wordpress.site
user = admin
password = admin_password

[WEBSITES]
websites = clarin.com

## lib-scrapperFactory.py
#!/usr/bin/python3

# Alvaro Castellano Vela - 22/07/2016
# https://github.com/a-castellano

import sys
sys.path.append('../')

from lib.scrappers import clarin_boxeo

class ScrapperFactory( object ):

    def factory( type, db, wpinfo, table, url, slug, log ):
        if type == "clarin_boxeo": return clarin_boxeo.ScrapperClarinBoxeo( db, wpinfo, table, url, slug, log )
    factory = staticmethod(factory)

## lib-scrappers-clarin_boxeo.py
#!/usr/bin/python3

# Alvaro Castellano Vela - 21/07/2016
# https://github.com/a-castellano

import sys
sys.path.append('../../')

from lib.scrapper import Scrapper

import codecs
import re
import time
import unicodedata

from bs4 import BeautifulSoup

from subprocess import call

from lxml import html
import requests

from pyvirtualdisplay import Display
from selenium import webdriver

class ScrapperClarinBoxeo( Scrapper ):

    def __init__( self, db, wpinfo, table, url, slug, log ):

        Scrapper.__init__( self, db,wpinfo, table, url, slug, log )

    def scrape( self ):

        self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Starting scrapper ]" )

        self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Original URL -> {} ]".format( self.url ) )

        self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Getting previous URL's ]" )

        currentItems = self.db.getURLs( self.table )
        storedItems = []
        itemsToScrappe = []
        for item in currentItems:
            storedItems.append(item[0])

        page_filename = '.data_{}.scrapper_data'.format( self.table )

        call( [ 'curl', self.url, '-o', page_filename ] )

        data = open( page_filename, "r" )
        dataReaded = data.read()
        data.close()
        #remove page_filename

        hrefObject = re.findall(r'<a href="/deportes/boxeo/(.*?)"', dataReaded, re.M|re.I|re.S)

        for link in hrefObject:

            realLink = self.url + link[ : link.find( "#" ) ]


            if realLink not in itemsToScrappe and realLink not in storedItems and realLink[ realLink.rfind( "." ) + 1 : ] == "html" :

                itemsToScrappe.append(realLink)

        if itemsToScrappe:


            self.log.info( "\t[ Scrapper Clarin Boxeo  ] - [ Items to scrappe ]" )

            for item in itemsToScrappe:
                self.log.info( "[\t\t\t{} ]".format( item ) )


            self.log.info( "\t[ Scrapper Clarin Boxeo  ] - [ Running Web Browser ]" )

            display = Display(visible=0, size=(800,600))
            display.start()
            driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver")


            for item in itemsToScrappe:

                self.log.info( "\t[ Scrapper Clarin Boxeo  ] - [ Scraping {} ]".format( item ) )

                try:

                    newItem = {}
                    newItem['title'] = ""
                    newItem['description'] = ""
                    newItem['url'] = item
                    newItem['image_url'] = ""
                    newItem['video_url'] = ""
                    newItem['content'] = ""
                    newItem['slug'] = ""
                    newItem['keywords'] = ""
                    newItem['referer'] = "Clarin"
                    newItem['referer_url'] = "http://www.clarin.com"


                    driver.get( newItem['url'] )
                    dataReaded = driver.page_source
                    self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Scraping {} ]".format( newItem['url'] ) )

                    matchTitle = re.findall( r'og:title" content="(.*?)"', dataReaded, re.M|re.I|re.S )

                    newItem['title'] = matchTitle[0].replace('&#039;',"'").replace('"',"'").replace("&quot;","'")

                    self.log.info( "\t\t\t\t[ title: '{}'  ]".format( newItem['title'] ) )

                    matchDescription = re.findall( r'<meta name="DESCRIPTION" content="(.*?)"', dataReaded, re.M|re.I|re.S )

                    newItem['description'] = matchDescription[0].replace('&#039;',"'").replace('"',"'").replace("&quot;","'")

                    self.log.info( "\t\t\t\t[ description: '{}'  ]".format( newItem['description'] ) )

                    matchImage = re.findall( r'<link rel="image_src" href="(.*?)"', dataReaded, re.M|re.I|re.S )

                    if len(matchImage) > 0:

                        if 'http' in matchImage[0]:

                            newItem['image_url'] = matchImage[0]
                            self.log.info( "\t\t\t\t[ image_url: '{}'  ]".format( newItem['image_url'] ) )


                    matchvVideo = re.findall( r'<iframe src="https://www.youtube.com/embed/(.*?)"', dataReaded, re.M|re.I|re.S )
                    if len(matchvVideo) > 0:

                        newItem['video_url'] = "https://www.youtube.com/watch?v=" + matchvVideo[0]
                        self.log.info( "\t\t\t\t[ video_url: '{}'  ]".format( newItem['video_url'] ) )

                    soup = BeautifulSoup(dataReaded, "lxml")

                    newItem['content'] = str( soup.select('div[class=nota]')[0] ).replace('"',"'")

                    self.log.info( "\t\t\t\t[ content: '{}'  ]".format( newItem['content'] ) )

                    newItem['slug'] = item
                    newItem['slug'] = newItem['slug'].replace("http://www.clarin.com/deportes/boxeo/", "")
                    newItem['slug'] = newItem['slug'][ : newItem['slug'].find("_") ]

                    self.log.info( "\t\t\t\t[ slug: '{}'  ]".format( newItem['slug'] ) )

                    matchKeywords = re.findall( r'<meta name="KEYWORDS" content="(.*?)"', dataReaded, re.M|re.I|re.S )

                    newItem['keywords'] = matchKeywords[0]

                    self.log.info( "\t\t\t\t[ keywords: '{}'  ]".format( newItem['keywords'] ) )

                    self.items.append( newItem )

                except NameError:
                    print("Oops!",sys.exc_info()[0])
                    pass

            driver.close()
            display.stop()

        self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Finishing scrapper ]" )
	[WEBSITE]
	name = clarin.com

	[SECTIONS]

	section1_name = clarin_boxeo
	section1_url = http://www.clarin.com/deportes/boxeo/
	section1_slug = deportes/boxeo
	[LOG]
	pathOutFile=NewsScaper
	pathErrFile=NewsScaper.error

	[DB]
	host = YOUR_HOST
	port = PORT
	user = YOUR_USER
	password = USER's_PASSWORD
	database = THE_DATABASE

	[WP]
	host = wordpress.site
	user = admin
	password = admin_password

	[WEBSITES]
	websites = clarin.com
	#!/usr/bin/python3

	# Alvaro Castellano Vela - 22/07/2016
	# https://github.com/a-castellano

	import sys
	sys.path.append('../')

	from lib.scrappers import clarin_boxeo

	class ScrapperFactory( object ):

	def factory( type, db, wpinfo, table, url, slug, log ):
	if type == "clarin_boxeo": return clarin_boxeo.ScrapperClarinBoxeo( db, wpinfo, table, url, slug, log )
	factory = staticmethod(factory)
	#!/usr/bin/python3

	# Alvaro Castellano Vela - 21/07/2016
	# https://github.com/a-castellano

	import sys
	sys.path.append('../../')

	from lib.scrapper import Scrapper

	import codecs
	import re
	import time
	import unicodedata

	from bs4 import BeautifulSoup

	from subprocess import call

	from lxml import html
	import requests

	from pyvirtualdisplay import Display
	from selenium import webdriver

	class ScrapperClarinBoxeo( Scrapper ):

	def __init__( self, db, wpinfo, table, url, slug, log ):

	Scrapper.__init__( self, db,wpinfo, table, url, slug, log )

	def scrape( self ):

	self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Starting scrapper ]" )

	self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Original URL -> {} ]".format( self.url ) )

	self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Getting previous URL's ]" )

	currentItems = self.db.getURLs( self.table )
	storedItems = []
	itemsToScrappe = []
	for item in currentItems:
	storedItems.append(item[0])

	page_filename = '.data_{}.scrapper_data'.format( self.table )

	call( [ 'curl', self.url, '-o', page_filename ] )

	data = open( page_filename, "r" )
	dataReaded = data.read()
	data.close()
	#remove page_filename

	hrefObject = re.findall(r'<a href="/deportes/boxeo/(.*?)"', dataReaded, re.M\|re.I\|re.S)

	for link in hrefObject:

	realLink = self.url + link[ : link.find( "#" ) ]


	if realLink not in itemsToScrappe and realLink not in storedItems and realLink[ realLink.rfind( "." ) + 1 : ] == "html" :

	itemsToScrappe.append(realLink)

	if itemsToScrappe:


	self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Items to scrappe ]" )

	for item in itemsToScrappe:
	self.log.info( "[\t\t\t{} ]".format( item ) )


	self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Running Web Browser ]" )

	display = Display(visible=0, size=(800,600))
	display.start()
	driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver")


	for item in itemsToScrappe:

	self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Scraping {} ]".format( item ) )

	try:

	newItem = {}
	newItem['title'] = ""
	newItem['description'] = ""
	newItem['url'] = item
	newItem['image_url'] = ""
	newItem['video_url'] = ""
	newItem['content'] = ""
	newItem['slug'] = ""
	newItem['keywords'] = ""
	newItem['referer'] = "Clarin"
	newItem['referer_url'] = "http://www.clarin.com"


	driver.get( newItem['url'] )
	dataReaded = driver.page_source
	self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Scraping {} ]".format( newItem['url'] ) )

	matchTitle = re.findall( r'og:title" content="(.*?)"', dataReaded, re.M\|re.I\|re.S )

	newItem['title'] = matchTitle[0].replace(''',"'").replace('"',"'").replace(""","'")

	self.log.info( "\t\t\t\t[ title: '{}' ]".format( newItem['title'] ) )

	matchDescription = re.findall( r'<meta name="DESCRIPTION" content="(.*?)"', dataReaded, re.M\|re.I\|re.S )

	newItem['description'] = matchDescription[0].replace(''',"'").replace('"',"'").replace(""","'")

	self.log.info( "\t\t\t\t[ description: '{}' ]".format( newItem['description'] ) )

	matchImage = re.findall( r'<link rel="image_src" href="(.*?)"', dataReaded, re.M\|re.I\|re.S )

	if len(matchImage) > 0:

	if 'http' in matchImage[0]:

	newItem['image_url'] = matchImage[0]
	self.log.info( "\t\t\t\t[ image_url: '{}' ]".format( newItem['image_url'] ) )


	matchvVideo = re.findall( r'<iframe src="https://www.youtube.com/embed/(.*?)"', dataReaded, re.M\|re.I\|re.S )
	if len(matchvVideo) > 0:

	newItem['video_url'] = "https://www.youtube.com/watch?v=" + matchvVideo[0]
	self.log.info( "\t\t\t\t[ video_url: '{}' ]".format( newItem['video_url'] ) )

	soup = BeautifulSoup(dataReaded, "lxml")

	newItem['content'] = str( soup.select('div[class=nota]')[0] ).replace('"',"'")

	self.log.info( "\t\t\t\t[ content: '{}' ]".format( newItem['content'] ) )

	newItem['slug'] = item
	newItem['slug'] = newItem['slug'].replace("http://www.clarin.com/deportes/boxeo/", "")
	newItem['slug'] = newItem['slug'][ : newItem['slug'].find("_") ]

	self.log.info( "\t\t\t\t[ slug: '{}' ]".format( newItem['slug'] ) )

	matchKeywords = re.findall( r'<meta name="KEYWORDS" content="(.*?)"', dataReaded, re.M\|re.I\|re.S )

	newItem['keywords'] = matchKeywords[0]

	self.log.info( "\t\t\t\t[ keywords: '{}' ]".format( newItem['keywords'] ) )

	self.items.append( newItem )

	except NameError:
	print("Oops!",sys.exc_info()[0])
	pass

	driver.close()
	display.stop()

	self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Finishing scrapper ]" )