Created
September 30, 2016 17:16
-
-
Save a-castellano/402b11f157fa486cd79420fb840739a6 to your computer and use it in GitHub Desktop.
Boxing Scrapper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[WEBSITE] | |
name = clarin.com | |
[SECTIONS] | |
section1_name = clarin_boxeo | |
section1_url = http://www.clarin.com/deportes/boxeo/ | |
section1_slug = deportes/boxeo |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[LOG] | |
pathOutFile=NewsScaper | |
pathErrFile=NewsScaper.error | |
[DB] | |
host = YOUR_HOST | |
port = PORT | |
user = YOUR_USER | |
password = USER's_PASSWORD | |
database = THE_DATABASE | |
[WP] | |
host = wordpress.site | |
user = admin | |
password = admin_password | |
[WEBSITES] | |
websites = clarin.com |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# Alvaro Castellano Vela - 22/07/2016 | |
# https://github.com/a-castellano | |
import sys | |
sys.path.append('../') | |
from lib.scrappers import clarin_boxeo | |
class ScrapperFactory( object ): | |
def factory( type, db, wpinfo, table, url, slug, log ): | |
if type == "clarin_boxeo": return clarin_boxeo.ScrapperClarinBoxeo( db, wpinfo, table, url, slug, log ) | |
factory = staticmethod(factory) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# Alvaro Castellano Vela - 21/07/2016 | |
# https://github.com/a-castellano | |
import sys | |
sys.path.append('../../') | |
from lib.scrapper import Scrapper | |
import codecs | |
import re | |
import time | |
import unicodedata | |
from bs4 import BeautifulSoup | |
from subprocess import call | |
from lxml import html | |
import requests | |
from pyvirtualdisplay import Display | |
from selenium import webdriver | |
class ScrapperClarinBoxeo( Scrapper ): | |
def __init__( self, db, wpinfo, table, url, slug, log ): | |
Scrapper.__init__( self, db,wpinfo, table, url, slug, log ) | |
def scrape( self ): | |
self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Starting scrapper ]" ) | |
self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Original URL -> {} ]".format( self.url ) ) | |
self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Getting previous URL's ]" ) | |
currentItems = self.db.getURLs( self.table ) | |
storedItems = [] | |
itemsToScrappe = [] | |
for item in currentItems: | |
storedItems.append(item[0]) | |
page_filename = '.data_{}.scrapper_data'.format( self.table ) | |
call( [ 'curl', self.url, '-o', page_filename ] ) | |
data = open( page_filename, "r" ) | |
dataReaded = data.read() | |
data.close() | |
#remove page_filename | |
hrefObject = re.findall(r'<a href="/deportes/boxeo/(.*?)"', dataReaded, re.M|re.I|re.S) | |
for link in hrefObject: | |
realLink = self.url + link[ : link.find( "#" ) ] | |
if realLink not in itemsToScrappe and realLink not in storedItems and realLink[ realLink.rfind( "." ) + 1 : ] == "html" : | |
itemsToScrappe.append(realLink) | |
if itemsToScrappe: | |
self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Items to scrappe ]" ) | |
for item in itemsToScrappe: | |
self.log.info( "[\t\t\t{} ]".format( item ) ) | |
self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Running Web Browser ]" ) | |
display = Display(visible=0, size=(800,600)) | |
display.start() | |
driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver") | |
for item in itemsToScrappe: | |
self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Scraping {} ]".format( item ) ) | |
try: | |
newItem = {} | |
newItem['title'] = "" | |
newItem['description'] = "" | |
newItem['url'] = item | |
newItem['image_url'] = "" | |
newItem['video_url'] = "" | |
newItem['content'] = "" | |
newItem['slug'] = "" | |
newItem['keywords'] = "" | |
newItem['referer'] = "Clarin" | |
newItem['referer_url'] = "http://www.clarin.com" | |
driver.get( newItem['url'] ) | |
dataReaded = driver.page_source | |
self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Scraping {} ]".format( newItem['url'] ) ) | |
matchTitle = re.findall( r'og:title" content="(.*?)"', dataReaded, re.M|re.I|re.S ) | |
newItem['title'] = matchTitle[0].replace(''',"'").replace('"',"'").replace(""","'") | |
self.log.info( "\t\t\t\t[ title: '{}' ]".format( newItem['title'] ) ) | |
matchDescription = re.findall( r'<meta name="DESCRIPTION" content="(.*?)"', dataReaded, re.M|re.I|re.S ) | |
newItem['description'] = matchDescription[0].replace(''',"'").replace('"',"'").replace(""","'") | |
self.log.info( "\t\t\t\t[ description: '{}' ]".format( newItem['description'] ) ) | |
matchImage = re.findall( r'<link rel="image_src" href="(.*?)"', dataReaded, re.M|re.I|re.S ) | |
if len(matchImage) > 0: | |
if 'http' in matchImage[0]: | |
newItem['image_url'] = matchImage[0] | |
self.log.info( "\t\t\t\t[ image_url: '{}' ]".format( newItem['image_url'] ) ) | |
matchvVideo = re.findall( r'<iframe src="https://www.youtube.com/embed/(.*?)"', dataReaded, re.M|re.I|re.S ) | |
if len(matchvVideo) > 0: | |
newItem['video_url'] = "https://www.youtube.com/watch?v=" + matchvVideo[0] | |
self.log.info( "\t\t\t\t[ video_url: '{}' ]".format( newItem['video_url'] ) ) | |
soup = BeautifulSoup(dataReaded, "lxml") | |
newItem['content'] = str( soup.select('div[class=nota]')[0] ).replace('"',"'") | |
self.log.info( "\t\t\t\t[ content: '{}' ]".format( newItem['content'] ) ) | |
newItem['slug'] = item | |
newItem['slug'] = newItem['slug'].replace("http://www.clarin.com/deportes/boxeo/", "") | |
newItem['slug'] = newItem['slug'][ : newItem['slug'].find("_") ] | |
self.log.info( "\t\t\t\t[ slug: '{}' ]".format( newItem['slug'] ) ) | |
matchKeywords = re.findall( r'<meta name="KEYWORDS" content="(.*?)"', dataReaded, re.M|re.I|re.S ) | |
newItem['keywords'] = matchKeywords[0] | |
self.log.info( "\t\t\t\t[ keywords: '{}' ]".format( newItem['keywords'] ) ) | |
self.items.append( newItem ) | |
except NameError: | |
print("Oops!",sys.exc_info()[0]) | |
pass | |
driver.close() | |
display.stop() | |
self.log.info( "\t[ Scrapper Clarin Boxeo ] - [ Finishing scrapper ]" ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment