|
import re, requests, logging, os, time, uuid, sys |
|
from couchdb import ResourceConflict |
|
from selenium.common.exceptions import TimeoutException |
|
|
|
class Crawler(object): |
|
logger = logging.getLogger("crawler_logger") |
|
logger.setLevel(logging.DEBUG) |
|
file_handler = logging.FileHandler(os.getcwd()+"/logs/crawler_{}.log".format( str(int(round(time.time()*1000))) )) |
|
file_handler.setLevel(logging.DEBUG) |
|
terminal_handler = logging.StreamHandler() |
|
terminal_handler.setLevel(logging.INFO) |
|
log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") |
|
file_handler.setFormatter(log_format) |
|
terminal_handler.setFormatter(log_format) |
|
logger.addHandler(terminal_handler) |
|
logger.addHandler(file_handler) |
|
|
|
def __init__(self, idn, driver, couch, rule, *args): |
|
self.logger.info("Initializing Crawler object on '{}' domain with {} urls.".format(args[0], str(args))) |
|
self.idn=idn+"_"+str(uuid.uuid1()) |
|
self.driver = driver |
|
self.couch = couch |
|
if rule==None: self.rule="^/" |
|
else: self.rule=rule |
|
self.domain = args[0] |
|
self.urls = list(args) |
|
self.crawled = [] |
|
self.limit = [0, 0] |
|
self.sleep = 1 |
|
self.timeout = 20 |
|
|
|
def crawl(self): |
|
self.driver.set_page_load_timeout(self.timeout) |
|
# while urls in self.urls |
|
while self.urls: |
|
if self.limit[0] != 0: |
|
if self.limit[0] == self.limit[1]: |
|
self.logger.info("Limit reached, returns.") |
|
break |
|
|
|
try: |
|
self.logger.info("Connecting to {}...".format(self.urls[0])) |
|
time.sleep(self.sleep) |
|
try: |
|
self.driver.get(self.urls[0]) |
|
except TimeoutException: |
|
self.logger.warn("Timeout has reached. {} seconds.".format(self.timeout)) |
|
pass |
|
self.logger.debug("Analyzing to structures...") |
|
elms = self.driver.find_elements_by_css_selector("a") |
|
all_hrefs = [] |
|
for elm in elms: |
|
all_hrefs.append(elm.get_attribute("href")) |
|
hrefs = [x for x in all_hrefs if re.search(re.escape(self.domain), x) and not re.search("#|\?", x)] |
|
self.logger.info("Links are checked if they are crawled...") |
|
for href in hrefs: |
|
if self.domain[0:-1]+href in self.crawled: |
|
self.logger.warn("{} already crawled.".format(str(href))) |
|
pass |
|
else: self.urls.append(href) |
|
self.crawled.append(self.urls[0]) |
|
# Remove first url from reversed self.urls list |
|
#self.urls[::-1].pop() |
|
del self.urls[0] |
|
self.limit[1]+=1 |
|
self.logger.debug("Updating database {}".format(self.idn)) |
|
try: |
|
self.couch.database["{}".format(self.idn)] = { |
|
"idn" : self.idn, |
|
"rule" : self.rule, |
|
"domain" : self.domain, |
|
"urls" : self.urls, |
|
} |
|
except ResourceConflict: |
|
revno = self.couch.database["{}".format(self.idn)]["_rev"] |
|
self.couch.database["{}".format(self.idn)] = { |
|
"_rev" : revno, |
|
"idn" : self.idn, |
|
"rule" : self.rule, |
|
"domain" : self.domain, |
|
"urls" : self.urls, |
|
} |
|
self.logger.info("Reached count {}/{}".format(str(self.limit[1]), str(self.limit[0]))) |
|
except Exception as e: |
|
exc_type, exc_obj, exc_tb = sys.exc_info() |
|
f = exc_tb.tb_frame |
|
lineno = exc_tb.tb_lineno |
|
filename = f.f_code.co_filename |
|
self.logger.error("Crawling function raised an error in line {} @ {}, passing: {}".format(lineno, filename, str(e))) |
|
pass |
|
|
|
if len(self.urls) == 0: |
|
self.logger.info("No url left to crawl, returns.") |
|
return |
|
|
|
class Scraper(object): |
|
logger = logging.getLogger("scraper_logger") |
|
logger.setLevel(logging.DEBUG) |
|
file_handler = logging.FileHandler(os.getcwd()+"/logs/scraper_{}.log".format( str(int(round(time.time()*1000))) )) |
|
file_handler.setLevel(logging.DEBUG) |
|
terminal_handler = logging.StreamHandler() |
|
terminal_handler.setLevel(logging.INFO) |
|
log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") |
|
file_handler.setFormatter(log_format) |
|
terminal_handler.setFormatter(log_format) |
|
logger.addHandler(terminal_handler) |
|
logger.addHandler(file_handler) |
|
|
|
def __init__(self, idn, couch, driver): |
|
self.idn=idn+"_"+str(uuid.uuid1()) |
|
self.rules = [] |
|
self.data = {} |
|
self.driver = driver |
|
self.couch = couch |
|
self.sleep = 1 |
|
|
|
def add_rule(self, title, tag, attrs): |
|
self.rules.append((title, tag, attrs)) |
|
self.logger.info("{} rule added.".format(str((title,tag,attrs)))) |
|
|
|
def select_rules(self, **where): |
|
selected = [] |
|
for item in self.rules: |
|
try: |
|
if item[0] in where["title"]: |
|
selected.append(item) |
|
elif item[1] in where["tag"]: |
|
selected.append(item) |
|
elif item[2] in where["attrs"]: |
|
selected.append(item) |
|
else: pass |
|
except Exception: |
|
continue |
|
return selected |
|
|
|
def remove_rules(self, *rules): |
|
for item in rules: |
|
self.rules.remove(item) |
|
self.logger.info("{} rules removed.".format(str(rules))) |
|
|
|
def scrap(self, *urls): |
|
count = 1 |
|
for url in urls: |
|
self.logger.info("Connecting to {}... ({}/{})".format(url, str(count), str(len(urls)))) |
|
time.sleep(self.sleep) |
|
try: |
|
self.driver.get(url) |
|
count+=1 |
|
except TimeoutException: |
|
pass |
|
self.logger.debug("Parsing to the structure...") |
|
self.data[url] = [] |
|
for rule in self.rules: |
|
self.logger.debug("Scraping rule {}".format(str(rule))) |
|
elms = self.driver.find_elements_by_css_selector(rule[2]) |
|
if len(elms) == 0: pass |
|
result = [x.text for x in elms] |
|
self.data[url].append((rule, result)) |
|
|
|
self.logger.info("Updating database...") |
|
try: |
|
self.couch.database["{}".format(self.idn)] = { |
|
"idn" : self.idn, |
|
"data" : self.data, |
|
"rules" : self.rules, |
|
"urls" : urls, |
|
} |
|
except ResourceConflict: |
|
revno = self.couch.database["{}".format(self.idn)]["_rev"] |
|
self.couch.database["{}".format(self.idn)] = { |
|
"_rev" : revno, |
|
"idn" : self.idn, |
|
"data" : self.data, |
|
"rules" : self.rules, |
|
"urls" : urls, |
|
} |