erayerdin/ A Simple Crawling and Scraping Algorithm Using Python

## A Simple Crawling and Scraping Algorithm Using Python
 A Simple Crawling and Scraping Algorithm Using Python

## !LICENSE
        DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
                    Version 2, December 2004

 Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>

 Everyone is permitted to copy and distribute verbatim or modified
 copies of this license document, and changing it is allowed as long
 as the name is changed.

            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

  0. You just DO WHAT THE FUCK YOU WANT TO.

## !README.md

      
    Raw
  

              !README.md
            
          
    A Simple Crawling and Scraping Algorithm Using Python

Created for the blog AlreadyCoded. Check out the thread.
License: Do What The Fuck You Want To Public License
Requirements


BeautifulSoup4
Couchdb

Features

Advantages


Logging
Dumping

Disadvantages


Static Page Parsing


## __init__.py
import re, requests, logging, os, time, uuid, sys
from bs4 import BeautifulSoup as bs
from couchdb import ResourceConflict
#from selenium import webdriver
#from selenium.webdriver.common.keys import Keys

class Crawler(object):
    logger = logging.getLogger("crawler_logger")
    logger.setLevel(logging.DEBUG)
    file_handler = logging.FileHandler(os.getcwd()+"/logs/crawler_{}.log".format( str(int(round(time.time()*1000))) ))
    file_handler.setLevel(logging.DEBUG)
    terminal_handler = logging.StreamHandler()
    terminal_handler.setLevel(logging.INFO)
    log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    file_handler.setFormatter(log_format)
    terminal_handler.setFormatter(log_format)
    logger.addHandler(terminal_handler)
    logger.addHandler(file_handler)

    def __init__(self, idn, couch, rule, *args):
        self.logger.info("Initializing Crawler object on '{}' domain with {} urls.".format(args[0], str(args)))
        self.idn=idn+"_"+str(uuid.uuid1())
        self.couch = couch
        if rule==None: self.rule="^/"
        else: self.rule=rule
        self.domain = args[0]
        self.urls = list(args)
        self.crawled = []
        self.limit = [0, 0]
        self.sleep = 1

    def crawl(self, proxies=None):
        # while urls in self.urls
        while self.urls:
            if self.limit[0] != 0:
                if self.limit[0] == self.limit[1]:
                    self.logger.info("Limit reached, returns.")
                    break

            try:
                self.logger.info("Connecting to {}...".format(self.urls[0]))
                response = requests.get(self.urls[0], proxies)
                time.sleep(self.sleep)
                response.encoding = "utf-8"
                self.logger.debug("Analyzing to structures...")
                soup = bs(response.text, "html.parser")
                links = soup.find_all("a", {"href" : re.compile(self.rule)})
                hrefs = [x.attrs["href"] for x in links]
                self.logger.info("Links are checked if they are crawled...")
                for href in hrefs:
                    if self.domain[0:-1]+href in self.crawled:
                        self.logger.warn("{} already crawled.".format(str(href)))
                        pass
                    else: self.urls.append(self.domain[0:-1]+href)
                self.crawled.append(self.urls[0])
                # Remove first url from reversed self.urls list
                #self.urls[::-1].pop()
                del self.urls[0]
                self.limit[1]+=1
                self.logger.debug("Updating database {}".format(self.idn))
                try:
                    self.couch.database["{}".format(self.idn)] = {
                        "idn" : self.idn,
                        "rule" : self.rule,
                        "domain" : self.domain,
                        "urls" : self.urls,
                    }
                except ResourceConflict:
                    revno = self.couch.database["{}".format(self.idn)]["_rev"]
                    self.couch.database["{}".format(self.idn)] = {
                        "_rev" : revno,
                        "idn" : self.idn,
                        "rule" : self.rule,
                        "domain" : self.domain,
                        "urls" : self.urls,
                    }
                self.logger.info("Reached count {}/{}".format(str(self.limit[1]), str(self.limit[0])))
            except Exception as e:
                exc_type, exc_obj, exc_tb = sys.exc_info()
                f = exc_tb.tb_frame
                lineno = exc_tb.tb_lineno
                filename = f.f_code.co_filename
                self.logger.error("Crawling function raised an error in line {} @ {}, passing: {}".format(lineno, filename, str(e)))

        if len(self.urls) == 0:
            self.logger.info("No url left to crawl, returns.")
            return

class Scraper(object):
    logger = logging.getLogger("scraper_logger")
    logger.setLevel(logging.DEBUG)
    file_handler = logging.FileHandler(os.getcwd()+"/logs/scraper_{}.log".format( str(int(round(time.time()*1000))) ))
    file_handler.setLevel(logging.DEBUG)
    terminal_handler = logging.StreamHandler()
    terminal_handler.setLevel(logging.INFO)
    log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    file_handler.setFormatter(log_format)
    terminal_handler.setFormatter(log_format)
    logger.addHandler(terminal_handler)
    logger.addHandler(file_handler)

    def __init__(self, idn, couch):
        self.idn=idn+"_"+str(uuid.uuid1())
        self.rules = []
        self.data = {}
        self.couch = couch
        self.sleep = 1

    def add_rule(self, title, tag, attrs):
        self.rules.append((title, tag, attrs))
        self.logger.info("{} rule added.".format(str((title,tag,attrs))))

    def select_rules(self, **where):
        selected = []
        for item in self.rules:
            try:
                if item[0] in where["title"]:
                    selected.append(item)
                elif item[1] in where["tag"]:
                    selected.append(item)
                elif item[2] in where["attrs"]:
                    selected.append(item)
                else: pass
            except Exception:
                continue
        return selected

    def remove_rules(self, *rules):
        for item in rules:
            self.rules.remove(item)
        self.logger.info("{} rules removed.".format(str(rules)))

    def scrap(self, proxies=None, *urls):
        for url in urls:
            self.logger.info("Connecting to {}...".format(url))
            response = requests.get(url, proxies)
            time.sleep(self.sleep)
            response.encoding = "utf-8"
            self.logger.debug("Parsing to the structure...")
            soup = bs(response.text, "html.parser")
            self.data[url] = []
            for rule in self.rules:
                self.logger.debug("Scraping rule {}".format(str(rule)))
                rule_search = soup.find_all(rule[1], rule[2])
                if len(rule_search) == 0: pass
                result = [x.get_text() for x in rule_search]
                self.data[url].append((rule, result))

            self.logger.info("Updating database...")
            try:
                self.couch.database["{}".format(self.idn)] = {
                    "idn" : self.idn,
                    "data" : self.data,
                    "rules" : self.rules,
                    "urls" : urls,
                }
            except ResourceConflict:
                revno = self.couch.database["{}".format(self.idn)]["_rev"]
                self.couch.database["{}".format(self.idn)] = {
                    "_rev" : revno,
                    "idn" : self.idn,
                    "data" : self.data,
                    "rules" : self.rules,
                    "urls" : urls,
                }

## db.py
import couchdb

class Couch(object):
    def __init__(self, username, password):
        self.username = username
        self.password = password

        self.server = couchdb.Server()
        self.server.resource.credentials = (username, password)
        self.database = None

    def select_database(self, dbname):
        try:
            self.database=self.server.create(dbname)
        except Exception:
            self.database=self.server[dbname]
	DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
	Version 2, December 2004

	Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>

	Everyone is permitted to copy and distribute verbatim or modified
	copies of this license document, and changing it is allowed as long
	as the name is changed.

	DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
	TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

	0. You just DO WHAT THE FUCK YOU WANT TO.
	import re, requests, logging, os, time, uuid, sys
	from bs4 import BeautifulSoup as bs
	from couchdb import ResourceConflict
	#from selenium import webdriver
	#from selenium.webdriver.common.keys import Keys

	class Crawler(object):
	logger = logging.getLogger("crawler_logger")
	logger.setLevel(logging.DEBUG)
	file_handler = logging.FileHandler(os.getcwd()+"/logs/crawler_{}.log".format( str(int(round(time.time()*1000))) ))
	file_handler.setLevel(logging.DEBUG)
	terminal_handler = logging.StreamHandler()
	terminal_handler.setLevel(logging.INFO)
	log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
	file_handler.setFormatter(log_format)
	terminal_handler.setFormatter(log_format)
	logger.addHandler(terminal_handler)
	logger.addHandler(file_handler)

	def __init__(self, idn, couch, rule, *args):
	self.logger.info("Initializing Crawler object on '{}' domain with {} urls.".format(args[0], str(args)))
	self.idn=idn+"_"+str(uuid.uuid1())
	self.couch = couch
	if rule==None: self.rule="^/"
	else: self.rule=rule
	self.domain = args[0]
	self.urls = list(args)
	self.crawled = []
	self.limit = [0, 0]
	self.sleep = 1

	def crawl(self, proxies=None):
	# while urls in self.urls
	while self.urls:
	if self.limit[0] != 0:
	if self.limit[0] == self.limit[1]:
	self.logger.info("Limit reached, returns.")
	break

	try:
	self.logger.info("Connecting to {}...".format(self.urls[0]))
	response = requests.get(self.urls[0], proxies)
	time.sleep(self.sleep)
	response.encoding = "utf-8"
	self.logger.debug("Analyzing to structures...")
	soup = bs(response.text, "html.parser")
	links = soup.find_all("a", {"href" : re.compile(self.rule)})
	hrefs = [x.attrs["href"] for x in links]
	self.logger.info("Links are checked if they are crawled...")
	for href in hrefs:
	if self.domain[0:-1]+href in self.crawled:
	self.logger.warn("{} already crawled.".format(str(href)))
	pass
	else: self.urls.append(self.domain[0:-1]+href)
	self.crawled.append(self.urls[0])
	# Remove first url from reversed self.urls list
	#self.urls[::-1].pop()
	del self.urls[0]
	self.limit[1]+=1
	self.logger.debug("Updating database {}".format(self.idn))
	try:
	self.couch.database["{}".format(self.idn)] = {
	"idn" : self.idn,
	"rule" : self.rule,
	"domain" : self.domain,
	"urls" : self.urls,
	}
	except ResourceConflict:
	revno = self.couch.database["{}".format(self.idn)]["_rev"]
	self.couch.database["{}".format(self.idn)] = {
	"_rev" : revno,
	"idn" : self.idn,
	"rule" : self.rule,
	"domain" : self.domain,
	"urls" : self.urls,
	}
	self.logger.info("Reached count {}/{}".format(str(self.limit[1]), str(self.limit[0])))
	except Exception as e:
	exc_type, exc_obj, exc_tb = sys.exc_info()
	f = exc_tb.tb_frame
	lineno = exc_tb.tb_lineno
	filename = f.f_code.co_filename
	self.logger.error("Crawling function raised an error in line {} @ {}, passing: {}".format(lineno, filename, str(e)))

	if len(self.urls) == 0:
	self.logger.info("No url left to crawl, returns.")
	return

	class Scraper(object):
	logger = logging.getLogger("scraper_logger")
	logger.setLevel(logging.DEBUG)
	file_handler = logging.FileHandler(os.getcwd()+"/logs/scraper_{}.log".format( str(int(round(time.time()*1000))) ))
	file_handler.setLevel(logging.DEBUG)
	terminal_handler = logging.StreamHandler()
	terminal_handler.setLevel(logging.INFO)
	log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
	file_handler.setFormatter(log_format)
	terminal_handler.setFormatter(log_format)
	logger.addHandler(terminal_handler)
	logger.addHandler(file_handler)

	def __init__(self, idn, couch):
	self.idn=idn+"_"+str(uuid.uuid1())
	self.rules = []
	self.data = {}
	self.couch = couch
	self.sleep = 1

	def add_rule(self, title, tag, attrs):
	self.rules.append((title, tag, attrs))
	self.logger.info("{} rule added.".format(str((title,tag,attrs))))

	def select_rules(self, **where):
	selected = []
	for item in self.rules:
	try:
	if item[0] in where["title"]:
	selected.append(item)
	elif item[1] in where["tag"]:
	selected.append(item)
	elif item[2] in where["attrs"]:
	selected.append(item)
	else: pass
	except Exception:
	continue
	return selected

	def remove_rules(self, *rules):
	for item in rules:
	self.rules.remove(item)
	self.logger.info("{} rules removed.".format(str(rules)))

	def scrap(self, proxies=None, *urls):
	for url in urls:
	self.logger.info("Connecting to {}...".format(url))
	response = requests.get(url, proxies)
	time.sleep(self.sleep)
	response.encoding = "utf-8"
	self.logger.debug("Parsing to the structure...")
	soup = bs(response.text, "html.parser")
	self.data[url] = []
	for rule in self.rules:
	self.logger.debug("Scraping rule {}".format(str(rule)))
	rule_search = soup.find_all(rule[1], rule[2])
	if len(rule_search) == 0: pass
	result = [x.get_text() for x in rule_search]
	self.data[url].append((rule, result))

	self.logger.info("Updating database...")
	try:
	self.couch.database["{}".format(self.idn)] = {
	"idn" : self.idn,
	"data" : self.data,
	"rules" : self.rules,
	"urls" : urls,
	}
	except ResourceConflict:
	revno = self.couch.database["{}".format(self.idn)]["_rev"]
	self.couch.database["{}".format(self.idn)] = {
	"_rev" : revno,
	"idn" : self.idn,
	"data" : self.data,
	"rules" : self.rules,
	"urls" : urls,
	}
	import couchdb

	class Couch(object):
	def __init__(self, username, password):
	self.username = username
	self.password = password

	self.server = couchdb.Server()
	self.server.resource.credentials = (username, password)
	self.database = None

	def select_database(self, dbname):
	try:
	self.database=self.server.create(dbname)
	except Exception:
	self.database=self.server[dbname]