erayerdin/ A Simple Dynamic Crawling and Scraping Algorithm in Python

## A Simple Dynamic Crawling and Scraping Algorithm in Python
 A Simple Dynamic Crawling and Scraping Algorithm in Python

## !LICENSE
        DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
                    Version 2, December 2004

 Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>

 Everyone is permitted to copy and distribute verbatim or modified
 copies of this license document, and changing it is allowed as long
 as the name is changed.

            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

  0. You just DO WHAT THE FUCK YOU WANT TO.

## !README.md

      
    Raw
  

              !README.md
            
          
    A Simple Crawling and Scraping Algorithm Using Python

Created for the blog AlreadyCoded.
License: Do What The Fuck You Want To Public License
Requirements


In Python

Selenium
Couchdb


In System

Firefox


Features

Advantages


Logging
Dumping
Dynamic (Javascript Based) Page Parsing
DOM Manipulating


## __init__.py
import re, requests, logging, os, time, uuid, sys
from couchdb import ResourceConflict
from selenium.common.exceptions import TimeoutException

class Crawler(object):
    logger = logging.getLogger("crawler_logger")
    logger.setLevel(logging.DEBUG)
    file_handler = logging.FileHandler(os.getcwd()+"/logs/crawler_{}.log".format( str(int(round(time.time()*1000))) ))
    file_handler.setLevel(logging.DEBUG)
    terminal_handler = logging.StreamHandler()
    terminal_handler.setLevel(logging.INFO)
    log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    file_handler.setFormatter(log_format)
    terminal_handler.setFormatter(log_format)
    logger.addHandler(terminal_handler)
    logger.addHandler(file_handler)

    def __init__(self, idn, driver, couch, rule, *args):
        self.logger.info("Initializing Crawler object on '{}' domain with {} urls.".format(args[0], str(args)))
        self.idn=idn+"_"+str(uuid.uuid1())
        self.driver = driver
        self.couch = couch
        if rule==None: self.rule="^/"
        else: self.rule=rule
        self.domain = args[0]
        self.urls = list(args)
        self.crawled = []
        self.limit = [0, 0]
        self.sleep = 1
        self.timeout = 20

    def crawl(self):
        self.driver.set_page_load_timeout(self.timeout)
        # while urls in self.urls
        while self.urls:
            if self.limit[0] != 0:
                if self.limit[0] == self.limit[1]:
                    self.logger.info("Limit reached, returns.")
                    break

            try:
                self.logger.info("Connecting to {}...".format(self.urls[0]))
                time.sleep(self.sleep)
                try:
                    self.driver.get(self.urls[0])
                except TimeoutException:
                    self.logger.warn("Timeout has reached. {} seconds.".format(self.timeout))
                    pass
                self.logger.debug("Analyzing to structures...")
                elms = self.driver.find_elements_by_css_selector("a")
                all_hrefs = []
                for elm in elms:
                    all_hrefs.append(elm.get_attribute("href"))
                hrefs = [x for x in all_hrefs if re.search(re.escape(self.domain), x) and not re.search("#|\?", x)]
                self.logger.info("Links are checked if they are crawled...")
                for href in hrefs:
                    if self.domain[0:-1]+href in self.crawled:
                        self.logger.warn("{} already crawled.".format(str(href)))
                        pass
                    else: self.urls.append(href)
                self.crawled.append(self.urls[0])
                # Remove first url from reversed self.urls list
                #self.urls[::-1].pop()
                del self.urls[0]
                self.limit[1]+=1
                self.logger.debug("Updating database {}".format(self.idn))
                try:
                    self.couch.database["{}".format(self.idn)] = {
                        "idn" : self.idn,
                        "rule" : self.rule,
                        "domain" : self.domain,
                        "urls" : self.urls,
                    }
                except ResourceConflict:
                    revno = self.couch.database["{}".format(self.idn)]["_rev"]
                    self.couch.database["{}".format(self.idn)] = {
                        "_rev" : revno,
                        "idn" : self.idn,
                        "rule" : self.rule,
                        "domain" : self.domain,
                        "urls" : self.urls,
                    }
                self.logger.info("Reached count {}/{}".format(str(self.limit[1]), str(self.limit[0])))
            except Exception as e:
                exc_type, exc_obj, exc_tb = sys.exc_info()
                f = exc_tb.tb_frame
                lineno = exc_tb.tb_lineno
                filename = f.f_code.co_filename
                self.logger.error("Crawling function raised an error in line {} @ {}, passing: {}".format(lineno, filename, str(e)))
                pass

        if len(self.urls) == 0:
            self.logger.info("No url left to crawl, returns.")
            return

class Scraper(object):
    logger = logging.getLogger("scraper_logger")
    logger.setLevel(logging.DEBUG)
    file_handler = logging.FileHandler(os.getcwd()+"/logs/scraper_{}.log".format( str(int(round(time.time()*1000))) ))
    file_handler.setLevel(logging.DEBUG)
    terminal_handler = logging.StreamHandler()
    terminal_handler.setLevel(logging.INFO)
    log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    file_handler.setFormatter(log_format)
    terminal_handler.setFormatter(log_format)
    logger.addHandler(terminal_handler)
    logger.addHandler(file_handler)

    def __init__(self, idn, couch, driver):
        self.idn=idn+"_"+str(uuid.uuid1())
        self.rules = []
        self.data = {}
        self.driver = driver
        self.couch = couch
        self.sleep = 1

    def add_rule(self, title, tag, attrs):
        self.rules.append((title, tag, attrs))
        self.logger.info("{} rule added.".format(str((title,tag,attrs))))

    def select_rules(self, **where):
        selected = []
        for item in self.rules:
            try:
                if item[0] in where["title"]:
                    selected.append(item)
                elif item[1] in where["tag"]:
                    selected.append(item)
                elif item[2] in where["attrs"]:
                    selected.append(item)
                else: pass
            except Exception:
                continue
        return selected

    def remove_rules(self, *rules):
        for item in rules:
            self.rules.remove(item)
        self.logger.info("{} rules removed.".format(str(rules)))

    def scrap(self, *urls):
        count = 1
        for url in urls:
            self.logger.info("Connecting to {}... ({}/{})".format(url, str(count), str(len(urls))))
            time.sleep(self.sleep)
            try:
                self.driver.get(url)
                count+=1
            except TimeoutException:
                pass
            self.logger.debug("Parsing to the structure...")
            self.data[url] = []
            for rule in self.rules:
                self.logger.debug("Scraping rule {}".format(str(rule)))
                elms = self.driver.find_elements_by_css_selector(rule[2])
                if len(elms) == 0: pass
                result = [x.text for x in elms]
                self.data[url].append((rule, result))

            self.logger.info("Updating database...")
            try:
                self.couch.database["{}".format(self.idn)] = {
                    "idn" : self.idn,
                    "data" : self.data,
                    "rules" : self.rules,
                    "urls" : urls,
                }
            except ResourceConflict:
                revno = self.couch.database["{}".format(self.idn)]["_rev"]
                self.couch.database["{}".format(self.idn)] = {
                    "_rev" : revno,
                    "idn" : self.idn,
                    "data" : self.data,
                    "rules" : self.rules,
                    "urls" : urls,
                }

## app.py
from api import Crawler, Scraper
from api.db import Couch
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from urllib.parse import urlparse
from termcolor import colored as term
import re

proxy = {
    "http" : "",
    "https" : ""
}

domains = {
    "http://www.awebsite.com/" : ("div", ".", "entry"), # Don't forget the / at the end of URL.
}

if __name__ == "__main__":
    profile = FirefoxProfile()
    profile.add_extension("env/firefox/quickjava-2.0.6-fx.xpi")
    profile.set_preference("thatoneguydotnet.QuickJava.curVersion", "2.0.6.1")
    profile.set_preference("thatoneguydotnet.QuickJava.startupStatus.Images", 2)
    profile.set_preference("thatoneguydotnet.QuickJava.startupStatus.AnimatedImage", 2)
    profile.set_preference("thatoneguydotnet.QuickJava.startupStatus.CSS", 2)
    profile.set_preference("thatoneguydotnet.QuickJava.startupStatus.Flash", 2)
    profile.set_preference("thatoneguydotnet.QuickJava.startupStatus.Java", 2)
    profile.set_preference("thatoneguydotnet.QuickJava.startupStatus.Silverlight", 2)

    db = Couch("erayerdin", "885874")

    dr = webdriver.Firefox(profile)

    for domain, rule in domains.items():
        print(term("Playing on {}".format(domain), "green"))
        dr.set_window_size(1280, 720)

        print(term("Crawling started.", "red"))
        db.select_database("turkce_erotik_hikayeler_baglantilar")
        cr = Crawler(urlparse(domain).netloc[4::], dr, db, None, domain)
        cr.limit[0] = 100
        cr.crawl()

        urls = [x for x in cr.urls if not re.search("\/category\/|\/page\/[0-9]|\/iletisim\/|\/hepsi\/|\/kategori\/|\/Kategori\/", x)]

        print(term("Scraping started.", "red"))
        db.select_database("turkce_erotik_hikayeler_butunce")
        sc = Scraper(urlparse(domain).netloc[4::], db, dr)
        sc.add_rule("metin", rule[0], rule[1]+rule[2])
        sc.scrap(*urls)

        cr=None
        sc=None
        urls=None

    dr.quit()

## db.py
import couchdb

class Couch(object):
    def __init__(self, username, password):
        self.username = username
        self.password = password

        self.server = couchdb.Server()
        self.server.resource.credentials = (username, password)
        self.database = None

    def select_database(self, dbname):
        try:
            self.database=self.server.create(dbname)
        except Exception:
            self.database=self.server[dbname]
	DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
	Version 2, December 2004

	Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>

	Everyone is permitted to copy and distribute verbatim or modified
	copies of this license document, and changing it is allowed as long
	as the name is changed.

	DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
	TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

	0. You just DO WHAT THE FUCK YOU WANT TO.
	import re, requests, logging, os, time, uuid, sys
	from couchdb import ResourceConflict
	from selenium.common.exceptions import TimeoutException

	class Crawler(object):
	logger = logging.getLogger("crawler_logger")
	logger.setLevel(logging.DEBUG)
	file_handler = logging.FileHandler(os.getcwd()+"/logs/crawler_{}.log".format( str(int(round(time.time()*1000))) ))
	file_handler.setLevel(logging.DEBUG)
	terminal_handler = logging.StreamHandler()
	terminal_handler.setLevel(logging.INFO)
	log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
	file_handler.setFormatter(log_format)
	terminal_handler.setFormatter(log_format)
	logger.addHandler(terminal_handler)
	logger.addHandler(file_handler)

	def __init__(self, idn, driver, couch, rule, *args):
	self.logger.info("Initializing Crawler object on '{}' domain with {} urls.".format(args[0], str(args)))
	self.idn=idn+"_"+str(uuid.uuid1())
	self.driver = driver
	self.couch = couch
	if rule==None: self.rule="^/"
	else: self.rule=rule
	self.domain = args[0]
	self.urls = list(args)
	self.crawled = []
	self.limit = [0, 0]
	self.sleep = 1
	self.timeout = 20

	def crawl(self):
	self.driver.set_page_load_timeout(self.timeout)
	# while urls in self.urls
	while self.urls:
	if self.limit[0] != 0:
	if self.limit[0] == self.limit[1]:
	self.logger.info("Limit reached, returns.")
	break

	try:
	self.logger.info("Connecting to {}...".format(self.urls[0]))
	time.sleep(self.sleep)
	try:
	self.driver.get(self.urls[0])
	except TimeoutException:
	self.logger.warn("Timeout has reached. {} seconds.".format(self.timeout))
	pass
	self.logger.debug("Analyzing to structures...")
	elms = self.driver.find_elements_by_css_selector("a")
	all_hrefs = []
	for elm in elms:
	all_hrefs.append(elm.get_attribute("href"))
	hrefs = [x for x in all_hrefs if re.search(re.escape(self.domain), x) and not re.search("#\|\?", x)]
	self.logger.info("Links are checked if they are crawled...")
	for href in hrefs:
	if self.domain[0:-1]+href in self.crawled:
	self.logger.warn("{} already crawled.".format(str(href)))
	pass
	else: self.urls.append(href)
	self.crawled.append(self.urls[0])
	# Remove first url from reversed self.urls list
	#self.urls[::-1].pop()
	del self.urls[0]
	self.limit[1]+=1
	self.logger.debug("Updating database {}".format(self.idn))
	try:
	self.couch.database["{}".format(self.idn)] = {
	"idn" : self.idn,
	"rule" : self.rule,
	"domain" : self.domain,
	"urls" : self.urls,
	}
	except ResourceConflict:
	revno = self.couch.database["{}".format(self.idn)]["_rev"]
	self.couch.database["{}".format(self.idn)] = {
	"_rev" : revno,
	"idn" : self.idn,
	"rule" : self.rule,
	"domain" : self.domain,
	"urls" : self.urls,
	}
	self.logger.info("Reached count {}/{}".format(str(self.limit[1]), str(self.limit[0])))
	except Exception as e:
	exc_type, exc_obj, exc_tb = sys.exc_info()
	f = exc_tb.tb_frame
	lineno = exc_tb.tb_lineno
	filename = f.f_code.co_filename
	self.logger.error("Crawling function raised an error in line {} @ {}, passing: {}".format(lineno, filename, str(e)))
	pass

	if len(self.urls) == 0:
	self.logger.info("No url left to crawl, returns.")
	return

	class Scraper(object):
	logger = logging.getLogger("scraper_logger")
	logger.setLevel(logging.DEBUG)
	file_handler = logging.FileHandler(os.getcwd()+"/logs/scraper_{}.log".format( str(int(round(time.time()*1000))) ))
	file_handler.setLevel(logging.DEBUG)
	terminal_handler = logging.StreamHandler()
	terminal_handler.setLevel(logging.INFO)
	log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
	file_handler.setFormatter(log_format)
	terminal_handler.setFormatter(log_format)
	logger.addHandler(terminal_handler)
	logger.addHandler(file_handler)

	def __init__(self, idn, couch, driver):
	self.idn=idn+"_"+str(uuid.uuid1())
	self.rules = []
	self.data = {}
	self.driver = driver
	self.couch = couch
	self.sleep = 1

	def add_rule(self, title, tag, attrs):
	self.rules.append((title, tag, attrs))
	self.logger.info("{} rule added.".format(str((title,tag,attrs))))

	def select_rules(self, **where):
	selected = []
	for item in self.rules:
	try:
	if item[0] in where["title"]:
	selected.append(item)
	elif item[1] in where["tag"]:
	selected.append(item)
	elif item[2] in where["attrs"]:
	selected.append(item)
	else: pass
	except Exception:
	continue
	return selected

	def remove_rules(self, *rules):
	for item in rules:
	self.rules.remove(item)
	self.logger.info("{} rules removed.".format(str(rules)))

	def scrap(self, *urls):
	count = 1
	for url in urls:
	self.logger.info("Connecting to {}... ({}/{})".format(url, str(count), str(len(urls))))
	time.sleep(self.sleep)
	try:
	self.driver.get(url)
	count+=1
	except TimeoutException:
	pass
	self.logger.debug("Parsing to the structure...")
	self.data[url] = []
	for rule in self.rules:
	self.logger.debug("Scraping rule {}".format(str(rule)))
	elms = self.driver.find_elements_by_css_selector(rule[2])
	if len(elms) == 0: pass
	result = [x.text for x in elms]
	self.data[url].append((rule, result))

	self.logger.info("Updating database...")
	try:
	self.couch.database["{}".format(self.idn)] = {
	"idn" : self.idn,
	"data" : self.data,
	"rules" : self.rules,
	"urls" : urls,
	}
	except ResourceConflict:
	revno = self.couch.database["{}".format(self.idn)]["_rev"]
	self.couch.database["{}".format(self.idn)] = {
	"_rev" : revno,
	"idn" : self.idn,
	"data" : self.data,
	"rules" : self.rules,
	"urls" : urls,
	}
	from api import Crawler, Scraper
	from api.db import Couch
	from selenium import webdriver
	from selenium.webdriver.common.keys import Keys
	from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
	from urllib.parse import urlparse
	from termcolor import colored as term
	import re

	proxy = {
	"http" : "",
	"https" : ""
	}

	domains = {
	"http://www.awebsite.com/" : ("div", ".", "entry"), # Don't forget the / at the end of URL.
	}

	if __name__ == "__main__":
	profile = FirefoxProfile()
	profile.add_extension("env/firefox/quickjava-2.0.6-fx.xpi")
	profile.set_preference("thatoneguydotnet.QuickJava.curVersion", "2.0.6.1")
	profile.set_preference("thatoneguydotnet.QuickJava.startupStatus.Images", 2)
	profile.set_preference("thatoneguydotnet.QuickJava.startupStatus.AnimatedImage", 2)
	profile.set_preference("thatoneguydotnet.QuickJava.startupStatus.CSS", 2)
	profile.set_preference("thatoneguydotnet.QuickJava.startupStatus.Flash", 2)
	profile.set_preference("thatoneguydotnet.QuickJava.startupStatus.Java", 2)
	profile.set_preference("thatoneguydotnet.QuickJava.startupStatus.Silverlight", 2)

	db = Couch("erayerdin", "885874")

	dr = webdriver.Firefox(profile)

	for domain, rule in domains.items():
	print(term("Playing on {}".format(domain), "green"))
	dr.set_window_size(1280, 720)

	print(term("Crawling started.", "red"))
	db.select_database("turkce_erotik_hikayeler_baglantilar")
	cr = Crawler(urlparse(domain).netloc[4::], dr, db, None, domain)
	cr.limit[0] = 100
	cr.crawl()

	urls = [x for x in cr.urls if not re.search("\/category\/\|\/page\/[0-9]\|\/iletisim\/\|\/hepsi\/\|\/kategori\/\|\/Kategori\/", x)]

	print(term("Scraping started.", "red"))
	db.select_database("turkce_erotik_hikayeler_butunce")
	sc = Scraper(urlparse(domain).netloc[4::], db, dr)
	sc.add_rule("metin", rule[0], rule[1]+rule[2])
	sc.scrap(*urls)

	cr=None
	sc=None
	urls=None

	dr.quit()
	import couchdb

	class Couch(object):
	def __init__(self, username, password):
	self.username = username
	self.password = password

	self.server = couchdb.Server()
	self.server.resource.credentials = (username, password)
	self.database = None

	def select_database(self, dbname):
	try:
	self.database=self.server.create(dbname)
	except Exception:
	self.database=self.server[dbname]