Created for the blog AlreadyCoded. Check out the thread.
License: Do What The Fuck You Want To Public License
- BeautifulSoup4
- Couchdb
- Logging
- Dumping
- Static Page Parsing
A Simple Crawling and Scraping Algorithm Using Python |
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | |
Version 2, December 2004 | |
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net> | |
Everyone is permitted to copy and distribute verbatim or modified | |
copies of this license document, and changing it is allowed as long | |
as the name is changed. | |
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | |
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION | |
0. You just DO WHAT THE FUCK YOU WANT TO. |
Created for the blog AlreadyCoded. Check out the thread.
License: Do What The Fuck You Want To Public License
import re, requests, logging, os, time, uuid, sys | |
from bs4 import BeautifulSoup as bs | |
from couchdb import ResourceConflict | |
#from selenium import webdriver | |
#from selenium.webdriver.common.keys import Keys | |
class Crawler(object): | |
logger = logging.getLogger("crawler_logger") | |
logger.setLevel(logging.DEBUG) | |
file_handler = logging.FileHandler(os.getcwd()+"/logs/crawler_{}.log".format( str(int(round(time.time()*1000))) )) | |
file_handler.setLevel(logging.DEBUG) | |
terminal_handler = logging.StreamHandler() | |
terminal_handler.setLevel(logging.INFO) | |
log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") | |
file_handler.setFormatter(log_format) | |
terminal_handler.setFormatter(log_format) | |
logger.addHandler(terminal_handler) | |
logger.addHandler(file_handler) | |
def __init__(self, idn, couch, rule, *args): | |
self.logger.info("Initializing Crawler object on '{}' domain with {} urls.".format(args[0], str(args))) | |
self.idn=idn+"_"+str(uuid.uuid1()) | |
self.couch = couch | |
if rule==None: self.rule="^/" | |
else: self.rule=rule | |
self.domain = args[0] | |
self.urls = list(args) | |
self.crawled = [] | |
self.limit = [0, 0] | |
self.sleep = 1 | |
def crawl(self, proxies=None): | |
# while urls in self.urls | |
while self.urls: | |
if self.limit[0] != 0: | |
if self.limit[0] == self.limit[1]: | |
self.logger.info("Limit reached, returns.") | |
break | |
try: | |
self.logger.info("Connecting to {}...".format(self.urls[0])) | |
response = requests.get(self.urls[0], proxies) | |
time.sleep(self.sleep) | |
response.encoding = "utf-8" | |
self.logger.debug("Analyzing to structures...") | |
soup = bs(response.text, "html.parser") | |
links = soup.find_all("a", {"href" : re.compile(self.rule)}) | |
hrefs = [x.attrs["href"] for x in links] | |
self.logger.info("Links are checked if they are crawled...") | |
for href in hrefs: | |
if self.domain[0:-1]+href in self.crawled: | |
self.logger.warn("{} already crawled.".format(str(href))) | |
pass | |
else: self.urls.append(self.domain[0:-1]+href) | |
self.crawled.append(self.urls[0]) | |
# Remove first url from reversed self.urls list | |
#self.urls[::-1].pop() | |
del self.urls[0] | |
self.limit[1]+=1 | |
self.logger.debug("Updating database {}".format(self.idn)) | |
try: | |
self.couch.database["{}".format(self.idn)] = { | |
"idn" : self.idn, | |
"rule" : self.rule, | |
"domain" : self.domain, | |
"urls" : self.urls, | |
} | |
except ResourceConflict: | |
revno = self.couch.database["{}".format(self.idn)]["_rev"] | |
self.couch.database["{}".format(self.idn)] = { | |
"_rev" : revno, | |
"idn" : self.idn, | |
"rule" : self.rule, | |
"domain" : self.domain, | |
"urls" : self.urls, | |
} | |
self.logger.info("Reached count {}/{}".format(str(self.limit[1]), str(self.limit[0]))) | |
except Exception as e: | |
exc_type, exc_obj, exc_tb = sys.exc_info() | |
f = exc_tb.tb_frame | |
lineno = exc_tb.tb_lineno | |
filename = f.f_code.co_filename | |
self.logger.error("Crawling function raised an error in line {} @ {}, passing: {}".format(lineno, filename, str(e))) | |
if len(self.urls) == 0: | |
self.logger.info("No url left to crawl, returns.") | |
return | |
class Scraper(object): | |
logger = logging.getLogger("scraper_logger") | |
logger.setLevel(logging.DEBUG) | |
file_handler = logging.FileHandler(os.getcwd()+"/logs/scraper_{}.log".format( str(int(round(time.time()*1000))) )) | |
file_handler.setLevel(logging.DEBUG) | |
terminal_handler = logging.StreamHandler() | |
terminal_handler.setLevel(logging.INFO) | |
log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") | |
file_handler.setFormatter(log_format) | |
terminal_handler.setFormatter(log_format) | |
logger.addHandler(terminal_handler) | |
logger.addHandler(file_handler) | |
def __init__(self, idn, couch): | |
self.idn=idn+"_"+str(uuid.uuid1()) | |
self.rules = [] | |
self.data = {} | |
self.couch = couch | |
self.sleep = 1 | |
def add_rule(self, title, tag, attrs): | |
self.rules.append((title, tag, attrs)) | |
self.logger.info("{} rule added.".format(str((title,tag,attrs)))) | |
def select_rules(self, **where): | |
selected = [] | |
for item in self.rules: | |
try: | |
if item[0] in where["title"]: | |
selected.append(item) | |
elif item[1] in where["tag"]: | |
selected.append(item) | |
elif item[2] in where["attrs"]: | |
selected.append(item) | |
else: pass | |
except Exception: | |
continue | |
return selected | |
def remove_rules(self, *rules): | |
for item in rules: | |
self.rules.remove(item) | |
self.logger.info("{} rules removed.".format(str(rules))) | |
def scrap(self, proxies=None, *urls): | |
for url in urls: | |
self.logger.info("Connecting to {}...".format(url)) | |
response = requests.get(url, proxies) | |
time.sleep(self.sleep) | |
response.encoding = "utf-8" | |
self.logger.debug("Parsing to the structure...") | |
soup = bs(response.text, "html.parser") | |
self.data[url] = [] | |
for rule in self.rules: | |
self.logger.debug("Scraping rule {}".format(str(rule))) | |
rule_search = soup.find_all(rule[1], rule[2]) | |
if len(rule_search) == 0: pass | |
result = [x.get_text() for x in rule_search] | |
self.data[url].append((rule, result)) | |
self.logger.info("Updating database...") | |
try: | |
self.couch.database["{}".format(self.idn)] = { | |
"idn" : self.idn, | |
"data" : self.data, | |
"rules" : self.rules, | |
"urls" : urls, | |
} | |
except ResourceConflict: | |
revno = self.couch.database["{}".format(self.idn)]["_rev"] | |
self.couch.database["{}".format(self.idn)] = { | |
"_rev" : revno, | |
"idn" : self.idn, | |
"data" : self.data, | |
"rules" : self.rules, | |
"urls" : urls, | |
} |
import couchdb | |
class Couch(object): | |
def __init__(self, username, password): | |
self.username = username | |
self.password = password | |
self.server = couchdb.Server() | |
self.server.resource.credentials = (username, password) | |
self.database = None | |
def select_database(self, dbname): | |
try: | |
self.database=self.server.create(dbname) | |
except Exception: | |
self.database=self.server[dbname] |