Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save erayerdin/10ed20262f7578064450 to your computer and use it in GitHub Desktop.
Save erayerdin/10ed20262f7578064450 to your computer and use it in GitHub Desktop.
A simple Crawling and Scraping algorithm.
A Simple Crawling and Scraping Algorithm Using Python
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
Version 2, December 2004
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
Everyone is permitted to copy and distribute verbatim or modified
copies of this license document, and changing it is allowed as long
as the name is changed.
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. You just DO WHAT THE FUCK YOU WANT TO.
import re, requests, logging, os, time, uuid, sys
from bs4 import BeautifulSoup as bs
from couchdb import ResourceConflict
#from selenium import webdriver
#from selenium.webdriver.common.keys import Keys
class Crawler(object):
logger = logging.getLogger("crawler_logger")
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler(os.getcwd()+"/logs/crawler_{}.log".format( str(int(round(time.time()*1000))) ))
file_handler.setLevel(logging.DEBUG)
terminal_handler = logging.StreamHandler()
terminal_handler.setLevel(logging.INFO)
log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
file_handler.setFormatter(log_format)
terminal_handler.setFormatter(log_format)
logger.addHandler(terminal_handler)
logger.addHandler(file_handler)
def __init__(self, idn, couch, rule, *args):
self.logger.info("Initializing Crawler object on '{}' domain with {} urls.".format(args[0], str(args)))
self.idn=idn+"_"+str(uuid.uuid1())
self.couch = couch
if rule==None: self.rule="^/"
else: self.rule=rule
self.domain = args[0]
self.urls = list(args)
self.crawled = []
self.limit = [0, 0]
self.sleep = 1
def crawl(self, proxies=None):
# while urls in self.urls
while self.urls:
if self.limit[0] != 0:
if self.limit[0] == self.limit[1]:
self.logger.info("Limit reached, returns.")
break
try:
self.logger.info("Connecting to {}...".format(self.urls[0]))
response = requests.get(self.urls[0], proxies)
time.sleep(self.sleep)
response.encoding = "utf-8"
self.logger.debug("Analyzing to structures...")
soup = bs(response.text, "html.parser")
links = soup.find_all("a", {"href" : re.compile(self.rule)})
hrefs = [x.attrs["href"] for x in links]
self.logger.info("Links are checked if they are crawled...")
for href in hrefs:
if self.domain[0:-1]+href in self.crawled:
self.logger.warn("{} already crawled.".format(str(href)))
pass
else: self.urls.append(self.domain[0:-1]+href)
self.crawled.append(self.urls[0])
# Remove first url from reversed self.urls list
#self.urls[::-1].pop()
del self.urls[0]
self.limit[1]+=1
self.logger.debug("Updating database {}".format(self.idn))
try:
self.couch.database["{}".format(self.idn)] = {
"idn" : self.idn,
"rule" : self.rule,
"domain" : self.domain,
"urls" : self.urls,
}
except ResourceConflict:
revno = self.couch.database["{}".format(self.idn)]["_rev"]
self.couch.database["{}".format(self.idn)] = {
"_rev" : revno,
"idn" : self.idn,
"rule" : self.rule,
"domain" : self.domain,
"urls" : self.urls,
}
self.logger.info("Reached count {}/{}".format(str(self.limit[1]), str(self.limit[0])))
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
f = exc_tb.tb_frame
lineno = exc_tb.tb_lineno
filename = f.f_code.co_filename
self.logger.error("Crawling function raised an error in line {} @ {}, passing: {}".format(lineno, filename, str(e)))
if len(self.urls) == 0:
self.logger.info("No url left to crawl, returns.")
return
class Scraper(object):
logger = logging.getLogger("scraper_logger")
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler(os.getcwd()+"/logs/scraper_{}.log".format( str(int(round(time.time()*1000))) ))
file_handler.setLevel(logging.DEBUG)
terminal_handler = logging.StreamHandler()
terminal_handler.setLevel(logging.INFO)
log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
file_handler.setFormatter(log_format)
terminal_handler.setFormatter(log_format)
logger.addHandler(terminal_handler)
logger.addHandler(file_handler)
def __init__(self, idn, couch):
self.idn=idn+"_"+str(uuid.uuid1())
self.rules = []
self.data = {}
self.couch = couch
self.sleep = 1
def add_rule(self, title, tag, attrs):
self.rules.append((title, tag, attrs))
self.logger.info("{} rule added.".format(str((title,tag,attrs))))
def select_rules(self, **where):
selected = []
for item in self.rules:
try:
if item[0] in where["title"]:
selected.append(item)
elif item[1] in where["tag"]:
selected.append(item)
elif item[2] in where["attrs"]:
selected.append(item)
else: pass
except Exception:
continue
return selected
def remove_rules(self, *rules):
for item in rules:
self.rules.remove(item)
self.logger.info("{} rules removed.".format(str(rules)))
def scrap(self, proxies=None, *urls):
for url in urls:
self.logger.info("Connecting to {}...".format(url))
response = requests.get(url, proxies)
time.sleep(self.sleep)
response.encoding = "utf-8"
self.logger.debug("Parsing to the structure...")
soup = bs(response.text, "html.parser")
self.data[url] = []
for rule in self.rules:
self.logger.debug("Scraping rule {}".format(str(rule)))
rule_search = soup.find_all(rule[1], rule[2])
if len(rule_search) == 0: pass
result = [x.get_text() for x in rule_search]
self.data[url].append((rule, result))
self.logger.info("Updating database...")
try:
self.couch.database["{}".format(self.idn)] = {
"idn" : self.idn,
"data" : self.data,
"rules" : self.rules,
"urls" : urls,
}
except ResourceConflict:
revno = self.couch.database["{}".format(self.idn)]["_rev"]
self.couch.database["{}".format(self.idn)] = {
"_rev" : revno,
"idn" : self.idn,
"data" : self.data,
"rules" : self.rules,
"urls" : urls,
}
import couchdb
class Couch(object):
def __init__(self, username, password):
self.username = username
self.password = password
self.server = couchdb.Server()
self.server.resource.credentials = (username, password)
self.database = None
def select_database(self, dbname):
try:
self.database=self.server.create(dbname)
except Exception:
self.database=self.server[dbname]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment