Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save erayerdin/33442173da704398528f to your computer and use it in GitHub Desktop.
Save erayerdin/33442173da704398528f to your computer and use it in GitHub Desktop.
A simple dynamic crawling and scraping algorithm in Python
A Simple Dynamic Crawling and Scraping Algorithm in Python
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
Version 2, December 2004
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
Everyone is permitted to copy and distribute verbatim or modified
copies of this license document, and changing it is allowed as long
as the name is changed.
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. You just DO WHAT THE FUCK YOU WANT TO.

A Simple Crawling and Scraping Algorithm Using Python

Created for the blog AlreadyCoded.

License: Do What The Fuck You Want To Public License

Requirements

  • In Python
    • Selenium
    • Couchdb
  • In System
    • Firefox

Features

Advantages

  • Logging
  • Dumping
  • Dynamic (Javascript Based) Page Parsing
  • DOM Manipulating
import re, requests, logging, os, time, uuid, sys
from couchdb import ResourceConflict
from selenium.common.exceptions import TimeoutException
class Crawler(object):
logger = logging.getLogger("crawler_logger")
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler(os.getcwd()+"/logs/crawler_{}.log".format( str(int(round(time.time()*1000))) ))
file_handler.setLevel(logging.DEBUG)
terminal_handler = logging.StreamHandler()
terminal_handler.setLevel(logging.INFO)
log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
file_handler.setFormatter(log_format)
terminal_handler.setFormatter(log_format)
logger.addHandler(terminal_handler)
logger.addHandler(file_handler)
def __init__(self, idn, driver, couch, rule, *args):
self.logger.info("Initializing Crawler object on '{}' domain with {} urls.".format(args[0], str(args)))
self.idn=idn+"_"+str(uuid.uuid1())
self.driver = driver
self.couch = couch
if rule==None: self.rule="^/"
else: self.rule=rule
self.domain = args[0]
self.urls = list(args)
self.crawled = []
self.limit = [0, 0]
self.sleep = 1
self.timeout = 20
def crawl(self):
self.driver.set_page_load_timeout(self.timeout)
# while urls in self.urls
while self.urls:
if self.limit[0] != 0:
if self.limit[0] == self.limit[1]:
self.logger.info("Limit reached, returns.")
break
try:
self.logger.info("Connecting to {}...".format(self.urls[0]))
time.sleep(self.sleep)
try:
self.driver.get(self.urls[0])
except TimeoutException:
self.logger.warn("Timeout has reached. {} seconds.".format(self.timeout))
pass
self.logger.debug("Analyzing to structures...")
elms = self.driver.find_elements_by_css_selector("a")
all_hrefs = []
for elm in elms:
all_hrefs.append(elm.get_attribute("href"))
hrefs = [x for x in all_hrefs if re.search(re.escape(self.domain), x) and not re.search("#|\?", x)]
self.logger.info("Links are checked if they are crawled...")
for href in hrefs:
if self.domain[0:-1]+href in self.crawled:
self.logger.warn("{} already crawled.".format(str(href)))
pass
else: self.urls.append(href)
self.crawled.append(self.urls[0])
# Remove first url from reversed self.urls list
#self.urls[::-1].pop()
del self.urls[0]
self.limit[1]+=1
self.logger.debug("Updating database {}".format(self.idn))
try:
self.couch.database["{}".format(self.idn)] = {
"idn" : self.idn,
"rule" : self.rule,
"domain" : self.domain,
"urls" : self.urls,
}
except ResourceConflict:
revno = self.couch.database["{}".format(self.idn)]["_rev"]
self.couch.database["{}".format(self.idn)] = {
"_rev" : revno,
"idn" : self.idn,
"rule" : self.rule,
"domain" : self.domain,
"urls" : self.urls,
}
self.logger.info("Reached count {}/{}".format(str(self.limit[1]), str(self.limit[0])))
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
f = exc_tb.tb_frame
lineno = exc_tb.tb_lineno
filename = f.f_code.co_filename
self.logger.error("Crawling function raised an error in line {} @ {}, passing: {}".format(lineno, filename, str(e)))
pass
if len(self.urls) == 0:
self.logger.info("No url left to crawl, returns.")
return
class Scraper(object):
logger = logging.getLogger("scraper_logger")
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler(os.getcwd()+"/logs/scraper_{}.log".format( str(int(round(time.time()*1000))) ))
file_handler.setLevel(logging.DEBUG)
terminal_handler = logging.StreamHandler()
terminal_handler.setLevel(logging.INFO)
log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
file_handler.setFormatter(log_format)
terminal_handler.setFormatter(log_format)
logger.addHandler(terminal_handler)
logger.addHandler(file_handler)
def __init__(self, idn, couch, driver):
self.idn=idn+"_"+str(uuid.uuid1())
self.rules = []
self.data = {}
self.driver = driver
self.couch = couch
self.sleep = 1
def add_rule(self, title, tag, attrs):
self.rules.append((title, tag, attrs))
self.logger.info("{} rule added.".format(str((title,tag,attrs))))
def select_rules(self, **where):
selected = []
for item in self.rules:
try:
if item[0] in where["title"]:
selected.append(item)
elif item[1] in where["tag"]:
selected.append(item)
elif item[2] in where["attrs"]:
selected.append(item)
else: pass
except Exception:
continue
return selected
def remove_rules(self, *rules):
for item in rules:
self.rules.remove(item)
self.logger.info("{} rules removed.".format(str(rules)))
def scrap(self, *urls):
count = 1
for url in urls:
self.logger.info("Connecting to {}... ({}/{})".format(url, str(count), str(len(urls))))
time.sleep(self.sleep)
try:
self.driver.get(url)
count+=1
except TimeoutException:
pass
self.logger.debug("Parsing to the structure...")
self.data[url] = []
for rule in self.rules:
self.logger.debug("Scraping rule {}".format(str(rule)))
elms = self.driver.find_elements_by_css_selector(rule[2])
if len(elms) == 0: pass
result = [x.text for x in elms]
self.data[url].append((rule, result))
self.logger.info("Updating database...")
try:
self.couch.database["{}".format(self.idn)] = {
"idn" : self.idn,
"data" : self.data,
"rules" : self.rules,
"urls" : urls,
}
except ResourceConflict:
revno = self.couch.database["{}".format(self.idn)]["_rev"]
self.couch.database["{}".format(self.idn)] = {
"_rev" : revno,
"idn" : self.idn,
"data" : self.data,
"rules" : self.rules,
"urls" : urls,
}
from api import Crawler, Scraper
from api.db import Couch
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from urllib.parse import urlparse
from termcolor import colored as term
import re
proxy = {
"http" : "",
"https" : ""
}
domains = {
"http://www.awebsite.com/" : ("div", ".", "entry"), # Don't forget the / at the end of URL.
}
if __name__ == "__main__":
profile = FirefoxProfile()
profile.add_extension("env/firefox/quickjava-2.0.6-fx.xpi")
profile.set_preference("thatoneguydotnet.QuickJava.curVersion", "2.0.6.1")
profile.set_preference("thatoneguydotnet.QuickJava.startupStatus.Images", 2)
profile.set_preference("thatoneguydotnet.QuickJava.startupStatus.AnimatedImage", 2)
profile.set_preference("thatoneguydotnet.QuickJava.startupStatus.CSS", 2)
profile.set_preference("thatoneguydotnet.QuickJava.startupStatus.Flash", 2)
profile.set_preference("thatoneguydotnet.QuickJava.startupStatus.Java", 2)
profile.set_preference("thatoneguydotnet.QuickJava.startupStatus.Silverlight", 2)
db = Couch("erayerdin", "885874")
dr = webdriver.Firefox(profile)
for domain, rule in domains.items():
print(term("Playing on {}".format(domain), "green"))
dr.set_window_size(1280, 720)
print(term("Crawling started.", "red"))
db.select_database("turkce_erotik_hikayeler_baglantilar")
cr = Crawler(urlparse(domain).netloc[4::], dr, db, None, domain)
cr.limit[0] = 100
cr.crawl()
urls = [x for x in cr.urls if not re.search("\/category\/|\/page\/[0-9]|\/iletisim\/|\/hepsi\/|\/kategori\/|\/Kategori\/", x)]
print(term("Scraping started.", "red"))
db.select_database("turkce_erotik_hikayeler_butunce")
sc = Scraper(urlparse(domain).netloc[4::], db, dr)
sc.add_rule("metin", rule[0], rule[1]+rule[2])
sc.scrap(*urls)
cr=None
sc=None
urls=None
dr.quit()
import couchdb
class Couch(object):
def __init__(self, username, password):
self.username = username
self.password = password
self.server = couchdb.Server()
self.server.resource.credentials = (username, password)
self.database = None
def select_database(self, dbname):
try:
self.database=self.server.create(dbname)
except Exception:
self.database=self.server[dbname]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment