Skip to content

Instantly share code, notes, and snippets.

@erayerdin
Created October 18, 2015 13:19
Show Gist options
  • Save erayerdin/ad319994326a8a236bdb to your computer and use it in GitHub Desktop.
Save erayerdin/ad319994326a8a236bdb to your computer and use it in GitHub Desktop.
import re, requests, logging, os, time, json
from bs4 import BeautifulSoup as bs
class Crawler(object):
logger = logging.getLogger("crawler_logger")
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler(os.getcwd()+"/logs/{}.log".format( str(int(round(time.time()*1000))) ))
file_handler.setLevel(logging.DEBUG)
terminal_handler = logging.StreamHandler()
terminal_handler.setLevel(logging.INFO)
log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
file_handler.setFormatter(log_format)
terminal_handler.setFormatter(log_format)
logger.addHandler(terminal_handler)
logger.addHandler(file_handler)
def __init__(self, *args):
self.logger.info("Initializing Crawler object on '{}' domain with {} urls.".format(args[0], str(args)))
self.domain = args[0]
self.urls = list(args)
self.crawled = []
self.limit = [0, 0]
self.dump_file = "urls.json"
def crawl(self):
if self.limit[0] != 0:
if self.limit[0] == self.limit[1]:
self.logger.info("Limit reached, writing to file and returns.")
with open(self.dump_file, "w") as dump_file:
dump_file.write(json.dumps(self.urls))
return
elif len(self.urls) == 0:
self.logger.info("No url left to crawl, returns.")
with open(self.dump_file, "w+") as dump_file:
dump_file.write(json.dumps(self.urls))
return
else:
try:
self.logger.info("Connecting to {}...".format(self.urls[0]))
response = requests.get(self.urls[0])
response.encoding = "utf-8"
self.logger.info("Analyzing to structures...")
soup = bs(response.text, "html.parser")
links = soup.find_all("a", {"href" : re.compile("^/")})
hrefs = [x.attrs["href"] for x in links]
self.logger.info("Links are checked if they are crawled...")
for href in hrefs:
if self.domain[0:-1]+href in self.crawled:
self.logger.warn("{} already crawled.".format(str(href)))
pass
else: self.urls.append(self.domain[0:-1]+href)
self.crawled.append(self.urls[0])
self.urls.remove(self.urls[0])
self.limit[1]+=1
self.logger.info("Reached count {}/{}".format(str(self.limit[1]), str(self.limit[0])))
except Exception as e:
self.logger.error("Crawling function raised an error, passing: {}".format(str(e)))
self.crawl()
from api import Crawler
if __name__ == "__main__":
mycrawler = Crawler("http://metasozluk.com/")
mycrawler.limit[0] = 500
mycrawler.crawl()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment