Created
October 18, 2015 13:19
-
-
Save erayerdin/ad319994326a8a236bdb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re, requests, logging, os, time, json | |
from bs4 import BeautifulSoup as bs | |
class Crawler(object): | |
logger = logging.getLogger("crawler_logger") | |
logger.setLevel(logging.DEBUG) | |
file_handler = logging.FileHandler(os.getcwd()+"/logs/{}.log".format( str(int(round(time.time()*1000))) )) | |
file_handler.setLevel(logging.DEBUG) | |
terminal_handler = logging.StreamHandler() | |
terminal_handler.setLevel(logging.INFO) | |
log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") | |
file_handler.setFormatter(log_format) | |
terminal_handler.setFormatter(log_format) | |
logger.addHandler(terminal_handler) | |
logger.addHandler(file_handler) | |
def __init__(self, *args): | |
self.logger.info("Initializing Crawler object on '{}' domain with {} urls.".format(args[0], str(args))) | |
self.domain = args[0] | |
self.urls = list(args) | |
self.crawled = [] | |
self.limit = [0, 0] | |
self.dump_file = "urls.json" | |
def crawl(self): | |
if self.limit[0] != 0: | |
if self.limit[0] == self.limit[1]: | |
self.logger.info("Limit reached, writing to file and returns.") | |
with open(self.dump_file, "w") as dump_file: | |
dump_file.write(json.dumps(self.urls)) | |
return | |
elif len(self.urls) == 0: | |
self.logger.info("No url left to crawl, returns.") | |
with open(self.dump_file, "w+") as dump_file: | |
dump_file.write(json.dumps(self.urls)) | |
return | |
else: | |
try: | |
self.logger.info("Connecting to {}...".format(self.urls[0])) | |
response = requests.get(self.urls[0]) | |
response.encoding = "utf-8" | |
self.logger.info("Analyzing to structures...") | |
soup = bs(response.text, "html.parser") | |
links = soup.find_all("a", {"href" : re.compile("^/")}) | |
hrefs = [x.attrs["href"] for x in links] | |
self.logger.info("Links are checked if they are crawled...") | |
for href in hrefs: | |
if self.domain[0:-1]+href in self.crawled: | |
self.logger.warn("{} already crawled.".format(str(href))) | |
pass | |
else: self.urls.append(self.domain[0:-1]+href) | |
self.crawled.append(self.urls[0]) | |
self.urls.remove(self.urls[0]) | |
self.limit[1]+=1 | |
self.logger.info("Reached count {}/{}".format(str(self.limit[1]), str(self.limit[0]))) | |
except Exception as e: | |
self.logger.error("Crawling function raised an error, passing: {}".format(str(e))) | |
self.crawl() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from api import Crawler | |
if __name__ == "__main__": | |
mycrawler = Crawler("http://metasozluk.com/") | |
mycrawler.limit[0] = 500 | |
mycrawler.crawl() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment