Created
November 14, 2017 07:46
-
-
Save iamkhush/f8f9d7f14e8f58d809b483c7759c2703 to your computer and use it in GitHub Desktop.
Simple Crawler which outputs sitemap for the given website
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Simple Crawler which outputs sitemap.""" | |
import re | |
import sys | |
from urllib.parse import urlparse | |
import requests | |
class Crawler: | |
to_crawl = set([]) | |
crawl_done = set([]) | |
avoid_extentions = ('.exe', '.img', '.pdf', 'mp4', '.png', '.gif') | |
link_regex = re.compile(r'<a [^>]*href=[\'"](.*?)[\'"][^>]*?>') | |
def __init__(self, url): | |
self.parsed_link = urlparse(url) | |
self.to_crawl.add(url) | |
def check_url_and_return_completed(self, url): | |
""" | |
>>> crawl = Crawler('http://example.com') | |
>>> crawl.check_url_and_return_completed('http://example.com/example.png') | |
>>> crawl.check_url_and_return_completed('/xyz') | |
'http://example.com/xyz' | |
>>> crawl.check_url_and_return_completed('/xyz.png') | |
""" | |
# check if the url doesnt have extns to be avoided | |
if not url.endswith(self.avoid_extentions): | |
link = urlparse(url) | |
# check if domain are same | |
if link.netloc == self.parsed_link.netloc: | |
if url not in self.crawl_done: | |
return url | |
elif url.startswith('/'): | |
return '%s://%s%s' % ( | |
self.parsed_link.scheme, self.parsed_link.netloc, url) | |
def crawl(self): | |
while self.to_crawl: | |
url = self.to_crawl.pop() | |
resp = requests.get(url) | |
if resp.status_code == requests.codes.ok: | |
all_links = self.link_regex.findall(resp.text) | |
for link in all_links: | |
complete_url = self.check_url_and_return_completed(link) | |
if complete_url and complete_url not in self.crawl_done: | |
self.to_crawl.add(complete_url) | |
self.crawl_done.add(url) | |
def get_sitemap(self): | |
template = '<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">%s</urlset>' | |
sub_template = ['<url><loc>%s</loc></url>' % url for url in self.crawl_done] | |
return template % ''.join(sub_template) | |
if __name__ == '__main__': | |
if len(sys.argv) != 2: | |
sys.exit('Please provide the website address') | |
url = sys.argv[1] | |
crawler = Crawler(url) | |
crawler.crawl() | |
print(crawler.get_sitemap()) | |
# import doctest | |
# doctest.testmod() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment