Skip to content

Instantly share code, notes, and snippets.

@iamkhush
Created November 14, 2017 07:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iamkhush/f8f9d7f14e8f58d809b483c7759c2703 to your computer and use it in GitHub Desktop.
Save iamkhush/f8f9d7f14e8f58d809b483c7759c2703 to your computer and use it in GitHub Desktop.
Simple Crawler which outputs sitemap for the given website
"""Simple Crawler which outputs sitemap."""
import re
import sys
from urllib.parse import urlparse
import requests
class Crawler:
to_crawl = set([])
crawl_done = set([])
avoid_extentions = ('.exe', '.img', '.pdf', 'mp4', '.png', '.gif')
link_regex = re.compile(r'<a [^>]*href=[\'"](.*?)[\'"][^>]*?>')
def __init__(self, url):
self.parsed_link = urlparse(url)
self.to_crawl.add(url)
def check_url_and_return_completed(self, url):
"""
>>> crawl = Crawler('http://example.com')
>>> crawl.check_url_and_return_completed('http://example.com/example.png')
>>> crawl.check_url_and_return_completed('/xyz')
'http://example.com/xyz'
>>> crawl.check_url_and_return_completed('/xyz.png')
"""
# check if the url doesnt have extns to be avoided
if not url.endswith(self.avoid_extentions):
link = urlparse(url)
# check if domain are same
if link.netloc == self.parsed_link.netloc:
if url not in self.crawl_done:
return url
elif url.startswith('/'):
return '%s://%s%s' % (
self.parsed_link.scheme, self.parsed_link.netloc, url)
def crawl(self):
while self.to_crawl:
url = self.to_crawl.pop()
resp = requests.get(url)
if resp.status_code == requests.codes.ok:
all_links = self.link_regex.findall(resp.text)
for link in all_links:
complete_url = self.check_url_and_return_completed(link)
if complete_url and complete_url not in self.crawl_done:
self.to_crawl.add(complete_url)
self.crawl_done.add(url)
def get_sitemap(self):
template = '<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">%s</urlset>'
sub_template = ['<url><loc>%s</loc></url>' % url for url in self.crawl_done]
return template % ''.join(sub_template)
if __name__ == '__main__':
if len(sys.argv) != 2:
sys.exit('Please provide the website address')
url = sys.argv[1]
crawler = Crawler(url)
crawler.crawl()
print(crawler.get_sitemap())
# import doctest
# doctest.testmod()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment