Skip to content

Instantly share code, notes, and snippets.

@ssaurel
Created May 1, 2023 16:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ssaurel/6550b536cc83c6107736bd631a2c9314 to your computer and use it in GitHub Desktop.
Save ssaurel/6550b536cc83c6107736bd631a2c9314 to your computer and use it in GitHub Desktop.
SitemapGenerator Program in Python for the SSaurel's Blog
from pip._vendor import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import urllib
import xml.etree.ElementTree as ET
class SitemapGenerator:
def __init__(self, root, filename):
self.filename = filename
self.urls = {}
self.root = root
self.hostname = urlparse(root).hostname
def crawl(self, url, level):
print("Level: " + str(level) + "/ Explore " + url)
page = requests.get(url)
if page.status_code == 200 :
url = urllib.parse.urldefrag(url)[0] # we don't add url with fragments
if url not in self.urls :
self.urls[url] = level
soup = BeautifulSoup(page.content, "html.parser")
for link in soup.findAll('a') :
try :
href = link.get('href')
result = urlparse(href)
newurl = None
if result.hostname == None and href is not None:
# same domain
newurl = self.root + ("/", "")[href.startswith("/")] + href;
elif result.hostname == self.hostname :
newurl = href;
if newurl != None :
self.crawl(newurl, level + 1)
except TypeError:
print("Error for link:" + link.get('href'))
else :
if self.urls[url] > level :
self.urls[url] = level
else :
print(url + " unreachable")
def generatefile(self):
urlsbylevel = {}
maxlevel = 0
for key, value in self.urls.items():
if value > maxlevel :
maxlevel = value
listurls = None
if value not in urlsbylevel:
listurls = []
else :
listurls = urlsbylevel[value]
if listurls != None :
listurls.append(key)
urlsbylevel[value] = listurls
# priority between 0 and 1
# calculate the step between each level
step = 1 / (maxlevel * 2)
rootstr = '<urlset></urlset>'
root = ET.fromstring(rootstr)
root.attrib = {'xmlns' : 'http://www.sitemaps.org/schemas/sitemap/0.9'}
for key, value in urlsbylevel.items():
priority = round(1 - step * key, 2)
if priority < 0:
print("Step = " + str(step) + " Key = " + str(key))
for item in value:
url = ET.SubElement(root, "url")
ET.SubElement(url, "loc").text = item
ET.SubElement(url, "priority").text = str(priority)
tree = ET.ElementTree(root)
ET.indent(tree, ' ')
# writing xml
tree.write(self.filename, encoding="utf-8", xml_declaration=True)
sitemapGenerator = SitemapGenerator("https://www.toutsurlebitcoin.fr", "sitemap.xml")
sitemapGenerator.crawl("https://www.toutsurlebitcoin.fr", 0)
#print(sitemapGenerator.urls)
sitemapGenerator.generatefile()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment