Skip to content

Instantly share code, notes, and snippets.

@ssaurel
Created May 1, 2023 16:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ssaurel/3e32e9eb44a537904201a74e584f68b3 to your computer and use it in GitHub Desktop.
Save ssaurel/3e32e9eb44a537904201a74e584f68b3 to your computer and use it in GitHub Desktop.
Crawl method of the SitemapGenerator class for a tutorial on the SSaurel's Blog
def crawl(self, url, level):
print("Level: " + str(level) + "/ Explore " + url)
page = requests.get(url)
if page.status_code == 200 :
url = urllib.parse.urldefrag(url)[0] # we don't add url with fragments
if url not in self.urls :
self.urls[url] = level
soup = BeautifulSoup(page.content, "html.parser")
for link in soup.findAll('a') :
try :
href = link.get('href')
result = urlparse(href)
newurl = None
if result.hostname == None and href is not None:
# same domain
newurl = self.root + ("/", "")[href.startswith("/")] + href;
elif result.hostname == self.hostname :
newurl = href;
if newurl != None :
self.crawl(newurl, level + 1)
except TypeError:
print("Error for link:" + link.get('href'))
else :
if self.urls[url] > level :
self.urls[url] = level
else :
print(url + " unreachable")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment