Created
May 1, 2023 16:00
-
-
Save ssaurel/3e32e9eb44a537904201a74e584f68b3 to your computer and use it in GitHub Desktop.
Crawl method of the SitemapGenerator class for a tutorial on the SSaurel's Blog
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def crawl(self, url, level): | |
print("Level: " + str(level) + "/ Explore " + url) | |
page = requests.get(url) | |
if page.status_code == 200 : | |
url = urllib.parse.urldefrag(url)[0] # we don't add url with fragments | |
if url not in self.urls : | |
self.urls[url] = level | |
soup = BeautifulSoup(page.content, "html.parser") | |
for link in soup.findAll('a') : | |
try : | |
href = link.get('href') | |
result = urlparse(href) | |
newurl = None | |
if result.hostname == None and href is not None: | |
# same domain | |
newurl = self.root + ("/", "")[href.startswith("/")] + href; | |
elif result.hostname == self.hostname : | |
newurl = href; | |
if newurl != None : | |
self.crawl(newurl, level + 1) | |
except TypeError: | |
print("Error for link:" + link.get('href')) | |
else : | |
if self.urls[url] > level : | |
self.urls[url] = level | |
else : | |
print(url + " unreachable") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment