This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This function is implemented according to schema: | |
#https://www.sitemaps.org/protocol.html | |
import xml.etree.ElementTree as ET | |
import requests,re | |
def get_urls_from_sitemap(url): | |
sitemaps = [line[9:] for line in requests.get(f"{url}/robots.txt").text.splitlines() if re.match('sitemap:',line.lower())] # python has a robotsparser module but it is too strict, so we build our own | |
urls = [] | |
while len(sitemaps): | |
cur = sitemaps.pop() | |
text = requests.get(cur).text |