Skip to content

Instantly share code, notes, and snippets.

@capjamesg
Last active August 10, 2021 10:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save capjamesg/1865e188ab13542b6a5eb140d0a0b210 to your computer and use it in GitHub Desktop.
Save capjamesg/1865e188ab13542b6a5eb140d0a0b210 to your computer and use it in GitHub Desktop.
robots.txt parser in Python
import requests
HEADERS = {
"User-agent": "YOUR USER AGENT"
}
SITE_URL = "jamesg.blog"
def find_robots_directives():
"""
Finds the robots.txt file on a website, reads the contents, then follows directives.
"""
read_robots = requests.get("https://{}/robots.txt".format(config.SITE_URL), headers=config.HEADERS)
namespaces_to_ignore = []
next_line_is_to_be_read = False
sitemap_urls = []
for processed_line in read_robots.content.decode("utf-8").split("\n"):
# Only read directives pointed at all user agents or my crawler user agent
if "User-agent: *" in processed_line or "User-agent: USER-AGENT" in processed_line:
next_line_is_to_be_read = True
if "Disallow:" in processed_line and next_line_is_to_be_read == True:
if processed_line == "Disallow: /*" or processed_line == "Disallow: *":
print("All URLs disallowed. Crawl complete")
return namespaces_to_ignore, sitemap_urls
namespaces_to_ignore.append(processed_line.split(":")[1].strip())
elif "Sitemap:" in processed_line:
# Second : will be in the URL of the sitemap so it needs to be preserved
sitemap_url = processed_line.split(":")[1] + ":" + processed_line.split(":")[2]
sitemap_urls.append(sitemap_url.strip())
elif len(processed_line) == 0:
next_line_is_to_be_read = False
if sitemap_urls == []:
sitemap_urls.append("https://{}/sitemap.xml".format(config.SITE_URL))
return namespaces_to_ignore, sitemap_urls
namespaces_to_ignore, sitemap_urls = find_robots_directives()
print(namespaces_to_ignore, sitemap_urls)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment