Last active
August 10, 2021 10:06
-
-
Save capjamesg/1865e188ab13542b6a5eb140d0a0b210 to your computer and use it in GitHub Desktop.
robots.txt parser in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
HEADERS = { | |
"User-agent": "YOUR USER AGENT" | |
} | |
SITE_URL = "jamesg.blog" | |
def find_robots_directives(): | |
""" | |
Finds the robots.txt file on a website, reads the contents, then follows directives. | |
""" | |
read_robots = requests.get("https://{}/robots.txt".format(config.SITE_URL), headers=config.HEADERS) | |
namespaces_to_ignore = [] | |
next_line_is_to_be_read = False | |
sitemap_urls = [] | |
for processed_line in read_robots.content.decode("utf-8").split("\n"): | |
# Only read directives pointed at all user agents or my crawler user agent | |
if "User-agent: *" in processed_line or "User-agent: USER-AGENT" in processed_line: | |
next_line_is_to_be_read = True | |
if "Disallow:" in processed_line and next_line_is_to_be_read == True: | |
if processed_line == "Disallow: /*" or processed_line == "Disallow: *": | |
print("All URLs disallowed. Crawl complete") | |
return namespaces_to_ignore, sitemap_urls | |
namespaces_to_ignore.append(processed_line.split(":")[1].strip()) | |
elif "Sitemap:" in processed_line: | |
# Second : will be in the URL of the sitemap so it needs to be preserved | |
sitemap_url = processed_line.split(":")[1] + ":" + processed_line.split(":")[2] | |
sitemap_urls.append(sitemap_url.strip()) | |
elif len(processed_line) == 0: | |
next_line_is_to_be_read = False | |
if sitemap_urls == []: | |
sitemap_urls.append("https://{}/sitemap.xml".format(config.SITE_URL)) | |
return namespaces_to_ignore, sitemap_urls | |
namespaces_to_ignore, sitemap_urls = find_robots_directives() | |
print(namespaces_to_ignore, sitemap_urls) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment