capjamesg/robots.py

## robots.py
import requests

HEADERS = {
	"User-agent": "YOUR USER AGENT"
}

SITE_URL = "jamesg.blog"

def find_robots_directives():
	"""
		Finds the robots.txt file on a website, reads the contents, then follows directives.
	"""
	read_robots = requests.get("https://{}/robots.txt".format(config.SITE_URL), headers=config.HEADERS)

	namespaces_to_ignore = []

	next_line_is_to_be_read = False

	sitemap_urls = []

	for processed_line in read_robots.content.decode("utf-8").split("\n"):
		# Only read directives pointed at all user agents or my crawler user agent

		if "User-agent: *" in processed_line or "User-agent: USER-AGENT" in processed_line:
			next_line_is_to_be_read = True

		if "Disallow:" in processed_line and next_line_is_to_be_read == True:
			if processed_line == "Disallow: /*" or processed_line == "Disallow: *":
				print("All URLs disallowed. Crawl complete")
				return namespaces_to_ignore, sitemap_urls
			namespaces_to_ignore.append(processed_line.split(":")[1].strip())
		elif "Sitemap:" in processed_line:
			# Second : will be in the URL of the sitemap so it needs to be preserved
			sitemap_url = processed_line.split(":")[1] + ":" + processed_line.split(":")[2]
			sitemap_urls.append(sitemap_url.strip())
		elif len(processed_line) == 0:
			next_line_is_to_be_read = False

	if sitemap_urls == []:
		sitemap_urls.append("https://{}/sitemap.xml".format(config.SITE_URL))

	return namespaces_to_ignore, sitemap_urls

namespaces_to_ignore, sitemap_urls = find_robots_directives()

print(namespaces_to_ignore, sitemap_urls)
	import requests

	HEADERS = {
	"User-agent": "YOUR USER AGENT"
	}

	SITE_URL = "jamesg.blog"

	def find_robots_directives():
	"""
	Finds the robots.txt file on a website, reads the contents, then follows directives.
	"""
	read_robots = requests.get("https://{}/robots.txt".format(config.SITE_URL), headers=config.HEADERS)

	namespaces_to_ignore = []

	next_line_is_to_be_read = False

	sitemap_urls = []

	for processed_line in read_robots.content.decode("utf-8").split("\n"):
	# Only read directives pointed at all user agents or my crawler user agent

	if "User-agent: *" in processed_line or "User-agent: USER-AGENT" in processed_line:
	next_line_is_to_be_read = True

	if "Disallow:" in processed_line and next_line_is_to_be_read == True:
	if processed_line == "Disallow: /" or processed_line == "Disallow: ":
	print("All URLs disallowed. Crawl complete")
	return namespaces_to_ignore, sitemap_urls
	namespaces_to_ignore.append(processed_line.split(":")[1].strip())
	elif "Sitemap:" in processed_line:
	# Second : will be in the URL of the sitemap so it needs to be preserved
	sitemap_url = processed_line.split(":")[1] + ":" + processed_line.split(":")[2]
	sitemap_urls.append(sitemap_url.strip())
	elif len(processed_line) == 0:
	next_line_is_to_be_read = False

	if sitemap_urls == []:
	sitemap_urls.append("https://{}/sitemap.xml".format(config.SITE_URL))

	return namespaces_to_ignore, sitemap_urls

	namespaces_to_ignore, sitemap_urls = find_robots_directives()

	print(namespaces_to_ignore, sitemap_urls)