shanehh/test-if-allow-crawler.py

## test-if-allow-crawler.py
"""
Test if site allow to be crawled with respect to `robots.txt`

ref: https://stackoverflow.com/a/64495913/11487798
"""
import requests
from protego import Protego
from urllib.parse import urlparse


def base(url: str):
    result = urlparse(url)
    return result.scheme + "://" + result.netloc


def robots_txt(url: str):
    return base(url) + "/robots.txt"


def allow_crawl(url: str) -> bool:
    resp = requests.get(robots_txt(url))
    user_agent = resp.request.headers["User-Agent"]

    rp = Protego.parse(resp.text)
    rp._get_matching_rule_set(user_agent)
    return rp.can_fetch(url, user_agent)


url = "https://stackoverflow.com/questions/29980798/where-does-pip-install-its-packages"
test = allow_crawl(url)
breakpoint()
	"""
	Test if site allow to be crawled with respect to `robots.txt`

	ref: https://stackoverflow.com/a/64495913/11487798
	"""
	import requests
	from protego import Protego
	from urllib.parse import urlparse


	def base(url: str):
	result = urlparse(url)
	return result.scheme + "://" + result.netloc


	def robots_txt(url: str):
	return base(url) + "/robots.txt"


	def allow_crawl(url: str) -> bool:
	resp = requests.get(robots_txt(url))
	user_agent = resp.request.headers["User-Agent"]

	rp = Protego.parse(resp.text)
	rp._get_matching_rule_set(user_agent)
	return rp.can_fetch(url, user_agent)


	url = "https://stackoverflow.com/questions/29980798/where-does-pip-install-its-packages"
	test = allow_crawl(url)
	breakpoint()