Skip to content

Instantly share code, notes, and snippets.

@socrateslee
Last active September 29, 2023 13:16
Show Gist options
  • Save socrateslee/3441d52e90ffd629421a1e6f9a051505 to your computer and use it in GitHub Desktop.
Save socrateslee/3441d52e90ffd629421a1e6f9a051505 to your computer and use it in GitHub Desktop.
A monkey patch for urllib.robotparser to support * and $ in robots.txt
'''
A monkey patch for urllib.robotparser to support * and $ in robots.txt.
'''
import re
import urllib.parse
from urllib.robotparser import RobotFileParser, Entry, RuleLine
def get_robots_pattern(path):
ending = '.*?'
if path.endswith('$'):
path = path[:-1]
ending = '$'
parts = path.split('*')
parts = map(urllib.parse.quote, map(re.escape, parts))
return '.*?'.join(parts) + ending
def _rule_line__init__(self, path, allowance):
if path == '' and not allowance:
# an empty value means allow all
allowance = True
path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
self.pattern = re.compile(get_robots_pattern(path))
self.path = path
self.allowance = allowance
def _rule_line_applies_to(self, filename):
return True if self.pattern.match(filename) else False
RuleLine.__init__ = _rule_line__init__
RuleLine.applies_to = _rule_line_applies_to
__all__ = ['RobotFileParser']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment