Skip to content

Instantly share code, notes, and snippets.

@eliasdabbas
Created December 19, 2021 19:21
Show Gist options
  • Save eliasdabbas/fc21e016e281b26ae52d427ab385008a to your computer and use it in GitHub Desktop.
Save eliasdabbas/fc21e016e281b26ae52d427ab385008a to your computer and use it in GitHub Desktop.
Check if a given URL will be crawled or not given a set of conditions.
from urllib.parse import urlsplit, parse_qs
import re
def crawl_or_not(url,
exclude_url_params=None,
include_url_params=None,
include_url_pattern=None,
exclude_url_pattern=None):
"""Check if ``url`` will be crawled or not given the supplied conditions.
Note that these conditions only apply when discovering URLs by following links on pages.
URLs provided to the ``url_list`` parameter of the ``crawl`` function will be crawled
without taking into consideration any of these conditions.
:param list exclude_url_params: A list of URL parameter to exclude while crawling. URLs
containing any of these URLs will not be crawled.
:param list include_url_params: A list of URL parameters to include while crawling. Only
URLs containing one or more of these parameters will be crawled.
:param str exclude_url_pattern: A regular expression to exclude URLs. If this regex matches the
given URL it will not be crawled.
:param str include_url_pattern: A regular expression to include URLs. If this regex matches the
given URL it will be crawled.
"""
if exclude_url_params is not None and include_url_params is not None:
same_params = set(exclude_url_params).intersection(include_url_params)
if same_params:
raise ValueError(f"Please make sure you dont have the same parameters to exclude and include.\n"
f"Common parameters entered: {','.join(same_params)}")
if include_url_pattern is not None and exclude_url_pattern is not None:
if re.findall(include_url_pattern, url) == re.findall(exclude_url_pattern, url):
raise ValueError(f"Please make sure you don't include and exclud the same pattern.\n"
f"You entered '{include_url_pattern}'")
qs = parse_qs(urlsplit(url).query)
supplied_conditions = []
if exclude_url_params is not None:
exclude_params_in_url = not bool(set(exclude_url_params).intersection(qs))
supplied_conditions.append(exclude_params_in_url)
if include_url_params is not None:
include_params_in_url = bool(set(include_url_params).intersection(qs))
supplied_conditions.append(include_params_in_url)
if exclude_url_pattern is not None:
exclude_pattern_matched = not bool(re.findall(exclude_url_pattern, url))
supplied_conditions.append(exclude_pattern_matched)
if include_url_pattern is not None:
include_pattern_matched = bool(re.findall(include_url_pattern, url))
supplied_conditions.append(include_pattern_matched)
return all(supplied_conditions)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment