bash/url_extractor.py

## url_extractor.py
import re
from urllib.parse import urlparse
from typing import Generator, Tuple

BRACKETS = [
    ('(', ')'),
    ('[', ']'),
    ('{', '}'),
    ('<', '>'),
]

PROTOCOL_REGEX = re.compile('(http|https)://')
WHITESPACE_REGEX = re.compile('[\\s]+')

def find_first_unbalanced(input: str, opener: str, closer: str) -> int:
    """
    Finds the first index of an unbalanced bracket pair.
    """
    stack = []

    for i, char in enumerate(input):
        if char == opener:
            stack.append(i)

        if char == closer:
            if len(stack) == 0:
                return i
            else:
                stack.pop()

    if len(stack) > 0:
        return stack[0]

def find_urls(input: str) -> Generator[Tuple[int, int], None, None]:
    """
    Finds and yields the start and ending position of URLs in a given string.
    """
    input_len = len(input)
    pos = 0

    while True:
        rest = input[pos:]

        # find the next occurrence of http:// or https://
        # which makes it a potential url candiate
        next_url = PROTOCOL_REGEX.search(rest)

        if next_url is None:
            return

        start = next_url.start()

        # urls are not allowed to contain whitespaces which
        # makes whitespaces a pretty good upper limit
        whitespace = WHITESPACE_REGEX.search(rest[start:])

        if whitespace is None:
            end = len(rest)
        else:
            end = whitespace.start()

        url = rest[start:(start + end)]

        # urls are allowed to contain all sorts of brackets.
        # to prevent brackets that are part of the text to be mistaken as
        # part of the url, we require urls to have balanced brackets and remove
        # unbalanced brackets from our url "candiate".
        unbalanced_indexes = []
        for bracket in BRACKETS:
            unbalanced_index = find_first_unbalanced(url, bracket[0], bracket[1])

            if unbalanced_index is not None:
                unbalanced_indexes.append(unbalanced_index)

        if len(unbalanced_indexes) > 0:
            end = min(unbalanced_indexes)
            url = rest[start:(start + end)]

        # dots are also valid inside a URL even at trailing position.
        # this is inconvenient because dots also end sentences.
        # that's why we remove trailing dots.
        while url[-1:] == '.':
            end = end - 1
            url = url[:-1]

        parsed = urlparse(url)

        # only allow http and https schemes and require a hostname
        # which is a pretty good indicator that we have a valid URL
        if parsed.scheme in ['http', 'https'] and parsed.hostname is not None:
            yield (pos + start, pos + start + end)

        pos = pos + start + end
	import re
	from urllib.parse import urlparse
	from typing import Generator, Tuple

	BRACKETS = [
	('(', ')'),
	('[', ']'),
	('{', '}'),
	('<', '>'),
	]

	PROTOCOL_REGEX = re.compile('(http\|https)://')
	WHITESPACE_REGEX = re.compile('[\\s]+')

	def find_first_unbalanced(input: str, opener: str, closer: str) -> int:
	"""
	Finds the first index of an unbalanced bracket pair.
	"""
	stack = []

	for i, char in enumerate(input):
	if char == opener:
	stack.append(i)

	if char == closer:
	if len(stack) == 0:
	return i
	else:
	stack.pop()

	if len(stack) > 0:
	return stack[0]

	def find_urls(input: str) -> Generator[Tuple[int, int], None, None]:
	"""
	Finds and yields the start and ending position of URLs in a given string.
	"""
	input_len = len(input)
	pos = 0

	while True:
	rest = input[pos:]

	# find the next occurrence of http:// or https://
	# which makes it a potential url candiate
	next_url = PROTOCOL_REGEX.search(rest)

	if next_url is None:
	return

	start = next_url.start()

	# urls are not allowed to contain whitespaces which
	# makes whitespaces a pretty good upper limit
	whitespace = WHITESPACE_REGEX.search(rest[start:])

	if whitespace is None:
	end = len(rest)
	else:
	end = whitespace.start()

	url = rest[start:(start + end)]

	# urls are allowed to contain all sorts of brackets.
	# to prevent brackets that are part of the text to be mistaken as
	# part of the url, we require urls to have balanced brackets and remove
	# unbalanced brackets from our url "candiate".
	unbalanced_indexes = []
	for bracket in BRACKETS:
	unbalanced_index = find_first_unbalanced(url, bracket[0], bracket[1])

	if unbalanced_index is not None:
	unbalanced_indexes.append(unbalanced_index)

	if len(unbalanced_indexes) > 0:
	end = min(unbalanced_indexes)
	url = rest[start:(start + end)]

	# dots are also valid inside a URL even at trailing position.
	# this is inconvenient because dots also end sentences.
	# that's why we remove trailing dots.
	while url[-1:] == '.':
	end = end - 1
	url = url[:-1]

	parsed = urlparse(url)

	# only allow http and https schemes and require a hostname
	# which is a pretty good indicator that we have a valid URL
	if parsed.scheme in ['http', 'https'] and parsed.hostname is not None:
	yield (pos + start, pos + start + end)

	pos = pos + start + end