Skip to content

Instantly share code, notes, and snippets.

@bash
Created February 7, 2018 19:19
Show Gist options
  • Save bash/015bc272fe46b7b8850cc3ba05d4ef19 to your computer and use it in GitHub Desktop.
Save bash/015bc272fe46b7b8850cc3ba05d4ef19 to your computer and use it in GitHub Desktop.
URL Extractor
import re
from urllib.parse import urlparse
from typing import Generator, Tuple
BRACKETS = [
('(', ')'),
('[', ']'),
('{', '}'),
('<', '>'),
]
PROTOCOL_REGEX = re.compile('(http|https)://')
WHITESPACE_REGEX = re.compile('[\\s]+')
def find_first_unbalanced(input: str, opener: str, closer: str) -> int:
"""
Finds the first index of an unbalanced bracket pair.
"""
stack = []
for i, char in enumerate(input):
if char == opener:
stack.append(i)
if char == closer:
if len(stack) == 0:
return i
else:
stack.pop()
if len(stack) > 0:
return stack[0]
def find_urls(input: str) -> Generator[Tuple[int, int], None, None]:
"""
Finds and yields the start and ending position of URLs in a given string.
"""
input_len = len(input)
pos = 0
while True:
rest = input[pos:]
# find the next occurrence of http:// or https://
# which makes it a potential url candiate
next_url = PROTOCOL_REGEX.search(rest)
if next_url is None:
return
start = next_url.start()
# urls are not allowed to contain whitespaces which
# makes whitespaces a pretty good upper limit
whitespace = WHITESPACE_REGEX.search(rest[start:])
if whitespace is None:
end = len(rest)
else:
end = whitespace.start()
url = rest[start:(start + end)]
# urls are allowed to contain all sorts of brackets.
# to prevent brackets that are part of the text to be mistaken as
# part of the url, we require urls to have balanced brackets and remove
# unbalanced brackets from our url "candiate".
unbalanced_indexes = []
for bracket in BRACKETS:
unbalanced_index = find_first_unbalanced(url, bracket[0], bracket[1])
if unbalanced_index is not None:
unbalanced_indexes.append(unbalanced_index)
if len(unbalanced_indexes) > 0:
end = min(unbalanced_indexes)
url = rest[start:(start + end)]
# dots are also valid inside a URL even at trailing position.
# this is inconvenient because dots also end sentences.
# that's why we remove trailing dots.
while url[-1:] == '.':
end = end - 1
url = url[:-1]
parsed = urlparse(url)
# only allow http and https schemes and require a hostname
# which is a pretty good indicator that we have a valid URL
if parsed.scheme in ['http', 'https'] and parsed.hostname is not None:
yield (pos + start, pos + start + end)
pos = pos + start + end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment