Skip to content

Instantly share code, notes, and snippets.

@kupp1
Last active August 21, 2018 11:18
Show Gist options
  • Save kupp1/115eff55245f78e101582539c2efcccf to your computer and use it in GitHub Desktop.
Save kupp1/115eff55245f78e101582539c2efcccf to your computer and use it in GitHub Desktop.
python3 url resolver
import re
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import urllib.request
import ssl
import urllib.parse
import urllib.error
from PIL import Image
url = re.compile(r"""
(?:
(?:
(https?|ftp):\/\/
)|www\.
)
(?:
\S+
(?:
:\S*
)?@
)?
(?:
(?:
[1-9]\d?|1\d\d|2[01]\d|22[0-3]
)
(?:
\.(?:
1?\d{1,2}|2[0-4]\d|25[0-5]
)
){2}
(?:
\.(?:
[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]
)
)
|(?:
(?:
[a-z\u00a1-\uffff0-9]-*
)*
[a-z\u00a1-\uffff0-9]+
)
(?:
\.(?:
[a-z\u00a1-\uffff0-9]-*
)*
[a-z\u00a1-\uffff0-9]+
)*
(?:
\.(?:
[a-z\u00a1-\uffff]{2,}
)
)\.?
)
(?:
:\d{2,5}
)?
(?:
[\/?#]\S*
)?""", re.X)
def get_urls(text: str):
try:
urls_indices = [(m.start(0), m.end(0)) for m in url.finditer(text)]
urls = []
if urls_indices:
for i in urls_indices:
urls.append(text[i[0]:i[1]])
return urls
except IndexError:
return []
urls_agents = {
'youtube.com': '',
'youtu.be': ''
}
def get_domain(url: str):
without_protocol = url[url.find('://') + 3:]
if without_protocol[:4] == 'www.':
without_protocol = without_protocol[4:]
slash = without_protocol.find('/')
return without_protocol[:slash if slash != -1 else None]
def resolve(url: str):
if url[:4] == 'www.':
title = resolve('https://%s' % url[4:])
if not title:
title = resolve('http://%s' % url[4:])
return title
else:
return title
try:
domen = get_domain(url)
if get_domain(url) in urls_agents:
user_agent = urls_agents[domen]
else:
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' \
'68.0.3440.75 Safari/537.36'
req = urllib.request.Request(
urllib.parse.quote(url, safe='$-_.+!*\'(),;/?:@=&"<>#%{}|\^~[]'),
data=None,
headers={
'User-Agent': user_agent
}
)
with urllib.request.urlopen(req, timeout=5) as content:
type_ = content.info().get_content_type()
if type_ == 'text/html':
only_title = SoupStrainer('title')
soup = BeautifulSoup(content, 'lxml', parse_only=only_title)
if not soup.find('title'):
soup.decompose()
return
else:
title = soup.title.string
soup.decompose()
if title:
return title.strip().replace('\r\n', ' ')
else:
return
elif re.search('^image/.+', type_):
image = Image.open(content)
size = image.size
return '%s image: %d x %d' % (image.format.lower(), size[0], size[1])
else:
return
except (urllib.error.URLError, urllib.error.HTTPError, ssl.CertificateError):
pass
if __name__ == '__main__':
text = input('Enter text with urls: ')
print('Urls in text: ')
urls = get_urls(text)
if len(urls) > 0:
for url in urls:
print('URL: %s TITLE: %s' % (url, resolve(url)))
UrlMatcher = ///
(?:
(?:
(https?|ftp):\/\/
)|www\.
)
(?:
\S+
(?:
:\S*
)?@
)?
(?:
(?:
[1-9]\d?|1\d\d|2[01]\d|22[0-3]
)
(?:
\.(?:
1?\d{1,2}|2[0-4]\d|25[0-5]
)
){2}
(?:
\.(?:
[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]
)
)
|(?:
(?:
[a-z\u00a1-\uffff0-9]-*
)*
[a-z\u00a1-\uffff0-9]+
)
(?:
\.(?:
[a-z\u00a1-\uffff0-9]-*
)*
[a-z\u00a1-\uffff0-9]+
)*
(?:
\.(?:
[a-z\u00a1-\uffff]{2,}
)
)\.?
)
(?:
:\d{2,5}
)?
(?:
[\/?#]\S*
)?
///
parse = (line) ->
return UrlMatcher.test line
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment