kupp1/url-resolve.py

## url-resolve.py
import re
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import urllib.request
import ssl
import urllib.parse
import urllib.error
from PIL import Image


url = re.compile(r"""
                (?:
                  (?:
                    (https?|ftp):\/\/
                  )|www\.
                )
                (?:
                  \S+
                  (?:
                    :\S*
                  )?@
                )?
                (?:
                  (?:
                    [1-9]\d?|1\d\d|2[01]\d|22[0-3]
                  )
                  (?:
                    \.(?:
                        1?\d{1,2}|2[0-4]\d|25[0-5]
                      )
                  ){2}
                  (?:
                    \.(?:
                        [1-9]\d?|1\d\d|2[0-4]\d|25[0-4]
                      )
                  )
                  |(?:
                     (?:
                       [a-z\u00a1-\uffff0-9]-*
                     )*
                     [a-z\u00a1-\uffff0-9]+
                   )
                   (?:
                     \.(?:
                         [a-z\u00a1-\uffff0-9]-*
                       )*
                     [a-z\u00a1-\uffff0-9]+
                   )*
                   (?:
                     \.(?:
                         [a-z\u00a1-\uffff]{2,}
                       )
                   )\.?
                )
                (?:
                  :\d{2,5}
                )?
                (?:
                  [\/?#]\S*
                )?""", re.X)


def get_urls(text: str):
    try:
        urls_indices = [(m.start(0), m.end(0)) for m in url.finditer(text)]
        urls = []
        if urls_indices:
            for i in urls_indices:
                urls.append(text[i[0]:i[1]])
        return urls
    except IndexError:
        return []


urls_agents = {
    'youtube.com': '',
    'youtu.be': ''

}


def get_domain(url: str):
    without_protocol = url[url.find('://') + 3:]
    if without_protocol[:4] == 'www.':
        without_protocol = without_protocol[4:]
    slash = without_protocol.find('/')
    return without_protocol[:slash if slash != -1 else None]


def resolve(url: str):
    if url[:4] == 'www.':
        title = resolve('https://%s' % url[4:])
        if not title:
            title = resolve('http://%s' % url[4:])
            return title
        else:
            return title
    try:
        domen = get_domain(url)
        if get_domain(url) in urls_agents:
            user_agent = urls_agents[domen]
        else:
            user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' \
                         '68.0.3440.75 Safari/537.36'
        req = urllib.request.Request(
            urllib.parse.quote(url, safe='$-_.+!*\'(),;/?:@=&"<>#%{}|\^~[]'),
            data=None,
            headers={
                'User-Agent': user_agent
            }
        )
        with urllib.request.urlopen(req, timeout=5) as content:
            type_ = content.info().get_content_type()
            if type_ == 'text/html':
                only_title = SoupStrainer('title')
                soup = BeautifulSoup(content, 'lxml', parse_only=only_title)
                if not soup.find('title'):
                    soup.decompose()
                    return
                else:
                    title = soup.title.string
                    soup.decompose()
                    if title:
                        return title.strip().replace('\r\n', ' ')
                    else:
                        return
            elif re.search('^image/.+', type_):
                image = Image.open(content)
                size = image.size
                return '%s image: %d x %d' % (image.format.lower(), size[0], size[1])

            else:
                return
    except (urllib.error.URLError, urllib.error.HTTPError, ssl.CertificateError):
        pass


if __name__ == '__main__':
    text = input('Enter text with urls: ')
    print('Urls in text: ')
    urls = get_urls(text)
    if len(urls) > 0:
        for url in urls:
            print('URL: %s TITLE: %s' % (url, resolve(url)))

## url.coffee
UrlMatcher = ///
          (?:
            (?:
              (https?|ftp):\/\/
            )|www\.
          )
          (?:
            \S+
            (?:
              :\S*
            )?@
          )?
          (?:
            (?:
              [1-9]\d?|1\d\d|2[01]\d|22[0-3]
            )
            (?:
              \.(?:
                  1?\d{1,2}|2[0-4]\d|25[0-5]
                )
            ){2}
            (?:
              \.(?:
                  [1-9]\d?|1\d\d|2[0-4]\d|25[0-4]
                )
            )
            |(?:
               (?:
                 [a-z\u00a1-\uffff0-9]-*
               )*
               [a-z\u00a1-\uffff0-9]+
             )
             (?:
               \.(?:
                   [a-z\u00a1-\uffff0-9]-*
                 )*
               [a-z\u00a1-\uffff0-9]+
             )*
             (?:
               \.(?:
                   [a-z\u00a1-\uffff]{2,}
                 )
             )\.?
          )
          (?:
            :\d{2,5}
          )?
          (?:
            [\/?#]\S*
          )?
///

parse = (line) ->
  return UrlMatcher.test line
	import re
	from bs4 import BeautifulSoup
	from bs4 import SoupStrainer
	import urllib.request
	import ssl
	import urllib.parse
	import urllib.error
	from PIL import Image


	url = re.compile(r"""
	(?:
	(?:
	(https?\|ftp):\/\/
	)\|www\.
	)
	(?:
	\S+
	(?:
	:\S*
	)?@
	)?
	(?:
	(?:
	[1-9]\d?\|1\d\d\|2[01]\d\|22[0-3]
	)
	(?:
	\.(?:
	1?\d{1,2}\|2[0-4]\d\|25[0-5]
	)
	){2}
	(?:
	\.(?:
	[1-9]\d?\|1\d\d\|2[0-4]\d\|25[0-4]
	)
	)
	\|(?:
	(?:
	[a-z\u00a1-\uffff0-9]-*
	)*
	[a-z\u00a1-\uffff0-9]+
	)
	(?:
	\.(?:
	[a-z\u00a1-\uffff0-9]-*
	)*
	[a-z\u00a1-\uffff0-9]+
	)*
	(?:
	\.(?:
	[a-z\u00a1-\uffff]{2,}
	)
	)\.?
	)
	(?:
	:\d{2,5}
	)?
	(?:
	[\/?#]\S*
	)?""", re.X)


	def get_urls(text: str):
	try:
	urls_indices = [(m.start(0), m.end(0)) for m in url.finditer(text)]
	urls = []
	if urls_indices:
	for i in urls_indices:
	urls.append(text[i[0]:i[1]])
	return urls
	except IndexError:
	return []


	urls_agents = {
	'youtube.com': '',
	'youtu.be': ''

	}


	def get_domain(url: str):
	without_protocol = url[url.find('://') + 3:]
	if without_protocol[:4] == 'www.':
	without_protocol = without_protocol[4:]
	slash = without_protocol.find('/')
	return without_protocol[:slash if slash != -1 else None]


	def resolve(url: str):
	if url[:4] == 'www.':
	title = resolve('https://%s' % url[4:])
	if not title:
	title = resolve('http://%s' % url[4:])
	return title
	else:
	return title
	try:
	domen = get_domain(url)
	if get_domain(url) in urls_agents:
	user_agent = urls_agents[domen]
	else:
	user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' \
	'68.0.3440.75 Safari/537.36'
	req = urllib.request.Request(
	urllib.parse.quote(url, safe='$-_.+!*\'(),;/?:@=&"<>#%{}\|\^~[]'),
	data=None,
	headers={
	'User-Agent': user_agent
	}
	)
	with urllib.request.urlopen(req, timeout=5) as content:
	type_ = content.info().get_content_type()
	if type_ == 'text/html':
	only_title = SoupStrainer('title')
	soup = BeautifulSoup(content, 'lxml', parse_only=only_title)
	if not soup.find('title'):
	soup.decompose()
	return
	else:
	title = soup.title.string
	soup.decompose()
	if title:
	return title.strip().replace('\r\n', ' ')
	else:
	return
	elif re.search('^image/.+', type_):
	image = Image.open(content)
	size = image.size
	return '%s image: %d x %d' % (image.format.lower(), size[0], size[1])

	else:
	return
	except (urllib.error.URLError, urllib.error.HTTPError, ssl.CertificateError):
	pass


	if __name__ == '__main__':
	text = input('Enter text with urls: ')
	print('Urls in text: ')
	urls = get_urls(text)
	if len(urls) > 0:
	for url in urls:
	print('URL: %s TITLE: %s' % (url, resolve(url)))
	UrlMatcher = ///
	(?:
	(?:
	(https?\|ftp):\/\/
	)\|www\.
	)
	(?:
	\S+
	(?:
	:\S*
	)?@
	)?
	(?:
	(?:
	[1-9]\d?\|1\d\d\|2[01]\d\|22[0-3]
	)
	(?:
	\.(?:
	1?\d{1,2}\|2[0-4]\d\|25[0-5]
	)
	){2}
	(?:
	\.(?:
	[1-9]\d?\|1\d\d\|2[0-4]\d\|25[0-4]
	)
	)
	\|(?:
	(?:
	[a-z\u00a1-\uffff0-9]-*
	)*
	[a-z\u00a1-\uffff0-9]+
	)
	(?:
	\.(?:
	[a-z\u00a1-\uffff0-9]-*
	)*
	[a-z\u00a1-\uffff0-9]+
	)*
	(?:
	\.(?:
	[a-z\u00a1-\uffff]{2,}
	)
	)\.?
	)
	(?:
	:\d{2,5}
	)?
	(?:
	[\/?#]\S*
	)?
	///

	parse = (line) ->
	return UrlMatcher.test line