luminousmen/check_links.py

## check_links.py
import typing
import argparse
import urllib.parse

from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor as PoolExecutor
import requests
from termcolor import colored, cprint


def get_response(link):
    """Get response by following the link

    :param link: link to follow
    :returns: response
    """
    response = None
    try:
        response = requests.head(link, allow_redirects=True)
    except:
        pass
    return response

def get_status(response):
    """Get status by response

    :param response: status code or 'Err'
    """
    if response and response.status_code:
        return response.status_code
    return 'Err'


def check_link_status(link: str, failed: typing.List) -> str:
    """Check link

    :param href: link to check
    :param failed: list of failed resources
    """
    response = get_response(link)
    code = get_status(response)
    code = colored(code, 'green') if isinstance(code, int) and code // 100 in {2, 3} else colored(code, 'red')
    return "{} - {}".format(link, code)


def construct_url(href: str, original_page: str) -> str:
    """Construct url

    :param href: href to construct url from
    :param original_page: original page url
    """
    if href.startswith("#"):
        return
    elif href.startswith("//"):
        href = urllib.parse.urljoin("http:", href)
    elif href.startswith("/"):
        href = urllib.parse.urljoin(original_page, href)
    return href


def worker(link: str, original_page: str, failed: typing.List) -> None:
    """Worker function

    :param link: link to resource
    :param url: original url of the checked page
    :param failed: list of failed resources
    """
    href = link.get("href")
    if href:
        url = construct_url(href, original_page)
        result = check_link_status(url, failed)
        print(result)


def check_url(url: str):
    """Check page's links

    :param url: original url of the page to check
    :return:
    """
    if not url.startswith("http"):
        url = "http://" + url

    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")

    failed = []
    with PoolExecutor() as executor:
        for _ in executor.map(lambda link: worker(link, url, failed), soup.find_all("a")):
            pass
    return failed


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="")
    parser.add_argument('url', type=str, help="")

    args = parser.parse_args()
    failed = check_url(args.url)
    if failed:
        print("Can't get {}".format(failed))
	import typing
	import argparse
	import urllib.parse

	from bs4 import BeautifulSoup
	from concurrent.futures import ThreadPoolExecutor as PoolExecutor
	import requests
	from termcolor import colored, cprint


	def get_response(link):
	"""Get response by following the link

	:param link: link to follow
	:returns: response
	"""
	response = None
	try:
	response = requests.head(link, allow_redirects=True)
	except:
	pass
	return response

	def get_status(response):
	"""Get status by response

	:param response: status code or 'Err'
	"""
	if response and response.status_code:
	return response.status_code
	return 'Err'


	def check_link_status(link: str, failed: typing.List) -> str:
	"""Check link

	:param href: link to check
	:param failed: list of failed resources
	"""
	response = get_response(link)
	code = get_status(response)
	code = colored(code, 'green') if isinstance(code, int) and code // 100 in {2, 3} else colored(code, 'red')
	return "{} - {}".format(link, code)


	def construct_url(href: str, original_page: str) -> str:
	"""Construct url

	:param href: href to construct url from
	:param original_page: original page url
	"""
	if href.startswith("#"):
	return
	elif href.startswith("//"):
	href = urllib.parse.urljoin("http:", href)
	elif href.startswith("/"):
	href = urllib.parse.urljoin(original_page, href)
	return href


	def worker(link: str, original_page: str, failed: typing.List) -> None:
	"""Worker function

	:param link: link to resource
	:param url: original url of the checked page
	:param failed: list of failed resources
	"""
	href = link.get("href")
	if href:
	url = construct_url(href, original_page)
	result = check_link_status(url, failed)
	print(result)


	def check_url(url: str):
	"""Check page's links

	:param url: original url of the page to check
	:return:
	"""
	if not url.startswith("http"):
	url = "http://" + url

	page = requests.get(url)
	soup = BeautifulSoup(page.text, "html.parser")

	failed = []
	with PoolExecutor() as executor:
	for _ in executor.map(lambda link: worker(link, url, failed), soup.find_all("a")):
	pass
	return failed


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="")
	parser.add_argument('url', type=str, help="")

	args = parser.parse_args()
	failed = check_url(args.url)
	if failed:
	print("Can't get {}".format(failed))