Last active
September 20, 2019 14:44
-
-
Save luminousmen/20265c69888febd346d301c96f0380c5 to your computer and use it in GitHub Desktop.
Check links on a page
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import typing | |
import argparse | |
import urllib.parse | |
from bs4 import BeautifulSoup | |
from concurrent.futures import ThreadPoolExecutor as PoolExecutor | |
import requests | |
from termcolor import colored, cprint | |
def get_response(link): | |
"""Get response by following the link | |
:param link: link to follow | |
:returns: response | |
""" | |
response = None | |
try: | |
response = requests.head(link, allow_redirects=True) | |
except: | |
pass | |
return response | |
def get_status(response): | |
"""Get status by response | |
:param response: status code or 'Err' | |
""" | |
if response and response.status_code: | |
return response.status_code | |
return 'Err' | |
def check_link_status(link: str, failed: typing.List) -> str: | |
"""Check link | |
:param href: link to check | |
:param failed: list of failed resources | |
""" | |
response = get_response(link) | |
code = get_status(response) | |
code = colored(code, 'green') if isinstance(code, int) and code // 100 in {2, 3} else colored(code, 'red') | |
return "{} - {}".format(link, code) | |
def construct_url(href: str, original_page: str) -> str: | |
"""Construct url | |
:param href: href to construct url from | |
:param original_page: original page url | |
""" | |
if href.startswith("#"): | |
return | |
elif href.startswith("//"): | |
href = urllib.parse.urljoin("http:", href) | |
elif href.startswith("/"): | |
href = urllib.parse.urljoin(original_page, href) | |
return href | |
def worker(link: str, original_page: str, failed: typing.List) -> None: | |
"""Worker function | |
:param link: link to resource | |
:param url: original url of the checked page | |
:param failed: list of failed resources | |
""" | |
href = link.get("href") | |
if href: | |
url = construct_url(href, original_page) | |
result = check_link_status(url, failed) | |
print(result) | |
def check_url(url: str): | |
"""Check page's links | |
:param url: original url of the page to check | |
:return: | |
""" | |
if not url.startswith("http"): | |
url = "http://" + url | |
page = requests.get(url) | |
soup = BeautifulSoup(page.text, "html.parser") | |
failed = [] | |
with PoolExecutor() as executor: | |
for _ in executor.map(lambda link: worker(link, url, failed), soup.find_all("a")): | |
pass | |
return failed | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="") | |
parser.add_argument('url', type=str, help="") | |
args = parser.parse_args() | |
failed = check_url(args.url) | |
if failed: | |
print("Can't get {}".format(failed)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment