fpcorso/domain-search.py

## domain-search.py
# Script: Site Crawl - Domain Search
# Author: Frank Corso
# Date Created: 09/14/2020
# Last Modified: 09/14/2020
# Python Version: 3.6.5

# Crawls a supplied url (hopefully your own) and searches for instances of links to a different supplied domain.
# i.e. Crawls site on https://example-one.com looking for any links to example-two.com.
# Avoid crawling for a link that is in your header or footer as this will capture all of those too.
# To run, use `python domain-search.py` and then supply the site to be crawled in `https://example-one.com` format.
# Then supply the domain to search for in `example-two.com` format. Note: leave of HTTP(S) for the domain to search for
# in order to catch either instance.

import requests
from urllib.parse import urljoin
from urllib.parse import urlparse
from time import sleep

from bs4 import BeautifulSoup


def crawl(site_url, search_for):
    """Prepares to crawl domain looking for search_for"""
    if site_url.endswith('/'):
        site_url = site_url[:-1]

    if search_for.endswith('/'):
        search_for = search_for[:-1]

    instances = {}
    checked = []
    instances = crawl_page(site_url, site_url, search_for, checked, instances, 3)
    if len(instances) > 0:
        for link in instances:
            for page, link_text in link.items():
                print("Found instance of {} on page {}. The link was around '{}'".format(search_for, page,
                                                                                         link_text.strip()))

    else:
        print("No instances found of {} on {}".format(search_for, site_url))


def crawl_page(domain, page, search_for, checked, instances, depth):
    """Crawls a page looking for broken links and images."""

    # Adds small delay between each page load to reduce load on servers.
    sleep(0.300)

    depth = depth - 1

    # If we are farther down the sitemap than desired, cancel now.
    if depth < 0:
        return instances

    # If this was a link to another site, cancel now.
    if not page.startswith(domain):
        return instances

    # Cancel if we have already checked this page.
    if page in checked:
        return instances

    print('Checking page: {}'.format(page))

    # Adds to the checked list.
    checked.append(page)

    # Adds small delay to crawler after every 8 pages.
    if len(checked) % 8 == 0:
        sleep(2)

    try:
        r = requests.get(page, timeout=30)
        r.raise_for_status()
    except requests.exceptions.HTTPError:
        print("Exception found when checking {}: Unsuccessful status code of {}.".format(page, r.status_code))
        raise ValueError("Page Down")
    except requests.exceptions.Timeout:
        print("Exception found when checking {}: Timed out.".format(page))
        raise ValueError("Page Down")
    except requests.exceptions.ConnectionError as error:
        print("Exception found when checking {}: Connection error of {]".format(page, error))
        raise ValueError("Page Down")
    except requests.exceptions.RequestException as error:
        print("Exception found when checking {}: Unknown requests error of {}.".format(page, error))
        raise ValueError("Page Down")

    # If the page is a valid HTML page, crawl it.
    if 'html' in r.headers['Content-Type']:
        soup = BeautifulSoup(r.text, 'html.parser')

        tags_checked = 0
        tags = soup.find_all('a')

        # Go through all <a> elements.
        for tag in tags:

            tags_checked += 1
            if tags_checked % 10 == 0:
                print("Checked {} links of {} on {}".format(tags_checked, len(tags), page))

            # If the link does not have an href attribute
            if not tag.has_attr('href'):
                continue

            # If the URL we are searching for is found
            if search_for in tag['href']:

                # Try to determine the text of the link
                try:
                    broken_text = tag.text
                except:
                    broken_text = ''
                else:
                    if broken_text == '' or broken_text.isspace():
                        broken_text = 'HTML'
                        for element in tag.descendants:
                            if element != '\n':
                                if element.name == 'img':
                                    if element.has_attr('alt') and element['alt'] != '':
                                        broken_text = 'Image with alt text of: {}'.format(element['alt'])
                                    else:
                                        broken_text = 'Image with no alt text'
                                    break
                                else:
                                    broken_text = 'HTML with <{}> element'.format(element.name)

                # Add to our instances, only if it's not already in there.
                if page in instances:
                    if broken_text in instances[page]:
                        continue
                    else:
                        instances[page].append(broken_text)
                else:
                    instances[page] = [broken_text]
                continue

            # If it's a link to an element on same page, let's ignore it.
            if tag['href'].startswith('#'):
                continue

            # If it's an email link, let's ignore it.
            if tag['href'].startswith('mailto:'):
                continue

            # If it's a fax link, let's ignore it.
            if tag['href'].startswith('fax:'):
                continue

            # If it's a phone link, let's ignore it.
            if tag['href'].startswith('tel:'):
                continue

            # If it's some weird JavaScript, let's ignore it.
            if tag['href'].startswith('javascript:'):
                continue

            link = create_url(domain, page, tag['href'])

            # Tests the link.
            try:
                # Crawl the linked page.
                instances = crawl_page(domain, link, search_for, checked, instances, depth)
            except:
                # Link was broken or some other failure. Continue on.
                continue
    return instances


def create_url(domain, page, link):
    """Creates a full URL based on the passed link and the domain and page the link was on."""
    # Specifies protocol if generic
    if link.startswith('//'):
        parts = urlparse(domain)
        link = parts.scheme + ':' + link

    # Combines domain with link if it is a relative URL
    link_parts = urlparse(link)
    if len(link_parts.scheme) < 1:
        link = urljoin(domain, link)

    # Removes ID selectors
    ipos = link.find('#')
    if ipos > 1:
        link = link[:ipos]

    # Removes trailing slash
    if link.endswith('/'):
        link = link[:-1]

    return link


if __name__ == '__main__':
    domain_1 = input('What URL should this crawl? Include http or https: ')
    domain_2 = input('What domain should we look for? Do not include http or https: ')
    crawl(domain_1, domain_2)
	# Script: Site Crawl - Domain Search
	# Author: Frank Corso
	# Date Created: 09/14/2020
	# Last Modified: 09/14/2020
	# Python Version: 3.6.5

	# Crawls a supplied url (hopefully your own) and searches for instances of links to a different supplied domain.
	# i.e. Crawls site on https://example-one.com looking for any links to example-two.com.
	# Avoid crawling for a link that is in your header or footer as this will capture all of those too.
	# To run, use `python domain-search.py` and then supply the site to be crawled in `https://example-one.com` format.
	# Then supply the domain to search for in `example-two.com` format. Note: leave of HTTP(S) for the domain to search for
	# in order to catch either instance.

	import requests
	from urllib.parse import urljoin
	from urllib.parse import urlparse
	from time import sleep

	from bs4 import BeautifulSoup


	def crawl(site_url, search_for):
	"""Prepares to crawl domain looking for search_for"""
	if site_url.endswith('/'):
	site_url = site_url[:-1]

	if search_for.endswith('/'):
	search_for = search_for[:-1]

	instances = {}
	checked = []
	instances = crawl_page(site_url, site_url, search_for, checked, instances, 3)
	if len(instances) > 0:
	for link in instances:
	for page, link_text in link.items():
	print("Found instance of {} on page {}. The link was around '{}'".format(search_for, page,
	link_text.strip()))

	else:
	print("No instances found of {} on {}".format(search_for, site_url))


	def crawl_page(domain, page, search_for, checked, instances, depth):
	"""Crawls a page looking for broken links and images."""

	# Adds small delay between each page load to reduce load on servers.
	sleep(0.300)

	depth = depth - 1

	# If we are farther down the sitemap than desired, cancel now.
	if depth < 0:
	return instances

	# If this was a link to another site, cancel now.
	if not page.startswith(domain):
	return instances

	# Cancel if we have already checked this page.
	if page in checked:
	return instances

	print('Checking page: {}'.format(page))

	# Adds to the checked list.
	checked.append(page)

	# Adds small delay to crawler after every 8 pages.
	if len(checked) % 8 == 0:
	sleep(2)

	try:
	r = requests.get(page, timeout=30)
	r.raise_for_status()
	except requests.exceptions.HTTPError:
	print("Exception found when checking {}: Unsuccessful status code of {}.".format(page, r.status_code))
	raise ValueError("Page Down")
	except requests.exceptions.Timeout:
	print("Exception found when checking {}: Timed out.".format(page))
	raise ValueError("Page Down")
	except requests.exceptions.ConnectionError as error:
	print("Exception found when checking {}: Connection error of {]".format(page, error))
	raise ValueError("Page Down")
	except requests.exceptions.RequestException as error:
	print("Exception found when checking {}: Unknown requests error of {}.".format(page, error))
	raise ValueError("Page Down")

	# If the page is a valid HTML page, crawl it.
	if 'html' in r.headers['Content-Type']:
	soup = BeautifulSoup(r.text, 'html.parser')

	tags_checked = 0
	tags = soup.find_all('a')

	# Go through all <a> elements.
	for tag in tags:

	tags_checked += 1
	if tags_checked % 10 == 0:
	print("Checked {} links of {} on {}".format(tags_checked, len(tags), page))

	# If the link does not have an href attribute
	if not tag.has_attr('href'):
	continue

	# If the URL we are searching for is found
	if search_for in tag['href']:

	# Try to determine the text of the link
	try:
	broken_text = tag.text
	except:
	broken_text = ''
	else:
	if broken_text == '' or broken_text.isspace():
	broken_text = 'HTML'
	for element in tag.descendants:
	if element != '\n':
	if element.name == 'img':
	if element.has_attr('alt') and element['alt'] != '':
	broken_text = 'Image with alt text of: {}'.format(element['alt'])
	else:
	broken_text = 'Image with no alt text'
	break
	else:
	broken_text = 'HTML with <{}> element'.format(element.name)

	# Add to our instances, only if it's not already in there.
	if page in instances:
	if broken_text in instances[page]:
	continue
	else:
	instances[page].append(broken_text)
	else:
	instances[page] = [broken_text]
	continue

	# If it's a link to an element on same page, let's ignore it.
	if tag['href'].startswith('#'):
	continue

	# If it's an email link, let's ignore it.
	if tag['href'].startswith('mailto:'):
	continue

	# If it's a fax link, let's ignore it.
	if tag['href'].startswith('fax:'):
	continue

	# If it's a phone link, let's ignore it.
	if tag['href'].startswith('tel:'):
	continue

	# If it's some weird JavaScript, let's ignore it.
	if tag['href'].startswith('javascript:'):
	continue

	link = create_url(domain, page, tag['href'])

	# Tests the link.
	try:
	# Crawl the linked page.
	instances = crawl_page(domain, link, search_for, checked, instances, depth)
	except:
	# Link was broken or some other failure. Continue on.
	continue
	return instances


	def create_url(domain, page, link):
	"""Creates a full URL based on the passed link and the domain and page the link was on."""
	# Specifies protocol if generic
	if link.startswith('//'):
	parts = urlparse(domain)
	link = parts.scheme + ':' + link

	# Combines domain with link if it is a relative URL
	link_parts = urlparse(link)
	if len(link_parts.scheme) < 1:
	link = urljoin(domain, link)

	# Removes ID selectors
	ipos = link.find('#')
	if ipos > 1:
	link = link[:ipos]

	# Removes trailing slash
	if link.endswith('/'):
	link = link[:-1]

	return link


	if __name__ == '__main__':
	domain_1 = input('What URL should this crawl? Include http or https: ')
	domain_2 = input('What domain should we look for? Do not include http or https: ')
	crawl(domain_1, domain_2)