Last active
September 14, 2020 17:08
-
-
Save fpcorso/af947e9390bd7483ef4992db583fa8d1 to your computer and use it in GitHub Desktop.
Site Crawl - Domain Search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Script: Site Crawl - Domain Search | |
# Author: Frank Corso | |
# Date Created: 09/14/2020 | |
# Last Modified: 09/14/2020 | |
# Python Version: 3.6.5 | |
# Crawls a supplied url (hopefully your own) and searches for instances of links to a different supplied domain. | |
# i.e. Crawls site on https://example-one.com looking for any links to example-two.com. | |
# Avoid crawling for a link that is in your header or footer as this will capture all of those too. | |
# To run, use `python domain-search.py` and then supply the site to be crawled in `https://example-one.com` format. | |
# Then supply the domain to search for in `example-two.com` format. Note: leave of HTTP(S) for the domain to search for | |
# in order to catch either instance. | |
import requests | |
from urllib.parse import urljoin | |
from urllib.parse import urlparse | |
from time import sleep | |
from bs4 import BeautifulSoup | |
def crawl(site_url, search_for): | |
"""Prepares to crawl domain looking for search_for""" | |
if site_url.endswith('/'): | |
site_url = site_url[:-1] | |
if search_for.endswith('/'): | |
search_for = search_for[:-1] | |
instances = {} | |
checked = [] | |
instances = crawl_page(site_url, site_url, search_for, checked, instances, 3) | |
if len(instances) > 0: | |
for link in instances: | |
for page, link_text in link.items(): | |
print("Found instance of {} on page {}. The link was around '{}'".format(search_for, page, | |
link_text.strip())) | |
else: | |
print("No instances found of {} on {}".format(search_for, site_url)) | |
def crawl_page(domain, page, search_for, checked, instances, depth): | |
"""Crawls a page looking for broken links and images.""" | |
# Adds small delay between each page load to reduce load on servers. | |
sleep(0.300) | |
depth = depth - 1 | |
# If we are farther down the sitemap than desired, cancel now. | |
if depth < 0: | |
return instances | |
# If this was a link to another site, cancel now. | |
if not page.startswith(domain): | |
return instances | |
# Cancel if we have already checked this page. | |
if page in checked: | |
return instances | |
print('Checking page: {}'.format(page)) | |
# Adds to the checked list. | |
checked.append(page) | |
# Adds small delay to crawler after every 8 pages. | |
if len(checked) % 8 == 0: | |
sleep(2) | |
try: | |
r = requests.get(page, timeout=30) | |
r.raise_for_status() | |
except requests.exceptions.HTTPError: | |
print("Exception found when checking {}: Unsuccessful status code of {}.".format(page, r.status_code)) | |
raise ValueError("Page Down") | |
except requests.exceptions.Timeout: | |
print("Exception found when checking {}: Timed out.".format(page)) | |
raise ValueError("Page Down") | |
except requests.exceptions.ConnectionError as error: | |
print("Exception found when checking {}: Connection error of {]".format(page, error)) | |
raise ValueError("Page Down") | |
except requests.exceptions.RequestException as error: | |
print("Exception found when checking {}: Unknown requests error of {}.".format(page, error)) | |
raise ValueError("Page Down") | |
# If the page is a valid HTML page, crawl it. | |
if 'html' in r.headers['Content-Type']: | |
soup = BeautifulSoup(r.text, 'html.parser') | |
tags_checked = 0 | |
tags = soup.find_all('a') | |
# Go through all <a> elements. | |
for tag in tags: | |
tags_checked += 1 | |
if tags_checked % 10 == 0: | |
print("Checked {} links of {} on {}".format(tags_checked, len(tags), page)) | |
# If the link does not have an href attribute | |
if not tag.has_attr('href'): | |
continue | |
# If the URL we are searching for is found | |
if search_for in tag['href']: | |
# Try to determine the text of the link | |
try: | |
broken_text = tag.text | |
except: | |
broken_text = '' | |
else: | |
if broken_text == '' or broken_text.isspace(): | |
broken_text = 'HTML' | |
for element in tag.descendants: | |
if element != '\n': | |
if element.name == 'img': | |
if element.has_attr('alt') and element['alt'] != '': | |
broken_text = 'Image with alt text of: {}'.format(element['alt']) | |
else: | |
broken_text = 'Image with no alt text' | |
break | |
else: | |
broken_text = 'HTML with <{}> element'.format(element.name) | |
# Add to our instances, only if it's not already in there. | |
if page in instances: | |
if broken_text in instances[page]: | |
continue | |
else: | |
instances[page].append(broken_text) | |
else: | |
instances[page] = [broken_text] | |
continue | |
# If it's a link to an element on same page, let's ignore it. | |
if tag['href'].startswith('#'): | |
continue | |
# If it's an email link, let's ignore it. | |
if tag['href'].startswith('mailto:'): | |
continue | |
# If it's a fax link, let's ignore it. | |
if tag['href'].startswith('fax:'): | |
continue | |
# If it's a phone link, let's ignore it. | |
if tag['href'].startswith('tel:'): | |
continue | |
# If it's some weird JavaScript, let's ignore it. | |
if tag['href'].startswith('javascript:'): | |
continue | |
link = create_url(domain, page, tag['href']) | |
# Tests the link. | |
try: | |
# Crawl the linked page. | |
instances = crawl_page(domain, link, search_for, checked, instances, depth) | |
except: | |
# Link was broken or some other failure. Continue on. | |
continue | |
return instances | |
def create_url(domain, page, link): | |
"""Creates a full URL based on the passed link and the domain and page the link was on.""" | |
# Specifies protocol if generic | |
if link.startswith('//'): | |
parts = urlparse(domain) | |
link = parts.scheme + ':' + link | |
# Combines domain with link if it is a relative URL | |
link_parts = urlparse(link) | |
if len(link_parts.scheme) < 1: | |
link = urljoin(domain, link) | |
# Removes ID selectors | |
ipos = link.find('#') | |
if ipos > 1: | |
link = link[:ipos] | |
# Removes trailing slash | |
if link.endswith('/'): | |
link = link[:-1] | |
return link | |
if __name__ == '__main__': | |
domain_1 = input('What URL should this crawl? Include http or https: ') | |
domain_2 = input('What domain should we look for? Do not include http or https: ') | |
crawl(domain_1, domain_2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment