Skip to content

Instantly share code, notes, and snippets.

@fpcorso
Last active September 14, 2020 17:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fpcorso/af947e9390bd7483ef4992db583fa8d1 to your computer and use it in GitHub Desktop.
Save fpcorso/af947e9390bd7483ef4992db583fa8d1 to your computer and use it in GitHub Desktop.
Site Crawl - Domain Search
# Script: Site Crawl - Domain Search
# Author: Frank Corso
# Date Created: 09/14/2020
# Last Modified: 09/14/2020
# Python Version: 3.6.5
# Crawls a supplied url (hopefully your own) and searches for instances of links to a different supplied domain.
# i.e. Crawls site on https://example-one.com looking for any links to example-two.com.
# Avoid crawling for a link that is in your header or footer as this will capture all of those too.
# To run, use `python domain-search.py` and then supply the site to be crawled in `https://example-one.com` format.
# Then supply the domain to search for in `example-two.com` format. Note: leave of HTTP(S) for the domain to search for
# in order to catch either instance.
import requests
from urllib.parse import urljoin
from urllib.parse import urlparse
from time import sleep
from bs4 import BeautifulSoup
def crawl(site_url, search_for):
"""Prepares to crawl domain looking for search_for"""
if site_url.endswith('/'):
site_url = site_url[:-1]
if search_for.endswith('/'):
search_for = search_for[:-1]
instances = {}
checked = []
instances = crawl_page(site_url, site_url, search_for, checked, instances, 3)
if len(instances) > 0:
for link in instances:
for page, link_text in link.items():
print("Found instance of {} on page {}. The link was around '{}'".format(search_for, page,
link_text.strip()))
else:
print("No instances found of {} on {}".format(search_for, site_url))
def crawl_page(domain, page, search_for, checked, instances, depth):
"""Crawls a page looking for broken links and images."""
# Adds small delay between each page load to reduce load on servers.
sleep(0.300)
depth = depth - 1
# If we are farther down the sitemap than desired, cancel now.
if depth < 0:
return instances
# If this was a link to another site, cancel now.
if not page.startswith(domain):
return instances
# Cancel if we have already checked this page.
if page in checked:
return instances
print('Checking page: {}'.format(page))
# Adds to the checked list.
checked.append(page)
# Adds small delay to crawler after every 8 pages.
if len(checked) % 8 == 0:
sleep(2)
try:
r = requests.get(page, timeout=30)
r.raise_for_status()
except requests.exceptions.HTTPError:
print("Exception found when checking {}: Unsuccessful status code of {}.".format(page, r.status_code))
raise ValueError("Page Down")
except requests.exceptions.Timeout:
print("Exception found when checking {}: Timed out.".format(page))
raise ValueError("Page Down")
except requests.exceptions.ConnectionError as error:
print("Exception found when checking {}: Connection error of {]".format(page, error))
raise ValueError("Page Down")
except requests.exceptions.RequestException as error:
print("Exception found when checking {}: Unknown requests error of {}.".format(page, error))
raise ValueError("Page Down")
# If the page is a valid HTML page, crawl it.
if 'html' in r.headers['Content-Type']:
soup = BeautifulSoup(r.text, 'html.parser')
tags_checked = 0
tags = soup.find_all('a')
# Go through all <a> elements.
for tag in tags:
tags_checked += 1
if tags_checked % 10 == 0:
print("Checked {} links of {} on {}".format(tags_checked, len(tags), page))
# If the link does not have an href attribute
if not tag.has_attr('href'):
continue
# If the URL we are searching for is found
if search_for in tag['href']:
# Try to determine the text of the link
try:
broken_text = tag.text
except:
broken_text = ''
else:
if broken_text == '' or broken_text.isspace():
broken_text = 'HTML'
for element in tag.descendants:
if element != '\n':
if element.name == 'img':
if element.has_attr('alt') and element['alt'] != '':
broken_text = 'Image with alt text of: {}'.format(element['alt'])
else:
broken_text = 'Image with no alt text'
break
else:
broken_text = 'HTML with <{}> element'.format(element.name)
# Add to our instances, only if it's not already in there.
if page in instances:
if broken_text in instances[page]:
continue
else:
instances[page].append(broken_text)
else:
instances[page] = [broken_text]
continue
# If it's a link to an element on same page, let's ignore it.
if tag['href'].startswith('#'):
continue
# If it's an email link, let's ignore it.
if tag['href'].startswith('mailto:'):
continue
# If it's a fax link, let's ignore it.
if tag['href'].startswith('fax:'):
continue
# If it's a phone link, let's ignore it.
if tag['href'].startswith('tel:'):
continue
# If it's some weird JavaScript, let's ignore it.
if tag['href'].startswith('javascript:'):
continue
link = create_url(domain, page, tag['href'])
# Tests the link.
try:
# Crawl the linked page.
instances = crawl_page(domain, link, search_for, checked, instances, depth)
except:
# Link was broken or some other failure. Continue on.
continue
return instances
def create_url(domain, page, link):
"""Creates a full URL based on the passed link and the domain and page the link was on."""
# Specifies protocol if generic
if link.startswith('//'):
parts = urlparse(domain)
link = parts.scheme + ':' + link
# Combines domain with link if it is a relative URL
link_parts = urlparse(link)
if len(link_parts.scheme) < 1:
link = urljoin(domain, link)
# Removes ID selectors
ipos = link.find('#')
if ipos > 1:
link = link[:ipos]
# Removes trailing slash
if link.endswith('/'):
link = link[:-1]
return link
if __name__ == '__main__':
domain_1 = input('What URL should this crawl? Include http or https: ')
domain_2 = input('What domain should we look for? Do not include http or https: ')
crawl(domain_1, domain_2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment