Last active
August 13, 2021 20:31
-
-
Save joshbtn/230f020994d7ca04f4519bc1f65f5985 to your computer and use it in GitHub Desktop.
FindBrokenLinks.ps1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
param ( | |
[Parameter(Mandatory=$True)] | |
[ValidatePattern("https?://")] | |
[System.Uri]$CrawlUrl = "", | |
# Only check for broken links internal to the site (ignore things like https://ya_badlink.com) | |
[Boolean]$OnlyCheckRelativeLinks = $True | |
) | |
# Keep a tab on what we've already crawled | |
$already_crawled = New-Object Collections.Generic.List[String] | |
# A list to hold broken links | |
$brokenLinks = @{} | |
$brokenLinkList = New-Object Collections.Generic.List[String] | |
function Crawl{ | |
param ( | |
[ValidatePattern("^https?://")] | |
[System.Uri[]]$Urls, | |
[System.Uri[]]$ParentUrl | |
) | |
# Loop through each url in the urls list | |
foreach($url in $Urls){ | |
Write-Host "Starting to process $url" | |
# Skip if the url has already been crawled | |
if($already_crawled.Contains($url)){ | |
Write-Host "Already Crawled $url" | |
# If it's already been crawled but the link is broken, add info to the dictionary | |
if($brokenLinks.Contains($url)){ | |
if(!$brokenLinks[$ParentUrl.ToString()]){ | |
$brokenLinks[$ParentUrl.ToString()] = New-Object Collections.Generic.List[String] | |
} | |
$brokenLinks[$ParentUrl.ToString()].Add($url) | |
Write-Host " Adding $url to broken links for $ParentUrl" | |
} | |
continue | |
} | |
# Load the html from the url | |
$res = try { | |
Invoke-WebRequest -Uri $url -ErrorAction Stop | |
} catch [System.Net.WebException] { | |
Write-Verbose "An exception was caught: $($_.Exception.Message)" | |
$_.Exception.Response | |
} | |
#then convert the status code enum to int by doing this | |
$statusCodeInt = [int]$res.StatusCode | |
# Add to broken links list if not returning a valid status code | |
if($statusCodeInt -gt 399 ){ | |
# Keep track of broken links grouped by page the links appear on | |
$brokenLinks[$ParentUrl] = $url | |
# Keep track of a flat list of broken links | |
$brokenLinkList.Add($url) | |
Write-Host "Adding $url to broken links for $ParentUrl" | |
continue | |
} | |
# Tag that this url has already been crawled | |
$already_crawled.Add($url) | |
# Use the built in DOM object to grab all <a> tags | |
$aTags = $res.ParsedHtml.getElementsByTagName('a') | |
# A list to stash a flat list of urls in | |
$links = New-Object Collections.Generic.List[String] | |
# Generate a flat list of links from the collection of DOM anchor tags | |
foreach( $a in $aTags ){ | |
if($a.href -match "^https?://"){ | |
if($OnlyCheckRelativeLinks -and !$($a.href -match "^https?://"+$CrawlUrl.Host) ){ | |
continue | |
} | |
$links.Add($a.href) | |
} | |
} | |
sleep .5 | |
# Recursively call the crawl function with the new list of links | |
Crawl -Urls $links -ParentUrl $url | |
} | |
} | |
Crawl -Urls @("$CrawlUrl") -ParentUrl "<root>" | |
$brokenLinks.GetEnumerator() | Export-Csv -NoTypeInformation BrokenLinks.csv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment