Last active
June 27, 2023 22:24
-
-
Save Swimburger/c2def1ea0dcb53d3d23030296c6e1b6c to your computer and use it in GitHub Desktop.
Crawl your website links and images to find broken links/images using PowerShell
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Param( | |
[Parameter(Mandatory=$true)] | |
[string] $Url, | |
[Parameter(Mandatory=$true)] | |
[int] $MaxPages, | |
[bool] $IncludeImages = $true, | |
[bool] $StayOnDomain = $true, | |
[bool] $IgnoreFragments = $true) | |
Add-Type -AssemblyName System.Web | |
$Domain = [Uri]::new($Url).Host; | |
Function Get-AbsoluteUrl([string]$pageUrl) { | |
Begin { | |
[Uri]$baseUri = [Uri]::new($pageUrl); | |
} | |
Process { | |
$DecodedUrl = $_.Replace('&', '&'); | |
If ([system.uri]::IsWellFormedUriString($DecodedUrl, [System.UriKind]::Absolute)) { | |
$DecodedUrl | |
}Else{ | |
[Uri]::new($baseUri, [string]$DecodedUrl).AbsoluteUri; | |
} | |
} | |
} | |
Function Remove-Fragments(){ | |
Process { | |
$Uri = [Uri]::new([string]$_); | |
If($Uri.Fragment -ne $null -and $Uri.Fragment -ne ''){ | |
$Uri.AbsoluteUri.Replace($Uri.Fragment, ''); | |
}Else{ | |
$_; | |
} | |
} | |
} | |
[System.Collections.ArrayList]$UrlsToCrawl = [System.Collections.ArrayList]@($Url); | |
$CrawlIndex = 0; | |
Do | |
{ | |
$Url = $UrlsToCrawl[$CrawlIndex]; | |
$CrawlIndex++; | |
Try{ | |
$Response = Invoke-WebRequest -Uri $Url -UseBasicParsing -TimeoutSec 30; | |
Write-Host ($CrawlIndex): $Response.StatusCode - $Url; | |
$UrlsToCrawl = ($UrlsToCrawl + ($Response.Links.href | Where-Object { $_ -ne $null } | Get-AbsoluteUrl -pageUrl $Url)) | Select -Unique; | |
if($IncludeImages){ | |
$UrlsToCrawl = ($UrlsToCrawl + ($Response.Images.src | Where-Object { $_ -ne $null } | Get-AbsoluteUrl -pageUrl $Url)) | Select -Unique; | |
} | |
if($StayOnDomain){ | |
$UrlsToCrawl = $UrlsToCrawl | Where-Object { [Uri]::new($_).Host -eq $Domain }; | |
} | |
if($IgnoreFragments){ | |
$UrlsToCrawl = $UrlsToCrawl | Remove-Fragments | select -Unique | |
} | |
}Catch [System.Net.WebException] { | |
Write-Warning ($CrawlIndex.ToString() + ": " + ([int]$_.Exception.Response.StatusCode).ToString() + " - " + $Url); | |
}Catch { | |
Write-Warning ($CrawlIndex.ToString() + ": Unknown error occurred - Url: " + $Url); | |
Write-Error $_.Exception; | |
} | |
}While ($CrawlIndex -lt $MaxPages -and $UrlsToCrawl.Count -gt $CrawlIndex) |
We have to keep track of where the URL was found and that can be multiple locations. So we'll have to crawl the entire site and then report on it. It took me a while, but here's an updated version:
Param(
[Parameter(Mandatory=$true)]
[string] $Url,
[Parameter(Mandatory=$true)]
[int] $MaxPages,
[bool] $IncludeImages = $true,
[bool] $StayOnDomain = $true,
[bool] $IgnoreFragments = $true)
Add-Type -AssemblyName System.Web
$Domain = [Uri]::new($Url).Host;
Function Get-AbsoluteUrl([string]$pageUrl) {
Begin {
[Uri]$baseUri = [Uri]::new($pageUrl);
}
Process {
$DecodedUrl = $_.Replace('&', '&');
If ([system.uri]::IsWellFormedUriString($DecodedUrl, [System.UriKind]::Absolute)) {
$DecodedUrl
}Else{
[Uri]::new($baseUri, [string]$DecodedUrl).AbsoluteUri;
}
}
}
Function Remove-Fragments(){
Process {
$Uri = [Uri]::new([string]$_);
If($Uri.Fragment -ne $null -and $Uri.Fragment -ne ''){
$Uri.AbsoluteUri.Replace($Uri.Fragment, '');
}Else{
$_;
}
}
}
[System.Collections.ArrayList]$UrlsToCrawl = [System.Collections.ArrayList]@($Url);
$UrlsToCrawlResults = @{};
$UrlsToCrawlResults[$Url] = @{
Message = $Null;
ErrorMessage = $Null;
AppearedOn = [System.Collections.Generic.HashSet[string]]::new();
};
$CrawlIndex = 0;
Do
{
$Url = $UrlsToCrawl[$CrawlIndex];
$CrawlIndex++;
Try{
$Response = Invoke-WebRequest -Uri $Url -UseBasicParsing -TimeoutSec 30;
$UrlsToCrawlResults[$Url]["Message"] = "($CrawlIndex): $($Response.StatusCode) - $Url";
[System.Collections.ArrayList]$NewUrlsToCrawl = [System.Collections.ArrayList]$Response.Links.href | Where-Object { $_ -ne $null } | Get-AbsoluteUrl -pageUrl $Url;
if($IncludeImages){
$NewUrlsToCrawl = $NewUrlsToCrawl + ($Response.Images.src | Where-Object { $_ -ne $null } | Get-AbsoluteUrl -pageUrl $Url);
}
if($StayOnDomain){
$NewUrlsToCrawl = $NewUrlsToCrawl | Where-Object { $_ -ne $null } | Get-AbsoluteUrl -pageUrl $Url | Where-Object { [Uri]::new($_).Host -eq $Domain };
}
if($IgnoreFragments){
$NewUrlsToCrawl = $NewUrlsToCrawl | Where-Object { $_ -ne $null } | Get-AbsoluteUrl -pageUrl $Url | Remove-Fragments;
}
ForEach($NewUrlToCrawl in $NewUrlsToCrawl)
{
If([String]::IsNullOrEmpty($NewUrlToCrawl))
{
Continue;
}
If($UrlsToCrawlResults.ContainsKey($NewUrlToCrawl) -eq $False)
{
$UrlsToCrawl.Add($NewUrlToCrawl) | Out-Null;
$UrlsToCrawlResults[$NewUrlToCrawl] = @{
Message = $Null;
ErrorMessage = $Null;
AppearedOn = [System.Collections.Generic.HashSet[string]]::new();
};
}
$UrlsToCrawlResults[$NewUrlToCrawl]["AppearedOn"].Add($Url) | Out-Null;
}
}Catch [System.Net.WebException] {
$UrlsToCrawlResults[$Url]["ErrorMessage"] = ($CrawlIndex.ToString() + ": " + ([int]$_.Exception.Response.StatusCode).ToString() + " - " + $Url);
}Catch {
Write-Warning ($CrawlIndex.ToString() + ": Unknown error occurred - Url: " + $Url);
Write-Error $_.Exception;
}
}While ($CrawlIndex -lt $MaxPages -and $UrlsToCrawl.Count -gt $CrawlIndex)
ForEach($Url in $UrlsToCrawl)
{
$Result = $UrlsToCrawlResults[$Url];
$AppearedOn = "";
If($Result["AppearedOn"].Length -gt 0)
{
$AppearedOn = [String]::Join(', ', $Result["AppearedOn"]);
}
If($Result["Message"] -ne $Null)
{
Write-Host ($Result["Message"] + " (Source: $AppearedOn)");
}
ElseIf($Result["ErrorMessage"])
{
Write-Warning ($Result["ErrorMessage"] + " (Source: $AppearedOn)");
}
}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is great.
On the line Write-Host ($CrawlIndex): $Response.StatusCode - $Url;
I would like to output what page the link was found on.
How would you add that?