Skip to content

Instantly share code, notes, and snippets.

@Swimburger
Last active June 27, 2023 22:24
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save Swimburger/c2def1ea0dcb53d3d23030296c6e1b6c to your computer and use it in GitHub Desktop.
Crawl your website links and images to find broken links/images using PowerShell
Param(
[Parameter(Mandatory=$true)]
[string] $Url,
[Parameter(Mandatory=$true)]
[int] $MaxPages,
[bool] $IncludeImages = $true,
[bool] $StayOnDomain = $true,
[bool] $IgnoreFragments = $true)
Add-Type -AssemblyName System.Web
$Domain = [Uri]::new($Url).Host;
Function Get-AbsoluteUrl([string]$pageUrl) {
Begin {
[Uri]$baseUri = [Uri]::new($pageUrl);
}
Process {
$DecodedUrl = $_.Replace('&', '&');
If ([system.uri]::IsWellFormedUriString($DecodedUrl, [System.UriKind]::Absolute)) {
$DecodedUrl
}Else{
[Uri]::new($baseUri, [string]$DecodedUrl).AbsoluteUri;
}
}
}
Function Remove-Fragments(){
Process {
$Uri = [Uri]::new([string]$_);
If($Uri.Fragment -ne $null -and $Uri.Fragment -ne ''){
$Uri.AbsoluteUri.Replace($Uri.Fragment, '');
}Else{
$_;
}
}
}
[System.Collections.ArrayList]$UrlsToCrawl = [System.Collections.ArrayList]@($Url);
$CrawlIndex = 0;
Do
{
$Url = $UrlsToCrawl[$CrawlIndex];
$CrawlIndex++;
Try{
$Response = Invoke-WebRequest -Uri $Url -UseBasicParsing -TimeoutSec 30;
Write-Host ($CrawlIndex): $Response.StatusCode - $Url;
$UrlsToCrawl = ($UrlsToCrawl + ($Response.Links.href | Where-Object { $_ -ne $null } | Get-AbsoluteUrl -pageUrl $Url)) | Select -Unique;
if($IncludeImages){
$UrlsToCrawl = ($UrlsToCrawl + ($Response.Images.src | Where-Object { $_ -ne $null } | Get-AbsoluteUrl -pageUrl $Url)) | Select -Unique;
}
if($StayOnDomain){
$UrlsToCrawl = $UrlsToCrawl | Where-Object { [Uri]::new($_).Host -eq $Domain };
}
if($IgnoreFragments){
$UrlsToCrawl = $UrlsToCrawl | Remove-Fragments | select -Unique
}
}Catch [System.Net.WebException] {
Write-Warning ($CrawlIndex.ToString() + ": " + ([int]$_.Exception.Response.StatusCode).ToString() + " - " + $Url);
}Catch {
Write-Warning ($CrawlIndex.ToString() + ": Unknown error occurred - Url: " + $Url);
Write-Error $_.Exception;
}
}While ($CrawlIndex -lt $MaxPages -and $UrlsToCrawl.Count -gt $CrawlIndex)
@nicklaposky1
Copy link

This is great.
On the line Write-Host ($CrawlIndex): $Response.StatusCode - $Url;
I would like to output what page the link was found on.

How would you add that?

@Swimburger
Copy link
Author

We have to keep track of where the URL was found and that can be multiple locations. So we'll have to crawl the entire site and then report on it. It took me a while, but here's an updated version:

Param(
    [Parameter(Mandatory=$true)]
    [string] $Url, 
    [Parameter(Mandatory=$true)]
    [int] $MaxPages, 
    [bool] $IncludeImages = $true, 
    [bool] $StayOnDomain = $true, 
    [bool] $IgnoreFragments = $true)

Add-Type -AssemblyName System.Web

$Domain = [Uri]::new($Url).Host;

Function Get-AbsoluteUrl([string]$pageUrl) {
    Begin {
        [Uri]$baseUri = [Uri]::new($pageUrl);
    }
    Process {
        $DecodedUrl = $_.Replace('&', '&');
        If ([system.uri]::IsWellFormedUriString($DecodedUrl, [System.UriKind]::Absolute)) {
            $DecodedUrl
        }Else{
            [Uri]::new($baseUri, [string]$DecodedUrl).AbsoluteUri;
        }
    }
}

Function Remove-Fragments(){
    Process {
        $Uri = [Uri]::new([string]$_);
        If($Uri.Fragment -ne $null -and $Uri.Fragment -ne ''){
            $Uri.AbsoluteUri.Replace($Uri.Fragment, '');
        }Else{
            $_;
        }
    }
}

[System.Collections.ArrayList]$UrlsToCrawl = [System.Collections.ArrayList]@($Url);
$UrlsToCrawlResults = @{};
$UrlsToCrawlResults[$Url] = @{
    Message = $Null;
    ErrorMessage = $Null;
    AppearedOn = [System.Collections.Generic.HashSet[string]]::new();
};
$CrawlIndex = 0;
Do
{
    $Url = $UrlsToCrawl[$CrawlIndex];
    $CrawlIndex++;
    Try{
        $Response = Invoke-WebRequest -Uri $Url -UseBasicParsing -TimeoutSec 30;
        $UrlsToCrawlResults[$Url]["Message"] = "($CrawlIndex): $($Response.StatusCode) - $Url";

        [System.Collections.ArrayList]$NewUrlsToCrawl = [System.Collections.ArrayList]$Response.Links.href | Where-Object { $_ -ne $null } | Get-AbsoluteUrl -pageUrl $Url;

        if($IncludeImages){
            $NewUrlsToCrawl = $NewUrlsToCrawl + ($Response.Images.src | Where-Object { $_ -ne $null } | Get-AbsoluteUrl -pageUrl $Url);
        }

        if($StayOnDomain){
            $NewUrlsToCrawl = $NewUrlsToCrawl | Where-Object { $_ -ne $null } | Get-AbsoluteUrl -pageUrl $Url | Where-Object { [Uri]::new($_).Host -eq $Domain };
        }

        if($IgnoreFragments){
            $NewUrlsToCrawl = $NewUrlsToCrawl | Where-Object { $_ -ne $null } | Get-AbsoluteUrl -pageUrl $Url | Remove-Fragments;
        }

        ForEach($NewUrlToCrawl in $NewUrlsToCrawl)
        {
            If([String]::IsNullOrEmpty($NewUrlToCrawl))
            {
                Continue;
            }

            If($UrlsToCrawlResults.ContainsKey($NewUrlToCrawl) -eq $False)
            {
                $UrlsToCrawl.Add($NewUrlToCrawl) | Out-Null;
                $UrlsToCrawlResults[$NewUrlToCrawl] = @{
                    Message = $Null;
                    ErrorMessage = $Null;
                    AppearedOn = [System.Collections.Generic.HashSet[string]]::new();
                };
            }
            $UrlsToCrawlResults[$NewUrlToCrawl]["AppearedOn"].Add($Url) | Out-Null;
        }
    }Catch [System.Net.WebException] {
        $UrlsToCrawlResults[$Url]["ErrorMessage"] = ($CrawlIndex.ToString() + ": " + ([int]$_.Exception.Response.StatusCode).ToString() + " - " + $Url);
    }Catch {
        Write-Warning ($CrawlIndex.ToString() + ": Unknown error occurred - Url: " + $Url);
        Write-Error $_.Exception;
    }
}While ($CrawlIndex -lt $MaxPages -and $UrlsToCrawl.Count -gt $CrawlIndex)


ForEach($Url in $UrlsToCrawl)
{
    $Result = $UrlsToCrawlResults[$Url];
    $AppearedOn = "";
    If($Result["AppearedOn"].Length -gt 0)
    {
        $AppearedOn = [String]::Join(', ', $Result["AppearedOn"]);
    }
    If($Result["Message"] -ne $Null)
    {
        Write-Host ($Result["Message"] + " (Source: $AppearedOn)");
    }
    ElseIf($Result["ErrorMessage"])
    {
        Write-Warning ($Result["ErrorMessage"] + " (Source: $AppearedOn)");
    }
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment