Skip to content

Instantly share code, notes, and snippets.

@Swimburger Swimburger/CrawlSite.ps1
Last active Jan 25, 2019

Embed
What would you like to do?
Crawl your website links and images to find broken links/images using PowerShell
Param(
[Parameter(Mandatory=$true)]
[string] $Url,
[Parameter(Mandatory=$true)]
[int] $MaxPages,
[bool] $IncludeImages = $true,
[bool] $StayOnDomain = $true,
[bool] $IgnoreFragments = $true)
$Domain = [Uri]::new($Url).Host;
Function Get-AbsoluteUrl([string]$pageUrl) {
Begin {
[Uri]$baseUri = [Uri]::new($pageUrl);
}
Process {
$Skip = $False;
If(![string]::IsNullOrEmpty($_)){
If($_.StartsWith('www')){
Write-Warning "Invalid HREF: $_ found on $pageurl";
}Else{
$DecodedUrl = $_.Replace('&', '&');
If ([system.uri]::IsWellFormedUriString($DecodedUrl, [System.UriKind]::Absolute)) {
$DecodedUrl;
}Else{
[Uri]::new($baseUri, [string]$DecodedUrl).AbsoluteUri;
}
}
}
}
}
Function Remove-Fragments(){
Process {
$Uri = [Uri]::new([string]$_);
If($Uri.Fragment -ne $null -and $Uri.Fragment -ne ''){
$Uri.AbsoluteUri.Replace($Uri.Fragment, '');
}Else{
$_;
}
}
}
Function UrlFilter([System.Collections.ArrayList]$UrlsToCrawl, [System.Collections.ArrayList]$UrlsCrawled){
Process {
$IsValidUrl = $True;
If ([string]::IsNullOrEmpty($_)){
$IsValidUrl = $False;
} Else {
If($_.StartsWith("tel:") -or $_.StartsWith("mailto:") -or $_.StartsWith("javascript:")){
$IsValidUrl = $False;
}
If (($UrlsToCrawl -contains $_) -or ($UrlsCrawled -contains $_)){
$IsValidUrl = $False;
}
If (($UrlsToCrawl -contains $_.TrimEnd('/')) -or ($UrlsCrawled -contains $_.TrimEnd('/'))){
$IsValidUrl = $False;
}
If($IsValidUrl){
$_;
}
}
}
}
[System.Collections.ArrayList]$UrlsToCrawl = [System.Collections.ArrayList]@($Url);
[System.Collections.ArrayList]$UrlsCrawled = [System.Collections.ArrayList]@();
$CrawlIndex = 0;
Do
{
$Url = $UrlsToCrawl[0];
$CrawlIndex++;
Try{
$Response = Invoke-WebRequest -Uri $Url -UseBasicParsing -TimeoutSec 30;
$UrlsToCrawl.Remove($Url);
$UrlsCrawled.Add($Url) | Out-Null;
Write-Host ($CrawlIndex): $Response.StatusCode - $Url;
$NewHrefs = $Response.Links.href | Get-AbsoluteUrl -pageUrl $Url | UrlFilter -UrlsToCrawl $UrlsToCrawl -UrlsCrawled $UrlsCrawled;
if($StayOnDomain){
$NewHrefs = $NewHrefs | Where-Object { [Uri]::new($_).Host -eq $Domain };
}
if($IgnoreFragments){
$NewHrefs = $NewHrefs | Remove-Fragments | UrlFilter -UrlsToCrawl $UrlsToCrawl -UrlsCrawled $UrlsCrawled | Sort-Object;
}
$UrlsToCrawl = @(($UrlsToCrawl + $NewHrefs) | Sort-Object | Get-Unique);
if($IncludeImages){
$NewImgs = ($Response.Images.src | Get-AbsoluteUrl -pageUrl $Url | UrlFilter -UrlsToCrawl $UrlsToCrawl -UrlsCrawled $UrlsCrawled);
if($StayOnDomain){
$NewImgs = $NewImgs | Where-Object { [Uri]::new($_).Host -eq $Domain };
}
if($IgnoreFragments){
$NewImgs = $NewImgs | Remove-Fragments | UrlFilter -UrlsToCrawl $UrlsToCrawl -UrlsCrawled $UrlsCrawled | Sort-Object;
}
$UrlsToCrawl = @(($UrlsToCrawl + $NewImgs) | Sort-Object | Get-Unique);
}
}Catch [System.Net.WebException] {
Write-Warning ($CrawlIndex.ToString() + ": " + ([int]$_.Exception.Response.StatusCode).ToString() + " - " + $Url);
$UrlsToCrawl.Remove($Url);
$UrlsCrawled.Add($Url) | Out-Null;
}Catch {
Write-Warning (($CrawlIndex.ToString()) + ": Unknown error occurred - Url: " + $Url);
Write-Error $_.Exception;
$UrlsToCrawl.Remove($Url);
$UrlsCrawled.Add($Url) | Out-Null;
}
}While ($CrawlIndex -lt $MaxPages -and $UrlsToCrawl.Count -gt 0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.