Last active
February 2, 2020 21:36
-
-
Save thedavecarroll/8e0de90d498308381ac38c3b2003c9ac to your computer and use it in GitHub Desktop.
Test Links in Markdown Files with PowerShell (and Regex)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function Test-MarkdownFileLinks{ | |
[CmdLetBinding()] | |
param( | |
[ValidateScript({Test-Path -Path $_})] | |
[string[]]$MarkdownFile, | |
[uri[]]$SkipUri, | |
[switch]$ShowProgress | |
) | |
begin { | |
$InlineLink = '(?:]\s*\(\s*)(?<Inline>(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&\/\/=]*))(?:\s*\)\s*)' | |
$InlineWithTitleLink = '(?:]\s*\(\s*)(?<InlineWithTitle>\S+)(?:\s+(?:"|'').*\)\s*)' | |
$AngleBracketLink = '(<(?<AngleBracket>((http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&\/\/=]*)))>)' | |
$RelativeLink = '(?:]\s*\((?<Relative>\/\S+|\.+\/\S+|\w+\/\S+)\))' | |
$ReferenceLink = '(?:\S+\]:\s*<?\s*)(?<Reference>(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&\/\/=]*))(?<!>)' | |
$EmailAddress = '^[a-zA-Z0-9.!#$%&''*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$' | |
$FileCount = 1 | |
} | |
process { | |
foreach ($Markdown in $MarkdownFile) { | |
$LinkCount = $InlineCount = $InlineWithTitleCount = $AngleBracketCount = $ReferenceCount = $RelativeCount = 0 | |
$File = Get-ChildItem -Path $Markdown | |
if ($ShowProgress) { | |
$ProgressParams = @{ | |
Activity = 'Checking Links' | |
Status = '{0} - {1}/{2}' -f $File.Name,$FileCount,$MarkdownFile.Count | |
} | |
Write-Progress @ProgressParams | |
} | |
$Content = Get-Content -Path $File.FullName -Raw | |
$ContentLines = Get-Content -Path $File.FullName | |
# Create a regex object with all of the link type regexes | |
$LinkRegex = [regex]::new("$InlineLink|$InlineWithTitleLink|$ReferenceLink|$AngleBracketLink|$RelativeLink") | |
# Match the regex against the entire file | |
$LinkMatches = $LinkRegex.Matches($Content) | |
foreach ($Link in $LinkMatches) { | |
$StatusCode = $null | |
# The Groups[0] is the full match of a given regex, which is a large portion of text that can be used to find the line | |
$LinkGroup = $Link.Where{$_.Captures}.Groups[0].Groups[0].Value.Trim() | |
# Find the line where the link occurs | |
$LineCount = 1 | |
foreach ($Line in $ContentLines) { | |
if ($Line | Select-String $LinkGroup -SimpleMatch -Quiet) { | |
$LineNumber = $LineCount | |
} | |
$LineCount++ | |
} | |
$LinkMatch = $Link.Where{$_.Captures}.Groups[0].Groups.Where{$_.Name -match 'Inline|Angle|Rel|Ref' -and $_.Success -eq $true} | |
# Only continue processing for named groups from regex for each link type | |
if ($null -eq $LinkMatch.Name) { | |
continue | |
} | |
# The AngleBracket regex also grabs email addresses, so we need to ignore those. | |
if ($LinkMatch.Name -eq 'AngleBracket' -and $LinkMatch.Value -match $EmailAddress) { | |
continue | |
} | |
# Relative links cannot be validated. | |
if ($LinkMatch.Name -eq 'Relative') { | |
$StatusCode = 'Skipped' | |
} | |
# Skip any Url with host of localhost | |
if (([Uri]$LinkMatch.Value).Host -eq 'localhost') { | |
$StatusCode = 'Skipped' | |
} | |
# Test the Url | |
if ($StatusCode -ne 'Skipped') { | |
try { | |
$OriginalProgress = $ProgressPreference | |
$ProgressPreference = 'SilentlyContinue' | |
$StatusCode = (Invoke-WebRequest -Uri $LinkMatch.Value -Verbose:$false).StatusCode | |
$ProgressPreference = $OriginalProgress | |
} | |
catch { | |
$StatusCode = $_.Exception.Message | |
} | |
} | |
# Output the result to the pipeline | |
[PsCustomObject]@{ | |
Name = $File.Name | |
FullName = $File.FullName | |
LineNumber = $LineNumber | |
Line = $ContentLines[$LineNumber-1] | |
LinkType = $LinkMatch.Name | |
Url = $LinkMatch.Value | |
StatusCode = $StatusCode | |
} | |
# Increment counters | |
$LinkCount++ | |
switch ($LinkMatch.Name) { | |
'Inline' { $InlineCount++ } | |
'InlineWithTitle' { $InlineWithTitleCount++ } | |
'AngleBracket' { $AngleBracketCount++ } | |
'Reference' { $ReferenceCount++ } | |
'Relative' { $RelativeCount++ } | |
} | |
} | |
'Found {0} links in {1}' -f $LinkCount,$File.FullName | Write-Verbose | |
$LinkTypeCount = 'Inline({0}) : InlineWithTitle({1}) : AngleBracket({2}) : Reference({3}) : Relative({4})' -f $InlineCount,$InlineWithTitleCount,$AngleBracketCount,$ReferenceCount,$RelativeCount | |
$LinkTypeCount | Write-Verbose | |
$FileCount++ | |
} | |
} | |
end { | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Introduction
I recently needed to verify links in several markdown files so I wrote this function.
Knowing that regex would be the way to go, I started building my regex statement for each link type. I'm using named groups in each statement which helps with processing. If you haven't used named groups in regex, you should check it out.
Originally, I was checking each line for each of the link types but, during some of the testing, I eventually switched to matching against the entire file. Then for each match, I'm looping through each line to find the line number where it starts. I did look into using regex to generate the numbers, but it's not an easy thing to do (for regex novice). I already had a headache and that would have compounded it, I'm sure.
Output
[PsCustomObject]
Invoke-WebRequest
-Verbose
Link Types
Here are the link types, that can now be captured and verified, with sample output.
Inline
InlineWithTitle
Reference
Relative
AngleBracket
RawUrl
I had to remove the RawUrl checking from the function as it was producing too many false positives.
HTML Comments
When I added the AngleBracket link type, I failed to consider HTML comments. They are now ignored with an updated Regex for this link type.
Example
Summary
If you find some links that are not caught or false positives, please leave a comment with an example of a full line that contains the link. If I have time, I will work on correcting that case.
I hope you found this function useful, and if so, please leave a comment.
Notes