-
-
Save thedavecarroll/8e0de90d498308381ac38c3b2003c9ac to your computer and use it in GitHub Desktop.
function Test-MarkdownFileLinks{ | |
[CmdLetBinding()] | |
param( | |
[ValidateScript({Test-Path -Path $_})] | |
[string[]]$MarkdownFile, | |
[uri[]]$SkipUri, | |
[switch]$ShowProgress | |
) | |
begin { | |
$InlineLink = '(?:]\s*\(\s*)(?<Inline>(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&\/\/=]*))(?:\s*\)\s*)' | |
$InlineWithTitleLink = '(?:]\s*\(\s*)(?<InlineWithTitle>\S+)(?:\s+(?:"|'').*\)\s*)' | |
$AngleBracketLink = '(<(?<AngleBracket>((http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&\/\/=]*)))>)' | |
$RelativeLink = '(?:]\s*\((?<Relative>\/\S+|\.+\/\S+|\w+\/\S+)\))' | |
$ReferenceLink = '(?:\S+\]:\s*<?\s*)(?<Reference>(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&\/\/=]*))(?<!>)' | |
$EmailAddress = '^[a-zA-Z0-9.!#$%&''*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$' | |
$FileCount = 1 | |
} | |
process { | |
foreach ($Markdown in $MarkdownFile) { | |
$LinkCount = $InlineCount = $InlineWithTitleCount = $AngleBracketCount = $ReferenceCount = $RelativeCount = 0 | |
$File = Get-ChildItem -Path $Markdown | |
if ($ShowProgress) { | |
$ProgressParams = @{ | |
Activity = 'Checking Links' | |
Status = '{0} - {1}/{2}' -f $File.Name,$FileCount,$MarkdownFile.Count | |
} | |
Write-Progress @ProgressParams | |
} | |
$Content = Get-Content -Path $File.FullName -Raw | |
$ContentLines = Get-Content -Path $File.FullName | |
# Create a regex object with all of the link type regexes | |
$LinkRegex = [regex]::new("$InlineLink|$InlineWithTitleLink|$ReferenceLink|$AngleBracketLink|$RelativeLink") | |
# Match the regex against the entire file | |
$LinkMatches = $LinkRegex.Matches($Content) | |
foreach ($Link in $LinkMatches) { | |
$StatusCode = $null | |
# The Groups[0] is the full match of a given regex, which is a large portion of text that can be used to find the line | |
$LinkGroup = $Link.Where{$_.Captures}.Groups[0].Groups[0].Value.Trim() | |
# Find the line where the link occurs | |
$LineCount = 1 | |
foreach ($Line in $ContentLines) { | |
if ($Line | Select-String $LinkGroup -SimpleMatch -Quiet) { | |
$LineNumber = $LineCount | |
} | |
$LineCount++ | |
} | |
$LinkMatch = $Link.Where{$_.Captures}.Groups[0].Groups.Where{$_.Name -match 'Inline|Angle|Rel|Ref' -and $_.Success -eq $true} | |
# Only continue processing for named groups from regex for each link type | |
if ($null -eq $LinkMatch.Name) { | |
continue | |
} | |
# The AngleBracket regex also grabs email addresses, so we need to ignore those. | |
if ($LinkMatch.Name -eq 'AngleBracket' -and $LinkMatch.Value -match $EmailAddress) { | |
continue | |
} | |
# Relative links cannot be validated. | |
if ($LinkMatch.Name -eq 'Relative') { | |
$StatusCode = 'Skipped' | |
} | |
# Skip any Url with host of localhost | |
if (([Uri]$LinkMatch.Value).Host -eq 'localhost') { | |
$StatusCode = 'Skipped' | |
} | |
# Test the Url | |
if ($StatusCode -ne 'Skipped') { | |
try { | |
$OriginalProgress = $ProgressPreference | |
$ProgressPreference = 'SilentlyContinue' | |
$StatusCode = (Invoke-WebRequest -Uri $LinkMatch.Value -Verbose:$false).StatusCode | |
$ProgressPreference = $OriginalProgress | |
} | |
catch { | |
$StatusCode = $_.Exception.Message | |
} | |
} | |
# Output the result to the pipeline | |
[PsCustomObject]@{ | |
Name = $File.Name | |
FullName = $File.FullName | |
LineNumber = $LineNumber | |
Line = $ContentLines[$LineNumber-1] | |
LinkType = $LinkMatch.Name | |
Url = $LinkMatch.Value | |
StatusCode = $StatusCode | |
} | |
# Increment counters | |
$LinkCount++ | |
switch ($LinkMatch.Name) { | |
'Inline' { $InlineCount++ } | |
'InlineWithTitle' { $InlineWithTitleCount++ } | |
'AngleBracket' { $AngleBracketCount++ } | |
'Reference' { $ReferenceCount++ } | |
'Relative' { $RelativeCount++ } | |
} | |
} | |
'Found {0} links in {1}' -f $LinkCount,$File.FullName | Write-Verbose | |
$LinkTypeCount = 'Inline({0}) : InlineWithTitle({1}) : AngleBracket({2}) : Reference({3}) : Relative({4})' -f $InlineCount,$InlineWithTitleCount,$AngleBracketCount,$ReferenceCount,$RelativeCount | |
$LinkTypeCount | Write-Verbose | |
$FileCount++ | |
} | |
} | |
end { | |
} | |
} |
Introduction
I recently needed to verify links in several markdown files so I wrote this function.
Knowing that regex would be the way to go, I started building my regex statement for each link type. I'm using named groups in each statement which helps with processing. If you haven't used named groups in regex, you should check it out.
Originally, I was checking each line for each of the link types but, during some of the testing, I eventually switched to matching against the entire file. Then for each match, I'm looping through each line to find the line number where it starts. I did look into using regex to generate the numbers, but it's not an easy thing to do (for regex novice). I already had a headache and that would have compounded it, I'm sure.
Output
[PsCustomObject]
Property | Description |
---|---|
Name | name of the file |
FullName | fullname of the file |
LineNumber | the line number where the link was found |
Line | the full line in the file where the link was found |
LinkType | the link type (see below) |
Url | the url or link |
StatusCode | Skipped or status returned from Invoke-WebRequest |
-Verbose
VERBOSE: Found # links in <full file name>
VERBOSE: Inline(#) : InlineWithTitle(#) : AngleBracket(#) : Reference(#) : Relative(#)
Link Types
Here are the link types, that can now be captured and verified, with sample output.
Inline
[I'm an inline-style link](https://www.google.com)
Name : MarkdownTest.md
FullName : D:\Development\PowerShell\MarkdownTest.md
LineNumber : 1
Line : [I'm an inline-style link](https://www.google.com)
Url : https://www.google.com
LinkType : Inline
StatusCode : 200
InlineWithTitle
[I'm an inline-style link with title](https://www.google.com "Google's Homepage")
Name : MarkdownTest.md
FullName : D:\Development\PowerShell\MarkdownTest.md
LineNumber : 3
Line : [I'm an inline-style link with title](https://www.google.com "Google's Homepage")
Url : https://www.google.com
LinkType : InlineWithTitle
StatusCode : 200
Reference
[arbitrary case-insensitive reference text]: https://www.mozilla.org
[1]: http://slashdot.org
[link text itself]: http://www.reddit.com
Name : MarkdownTest.md
FullName : D:\Development\PowerShell\MarkdownTest.md
LineNumber : 20
Line : [1]: http://slashdot.org
Url : http://slashdot.org
LinkType : Reference
StatusCode : 200
Relative
[I'm a relative reference to a repository file](../blob/master/LICENSE)
FullName : D:\Development\PowerShell\MarkdownTest.md
LineNumber : 7
Line : [I'm a relative reference to a repository file](../blob/master/LICENSE)
Url : ../blob/master/LICENSE
LinkType : Relative
StatusCode : Skipped
AngleBracket
http://www.example.com or <http://www.example.com> and sometimes
Name : MarkdownTest.md
FullName : D:\Development\PowerShell\MarkdownTest.md
LineNumber : 16
Line : http://www.example.com or <http://www.example.com> and sometimes
Url : http://www.example.com
LinkType : AngleBracket
StatusCode : 200
RawUrl
I had to remove the RawUrl checking from the function as it was producing too many false positives.
HTML Comments
When I added the AngleBracket link type, I failed to consider HTML comments. They are now ignored with an updated Regex for this link type.
Example
<!-- vale Microsoft.We = YES -->
Summary
If you find some links that are not caught or false positives, please leave a comment with an example of a full line that contains the link. If I have time, I will work on correcting that case.
I hope you found this function useful, and if so, please leave a comment.
Notes
- This function has been tested with Windows PowerShell 5.1 and PowerShell 7 RC2.
- Performance issues could arise while scanning a large number of files and content.
- As with most freely shared code, this comes as-is and without warranty or guarantee.
Updated to properly handle inline links without http/https at the beginning of the link.
Both of these should work now.
And this shouldn't.