Skip to content

Instantly share code, notes, and snippets.

@russcam
Created January 30, 2019 00:14
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save russcam/6ec15f171d299898903ef9da78791f12 to your computer and use it in GitHub Desktop.
Save russcam/6ec15f171d299898903ef9da78791f12 to your computer and use it in GitHub Desktop.
Checks for broken links in HTML pages
<Query Kind="Program">
<Reference>&lt;RuntimeDirectory&gt;\System.Net.Http.dll</Reference>
<NuGetReference>HtmlAgilityPack</NuGetReference>
<Namespace>HtmlAgilityPack</Namespace>
<Namespace>System.Net.Http</Namespace>
<Namespace>System.Threading.Tasks</Namespace>
<Namespace>System.Collections.Concurrent</Namespace>
</Query>
void Main()
{
var directory = "INPUT DIRECTORY HERE";
var excludeLinks = new HashSet<string>();
var client = new HttpClient();
var badLinks = new ConcurrentDictionary<string, HashSet<string>>();
var files = Directory.EnumerateFiles(directory, "*.html", SearchOption.AllDirectories).ToList();
Parallel.ForEach(files, html =>
{
var document = new HtmlDocument();
document.Load(html);
foreach (var anchor in document.DocumentNode.SelectNodes("//a"))
{
var href = anchor.GetAttributeValue("href", string.Empty);
if (string.IsNullOrEmpty(href) || !href.StartsWith("http") || excludeLinks.Contains(href))
{
continue;
}
try
{
var result = client.SendAsync(new HttpRequestMessage(HttpMethod.Head, new Uri(href))).Result;
if (result.StatusCode == System.Net.HttpStatusCode.NotFound)
{
HashSet<string> links;
if (!badLinks.TryGetValue(html, out links))
{
links = new HashSet<string> { href };
badLinks.TryAdd(html, links);
}
else
{
links.Add(href);
}
}
}
catch (Exception)
{
HashSet<string> links;
if (!badLinks.TryGetValue(html, out links))
{
links = new HashSet<string> { href };
badLinks.TryAdd(html, links);
}
else
{
links.Add(href);
}
}
}
});
badLinks.Dump();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment