Created
February 24, 2022 17:29
-
-
Save SteveDesmond-ca/781d6adf94c0353dbc2eedb43daf2ff1 to your computer and use it in GitHub Desktop.
SpeedCrawl PoC
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Collections.Concurrent; | |
using System.IO.Abstractions; | |
using System.Text.RegularExpressions; | |
using AngleSharp.Dom; | |
using AngleSharp.Html.Parser; | |
namespace SpeedCrawl.App; | |
public static class Program | |
{ | |
private static readonly HttpClient http = new(); | |
private static readonly IDictionary<string, HttpContent?> pages = new ConcurrentDictionary<string, HttpContent?>(); | |
private static readonly IFileSystem fs = new FileSystem(); | |
private static IReadOnlyCollection<KeyValuePair<string, HttpContent?>> Outstanding | |
=> pages.Where(p => p.Value == null).ToArray(); | |
public static async Task<int> Main(string[] args) | |
{ | |
var url = args[0]; | |
http.BaseAddress = new Uri(url); | |
var dir = args[1]; | |
fs.Directory.CreateDirectory(dir); | |
pages.Add("", null); | |
while (Outstanding.Any()) | |
{ | |
Console.WriteLine(Outstanding.Count + " pages remaining"); | |
foreach (var page in Outstanding) | |
await ParsePage(page.Key, dir); | |
} | |
Console.WriteLine($"Wrote {pages.Count} files"); | |
return 0; | |
} | |
private static async Task ParsePage(string key, string dir) | |
{ | |
var parser = new HtmlParser(); | |
var response = await http.GetAsync(key); | |
pages[key] = response.Content; | |
var subdir = fs.Path.Combine(dir, key); | |
fs.Directory.CreateDirectory(subdir); | |
var file = fs.Path.Combine(subdir, "index.html"); | |
var bytes = await response.Content.ReadAsByteArrayAsync(); | |
var writer = fs.File.WriteAllBytesAsync(file, bytes); | |
var stream = await response.Content.ReadAsStreamAsync(); | |
var document = await parser.ParseDocumentAsync(stream); | |
AddLinks(document); | |
var tasks = new[] | |
{ | |
SaveFiles(document, "img", "src", dir), | |
SaveFiles(document, "script", "src", dir), | |
SaveFiles(document, "link", "href", dir) | |
}; | |
await Task.WhenAll(tasks); | |
await writer; | |
Console.WriteLine($"Wrote {key}"); | |
} | |
private static void AddLinks(IParentNode document) | |
{ | |
var links = document.QuerySelectorAll("a[href]"); | |
foreach (var link in links) | |
{ | |
var href = link.Attributes["href"]?.Value ?? string.Empty; | |
var filename = href[1..].Split("?")[0]; | |
if (!href.StartsWith("/") || pages.ContainsKey(filename)) | |
continue; | |
pages.Add(filename, null); | |
Console.WriteLine($"Found {href}"); | |
} | |
} | |
private static async Task SaveFiles(IParentNode document, string tagSelector, string attribute, string dir) | |
{ | |
var tags = document.QuerySelectorAll($"{tagSelector}[{attribute}]"); | |
foreach (var tag in tags) | |
{ | |
var href = tag.Attributes[attribute]?.Value ?? string.Empty; | |
var filename = href[1..].Split("?")[0]; | |
if (!href.StartsWith("/") || pages.ContainsKey(filename)) | |
continue; | |
Console.WriteLine($"Found {href}"); | |
var response = await http.GetAsync(href); | |
pages.Add(filename, response.Content); | |
var bytes = await response.Content.ReadAsByteArrayAsync(); | |
var file = fs.Path.Combine(dir, filename); | |
await fs.File.WriteAllBytesAsync(file, bytes); | |
Console.WriteLine($"Wrote {href}"); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment