Skip to content

Instantly share code, notes, and snippets.

@SteveDesmond-ca
Created February 24, 2022 17:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SteveDesmond-ca/781d6adf94c0353dbc2eedb43daf2ff1 to your computer and use it in GitHub Desktop.
Save SteveDesmond-ca/781d6adf94c0353dbc2eedb43daf2ff1 to your computer and use it in GitHub Desktop.
SpeedCrawl PoC
using System.Collections.Concurrent;
using System.IO.Abstractions;
using System.Text.RegularExpressions;
using AngleSharp.Dom;
using AngleSharp.Html.Parser;
namespace SpeedCrawl.App;
public static class Program
{
private static readonly HttpClient http = new();
private static readonly IDictionary<string, HttpContent?> pages = new ConcurrentDictionary<string, HttpContent?>();
private static readonly IFileSystem fs = new FileSystem();
private static IReadOnlyCollection<KeyValuePair<string, HttpContent?>> Outstanding
=> pages.Where(p => p.Value == null).ToArray();
public static async Task<int> Main(string[] args)
{
var url = args[0];
http.BaseAddress = new Uri(url);
var dir = args[1];
fs.Directory.CreateDirectory(dir);
pages.Add("", null);
while (Outstanding.Any())
{
Console.WriteLine(Outstanding.Count + " pages remaining");
foreach (var page in Outstanding)
await ParsePage(page.Key, dir);
}
Console.WriteLine($"Wrote {pages.Count} files");
return 0;
}
private static async Task ParsePage(string key, string dir)
{
var parser = new HtmlParser();
var response = await http.GetAsync(key);
pages[key] = response.Content;
var subdir = fs.Path.Combine(dir, key);
fs.Directory.CreateDirectory(subdir);
var file = fs.Path.Combine(subdir, "index.html");
var bytes = await response.Content.ReadAsByteArrayAsync();
var writer = fs.File.WriteAllBytesAsync(file, bytes);
var stream = await response.Content.ReadAsStreamAsync();
var document = await parser.ParseDocumentAsync(stream);
AddLinks(document);
var tasks = new[]
{
SaveFiles(document, "img", "src", dir),
SaveFiles(document, "script", "src", dir),
SaveFiles(document, "link", "href", dir)
};
await Task.WhenAll(tasks);
await writer;
Console.WriteLine($"Wrote {key}");
}
private static void AddLinks(IParentNode document)
{
var links = document.QuerySelectorAll("a[href]");
foreach (var link in links)
{
var href = link.Attributes["href"]?.Value ?? string.Empty;
var filename = href[1..].Split("?")[0];
if (!href.StartsWith("/") || pages.ContainsKey(filename))
continue;
pages.Add(filename, null);
Console.WriteLine($"Found {href}");
}
}
private static async Task SaveFiles(IParentNode document, string tagSelector, string attribute, string dir)
{
var tags = document.QuerySelectorAll($"{tagSelector}[{attribute}]");
foreach (var tag in tags)
{
var href = tag.Attributes[attribute]?.Value ?? string.Empty;
var filename = href[1..].Split("?")[0];
if (!href.StartsWith("/") || pages.ContainsKey(filename))
continue;
Console.WriteLine($"Found {href}");
var response = await http.GetAsync(href);
pages.Add(filename, response.Content);
var bytes = await response.Content.ReadAsByteArrayAsync();
var file = fs.Path.Combine(dir, filename);
await fs.File.WriteAllBytesAsync(file, bytes);
Console.WriteLine($"Wrote {href}");
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment