Skip to content

Instantly share code, notes, and snippets.

@deanebarker
Last active April 1, 2022 11:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save deanebarker/4815459ca3cd7940247da925caa404da to your computer and use it in GitHub Desktop.
Save deanebarker/4815459ca3cd7940247da925caa404da to your computer and use it in GitHub Desktop.
Replace the body of a response with HTML extracted from that same body

This is a .NET5 middleware component that will extract HTML elements based on a CSS-style selector in a querystring argument and return them as the body of the request.

/my-page?extract=article

Extracts the outer HTML (the default) of all ARTICLE tags and concatenates them.

/my-page?extract=article&scope=inner

The same, except the inner HTML.

if the selector finds multiple matching tags, it will concantenate them all. If you only need the first, then you need to find a way to narrow your CSS selector to only return the first.

Uses the AngleSharp library, installable via Nuget. extract will take any syntax that AngleSharp supports, which is most CSS selectors.

https://github.com/AngleSharp/

This needs to be configured as middleware in Startup.cs, like this:

public void Configure(IApplicationBuilder app, IWebHostEnvironment env)
{
  app.UseMiddleware<ExtractFragmentMiddleware>();
}

Note: it needs to be placed early in the pipeline. Honestly, I don't know how early, but I configured as the first middleware in the pipeline, and it worked consistently.

using AngleSharp.Html.Parser;
using Microsoft.AspNetCore.Http;
using System.IO;
using System.Linq;
using System.Threading.Tasks;
namespace DeaneBarker.Optimizely.Middleware
{
public class ExtractFragmentMiddleware
{
private static readonly string EXTRACTION_PATH_KEY = "extract";
private static readonly string SCOPE_KEY = "scope";
private static HtmlParser _parser;
RequestDelegate _next;
static ExtractFragmentMiddleware()
{
_parser = new HtmlParser();
}
public ExtractFragmentMiddleware(RequestDelegate next)
{
_next = next;
}
public async Task Invoke(HttpContext context)
{
// If we don't have an extraction path, abandon
if(!HasExtractionPath(context))
{
await _next.Invoke(context);
return;
}
using (var buffer = new MemoryStream())
{
/// Start capturing the body
var stream = context.Response.Body;
context.Response.Body = buffer;
// Run the rest of the pipeline
await _next.Invoke(context);
// Process the body
buffer.Seek(0, SeekOrigin.Begin);
var reader = new StreamReader(buffer);
using (var bufferReader = new StreamReader(buffer))
{
string body = await bufferReader.ReadToEndAsync();
body = ModifyBody(body, GetExtractionPath(context), GetScope(context)); // This is where the work gets done
context.Response.Clear();
context.Response.ContentType = "text/html"; // A fair assumption?
await context.Response.WriteAsync(body);
context.Response.Body.Seek(0, SeekOrigin.Begin);
await context.Response.Body.CopyToAsync(stream);
context.Response.Body = stream;
}
}
}
private static bool HasExtractionPath(HttpContext context)
{
return context.Request.Query[EXTRACTION_PATH_KEY].Any();
}
private static string GetExtractionPath(HttpContext context)
{
return context.Request.Query[EXTRACTION_PATH_KEY].First();
}
private static string GetScope(HttpContext context)
{
return context.Request.Query[SCOPE_KEY].FirstOrDefault()?.ToLower().Trim();
}
private static string ModifyBody(string html, string path, string scope)
{
if(!html.Contains("<")) return html; // This isn't parsable HTML...
var doc = _parser.ParseDocument(html);
var elements = doc.QuerySelectorAll(path);
if (!elements.Any() || elements == null) return string.Empty; // Sanity check; not sure if needed
return string.Join(string.Empty, elements.Select(e => scope == "inner" ? e.InnerHtml : e.OuterHtml));
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment