deanebarker/ExtractFragmentMiddleware.cs

## README.md

      
    Raw
  

              README.md
            
          
    This is a .NET5 middleware component that will extract HTML elements based
on a CSS-style selector in a querystring argument and return them as the body of the request.
/my-page?extract=article
Extracts the outer HTML (the default) of all ARTICLE tags and concatenates them.
/my-page?extract=article&scope=inner
The same, except the inner HTML.
if the selector finds multiple matching tags, it will concantenate them all. If you only need
the first, then you need to find a way to narrow your CSS selector to only return the first.
Uses the AngleSharp library, installable via Nuget. extract will take any syntax that AngleSharp supports,
which is most CSS selectors.
https://github.com/AngleSharp/
This needs to be configured as middleware in Startup.cs, like this:
public void Configure(IApplicationBuilder app, IWebHostEnvironment env)
{
  app.UseMiddleware<ExtractFragmentMiddleware>();
}

Note: it needs to be placed early in the pipeline. Honestly, I don't know how early, but I configured as the first middleware in the pipeline, and it worked consistently.

  
## ExtractFragmentMiddleware.cs
using AngleSharp.Html.Parser;
using Microsoft.AspNetCore.Http;
using System.IO;
using System.Linq;
using System.Threading.Tasks;

namespace DeaneBarker.Optimizely.Middleware
{
    public class ExtractFragmentMiddleware
    {
        private static readonly string EXTRACTION_PATH_KEY = "extract";
        private static readonly string SCOPE_KEY = "scope";

        private static HtmlParser _parser;

        RequestDelegate _next;

        static ExtractFragmentMiddleware()
        {
            _parser = new HtmlParser();
        }

        public ExtractFragmentMiddleware(RequestDelegate next)
        {
            _next = next;
        }

        public async Task Invoke(HttpContext context)
        {
            // If we don't have an extraction path, abandon
            if(!HasExtractionPath(context))
            {
                await _next.Invoke(context);
                return;
            }

            using (var buffer = new MemoryStream())
            {
                /// Start capturing the body
                var stream = context.Response.Body;
                context.Response.Body = buffer;

                // Run the rest of the pipeline
                await _next.Invoke(context);

                // Process the body
                buffer.Seek(0, SeekOrigin.Begin);
                var reader = new StreamReader(buffer);
                using (var bufferReader = new StreamReader(buffer))
                {
                    string body = await bufferReader.ReadToEndAsync();
                    body = ModifyBody(body, GetExtractionPath(context), GetScope(context)); // This is where the work gets done

                    context.Response.Clear();
                    context.Response.ContentType = "text/html"; // A fair assumption?
                    await context.Response.WriteAsync(body);
                    context.Response.Body.Seek(0, SeekOrigin.Begin);

                    await context.Response.Body.CopyToAsync(stream);
                    context.Response.Body = stream;
                }
            }
        }

        private static bool HasExtractionPath(HttpContext context)
        {
            return context.Request.Query[EXTRACTION_PATH_KEY].Any();
        }

        private static string GetExtractionPath(HttpContext context)
        {
            return context.Request.Query[EXTRACTION_PATH_KEY].First();
        }

        private static string GetScope(HttpContext context)
        {
            return context.Request.Query[SCOPE_KEY].FirstOrDefault()?.ToLower().Trim();
        }

        private static string ModifyBody(string html, string path, string scope)
        {
            if(!html.Contains("<")) return html; // This isn't parsable HTML...

            var doc = _parser.ParseDocument(html);
            var elements = doc.QuerySelectorAll(path);

            if (!elements.Any() || elements == null) return string.Empty; // Sanity check; not sure if needed

            return string.Join(string.Empty, elements.Select(e => scope == "inner" ? e.InnerHtml : e.OuterHtml));
        }
    }
}
	using AngleSharp.Html.Parser;
	using Microsoft.AspNetCore.Http;
	using System.IO;
	using System.Linq;
	using System.Threading.Tasks;

	namespace DeaneBarker.Optimizely.Middleware
	{
	public class ExtractFragmentMiddleware
	{
	private static readonly string EXTRACTION_PATH_KEY = "extract";
	private static readonly string SCOPE_KEY = "scope";

	private static HtmlParser _parser;

	RequestDelegate _next;

	static ExtractFragmentMiddleware()
	{
	_parser = new HtmlParser();
	}

	public ExtractFragmentMiddleware(RequestDelegate next)
	{
	_next = next;
	}

	public async Task Invoke(HttpContext context)
	{
	// If we don't have an extraction path, abandon
	if(!HasExtractionPath(context))
	{
	await _next.Invoke(context);
	return;
	}

	using (var buffer = new MemoryStream())
	{
	/// Start capturing the body
	var stream = context.Response.Body;
	context.Response.Body = buffer;

	// Run the rest of the pipeline
	await _next.Invoke(context);

	// Process the body
	buffer.Seek(0, SeekOrigin.Begin);
	var reader = new StreamReader(buffer);
	using (var bufferReader = new StreamReader(buffer))
	{
	string body = await bufferReader.ReadToEndAsync();
	body = ModifyBody(body, GetExtractionPath(context), GetScope(context)); // This is where the work gets done

	context.Response.Clear();
	context.Response.ContentType = "text/html"; // A fair assumption?
	await context.Response.WriteAsync(body);
	context.Response.Body.Seek(0, SeekOrigin.Begin);

	await context.Response.Body.CopyToAsync(stream);
	context.Response.Body = stream;
	}
	}
	}

	private static bool HasExtractionPath(HttpContext context)
	{
	return context.Request.Query[EXTRACTION_PATH_KEY].Any();
	}

	private static string GetExtractionPath(HttpContext context)
	{
	return context.Request.Query[EXTRACTION_PATH_KEY].First();
	}

	private static string GetScope(HttpContext context)
	{
	return context.Request.Query[SCOPE_KEY].FirstOrDefault()?.ToLower().Trim();
	}

	private static string ModifyBody(string html, string path, string scope)
	{
	if(!html.Contains("<")) return html; // This isn't parsable HTML...

	var doc = _parser.ParseDocument(html);
	var elements = doc.QuerySelectorAll(path);

	if (!elements.Any() \|\| elements == null) return string.Empty; // Sanity check; not sure if needed

	return string.Join(string.Empty, elements.Select(e => scope == "inner" ? e.InnerHtml : e.OuterHtml));
	}
	}
	}