Skip to content

Instantly share code, notes, and snippets.

@klinkby
Created June 4, 2024 19:28
Show Gist options
  • Save klinkby/b3b9d2265153c2bfdd2e280bb9acd838 to your computer and use it in GitHub Desktop.
Save klinkby/b3b9d2265153c2bfdd2e280bb9acd838 to your computer and use it in GitHub Desktop.
Generate an extract of an XML document, reducing size by skipping elements with given name after a given number has been seen.
using System;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Xml;
using System.Diagnostics.CodeAnalysis;
/// <summary>
/// Generate an extract of an XML document, reducing size by skipping elements with given name
/// after a given number has been seen. Original schema will be retained.
/// </summary>
public sealed class XmlReducer : IAsyncDisposable, IDisposable
{
private const string Xmlns = "xmlns";
private static readonly XmlReaderSettings XmlReaderSettings = new()
{
Async = true,
CloseInput = false,
IgnoreComments = true,
IgnoreProcessingInstructions = true,
IgnoreWhitespace = true,
ConformanceLevel = ConformanceLevel.Document,
ValidationType = ValidationType.None
};
private static readonly XmlWriterSettings XmlWriterSettings = new()
{
Async = true,
Encoding = new UTF8Encoding(false),
CheckCharacters = false,
CloseOutput = false,
ConformanceLevel = ConformanceLevel.Document,
WriteEndDocumentOnClose = false
};
private static readonly RegexOptions RegexOptions = RegexOptions.Compiled
| RegexOptions.CultureInvariant
| RegexOptions.ExplicitCapture
| RegexOptions.NonBacktracking;
private static readonly TimeSpan RegexMatchTimeout = TimeSpan.Parse("00:00:01");
private readonly XmlReader _reader;
private readonly XmlWriter _writer;
private readonly Regex _triggerRegex;
private int _skipDepth = int.MaxValue;
private int _triggerCount;
/// <summary>
/// Create a parser context for reducing an XML document.
/// </summary>
/// <param name="largeXmlDocument">Stream to read from</param>
/// <param name="reducedDocument">Stream to write to</param>
/// <param name="triggerRegexPattern">Element name pattern to look for</param>
/// <param name="maxTriggerCount">Max occurences of the element before skipping</param>
public XmlReducer(
Stream largeXmlDocument,
Stream reducedDocument,
[StringSyntax(StringSyntaxAttribute.Regex)]
string triggerRegexPattern,
int maxTriggerCount = 1)
: this(
XmlReader.Create(largeXmlDocument, XmlReaderSettings),
XmlWriter.Create(reducedDocument, XmlWriterSettings),
new Regex(triggerRegexPattern, RegexOptions, RegexMatchTimeout),
maxTriggerCount)
{
if (ReferenceEquals(largeXmlDocument, reducedDocument))
throw new ArgumentException("Streams must be different", nameof(reducedDocument));
if (largeXmlDocument.CanRead == false)
throw new ArgumentException("Stream must be readable", nameof(largeXmlDocument));
if (reducedDocument.CanWrite == false)
throw new ArgumentException("Stream must be writable", nameof(reducedDocument));
}
/// <summary>
/// Create a parser context for reducing an XML document.
/// </summary>
/// <param name="reader">Read large document</param>
/// <param name="writer">Written reduced document</param>
/// <param name="triggerRegex">Element name pattern to look for</param>
/// <param name="maxTriggerCount">Max occurences of the element before skipping</param>
public XmlReducer(XmlReader reader, XmlWriter writer, Regex triggerRegex, int maxTriggerCount = 1)
{
if (ReadState.Initial != reader.ReadState)
throw new ArgumentException("Reader must be in initial state", nameof(reader));
_reader = reader;
_writer = writer;
_triggerRegex = triggerRegex;
_triggerCount = maxTriggerCount;
}
/// <summary>
/// Read all of input Xml document, copying part of it to the output
/// </summary>
/// <param name="cancellationToken">A <see cref="CancellationToken" /> used to cancel the operation</param>
/// <exception cref="XmlException">Thrown if input is not XML, e.g. if elements are not closed</exception>
public async Task ParseAsync(CancellationToken cancellationToken = default)
{
while (!cancellationToken.IsCancellationRequested
&& await _reader.ReadAsync())
{
if (Skip) continue;
var nodeParser = _reader.NodeType switch
{
XmlNodeType.Element => ParseElement(),
XmlNodeType.Text => ParseText(),
XmlNodeType.EndElement => ParseEndElement(),
_ => Task.CompletedTask
};
await nodeParser;
}
await _writer.FlushAsync();
}
private async Task ParseElement()
{
if (IsTrigger)
{
_skipDepth = _reader.Depth;
return;
}
await _writer.WriteStartElementAsync(_reader.Prefix, _reader.LocalName, _reader.NamespaceURI);
if (!_reader.HasAttributes) return;
for (var attInd = 0; attInd < _reader.AttributeCount; attInd++)
{
_reader.MoveToAttribute(attInd);
if (Xmlns == _reader.Prefix)
{
await _writer.WriteAttributeStringAsync(Xmlns, _reader.LocalName, null, _reader.Value);
}
else
{
if (Xmlns != _reader.Name)
await _writer.WriteAttributeStringAsync(null, _reader.Name, null, _reader.Value);
}
}
_reader.MoveToElement();
}
private Task ParseEndElement() => _writer.WriteEndElementAsync();
private Task ParseText() => _writer.WriteStringAsync(_reader.Value);
private bool Skip => _reader.Depth >= _skipDepth;
private bool IsTrigger =>
_triggerRegex.IsMatch(_reader.LocalName) && _triggerCount-- <= 0;
public void Dispose()
{
_reader.Dispose();
_writer.Dispose();
}
public async ValueTask DisposeAsync()
{
await _writer.DisposeAsync();
_reader.Dispose();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment