Skip to content

Instantly share code, notes, and snippets.

@alex-davies
Created August 10, 2016 18:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alex-davies/4accf28d40682231473eff289379aeed to your computer and use it in GitHub Desktop.
Save alex-davies/4accf28d40682231473eff289379aeed to your computer and use it in GitHub Desktop.
Forward only stream to read multipart/form-data
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
namespace MultipartParser
{
/// <summary>
/// Parses a multipart/form-data stream without buffering the entire request into memory
/// </summary>
public class MultipartPartParser : Stream
{
/// <summary>
/// Stream the multipart message is being read from
/// </summary>
public Stream MultipartStream { get; private set; }
/// <summary>
/// Encoding of the multipart stream header data
/// </summary>
public Encoding Encoding { get; private set; }
/// <summary>
/// The header element of the part
/// </summary>
public string Header { get; private set; }
/// <summary>
/// The content disposition of the part
/// </summary>
public string ContentDisposition { get; private set; }
/// <summary>
/// The content type of the part
/// </summary>
public string ContentType { get; private set; }
/// <summary>
/// The name of the form field that submitted this part
/// </summary>
public string Name { get; private set; }
/// <summary>
/// The filename if the submitted part was a file, otherwise null
/// </summary>
public string Filename { get; private set; }
/// <summary>
/// Determines if this is a full part or just a stub to indicate the
/// end of the stream
/// </summary>
public bool IsEndPart { get; private set; }
/// <summary>
/// The next part in the multipart message
/// </summary>
protected MultipartPartParser NextPart { get; private set; }
/// <summary>
/// Buffer to store data extracted from the multipart stream but not yet returned
/// </summary>
protected MemoryStream LocalBuffer { get; private set; }
/// <summary>
/// The boundary between parts prependned with the newline element
/// </summary>
protected byte[] BoundaryWithNewLinePrepend { get; private set; }
/// <summary>
/// The bytes that represnt a new line character
/// </summary>
protected byte[] NewLine { get; private set; }
public MultipartPartParser(Stream multipartStream) : this(multipartStream, Encoding.UTF8)
{
}
public MultipartPartParser(Stream multipartStream, Encoding encoding, MemoryStream buffer = null)
{
this.MultipartStream = multipartStream;
this.Encoding = encoding;
LocalBuffer = new MemoryStream();
if (buffer != null)
buffer.CopyTo(LocalBuffer);
LocalBuffer.Position = 0;
NewLine = Encoding.GetBytes("\r\n");
var DoubleNewLine = Encoding.GetBytes("\r\n\r\n");
//set boundary to empty for now, we dont know what it is until we process our header
BoundaryWithNewLinePrepend = new byte[0];
byte[] headerBytes = new byte[1024];
int headerBytesRead = this.Read(headerBytes, 0, headerBytes.Length);
int boundaryEnd;
if (!SearchBytePattern(NewLine, headerBytes, out boundaryEnd))
throw new Exception("No multipart boundary found. Data must begin with a content boundary");
//copy our boundary so we can use it
BoundaryWithNewLinePrepend = new byte[boundaryEnd + NewLine.Length];
Buffer.BlockCopy(NewLine, 0, BoundaryWithNewLinePrepend, 0, NewLine.Length);
Buffer.BlockCopy(headerBytes, 0, BoundaryWithNewLinePrepend, NewLine.Length, boundaryEnd);
//if we have reached the end of our stream at the end of our header then
//this is the end of multipart part, we label this as the end part and return
//we know we have reached the end when the number bytes we read was our header
//plus our search pattern (newline)
if (headerBytesRead == boundaryEnd + NewLine.Length)
{
IsEndPart = true;
return;
}
int headerEnd;
if (!SearchBytePattern(DoubleNewLine, headerBytes, boundaryEnd, out headerEnd))
{
//if we cant find the end of the header it could mean our header is massive
//and it wasnt in the initial block of bytes we read.
throw new Exception("Content header is too large to process");
}
headerEnd += DoubleNewLine.Length;
//get the header and header derived fields
Header = encoding.GetString(headerBytes, boundaryEnd, headerEnd - boundaryEnd).Trim();
ContentDisposition = RegexFirstGroup(Header, "^Content-Disposition:(.*)$");
ContentType = RegexFirstGroup(Header, "^Content-Type:(.*)$");
Filename = RegexFirstGroup(ContentDisposition, @"filename=""(.*?)""");
Name = RegexFirstGroup(ContentDisposition, @"name=""(.*?)""");
int CountOfNonHeaderBytes = headerBytesRead - headerEnd;
//put back the extra non header content so it can be streamed out again
ReinsertIntoLocalBuffer(headerBytes, headerEnd, CountOfNonHeaderBytes);
}
/// <summary>
/// Re-Buffers data extracted from the read
/// </summary>
/// <param name="source"></param>
/// <param name="offset"></param>
/// <param name="count"></param>
protected void ReinsertIntoLocalBuffer(byte[] source, int offset, int count)
{
//we have our header, but we potentially have read more than we need to
//we have two cases
//1. we have exhausted our LocalBuffer and some of the data came from the MultipartStream
// in this case we will reset our local buffer and write our remaining bytes back into
// our local buffer
//2. We did not exhaust our local buffer, in which case the remaining bytes are still in
// the local buffer so we will just rewind it so they are picked up next read
if (LocalBuffer.Position == LocalBuffer.Length)
{
LocalBuffer.Position = 0;
LocalBuffer.SetLength(0);
LocalBuffer.Write(source, offset, count);
LocalBuffer.Position = 0;
}
else
{
LocalBuffer.Position -= count;
}
}
/// <summary>
/// Helper method to easily get the first group of a regex expresion
/// </summary>
/// <param name="input"></param>
/// <param name="pattern"></param>
/// <returns></returns>
private string RegexFirstGroup(string input, string pattern)
{
var match = Regex.Match(input, pattern, RegexOptions.Multiline);
if (match.Success)
return match.Groups[1].Value.Trim();
return null;
}
public override int Read(byte[] buffer, int offset, int count)
{
MultipartPartParser nextPart;
return ReadForNextPart(buffer, offset, count, out nextPart);
}
/// <summary>
/// Moves the stream foward until a new part is found
/// </summary>
/// <param name="bufferSize"></param>
/// <returns></returns>
public MultipartPartParser ReadUntilNextPart(int bufferSize = 4096)
{
byte[] throwawayBuffer = new byte[bufferSize];
MultipartPartParser nextpart;
while (ReadForNextPart(throwawayBuffer, 0, bufferSize, out nextpart) > 0) { }
return nextpart;
}
/// <summary>
/// Reads the stream, if this part has completed the nextpart is returned
/// </summary>
/// <param name="buffer"></param>
/// <param name="offset"></param>
/// <param name="count"></param>
/// <param name="nextpart"></param>
/// <returns></returns>
public int ReadForNextPart(byte[] buffer, int offset, int count, out MultipartPartParser nextpart)
{
//If we have found our next part we have already finsihed this part and should stop here
if (NextPart != null || IsEndPart)
{
nextpart = NextPart;
return 0;
}
//the search buffer is the place where we will scan for part bounderies. We need it to be just
//a bit bigger than than the size requested, to ensure we dont accidnetly send part of a boundary
//without realising it
byte[] searchBuffer = new byte[count + BoundaryWithNewLinePrepend.Length];
int bytesReadThisCall = 0;
//first read from our local buffer
int bytesToReadFromLocalBuffer = Math.Min((int)LocalBuffer.Length, searchBuffer.Length);
if (bytesToReadFromLocalBuffer > 0)
{
bytesReadThisCall += LocalBuffer.Read(searchBuffer, bytesReadThisCall, bytesToReadFromLocalBuffer);
}
//if we could not fill our search buffer with our local buffer then read from the multipart stream
int bytesToReadFromStream = searchBuffer.Length - bytesReadThisCall;
bytesToReadFromStream = Math.Min(bytesToReadFromStream, (int)MultipartStream.Length - (int)MultipartStream.Position);
if (bytesToReadFromStream > 0)
{
bytesReadThisCall += MultipartStream.Read(searchBuffer, bytesReadThisCall, bytesToReadFromStream);
}
//the number of bytes returned will be one of three cases
//1. There is still plenty to return so we will return the 'count' they asked for
//2. We have emptied the stream, we will return the bytes read
//3. We have run into a new boundary, we will return the bytes up to the boundary end
int bytesReturned;
bool isEndOfPart = SearchBytePattern(BoundaryWithNewLinePrepend, searchBuffer, out bytesReturned);
//we can only return the parts we know for sure are not part of the next boundary
//which is the bytes we read minus the boundary length. This will also ensure we
//get back to the count we were originally asked for. We also need to make sure we
//return 0 bytes if we can not gaurentee there are no boundaries parts in what we
//did manage to read
if (!isEndOfPart)
bytesReturned = Math.Max(0, bytesReadThisCall - BoundaryWithNewLinePrepend.Length);
Buffer.BlockCopy(searchBuffer, 0, buffer, offset, bytesReturned);
//We need to handle the bytes that did not get returned by putting them back into
//the local buffer
int bytesNotReturned = bytesReadThisCall - bytesReturned;
ReinsertIntoLocalBuffer(searchBuffer, bytesReturned, bytesNotReturned);
nextpart = null;
if (isEndOfPart)
{
//the boundary we were looking for had a newline appended to it
//we dont want to send the newline to the next part so we will skip
LocalBuffer.Position += NewLine.Length;
NextPart = new MultipartPartParser(MultipartStream, Encoding, LocalBuffer);
//The next part may actually just the be end indicator, if thats the case
//we will null it and not return it
if (NextPart.IsEndPart)
NextPart = null;
nextpart = NextPart;
}
return bytesReturned;
}
/// <summary>
/// Searches for a byte pattern in a block of bytes
/// </summary>
/// <param name="pattern"></param>
/// <param name="bytes"></param>
/// <param name="matchStartIndex"></param>
/// <returns></returns>
protected bool SearchBytePattern(byte[] pattern, byte[] bytes, out int matchStartIndex)
{
return SearchBytePattern(pattern, bytes, 0, out matchStartIndex);
}
/// <summary>
/// Searches for a byte pattern in a block of bytes
/// </summary>
/// <param name="pattern"></param>
/// <param name="bytes"></param>
/// <param name="searchOffset"></param>
/// <param name="matchStartIndex"></param>
/// <returns></returns>
protected bool SearchBytePattern(byte[] pattern, byte[] bytes, int searchOffset, out int matchStartIndex)
{
if (pattern == null || pattern.Length == 0 || bytes == null || bytes.Length == 0)
{
matchStartIndex = -1;
return false;
}
matchStartIndex = Array.IndexOf<byte>(bytes, pattern[0]);
int searchUpToIndex = bytes.Length - pattern.Length;
while (matchStartIndex > 0 && matchStartIndex < searchUpToIndex)
{
bool ismatch = true;
for (int j = 1; j < pattern.Length && ismatch == true; j++)
{
if (bytes[matchStartIndex + j] != pattern[j])
ismatch = false;
}
if (ismatch)
return true;
matchStartIndex = Array.IndexOf<byte>(bytes, pattern[0], matchStartIndex + 1);
}
matchStartIndex = -1;
return false;
}
public override bool CanRead
{
get { return true; }
}
public override bool CanSeek
{
get { return false; }
}
public override bool CanWrite
{
get { return false; }
}
public override void Flush()
{
}
public override long Length
{
get { throw new NotSupportedException(); }
}
public override long Position
{
get
{
throw new NotSupportedException();
}
set
{
throw new NotSupportedException();
}
}
public override long Seek(long offset, SeekOrigin origin)
{
throw new NotSupportedException();
}
public override void SetLength(long value)
{
throw new NotSupportedException();
}
public override void Write(byte[] buffer, int offset, int count)
{
throw new NotSupportedException();
}
}
}
@ropuls
Copy link

ropuls commented Dec 6, 2017

Hi Alex,

virtually the first multipart parser I found that is not trying to read the full stream into memory in one shot. I am coming from c++, and sometimes really wonder about the mindlessness of resource usage of c# programmers.

I need to do basically the same, but with the ability to run it asynchroneously.

Thanks anyways!
Roman

@mihafreenode
Copy link

Great job with the parser!!!

In method SearchBytePattern(byte[] pattern, byte[] bytes, int searchOffset, out int matchStartIndex) condition in while should be >= 0, for rare cases where pattern[0], such as \r, is found at bytes[0]. ArrayIndexNotFound is -1

protected bool SearchBytePattern(byte[] pattern, byte[] bytes, int searchOffset, out int matchStartIndex)
        {
            if (pattern == null || pattern.Length == 0 || bytes == null || bytes.Length == 0)
            {
                matchStartIndex = -1;
                return false;
            }

            matchStartIndex = Array.IndexOf<byte>(bytes, pattern[0]);
            int searchUpToIndex = bytes.Length - pattern.Length;
            while (matchStartIndex >= 0 && matchStartIndex < searchUpToIndex)
            {
                bool ismatch = true;
                for (int j = 1; j < pattern.Length && ismatch == true; j++)
                {
                    if (bytes[matchStartIndex + j] != pattern[j])
                        ismatch = false;
                }
                if (ismatch)
                    return true;

                matchStartIndex = Array.IndexOf<byte>(bytes, pattern[0], matchStartIndex + 1);
            }

            matchStartIndex = -1;
            return false;
        }

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment