Darkyenus/HeaderParser.kt

## HeaderParser.kt

/*
The MIT-Zero License

Copyright (c) 2021 Jan Polák

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
 */

import org.slf4j.LoggerFactory
import java.nio.ByteBuffer
import java.nio.CharBuffer
import java.nio.charset.Charset
import java.nio.charset.CharsetDecoder
import java.nio.charset.CodingErrorAction
import kotlin.math.min

/*
 * Implements basic parsing primitives for headers as defined by various RFCs:
 * https://tools.ietf.org/html/rfc2045 (various tokens)
 * https://tools.ietf.org/html/rfc822 (various tokens, BNF notation used in other RFCs, see section 2)
 * https://tools.ietf.org/html/rfc2183 (disposition type)
 * https://tools.ietf.org/html/rfc2231 (not supported)
 * https://tools.ietf.org/html/rfc5987 (supported)
 */

private val LOG = LoggerFactory.getLogger("HeaderParser")

private class Parser(val content:CharSequence) {
	var position = 0
	fun eof():Boolean = position >= content.length
	fun peek():Char = content[position]

	/** RFC822 SPACE */
	fun eatWhitespace() {
		while (!eof()) {
			val c = peek()
			if (c == ' ' || c == '\t') {
				position++
			} else {
				break
			}
		}
	}

	/** RFC2045 token */
	fun parseToken():String? {
		eatWhitespace()
		val start = position
		while (!eof()) {
			val c = peek()
			if (c <= ' ' || c >= 0x7F.toChar() || c in /* RFC2045 tspecials */ "()<>@,;:\\\"/[]?=") {
				break
			}
			position++
		}
		if (start == position) {
			// At least one required
			return null
		}
		return content.substring(start, position)
	}

	/** RFC822 quoted-string */
	fun parseQuotedString():String? {
		val rollback = position
		eatWhitespace()
		if (eof() || peek() != '"') {
			position = rollback
			return null
		}
		position++
		val sb = StringBuilder()
		while (!eof()) {
			var c = peek()
			if (c == '"') {
				// Done
				position++
				return sb.toString()
			}

			// Parse qtext/quoted-pair
			if (c == '\\') {
				position++
				if (eof()) {
					// Character expected!
					break
				}
				c = peek()
			} else if (c == '\r') {
				// Explicitly disallowed
				break
			}

			// Valid char or quoted char, if ascii
			if (c >= 0x7F.toChar()) {
				// Only ASCII is allowed
				break
			}
			sb.append(c)
			position++
		}

		// Failed, rollback
		position = rollback
		return null
	}

	/** BNF utility */
	fun parseLiteral(literal:String, ignoreCase:Boolean):String? {
		if (literal.isEmpty()) {
			return literal
		}

		val rollback = position
		eatWhitespace()

		var i = 0
		while (i < literal.length) {
			if (eof() || !peek().equals(literal[i], ignoreCase = ignoreCase)) {
				position = rollback
				return null
			} else {
				position++
				i++
			}
		}

		return literal
	}

	/** RFC2045 value */
	fun parseValue():String? {
		return parseToken() ?: parseQuotedString()
	}

	/** RFC2183 disposition-type */
	fun parseDispositionType():String? {
		return parseLiteral("inline", true)
			?: parseLiteral("attachment", true)
			?:
			// RFC2045 extension-token
			// (since that calls for knowledge of all existing RFCs, let's just parse token)
			parseToken()
	}

	/** RFC5987 mime-charset, covers whole charset */
	fun parseCharset():String {
		val rollback = position
		while (true) {
			if (eof()) {
				break
			}

			val c = peek()
			if (c !in 'a'..'z' && c !in 'A'..'Z' && c !in '0'..'9' && c !in "!#$%&+-^_`{}~") {
				break
			}

			position++
		}
		return content.substring(rollback, position)
	}

	/** RFC3986 2.1 pct-encoded single HEXDIG */
	fun parseHEXDIG():Int {
		if (eof()) {
			return -1
		}
		val result = when (val c = peek()) {
			in '0'..'9' -> c - '0'
			in 'a'..'z' -> c - 'a' + 10
			in 'A'..'Z' -> c - 'A' + 10
			else -> return -1
		}
		position++
		return result
	}

	/** RFC5987 ext-value */
	fun parseExtendedValue():String? {
		val rollback = position
		eatWhitespace()
		val charsetName = parseCharset()
		if (parseLiteral("'", false) == null) {
			position = rollback
			return null
		}

		val charsetDecoder = when {
			charsetName.equals("UTF-8", ignoreCase = true) -> Charsets.UTF_8
			charsetName.equals("ISO-8859-1", ignoreCase = true) -> Charsets.ISO_8859_1
			else -> try {
				// Technically, this is not allowed, but we will allow it anyway
				Charset.forName(charsetName)
			} catch (e:Exception) {
				// No such charset! Append stuff directly as ascii and hope for the best.
				Charsets.US_ASCII
			}
		}.newDecoder()
			.onMalformedInput(CodingErrorAction.REPLACE)
			.onUnmappableCharacter(CodingErrorAction.REPLACE)
			.replaceWith("?")

		// Parse language
		// Language-Tag from https://tools.ietf.org/html/rfc5646#section-2.1
		// We only eat it and don't care if it is 100% correct
		while (true) {
			if (eof()) {
				position = rollback
				return null
			}

			val c = peek()
			if (c !in 'a'..'z' && c !in 'A'..'Z' && c !in '0'..'9' && c != '-') {
				break
			}

			position++
		}

		if (parseLiteral("'", false) == null) {
			position = rollback
			return null
		}

		// value-chars
		val sb = StringBuilder()
		val byteBuffer = ByteBuffer.allocate((content.length - position)/3) // Upper capacity bound
		val charBuffer = CharBuffer.allocate(byteBuffer.capacity() * 2)// Most pessimistic estimate
		var byteBufferDirty = false

		while (!eof()) {
			val c = peek()
			if (c in 'a'..'z' || c in 'A'..'Z' || c in '0'..'9' || c in "!#\$&+-.^_`|~") {
				if (byteBufferDirty) {
					byteBufferDirty = false
					pumpDecoder(byteBuffer, charBuffer, charsetDecoder, sb)
				}

				sb.append(c)
				position++
			} else if (c == '%') {
				position++
				val firstHexDig = parseHEXDIG()
				val secondHexDig = parseHEXDIG()
				val octet = (firstHexDig shl 4) or secondHexDig
				if (octet !in 0..0xFF) {
					// 2 HEXDIG expected, got EOF (-1 in either value would lead to negative result)
					position = rollback
					return null
				}
				byteBuffer.put(octet.toByte())
				byteBufferDirty = true
			} else {
				// Invalid value-char, end of string!
				break
			}
		}
		if (byteBufferDirty) {
			pumpDecoder(byteBuffer, charBuffer, charsetDecoder, sb)
		}

		return sb.toString()
	}

	private fun pumpDecoder(bytes:ByteBuffer, chars:CharBuffer, decoder:CharsetDecoder, out:java.lang.StringBuilder) {
		try {
			bytes.flip()
			chars.clear()
			val byteCount = bytes.limit()
			try {
				decoder.decode(bytes, chars, true)
				chars.flip()
			} catch (e: Exception) {
				LOG.error("Decoding {} bytes to {} failed", byteCount, decoder.charset(), e)
				for (i in 0 until min(byteCount, 3)) {
					out.append('?')
				}
				return
			}
			while (chars.hasRemaining()) {
				out.append(chars.get())
			}
		} finally {
			bytes.clear()
		}
	}

	/** RFC2045 parameter
	 * Also matches RFC2183 disposition-parm subrules all into one */
	fun parseParameter():Pair<String, String>? {
		val rollback = position
		val attribute = parseToken() ?: return null
		if (parseLiteral("=", true) == null) {
			position = rollback
			return null
		}

		val value = if (attribute.endsWith('*')) {
			parseExtendedValue() ?: parseValue()
		} else {
			parseValue()
		}
		if (value == null) {
			position = rollback
			return null
		}

		return attribute to value
	}
}

/**
 * Parse filename from the Content-Disposition header content.
 * See: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Disposition
 */
fun parseContentDispositionFilename(contentDispositionContent:CharSequence?):String? {
	val parser = Parser(contentDispositionContent ?: return null)
	parser.parseDispositionType() ?: return null
	var filename:String? = null
	while (parser.parseLiteral(";", true) != null) {
		val (key, value) = parser.parseParameter() ?: return null
		if (key.equals("filename*", true)) {
			return value
		} else if (key.equals("filename", true)) {
			filename = value
		}
	}
	return filename
}

## HeaderParserTest.kt

import org.junit.Assert
import org.junit.Test

class HeaderParserTest {

	@Test
	fun testParseContentDispositionFilename() {
		Assert.assertEquals(null, parseContentDispositionFilename("form-data; name=\"fieldName\""))
		Assert.assertEquals("filename.jpg", parseContentDispositionFilename("form-data; name=\"fieldName\"; filename=\"filename.jpg\""))
		Assert.assertEquals("cool.html", parseContentDispositionFilename(" attachment; filename=\"cool.html\""))
		Assert.assertEquals("example.txt", parseContentDispositionFilename("form-data; name=\"field2\"; filename=\"example.txt\""))
		Assert.assertEquals(null, parseContentDispositionFilename("bar; title=Economy"))
		Assert.assertEquals("Economy", parseContentDispositionFilename("bar; filename=Economy"))
		Assert.assertEquals("US-$ rates", parseContentDispositionFilename("bar; filename=\"US-\$ rates\""))
		Assert.assertEquals("£ rates", parseContentDispositionFilename("bar; filename*=iso-8859-1'en'%A3%20rates"))
		Assert.assertEquals("£ and € rates", parseContentDispositionFilename("bar; filename*=UTF-8''%c2%a3%20and%20%e2%82%ac%20rates"))
		Assert.assertEquals("€ exchange rates", parseContentDispositionFilename("bar; filename=\"EURO exchange rates\";    filename*=utf-8''%e2%82%ac%20exchange%20rates"))
		Assert.assertEquals("EURO exchange rates", parseContentDispositionFilename("bar; filename=\"EURO exchange rates\";    filenameNOT*=utf-8''%e2%82%ac%20exchange%20rates"))
	}

}

	/*
	The MIT-Zero License

	Copyright (c) 2021 Jan Polák

	Permission is hereby granted, free of charge, to any person obtaining a copy
	of this software and associated documentation files (the "Software"), to deal
	in the Software without restriction, including without limitation the rights
	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	copies of the Software, and to permit persons to whom the Software is
	furnished to do so.

	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	THE SOFTWARE.
	*/

	import org.slf4j.LoggerFactory
	import java.nio.ByteBuffer
	import java.nio.CharBuffer
	import java.nio.charset.Charset
	import java.nio.charset.CharsetDecoder
	import java.nio.charset.CodingErrorAction
	import kotlin.math.min

	/*
	* Implements basic parsing primitives for headers as defined by various RFCs:
	* https://tools.ietf.org/html/rfc2045 (various tokens)
	* https://tools.ietf.org/html/rfc822 (various tokens, BNF notation used in other RFCs, see section 2)
	* https://tools.ietf.org/html/rfc2183 (disposition type)
	* https://tools.ietf.org/html/rfc2231 (not supported)
	* https://tools.ietf.org/html/rfc5987 (supported)
	*/

	private val LOG = LoggerFactory.getLogger("HeaderParser")

	private class Parser(val content:CharSequence) {
	var position = 0
	fun eof():Boolean = position >= content.length
	fun peek():Char = content[position]

	/** RFC822 SPACE */
	fun eatWhitespace() {
	while (!eof()) {
	val c = peek()
	if (c == ' ' \|\| c == '\t') {
	position++
	} else {
	break
	}
	}
	}

	/** RFC2045 token */
	fun parseToken():String? {
	eatWhitespace()
	val start = position
	while (!eof()) {
	val c = peek()
	if (c <= ' ' \|\| c >= 0x7F.toChar() \|\| c in /* RFC2045 tspecials */ "()<>@,;:\\\"/[]?=") {
	break
	}
	position++
	}
	if (start == position) {
	// At least one required
	return null
	}
	return content.substring(start, position)
	}

	/** RFC822 quoted-string */
	fun parseQuotedString():String? {
	val rollback = position
	eatWhitespace()
	if (eof() \|\| peek() != '"') {
	position = rollback
	return null
	}
	position++
	val sb = StringBuilder()
	while (!eof()) {
	var c = peek()
	if (c == '"') {
	// Done
	position++
	return sb.toString()
	}

	// Parse qtext/quoted-pair
	if (c == '\\') {
	position++
	if (eof()) {
	// Character expected!
	break
	}
	c = peek()
	} else if (c == '\r') {
	// Explicitly disallowed
	break
	}

	// Valid char or quoted char, if ascii
	if (c >= 0x7F.toChar()) {
	// Only ASCII is allowed
	break
	}
	sb.append(c)
	position++
	}

	// Failed, rollback
	position = rollback
	return null
	}

	/** BNF utility */
	fun parseLiteral(literal:String, ignoreCase:Boolean):String? {
	if (literal.isEmpty()) {
	return literal
	}

	val rollback = position
	eatWhitespace()

	var i = 0
	while (i < literal.length) {
	if (eof() \|\| !peek().equals(literal[i], ignoreCase = ignoreCase)) {
	position = rollback
	return null
	} else {
	position++
	i++
	}
	}

	return literal
	}

	/** RFC2045 value */
	fun parseValue():String? {
	return parseToken() ?: parseQuotedString()
	}

	/** RFC2183 disposition-type */
	fun parseDispositionType():String? {
	return parseLiteral("inline", true)
	?: parseLiteral("attachment", true)
	?:
	// RFC2045 extension-token
	// (since that calls for knowledge of all existing RFCs, let's just parse token)
	parseToken()
	}

	/** RFC5987 mime-charset, covers whole charset */
	fun parseCharset():String {
	val rollback = position
	while (true) {
	if (eof()) {
	break
	}

	val c = peek()
	if (c !in 'a'..'z' && c !in 'A'..'Z' && c !in '0'..'9' && c !in "!#$%&+-^_`{}~") {
	break
	}

	position++
	}
	return content.substring(rollback, position)
	}

	/** RFC3986 2.1 pct-encoded single HEXDIG */
	fun parseHEXDIG():Int {
	if (eof()) {
	return -1
	}
	val result = when (val c = peek()) {
	in '0'..'9' -> c - '0'
	in 'a'..'z' -> c - 'a' + 10
	in 'A'..'Z' -> c - 'A' + 10
	else -> return -1
	}
	position++
	return result
	}

	/** RFC5987 ext-value */
	fun parseExtendedValue():String? {
	val rollback = position
	eatWhitespace()
	val charsetName = parseCharset()
	if (parseLiteral("'", false) == null) {
	position = rollback
	return null
	}

	val charsetDecoder = when {
	charsetName.equals("UTF-8", ignoreCase = true) -> Charsets.UTF_8
	charsetName.equals("ISO-8859-1", ignoreCase = true) -> Charsets.ISO_8859_1
	else -> try {
	// Technically, this is not allowed, but we will allow it anyway
	Charset.forName(charsetName)
	} catch (e:Exception) {
	// No such charset! Append stuff directly as ascii and hope for the best.
	Charsets.US_ASCII
	}
	}.newDecoder()
	.onMalformedInput(CodingErrorAction.REPLACE)
	.onUnmappableCharacter(CodingErrorAction.REPLACE)
	.replaceWith("?")

	// Parse language
	// Language-Tag from https://tools.ietf.org/html/rfc5646#section-2.1
	// We only eat it and don't care if it is 100% correct
	while (true) {
	if (eof()) {
	position = rollback
	return null
	}

	val c = peek()
	if (c !in 'a'..'z' && c !in 'A'..'Z' && c !in '0'..'9' && c != '-') {
	break
	}

	position++
	}

	if (parseLiteral("'", false) == null) {
	position = rollback
	return null
	}

	// value-chars
	val sb = StringBuilder()
	val byteBuffer = ByteBuffer.allocate((content.length - position)/3) // Upper capacity bound
	val charBuffer = CharBuffer.allocate(byteBuffer.capacity() * 2)// Most pessimistic estimate
	var byteBufferDirty = false

	while (!eof()) {
	val c = peek()
	if (c in 'a'..'z' \|\| c in 'A'..'Z' \|\| c in '0'..'9' \|\| c in "!#\$&+-.^_`\|~") {
	if (byteBufferDirty) {
	byteBufferDirty = false
	pumpDecoder(byteBuffer, charBuffer, charsetDecoder, sb)
	}

	sb.append(c)
	position++
	} else if (c == '%') {
	position++
	val firstHexDig = parseHEXDIG()
	val secondHexDig = parseHEXDIG()
	val octet = (firstHexDig shl 4) or secondHexDig
	if (octet !in 0..0xFF) {
	// 2 HEXDIG expected, got EOF (-1 in either value would lead to negative result)
	position = rollback
	return null
	}
	byteBuffer.put(octet.toByte())
	byteBufferDirty = true
	} else {
	// Invalid value-char, end of string!
	break
	}
	}
	if (byteBufferDirty) {
	pumpDecoder(byteBuffer, charBuffer, charsetDecoder, sb)
	}

	return sb.toString()
	}

	private fun pumpDecoder(bytes:ByteBuffer, chars:CharBuffer, decoder:CharsetDecoder, out:java.lang.StringBuilder) {
	try {
	bytes.flip()
	chars.clear()
	val byteCount = bytes.limit()
	try {
	decoder.decode(bytes, chars, true)
	chars.flip()
	} catch (e: Exception) {
	LOG.error("Decoding {} bytes to {} failed", byteCount, decoder.charset(), e)
	for (i in 0 until min(byteCount, 3)) {
	out.append('?')
	}
	return
	}
	while (chars.hasRemaining()) {
	out.append(chars.get())
	}
	} finally {
	bytes.clear()
	}
	}

	/** RFC2045 parameter
	* Also matches RFC2183 disposition-parm subrules all into one */
	fun parseParameter():Pair<String, String>? {
	val rollback = position
	val attribute = parseToken() ?: return null
	if (parseLiteral("=", true) == null) {
	position = rollback
	return null
	}

	val value = if (attribute.endsWith('*')) {
	parseExtendedValue() ?: parseValue()
	} else {
	parseValue()
	}
	if (value == null) {
	position = rollback
	return null
	}

	return attribute to value
	}
	}

	/**
	* Parse filename from the Content-Disposition header content.
	* See: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Disposition
	*/
	fun parseContentDispositionFilename(contentDispositionContent:CharSequence?):String? {
	val parser = Parser(contentDispositionContent ?: return null)
	parser.parseDispositionType() ?: return null
	var filename:String? = null
	while (parser.parseLiteral(";", true) != null) {
	val (key, value) = parser.parseParameter() ?: return null
	if (key.equals("filename*", true)) {
	return value
	} else if (key.equals("filename", true)) {
	filename = value
	}
	}
	return filename
	}

	import org.junit.Assert
	import org.junit.Test

	class HeaderParserTest {

	@Test
	fun testParseContentDispositionFilename() {
	Assert.assertEquals(null, parseContentDispositionFilename("form-data; name=\"fieldName\""))
	Assert.assertEquals("filename.jpg", parseContentDispositionFilename("form-data; name=\"fieldName\"; filename=\"filename.jpg\""))
	Assert.assertEquals("cool.html", parseContentDispositionFilename(" attachment; filename=\"cool.html\""))
	Assert.assertEquals("example.txt", parseContentDispositionFilename("form-data; name=\"field2\"; filename=\"example.txt\""))
	Assert.assertEquals(null, parseContentDispositionFilename("bar; title=Economy"))
	Assert.assertEquals("Economy", parseContentDispositionFilename("bar; filename=Economy"))
	Assert.assertEquals("US-$ rates", parseContentDispositionFilename("bar; filename=\"US-\$ rates\""))
	Assert.assertEquals("£ rates", parseContentDispositionFilename("bar; filename*=iso-8859-1'en'%A3%20rates"))
	Assert.assertEquals("£ and € rates", parseContentDispositionFilename("bar; filename*=UTF-8''%c2%a3%20and%20%e2%82%ac%20rates"))
	Assert.assertEquals("€ exchange rates", parseContentDispositionFilename("bar; filename=\"EURO exchange rates\"; filename*=utf-8''%e2%82%ac%20exchange%20rates"))
	Assert.assertEquals("EURO exchange rates", parseContentDispositionFilename("bar; filename=\"EURO exchange rates\"; filenameNOT*=utf-8''%e2%82%ac%20exchange%20rates"))
	}

	}