Last active
February 25, 2021 22:37
-
-
Save Darkyenus/bbffdcf5236d62c01a69e80f6d226e89 to your computer and use it in GitHub Desktop.
HTTP Header Content Parser (especially for Content-Disposition
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
The MIT-Zero License | |
Copyright (c) 2021 Jan Polák | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
THE SOFTWARE. | |
*/ | |
import org.slf4j.LoggerFactory | |
import java.nio.ByteBuffer | |
import java.nio.CharBuffer | |
import java.nio.charset.Charset | |
import java.nio.charset.CharsetDecoder | |
import java.nio.charset.CodingErrorAction | |
import kotlin.math.min | |
/* | |
* Implements basic parsing primitives for headers as defined by various RFCs: | |
* https://tools.ietf.org/html/rfc2045 (various tokens) | |
* https://tools.ietf.org/html/rfc822 (various tokens, BNF notation used in other RFCs, see section 2) | |
* https://tools.ietf.org/html/rfc2183 (disposition type) | |
* https://tools.ietf.org/html/rfc2231 (not supported) | |
* https://tools.ietf.org/html/rfc5987 (supported) | |
*/ | |
private val LOG = LoggerFactory.getLogger("HeaderParser") | |
private class Parser(val content:CharSequence) { | |
var position = 0 | |
fun eof():Boolean = position >= content.length | |
fun peek():Char = content[position] | |
/** RFC822 SPACE */ | |
fun eatWhitespace() { | |
while (!eof()) { | |
val c = peek() | |
if (c == ' ' || c == '\t') { | |
position++ | |
} else { | |
break | |
} | |
} | |
} | |
/** RFC2045 token */ | |
fun parseToken():String? { | |
eatWhitespace() | |
val start = position | |
while (!eof()) { | |
val c = peek() | |
if (c <= ' ' || c >= 0x7F.toChar() || c in /* RFC2045 tspecials */ "()<>@,;:\\\"/[]?=") { | |
break | |
} | |
position++ | |
} | |
if (start == position) { | |
// At least one required | |
return null | |
} | |
return content.substring(start, position) | |
} | |
/** RFC822 quoted-string */ | |
fun parseQuotedString():String? { | |
val rollback = position | |
eatWhitespace() | |
if (eof() || peek() != '"') { | |
position = rollback | |
return null | |
} | |
position++ | |
val sb = StringBuilder() | |
while (!eof()) { | |
var c = peek() | |
if (c == '"') { | |
// Done | |
position++ | |
return sb.toString() | |
} | |
// Parse qtext/quoted-pair | |
if (c == '\\') { | |
position++ | |
if (eof()) { | |
// Character expected! | |
break | |
} | |
c = peek() | |
} else if (c == '\r') { | |
// Explicitly disallowed | |
break | |
} | |
// Valid char or quoted char, if ascii | |
if (c >= 0x7F.toChar()) { | |
// Only ASCII is allowed | |
break | |
} | |
sb.append(c) | |
position++ | |
} | |
// Failed, rollback | |
position = rollback | |
return null | |
} | |
/** BNF utility */ | |
fun parseLiteral(literal:String, ignoreCase:Boolean):String? { | |
if (literal.isEmpty()) { | |
return literal | |
} | |
val rollback = position | |
eatWhitespace() | |
var i = 0 | |
while (i < literal.length) { | |
if (eof() || !peek().equals(literal[i], ignoreCase = ignoreCase)) { | |
position = rollback | |
return null | |
} else { | |
position++ | |
i++ | |
} | |
} | |
return literal | |
} | |
/** RFC2045 value */ | |
fun parseValue():String? { | |
return parseToken() ?: parseQuotedString() | |
} | |
/** RFC2183 disposition-type */ | |
fun parseDispositionType():String? { | |
return parseLiteral("inline", true) | |
?: parseLiteral("attachment", true) | |
?: | |
// RFC2045 extension-token | |
// (since that calls for knowledge of all existing RFCs, let's just parse token) | |
parseToken() | |
} | |
/** RFC5987 mime-charset, covers whole charset */ | |
fun parseCharset():String { | |
val rollback = position | |
while (true) { | |
if (eof()) { | |
break | |
} | |
val c = peek() | |
if (c !in 'a'..'z' && c !in 'A'..'Z' && c !in '0'..'9' && c !in "!#$%&+-^_`{}~") { | |
break | |
} | |
position++ | |
} | |
return content.substring(rollback, position) | |
} | |
/** RFC3986 2.1 pct-encoded single HEXDIG */ | |
fun parseHEXDIG():Int { | |
if (eof()) { | |
return -1 | |
} | |
val result = when (val c = peek()) { | |
in '0'..'9' -> c - '0' | |
in 'a'..'z' -> c - 'a' + 10 | |
in 'A'..'Z' -> c - 'A' + 10 | |
else -> return -1 | |
} | |
position++ | |
return result | |
} | |
/** RFC5987 ext-value */ | |
fun parseExtendedValue():String? { | |
val rollback = position | |
eatWhitespace() | |
val charsetName = parseCharset() | |
if (parseLiteral("'", false) == null) { | |
position = rollback | |
return null | |
} | |
val charsetDecoder = when { | |
charsetName.equals("UTF-8", ignoreCase = true) -> Charsets.UTF_8 | |
charsetName.equals("ISO-8859-1", ignoreCase = true) -> Charsets.ISO_8859_1 | |
else -> try { | |
// Technically, this is not allowed, but we will allow it anyway | |
Charset.forName(charsetName) | |
} catch (e:Exception) { | |
// No such charset! Append stuff directly as ascii and hope for the best. | |
Charsets.US_ASCII | |
} | |
}.newDecoder() | |
.onMalformedInput(CodingErrorAction.REPLACE) | |
.onUnmappableCharacter(CodingErrorAction.REPLACE) | |
.replaceWith("?") | |
// Parse language | |
// Language-Tag from https://tools.ietf.org/html/rfc5646#section-2.1 | |
// We only eat it and don't care if it is 100% correct | |
while (true) { | |
if (eof()) { | |
position = rollback | |
return null | |
} | |
val c = peek() | |
if (c !in 'a'..'z' && c !in 'A'..'Z' && c !in '0'..'9' && c != '-') { | |
break | |
} | |
position++ | |
} | |
if (parseLiteral("'", false) == null) { | |
position = rollback | |
return null | |
} | |
// value-chars | |
val sb = StringBuilder() | |
val byteBuffer = ByteBuffer.allocate((content.length - position)/3) // Upper capacity bound | |
val charBuffer = CharBuffer.allocate(byteBuffer.capacity() * 2)// Most pessimistic estimate | |
var byteBufferDirty = false | |
while (!eof()) { | |
val c = peek() | |
if (c in 'a'..'z' || c in 'A'..'Z' || c in '0'..'9' || c in "!#\$&+-.^_`|~") { | |
if (byteBufferDirty) { | |
byteBufferDirty = false | |
pumpDecoder(byteBuffer, charBuffer, charsetDecoder, sb) | |
} | |
sb.append(c) | |
position++ | |
} else if (c == '%') { | |
position++ | |
val firstHexDig = parseHEXDIG() | |
val secondHexDig = parseHEXDIG() | |
val octet = (firstHexDig shl 4) or secondHexDig | |
if (octet !in 0..0xFF) { | |
// 2 HEXDIG expected, got EOF (-1 in either value would lead to negative result) | |
position = rollback | |
return null | |
} | |
byteBuffer.put(octet.toByte()) | |
byteBufferDirty = true | |
} else { | |
// Invalid value-char, end of string! | |
break | |
} | |
} | |
if (byteBufferDirty) { | |
pumpDecoder(byteBuffer, charBuffer, charsetDecoder, sb) | |
} | |
return sb.toString() | |
} | |
private fun pumpDecoder(bytes:ByteBuffer, chars:CharBuffer, decoder:CharsetDecoder, out:java.lang.StringBuilder) { | |
try { | |
bytes.flip() | |
chars.clear() | |
val byteCount = bytes.limit() | |
try { | |
decoder.decode(bytes, chars, true) | |
chars.flip() | |
} catch (e: Exception) { | |
LOG.error("Decoding {} bytes to {} failed", byteCount, decoder.charset(), e) | |
for (i in 0 until min(byteCount, 3)) { | |
out.append('?') | |
} | |
return | |
} | |
while (chars.hasRemaining()) { | |
out.append(chars.get()) | |
} | |
} finally { | |
bytes.clear() | |
} | |
} | |
/** RFC2045 parameter | |
* Also matches RFC2183 disposition-parm subrules all into one */ | |
fun parseParameter():Pair<String, String>? { | |
val rollback = position | |
val attribute = parseToken() ?: return null | |
if (parseLiteral("=", true) == null) { | |
position = rollback | |
return null | |
} | |
val value = if (attribute.endsWith('*')) { | |
parseExtendedValue() ?: parseValue() | |
} else { | |
parseValue() | |
} | |
if (value == null) { | |
position = rollback | |
return null | |
} | |
return attribute to value | |
} | |
} | |
/** | |
* Parse filename from the Content-Disposition header content. | |
* See: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Disposition | |
*/ | |
fun parseContentDispositionFilename(contentDispositionContent:CharSequence?):String? { | |
val parser = Parser(contentDispositionContent ?: return null) | |
parser.parseDispositionType() ?: return null | |
var filename:String? = null | |
while (parser.parseLiteral(";", true) != null) { | |
val (key, value) = parser.parseParameter() ?: return null | |
if (key.equals("filename*", true)) { | |
return value | |
} else if (key.equals("filename", true)) { | |
filename = value | |
} | |
} | |
return filename | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.junit.Assert | |
import org.junit.Test | |
class HeaderParserTest { | |
@Test | |
fun testParseContentDispositionFilename() { | |
Assert.assertEquals(null, parseContentDispositionFilename("form-data; name=\"fieldName\"")) | |
Assert.assertEquals("filename.jpg", parseContentDispositionFilename("form-data; name=\"fieldName\"; filename=\"filename.jpg\"")) | |
Assert.assertEquals("cool.html", parseContentDispositionFilename(" attachment; filename=\"cool.html\"")) | |
Assert.assertEquals("example.txt", parseContentDispositionFilename("form-data; name=\"field2\"; filename=\"example.txt\"")) | |
Assert.assertEquals(null, parseContentDispositionFilename("bar; title=Economy")) | |
Assert.assertEquals("Economy", parseContentDispositionFilename("bar; filename=Economy")) | |
Assert.assertEquals("US-$ rates", parseContentDispositionFilename("bar; filename=\"US-\$ rates\"")) | |
Assert.assertEquals("£ rates", parseContentDispositionFilename("bar; filename*=iso-8859-1'en'%A3%20rates")) | |
Assert.assertEquals("£ and € rates", parseContentDispositionFilename("bar; filename*=UTF-8''%c2%a3%20and%20%e2%82%ac%20rates")) | |
Assert.assertEquals("€ exchange rates", parseContentDispositionFilename("bar; filename=\"EURO exchange rates\"; filename*=utf-8''%e2%82%ac%20exchange%20rates")) | |
Assert.assertEquals("EURO exchange rates", parseContentDispositionFilename("bar; filename=\"EURO exchange rates\"; filenameNOT*=utf-8''%e2%82%ac%20exchange%20rates")) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment