Skip to content

Instantly share code, notes, and snippets.

@Darkyenus
Last active February 25, 2021 22:37
Show Gist options
  • Save Darkyenus/bbffdcf5236d62c01a69e80f6d226e89 to your computer and use it in GitHub Desktop.
Save Darkyenus/bbffdcf5236d62c01a69e80f6d226e89 to your computer and use it in GitHub Desktop.
HTTP Header Content Parser (especially for Content-Disposition
/*
The MIT-Zero License
Copyright (c) 2021 Jan Polák
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
import org.slf4j.LoggerFactory
import java.nio.ByteBuffer
import java.nio.CharBuffer
import java.nio.charset.Charset
import java.nio.charset.CharsetDecoder
import java.nio.charset.CodingErrorAction
import kotlin.math.min
/*
* Implements basic parsing primitives for headers as defined by various RFCs:
* https://tools.ietf.org/html/rfc2045 (various tokens)
* https://tools.ietf.org/html/rfc822 (various tokens, BNF notation used in other RFCs, see section 2)
* https://tools.ietf.org/html/rfc2183 (disposition type)
* https://tools.ietf.org/html/rfc2231 (not supported)
* https://tools.ietf.org/html/rfc5987 (supported)
*/
private val LOG = LoggerFactory.getLogger("HeaderParser")
private class Parser(val content:CharSequence) {
var position = 0
fun eof():Boolean = position >= content.length
fun peek():Char = content[position]
/** RFC822 SPACE */
fun eatWhitespace() {
while (!eof()) {
val c = peek()
if (c == ' ' || c == '\t') {
position++
} else {
break
}
}
}
/** RFC2045 token */
fun parseToken():String? {
eatWhitespace()
val start = position
while (!eof()) {
val c = peek()
if (c <= ' ' || c >= 0x7F.toChar() || c in /* RFC2045 tspecials */ "()<>@,;:\\\"/[]?=") {
break
}
position++
}
if (start == position) {
// At least one required
return null
}
return content.substring(start, position)
}
/** RFC822 quoted-string */
fun parseQuotedString():String? {
val rollback = position
eatWhitespace()
if (eof() || peek() != '"') {
position = rollback
return null
}
position++
val sb = StringBuilder()
while (!eof()) {
var c = peek()
if (c == '"') {
// Done
position++
return sb.toString()
}
// Parse qtext/quoted-pair
if (c == '\\') {
position++
if (eof()) {
// Character expected!
break
}
c = peek()
} else if (c == '\r') {
// Explicitly disallowed
break
}
// Valid char or quoted char, if ascii
if (c >= 0x7F.toChar()) {
// Only ASCII is allowed
break
}
sb.append(c)
position++
}
// Failed, rollback
position = rollback
return null
}
/** BNF utility */
fun parseLiteral(literal:String, ignoreCase:Boolean):String? {
if (literal.isEmpty()) {
return literal
}
val rollback = position
eatWhitespace()
var i = 0
while (i < literal.length) {
if (eof() || !peek().equals(literal[i], ignoreCase = ignoreCase)) {
position = rollback
return null
} else {
position++
i++
}
}
return literal
}
/** RFC2045 value */
fun parseValue():String? {
return parseToken() ?: parseQuotedString()
}
/** RFC2183 disposition-type */
fun parseDispositionType():String? {
return parseLiteral("inline", true)
?: parseLiteral("attachment", true)
?:
// RFC2045 extension-token
// (since that calls for knowledge of all existing RFCs, let's just parse token)
parseToken()
}
/** RFC5987 mime-charset, covers whole charset */
fun parseCharset():String {
val rollback = position
while (true) {
if (eof()) {
break
}
val c = peek()
if (c !in 'a'..'z' && c !in 'A'..'Z' && c !in '0'..'9' && c !in "!#$%&+-^_`{}~") {
break
}
position++
}
return content.substring(rollback, position)
}
/** RFC3986 2.1 pct-encoded single HEXDIG */
fun parseHEXDIG():Int {
if (eof()) {
return -1
}
val result = when (val c = peek()) {
in '0'..'9' -> c - '0'
in 'a'..'z' -> c - 'a' + 10
in 'A'..'Z' -> c - 'A' + 10
else -> return -1
}
position++
return result
}
/** RFC5987 ext-value */
fun parseExtendedValue():String? {
val rollback = position
eatWhitespace()
val charsetName = parseCharset()
if (parseLiteral("'", false) == null) {
position = rollback
return null
}
val charsetDecoder = when {
charsetName.equals("UTF-8", ignoreCase = true) -> Charsets.UTF_8
charsetName.equals("ISO-8859-1", ignoreCase = true) -> Charsets.ISO_8859_1
else -> try {
// Technically, this is not allowed, but we will allow it anyway
Charset.forName(charsetName)
} catch (e:Exception) {
// No such charset! Append stuff directly as ascii and hope for the best.
Charsets.US_ASCII
}
}.newDecoder()
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.replaceWith("?")
// Parse language
// Language-Tag from https://tools.ietf.org/html/rfc5646#section-2.1
// We only eat it and don't care if it is 100% correct
while (true) {
if (eof()) {
position = rollback
return null
}
val c = peek()
if (c !in 'a'..'z' && c !in 'A'..'Z' && c !in '0'..'9' && c != '-') {
break
}
position++
}
if (parseLiteral("'", false) == null) {
position = rollback
return null
}
// value-chars
val sb = StringBuilder()
val byteBuffer = ByteBuffer.allocate((content.length - position)/3) // Upper capacity bound
val charBuffer = CharBuffer.allocate(byteBuffer.capacity() * 2)// Most pessimistic estimate
var byteBufferDirty = false
while (!eof()) {
val c = peek()
if (c in 'a'..'z' || c in 'A'..'Z' || c in '0'..'9' || c in "!#\$&+-.^_`|~") {
if (byteBufferDirty) {
byteBufferDirty = false
pumpDecoder(byteBuffer, charBuffer, charsetDecoder, sb)
}
sb.append(c)
position++
} else if (c == '%') {
position++
val firstHexDig = parseHEXDIG()
val secondHexDig = parseHEXDIG()
val octet = (firstHexDig shl 4) or secondHexDig
if (octet !in 0..0xFF) {
// 2 HEXDIG expected, got EOF (-1 in either value would lead to negative result)
position = rollback
return null
}
byteBuffer.put(octet.toByte())
byteBufferDirty = true
} else {
// Invalid value-char, end of string!
break
}
}
if (byteBufferDirty) {
pumpDecoder(byteBuffer, charBuffer, charsetDecoder, sb)
}
return sb.toString()
}
private fun pumpDecoder(bytes:ByteBuffer, chars:CharBuffer, decoder:CharsetDecoder, out:java.lang.StringBuilder) {
try {
bytes.flip()
chars.clear()
val byteCount = bytes.limit()
try {
decoder.decode(bytes, chars, true)
chars.flip()
} catch (e: Exception) {
LOG.error("Decoding {} bytes to {} failed", byteCount, decoder.charset(), e)
for (i in 0 until min(byteCount, 3)) {
out.append('?')
}
return
}
while (chars.hasRemaining()) {
out.append(chars.get())
}
} finally {
bytes.clear()
}
}
/** RFC2045 parameter
* Also matches RFC2183 disposition-parm subrules all into one */
fun parseParameter():Pair<String, String>? {
val rollback = position
val attribute = parseToken() ?: return null
if (parseLiteral("=", true) == null) {
position = rollback
return null
}
val value = if (attribute.endsWith('*')) {
parseExtendedValue() ?: parseValue()
} else {
parseValue()
}
if (value == null) {
position = rollback
return null
}
return attribute to value
}
}
/**
* Parse filename from the Content-Disposition header content.
* See: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Disposition
*/
fun parseContentDispositionFilename(contentDispositionContent:CharSequence?):String? {
val parser = Parser(contentDispositionContent ?: return null)
parser.parseDispositionType() ?: return null
var filename:String? = null
while (parser.parseLiteral(";", true) != null) {
val (key, value) = parser.parseParameter() ?: return null
if (key.equals("filename*", true)) {
return value
} else if (key.equals("filename", true)) {
filename = value
}
}
return filename
}
import org.junit.Assert
import org.junit.Test
class HeaderParserTest {
@Test
fun testParseContentDispositionFilename() {
Assert.assertEquals(null, parseContentDispositionFilename("form-data; name=\"fieldName\""))
Assert.assertEquals("filename.jpg", parseContentDispositionFilename("form-data; name=\"fieldName\"; filename=\"filename.jpg\""))
Assert.assertEquals("cool.html", parseContentDispositionFilename(" attachment; filename=\"cool.html\""))
Assert.assertEquals("example.txt", parseContentDispositionFilename("form-data; name=\"field2\"; filename=\"example.txt\""))
Assert.assertEquals(null, parseContentDispositionFilename("bar; title=Economy"))
Assert.assertEquals("Economy", parseContentDispositionFilename("bar; filename=Economy"))
Assert.assertEquals("US-$ rates", parseContentDispositionFilename("bar; filename=\"US-\$ rates\""))
Assert.assertEquals("£ rates", parseContentDispositionFilename("bar; filename*=iso-8859-1'en'%A3%20rates"))
Assert.assertEquals("£ and € rates", parseContentDispositionFilename("bar; filename*=UTF-8''%c2%a3%20and%20%e2%82%ac%20rates"))
Assert.assertEquals("€ exchange rates", parseContentDispositionFilename("bar; filename=\"EURO exchange rates\"; filename*=utf-8''%e2%82%ac%20exchange%20rates"))
Assert.assertEquals("EURO exchange rates", parseContentDispositionFilename("bar; filename=\"EURO exchange rates\"; filenameNOT*=utf-8''%e2%82%ac%20exchange%20rates"))
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment