Created
May 6, 2018 17:36
-
-
Save rossj/6e31df6933636c2c7fb1bcc6ac97c72e to your computer and use it in GitHub Desktop.
Some Node utility functions for aggregating byte information about Buffers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const enum UTF8_STATUS { | |
ACCEPT = 0, | |
REJECT = 1 | |
} | |
export interface MyBufAggs { | |
allNonNull: boolean; | |
allAsciiChar: boolean; | |
all7Bit: boolean; | |
validUtf8: boolean; | |
} | |
// From https://stackoverflow.com/questions/1031645/how-to-detect-utf-8-in-plain-c | |
const utf8d = Buffer.from([ | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1f | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3f | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5f | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7f | |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9f | |
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..bf | |
8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..df | |
0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // e0..ef | |
0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // f0..ff | |
0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0 | |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2 | |
1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4 | |
1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6 | |
1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // s7..s8 | |
]); | |
export function checkStringBuf(b: Buffer): MyBufAggs { | |
let utf8State = UTF8_STATUS.ACCEPT; | |
let allNonNull = true; | |
let allAsciiChar = true; | |
let all7Bit = true; | |
const len = b.length; | |
for (let i = 0; i < len; ++i) { | |
const byte = b[i]; | |
if (utf8State !== UTF8_STATUS.REJECT) { | |
// We don't care about the codepoint, so this is | |
// a simplified version of the decode function. | |
const type = utf8d[byte]; | |
utf8State = utf8d[256 + utf8State * 16 + type]; | |
} | |
if (byte === 0) { | |
allNonNull = false; | |
allAsciiChar = false; | |
} else if (byte >= 128) { | |
allAsciiChar = false; | |
all7Bit = false; | |
} else if (byte < 32 || byte >= 127) { | |
allAsciiChar = false; | |
} | |
} | |
return { | |
allNonNull, | |
allAsciiChar, | |
all7Bit, | |
validUtf8: utf8State === UTF8_STATUS.ACCEPT | |
}; | |
} | |
export function checkStringBufs(bufs: Buffer[]): MyBufAggs { | |
const aggs: MyBufAggs = { | |
allNonNull: true, | |
allAsciiChar: true, | |
all7Bit: true, | |
validUtf8: true | |
}; | |
for (const buf of bufs) { | |
const aggs2 = checkStringBuf(buf); | |
aggs.allNonNull = aggs.allNonNull && aggs2.allNonNull; | |
aggs.allAsciiChar = aggs.allAsciiChar && aggs2.allAsciiChar; | |
aggs.all7Bit = aggs.all7Bit && aggs2.all7Bit; | |
aggs.validUtf8 = aggs.validUtf8 && aggs2.validUtf8; | |
} | |
return aggs; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment