Skip to content

Instantly share code, notes, and snippets.

@hayes
Last active December 19, 2015 21:49
Show Gist options
  • Save hayes/6022839 to your computer and use it in GitHub Desktop.
Save hayes/6022839 to your computer and use it in GitHub Desktop.
convert a string to a byte array
function to_bytes(str) {
var arr = []
, byte_count
, padding
, binary
, prefix
, bytes
, code
for(var i = 0, len = str.length; i < len; ++i) {
code = fixedCharCodeAt(str, i)
if(code === false) {
// non BMP chars will have a length of 2
// if you try to get the charCode of the second element, it will be false
continue
}
// if it is a multibyte character
if(code > 0x7F) {
binary = code.toString(2)
// each aditional byte gives 5 additional usable bits
byte_count = Math.ceil((binary.length - 1) / 5)
// create leading 1s that represent number of bytes used to represent this character
prefix = new Array(byte_count + 1).join('1')
// create leading 0s to fill before the begining of the code point
padding = new Array(((byte_count - 1) * 6) + (9 - prefix.length) - binary.length)
binary = padding.join('0') + binary
// fill the remaining bits in the leading byte
prefix += binary.slice(0, 8 - byte_count)
binary = binary.slice(8 - byte_count)
// each byte has room for six bits from the code point
bytes = [prefix].concat(binary.match(/.{6}/g))
for(var j = 0, jlen = bytes.length; j < jlen; ++j) {
// ORing with 0x80 ensure that the non leading bytes are formated as 10xxxxxx
arr[arr.length] = parseInt(bytes[j], 2) | 0x80
}
} else {
arr[arr.length] = code
}
}
return arr
}
// from https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/charCodeAt#Example_2.3A_Fixing_charCodeAt_to_handle_non-Basic-Multilingual-Plane_characters_if_their_presence_earlier_in_the_string_is_unknown
// a version of charCodeAt that also handles non-Basic-Multilingual-Plane characters
function fixedCharCodeAt(str, idx) {
idx = idx || 0
var code = str.charCodeAt(idx)
, low
, hi
if(0xD800 <= code && code <= 0xDBFF) {
low = str.charCodeAt(idx+1)
hi = code
if(isNaN(low)) {
throw new Error('High surrogate not followed by low surrogate in fixedCharCodeAt()')
}
return ((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000
}
if(0xDC00 <= code && code <= 0xDFFF) {
return false
}
return code
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment