peteroupc/utf7.js

## utf7.js
/*
Written by Peter O. in 2014.
Any copyright is dedicated to the Public Domain.
http://creativecommons.org/publicdomain/zero/1.0/
If you like this, you should donate to Peter O.
at: http://upokecenter.com/d/
 */
var CodeUnitAppender = function() {
  this.surrogate = -1;
  this.lastByte = -1;
};
(function(prototype, constr) {
  constr.replacement = "\ufffd"
  prototype.finalizeAndReset = function(builder) {
    if (this.surrogate >= 0 && this.lastByte >= 0) {
      // Unpaired surrogate and an unpaired byte value
      builder.push(constr.replacement);
      builder.push(constr.replacement);
    } else if (this.surrogate >= 0 || this.lastByte >= 0) {
      // Unpaired surrogate or byte value remains
      builder.push(constr.replacement);
    }
    this.surrogate = -1;
    this.lastByte = -1;
  };

  prototype.appendIncompleteByte = function() {
    // Make sure lastByte isn't -1, for finalizeAndReset
    // purposes
    this.lastByte = 0;
  };

  prototype.appendByte = function(value, builder) {
    if (this.lastByte >= 0) {
      var codeunit = this.lastByte << 8;
      codeunit |= value & 0xff;
      this.appendCodeUnit(codeunit, builder);
      this.lastByte = -1;
    } else {
      this.lastByte = value;
    }
  };

  prototype.appendCodeUnit = function(codeunit, builder) {
    if (this.surrogate >= 0) {
      // If we have a surrogate, "codeunit"
      // must be a valid "low surrogate" to complete the pair
      if ((codeunit & 0xfc00) == 0xdc00) {
        // valid low surrogate
        builder.push(String.fromCharCode(this.surrogate));
        builder.push(String.fromCharCode(codeunit));
        this.surrogate = -1;
      } else if ((codeunit & 0xfc00) == 0xd800) {
        // unpaired high surrogate
        builder.push(constr.replacement);
        this.surrogate = codeunit;
      } else {
        // not a surrogate, output the first as U + FFFD
        // and the second as is
        builder.push(constr.replacement);
        builder.push(String.fromCharCode(codeunit));
        this.surrogate = -1;
      }
    } else {
      if ((codeunit & 0xfc00) == 0xdc00) {
        // unpaired low surrogate
        builder.push(constr.replacement);
      } else if ((codeunit & 0xfc00) == 0xd800) {
        // valid low surrogate
        this.surrogate = codeunit;
      } else {
        // not a surrogate
        builder.push(String.fromCharCode(codeunit));
      }
    }
  };

  prototype.reset = function() {
    this.surrogate = -1;
    this.lastByte = -1;
  }
})(CodeUnitAppender.prototype, CodeUnitAppender);

var Utf7 = function() {
};
Utf7.Alphabet = [ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, 52, 53, 54, 55, 56,
    57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8,
    9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1,
    -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
    41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1 ];

Utf7.prototype.decode = function(input) {
  if (typeof input == "string") {
    var bytes = []
    for ( var i = 0; i < input.length; i++) {
      // treat char codes beyond 0xFF as 0xFF; they will
      // be considered invalid for the purposes of UTF-7 decoding
      bytes[i] = Math.min(0xFF, input.charCodeAt(i));
    }
    input = new Uint8Array(bytes);
  }
  var builder = []
  this.readUtf7(input, builder, true);
  return builder.join("");
};

Utf7.prototype.readUtf7 = function(input, builder) {
  if (input == null) {
    throw new Error("stream");
  }
  if (builder == null) {
    throw new Error("builder");
  }
  var alphavalue = 0;
  var base64value = 0;
  var base64count = 0;
  var index = input.byteOffset;
  var endIndex = input.byteOffset + input.byteLength;
  var appender = new CodeUnitAppender();
  var state = 0; // 0: not in base64; 1: start of base 64; 2: continuing base64
  while (true) {
    var b;
    switch (state) {
    case 0: // not in base64
      b = (index >= endIndex) ? -1 : input[index++];
      if (b < 0) {
        // done
        return;
      }
      if (b == 0x09 || b == 0x0a || b == 0x0d) {
        builder.push(String.fromCharCode(b));
      } else if (b == 0x5c || b >= 0x7e || b < 0x20) {
        // Illegal byte in UTF-7
        builder.push("\ufffd");
      } else if (b == 0x2b) {
        // plus sign
        state = 1; // change state to "start of base64"
        base64value = 0;
        base64count = 0;
        appender.reset();
      } else {
        builder.push(String.fromCharCode(b));
      }
      break;
    case 1: // start of base64
      b = (index >= endIndex) ? -1 : input[index++];
      if (b < 0) {
        // End of stream, illegal
        state = 0;
        builder.push("\ufffd");
        return;
      }
      if (b == 0x2d) {
        // hyphen, so output a plus sign
        state = 0;
        builder.push('+');
      } else if (b >= 0x80) {
        // Non-ASCII byte, illegal
        state = 0;
        builder.push("\ufffd"); // for the illegal plus
        builder.push("\ufffd"); // for the illegal non-ASCII byte
      } else {
        alphavalue = Utf7.Alphabet[b];
        if (alphavalue >= 0) {
          state = 2; // change state to "continuing base64"
          base64value <<= 6;
          base64value |= alphavalue;
          ++base64count;
        } else {
          // Non-base64 byte (NOTE: Can't be plus or
          // minus at this point)
          state = 0;
          builder.push("\ufffd"); // for the illegal plus
          if (b == 0x09 || b == 0x0a || b == 0x0d) {
            builder.push(String.fromCharCode(b));
          } else if (b == 0x5c || b >= 0x7e || b < 0x20) {
            // Illegal byte in UTF-7
            builder.push("\ufffd");
          } else {
            builder.push(String.fromCharCode(b));
          }
        }
      }
      break;
    case 2: // continuing base64
      b = (index >= endIndex) ? -1 : input[index++];
      alphavalue = (b < 0 || b >= 0x80) ? -1 : Utf7.Alphabet[b];
      if (alphavalue >= 0) {
        // Base64 alphabet (except padding)
        base64value <<= 6;
        base64value |= alphavalue;
        ++base64count;
        if (base64count == 4) {
          // Generate UTF-16 bytes
          appender.appendByte((base64value >> 16) & 0xff, builder);
          appender.appendByte((base64value >> 8) & 0xff, builder);
          appender.appendByte(base64value & 0xff, builder);
          base64count = 0;
        }
      } else {
        state = 0;
        if (base64count == 1) {
          // incomplete base64 byte
          appender.appendIncompleteByte();
        } else if (base64count == 2) {
          base64value <<= 12;
          appender.appendByte((base64value >> 16) & 0xff, builder);
          if ((base64value & 0xffff) != 0) {
            // Redundant pad bits
            appender.appendIncompleteByte();
          }
        } else if (base64count == 3) {
          base64value <<= 6;
          appender.appendByte((base64value >> 16) & 0xff, builder);
          appender.appendByte((base64value >> 8) & 0xff, builder);
          if ((base64value & 0xff) != 0) {
            // Redundant pad bits
            appender.appendIncompleteByte();
          }
        }
        appender.finalizeAndReset(builder);
        if (b < 0) {
          // End of stream
          return;
        } else if (b == 0x2d) {
          // Ignore the hyphen
        } else if (b == 0x09 || b == 0x0a || b == 0x0d) {
          builder.push(String.fromCharCode(b));
        } else if (b == 0x5c || b >= 0x7e || b < 0x20) {
          // Illegal byte in UTF-7
          builder.push("\ufffd");
        } else {
          builder.push(String.fromCharCode(b));
        }
      }
      break;
    default:
      throw new IllegalStateException("Unexpected state");
    }
  }
};

## utf7test.js
/*
Written by Peter O. in 2014.
Any copyright is dedicated to the Public Domain.
http://creativecommons.org/publicdomain/zero/1.0/
If you like this, you should donate to Peter O.
at: http://upokecenter.com/d/
 */
    function testUtf7One(input,expected) {
      if(new Utf7().decode(input)!=expected){
        console.log("failure "+[input,expected])
      }
    }

    function testUtf7() {
      testUtf7One("\\", "\ufffd");
      testUtf7One("~", "\ufffd");
      testUtf7One("\u0001", "\ufffd");
      testUtf7One("\u007f", "\ufffd");
      testUtf7One("\r\n\t '!\"#'(),$-%@[]^&=<>;*_`{}./:|?", "\r\n\t '!\"#'(),$-%@[]^&=<>;*_`{}./:|?");
      testUtf7One("x+--", "x+-");
      testUtf7One("x+-y", "x+y");
      // Illegal byte after plus
      testUtf7One("+!", "\ufffd!");
      testUtf7One("+\n", "\ufffd\n");
      testUtf7One("+\u007f", "\ufffd\ufffd");
      testUtf7One("+", "\ufffd");
      // Incomplete byte
      testUtf7One("+D?", "\ufffd?");
      testUtf7One("+D\u007f", "\ufffd\ufffd");
      testUtf7One("+D", "\ufffd");
      // Only one UTF-16 byte
      testUtf7One("+DE?", "\ufffd?");
      testUtf7One("+DE", "\ufffd");
      testUtf7One("+DE\u007f", "\ufffd\ufffd");
      // UTF-16 code unit
      testUtf7One("+DEE?", "\u0c41?");
      testUtf7One("+DEE", "\u0c41");
      testUtf7One("+DEE\u007f", "\u0c41\ufffd");
      // UTF-16 code unit (redundant pad bit)
      testUtf7One("+DEF?", "\u0c41\ufffd?");
      testUtf7One("+DEF", "\u0c41\ufffd");
      testUtf7One("+DEF\u007f", "\u0c41\ufffd\ufffd");
      // High surrogate code unit
      testUtf7One("+2AA?", "\ufffd?");
      testUtf7One("+2AA", "\ufffd");
      testUtf7One("+2AA\u007f", "\ufffd\ufffd");
      // Low surrogate code unit
      testUtf7One("+3AA?", "\ufffd?");
      testUtf7One("+3AA", "\ufffd");
      testUtf7One("+3AA\u007f", "\ufffd\ufffd");
      // Surrogate pair
      testUtf7One("+2ADcAA?", "\ud800\udc00?");
      testUtf7One("+2ADcAA", "\ud800\udc00");
      testUtf7One("+2ADcAA\u007f", "\ud800\udc00\ufffd");
      // High surrogate followed by surrogate pair
      testUtf7One("+2ADYANwA?", "\ufffd\ud800\udc00?");
      testUtf7One("+2ADYANwA", "\ufffd\ud800\udc00");
      testUtf7One("+2ADYANwA\u007f", "\ufffd\ud800\udc00\ufffd");
      // High surrogate followed by non-surrogate
      testUtf7One("+2AAAwA?", "\ufffd\u00c0?");
      testUtf7One("+2AAAwA", "\ufffd\u00c0");
      testUtf7One("+2AAAwA\u007f", "\ufffd\u00c0\ufffd");
      // Two UTF-16 code units
      testUtf7One("+AMAA4A?", "\u00c0\u00e0?");
      testUtf7One("+AMAA4A", "\u00c0\u00e0");
      testUtf7One("+AMAA4A-Next", "\u00c0\u00e0Next");
      testUtf7One("+AMAA4A!Next", "\u00c0\u00e0!Next");
      testUtf7One("+AMAA4A\u007f", "\u00c0\u00e0\ufffd");
      // Two UTF-16 code units (redundant pad bit)
      testUtf7One("+AMAA4B?", "\u00c0\u00e0\ufffd?");
      testUtf7One("+AMAA4B", "\u00c0\u00e0\ufffd");
      testUtf7One("+AMAA4B-Next", "\u00c0\u00e0\ufffdNext");
      testUtf7One("+AMAA4B!Next", "\u00c0\u00e0\ufffd!Next");
      testUtf7One("+AMAA4B\u007f", "\u00c0\u00e0\ufffd\ufffd");
    }
	/*
	Written by Peter O. in 2014.
	Any copyright is dedicated to the Public Domain.
	http://creativecommons.org/publicdomain/zero/1.0/
	If you like this, you should donate to Peter O.
	at: http://upokecenter.com/d/
	*/
	var CodeUnitAppender = function() {
	this.surrogate = -1;
	this.lastByte = -1;
	};
	(function(prototype, constr) {
	constr.replacement = "\ufffd"
	prototype.finalizeAndReset = function(builder) {
	if (this.surrogate >= 0 && this.lastByte >= 0) {
	// Unpaired surrogate and an unpaired byte value
	builder.push(constr.replacement);
	builder.push(constr.replacement);
	} else if (this.surrogate >= 0 \|\| this.lastByte >= 0) {
	// Unpaired surrogate or byte value remains
	builder.push(constr.replacement);
	}
	this.surrogate = -1;
	this.lastByte = -1;
	};

	prototype.appendIncompleteByte = function() {
	// Make sure lastByte isn't -1, for finalizeAndReset
	// purposes
	this.lastByte = 0;
	};

	prototype.appendByte = function(value, builder) {
	if (this.lastByte >= 0) {
	var codeunit = this.lastByte << 8;
	codeunit \|= value & 0xff;
	this.appendCodeUnit(codeunit, builder);
	this.lastByte = -1;
	} else {
	this.lastByte = value;
	}
	};

	prototype.appendCodeUnit = function(codeunit, builder) {
	if (this.surrogate >= 0) {
	// If we have a surrogate, "codeunit"
	// must be a valid "low surrogate" to complete the pair
	if ((codeunit & 0xfc00) == 0xdc00) {
	// valid low surrogate
	builder.push(String.fromCharCode(this.surrogate));
	builder.push(String.fromCharCode(codeunit));
	this.surrogate = -1;
	} else if ((codeunit & 0xfc00) == 0xd800) {
	// unpaired high surrogate
	builder.push(constr.replacement);
	this.surrogate = codeunit;
	} else {
	// not a surrogate, output the first as U + FFFD
	// and the second as is
	builder.push(constr.replacement);
	builder.push(String.fromCharCode(codeunit));
	this.surrogate = -1;
	}
	} else {
	if ((codeunit & 0xfc00) == 0xdc00) {
	// unpaired low surrogate
	builder.push(constr.replacement);
	} else if ((codeunit & 0xfc00) == 0xd800) {
	// valid low surrogate
	this.surrogate = codeunit;
	} else {
	// not a surrogate
	builder.push(String.fromCharCode(codeunit));
	}
	}
	};

	prototype.reset = function() {
	this.surrogate = -1;
	this.lastByte = -1;
	}
	})(CodeUnitAppender.prototype, CodeUnitAppender);

	var Utf7 = function() {
	};
	Utf7.Alphabet = [ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, 52, 53, 54, 55, 56,
	57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8,
	9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1,
	-1, -1, -1, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
	41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1 ];

	Utf7.prototype.decode = function(input) {
	if (typeof input == "string") {
	var bytes = []
	for ( var i = 0; i < input.length; i++) {
	// treat char codes beyond 0xFF as 0xFF; they will
	// be considered invalid for the purposes of UTF-7 decoding
	bytes[i] = Math.min(0xFF, input.charCodeAt(i));
	}
	input = new Uint8Array(bytes);
	}
	var builder = []
	this.readUtf7(input, builder, true);
	return builder.join("");
	};

	Utf7.prototype.readUtf7 = function(input, builder) {
	if (input == null) {
	throw new Error("stream");
	}
	if (builder == null) {
	throw new Error("builder");
	}
	var alphavalue = 0;
	var base64value = 0;
	var base64count = 0;
	var index = input.byteOffset;
	var endIndex = input.byteOffset + input.byteLength;
	var appender = new CodeUnitAppender();
	var state = 0; // 0: not in base64; 1: start of base 64; 2: continuing base64
	while (true) {
	var b;
	switch (state) {
	case 0: // not in base64
	b = (index >= endIndex) ? -1 : input[index++];
	if (b < 0) {
	// done
	return;
	}
	if (b == 0x09 \|\| b == 0x0a \|\| b == 0x0d) {
	builder.push(String.fromCharCode(b));
	} else if (b == 0x5c \|\| b >= 0x7e \|\| b < 0x20) {
	// Illegal byte in UTF-7
	builder.push("\ufffd");
	} else if (b == 0x2b) {
	// plus sign
	state = 1; // change state to "start of base64"
	base64value = 0;
	base64count = 0;
	appender.reset();
	} else {
	builder.push(String.fromCharCode(b));
	}
	break;
	case 1: // start of base64
	b = (index >= endIndex) ? -1 : input[index++];
	if (b < 0) {
	// End of stream, illegal
	state = 0;
	builder.push("\ufffd");
	return;
	}
	if (b == 0x2d) {
	// hyphen, so output a plus sign
	state = 0;
	builder.push('+');
	} else if (b >= 0x80) {
	// Non-ASCII byte, illegal
	state = 0;
	builder.push("\ufffd"); // for the illegal plus
	builder.push("\ufffd"); // for the illegal non-ASCII byte
	} else {
	alphavalue = Utf7.Alphabet[b];
	if (alphavalue >= 0) {
	state = 2; // change state to "continuing base64"
	base64value <<= 6;
	base64value \|= alphavalue;
	++base64count;
	} else {
	// Non-base64 byte (NOTE: Can't be plus or
	// minus at this point)
	state = 0;
	builder.push("\ufffd"); // for the illegal plus
	if (b == 0x09 \|\| b == 0x0a \|\| b == 0x0d) {
	builder.push(String.fromCharCode(b));
	} else if (b == 0x5c \|\| b >= 0x7e \|\| b < 0x20) {
	// Illegal byte in UTF-7
	builder.push("\ufffd");
	} else {
	builder.push(String.fromCharCode(b));
	}
	}
	}
	break;
	case 2: // continuing base64
	b = (index >= endIndex) ? -1 : input[index++];
	alphavalue = (b < 0 \|\| b >= 0x80) ? -1 : Utf7.Alphabet[b];
	if (alphavalue >= 0) {
	// Base64 alphabet (except padding)
	base64value <<= 6;
	base64value \|= alphavalue;
	++base64count;
	if (base64count == 4) {
	// Generate UTF-16 bytes
	appender.appendByte((base64value >> 16) & 0xff, builder);
	appender.appendByte((base64value >> 8) & 0xff, builder);
	appender.appendByte(base64value & 0xff, builder);
	base64count = 0;
	}
	} else {
	state = 0;
	if (base64count == 1) {
	// incomplete base64 byte
	appender.appendIncompleteByte();
	} else if (base64count == 2) {
	base64value <<= 12;
	appender.appendByte((base64value >> 16) & 0xff, builder);
	if ((base64value & 0xffff) != 0) {
	// Redundant pad bits
	appender.appendIncompleteByte();
	}
	} else if (base64count == 3) {
	base64value <<= 6;
	appender.appendByte((base64value >> 16) & 0xff, builder);
	appender.appendByte((base64value >> 8) & 0xff, builder);
	if ((base64value & 0xff) != 0) {
	// Redundant pad bits
	appender.appendIncompleteByte();
	}
	}
	appender.finalizeAndReset(builder);
	if (b < 0) {
	// End of stream
	return;
	} else if (b == 0x2d) {
	// Ignore the hyphen
	} else if (b == 0x09 \|\| b == 0x0a \|\| b == 0x0d) {
	builder.push(String.fromCharCode(b));
	} else if (b == 0x5c \|\| b >= 0x7e \|\| b < 0x20) {
	// Illegal byte in UTF-7
	builder.push("\ufffd");
	} else {
	builder.push(String.fromCharCode(b));
	}
	}
	break;
	default:
	throw new IllegalStateException("Unexpected state");
	}
	}
	};