Skip to content

Instantly share code, notes, and snippets.

@PatrickPijnappel
Created March 20, 2016 03:37
Show Gist options
  • Save PatrickPijnappel/3241bba66acab9c8913f to your computer and use it in GitHub Desktop.
Save PatrickPijnappel/3241bba66acab9c8913f to your computer and use it in GitHub Desktop.
UTF8 Comparison
import Foundation
/// A codec for [UTF-8](http://www.unicode.org/glossary/#UTF_8).
public struct OldUTF8 {
// See Unicode 8.0.0, Ch 3.9, UTF-8.
// http://www.unicode.org/versions/Unicode8.0.0/ch03.pdf
/// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this
/// encoding.
public typealias CodeUnit = UInt8
public init() {}
/// Lookahead buffer used for UTF-8 decoding. New bytes are inserted at MSB,
/// and bytes are read at LSB. Note that we need to use a buffer, because
/// in case of invalid subsequences we sometimes don't know whether we should
/// consume a certain byte before looking at it.
var _decodeBuffer: UInt32 = 0
/// The amount of bits in `_decodeBuffer` that are current filled.
var _bitsInBuffer: UInt8 = 0
/// Whether we have exhausted the end of the generator. Note that this
/// doesn't mean we are done decoding, as the buffer might have bytes left.
var _atEnd: Bool = false
/// Start or continue decoding a UTF sequence.
///
/// In order to decode a code unit sequence completely, this function should
/// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`.
/// Checking that the generator was exhausted is not sufficient. The decoder
/// can have an internal buffer that is pre-filled with data from the input
/// generator.
///
/// Because of buffering, it is impossible to find the corresponding position
/// in the generator for a given returned `UnicodeScalar` or an error.
///
/// - parameter next: A generator of code units to be decoded. Repeated
/// calls to this method on the same instance should always reuse the same
/// generator. Failing to do so will result in undefined behavior.
public mutating func decode<
G : GeneratorType where G.Element == CodeUnit
>(inout next: G) -> UnicodeDecodingResult {
// FIXME: An UTF8 instance must always reuse the same generator for
// decoding. It would be better to have the generator passed at init time.
refillBuffer: if !_atEnd {
// Bufferless ASCII fastpath.
if _fastPath(_bitsInBuffer == 0) {
if let codeUnit = next.next() {
if codeUnit & 0x80 == 0 {
return .Result(UnicodeScalar(UInt32(codeUnit)))
} else { // Non-ASCII, switch to buffering mode.
_decodeBuffer = UInt32(codeUnit)
_bitsInBuffer = 8
}
} else {
_atEnd = true
return .EmptyInput
}
} else if(_decodeBuffer & 0x80 == 0) {
// ASCII in buffer. Don't refill buffer, so we might get back to
// the faster bufferless mode.
break refillBuffer
}
// Buffering mode.
// Fill buffer back to 4 bytes (or as many as left in the generator).
_sanityCheck(_bitsInBuffer < 32)
repeat {
if let codeUnit = next.next() {
// We use & 0x1f to make the compiler omit a bounds check branch.
_decodeBuffer |= (UInt32(codeUnit) << UInt32(_bitsInBuffer & 0x1f))
_bitsInBuffer = _bitsInBuffer &+ 8
} else { // Reached end.
_atEnd = true
if _bitsInBuffer == 0 { return .EmptyInput }
break // We still have some bytes left in our buffer.
}
} while _bitsInBuffer < 32
} else if _bitsInBuffer == 0 {
return .EmptyInput
}
// Decode one unicode scalar.
// Note our empty bytes are always 0x00, which is required for this call.
let (result, length) = NewUTF8._decodeOne(_decodeBuffer)
// Consume the decoded bytes (or maximal subpart of ill-formed sequence).
let bitsConsumed = 8 &* length
_sanityCheck(1...4 ~= length && bitsConsumed <= _bitsInBuffer)
// Swift doesn't allow shifts greater than or equal to the # of bits.
// _decodeBuffer >>= UInt32(bitsConsumed) & 0x1f
_decodeBuffer = UInt32(truncatingBitPattern:
UInt64(_decodeBuffer) >> (UInt64(bitsConsumed) & 0x1f))
_bitsInBuffer = _bitsInBuffer &- bitsConsumed
if _fastPath(result != nil) {
return .Result(UnicodeScalar(result!))
} else {
return .Error // Ill-formed UTF-8 code unit sequence.
}
}
/// Attempts to decode a single UTF-8 code unit sequence starting at the LSB
/// of `buffer`.
///
/// - Returns:
/// - result: The decoded code point if the code unit sequence is
/// well-formed; `nil` otherwise.
/// - length: The length of the code unit sequence in bytes if it is
/// well-formed; otherwise the *maximal subpart of the ill-formed
/// sequence* (Unicode 8.0.0, Ch 3.9, D93b), i.e. the number of leading
/// code units that were valid or 1 in case none were valid. Unicode
/// recommends to skip these bytes bytes and replace them by a single
/// replacement character (U+FFFD).
///
/// - Requires: There is at least one used byte in `buffer`, and the unused
/// space in `buffer` is filled with some value not matching the UTF-8
/// continuation byte form (`0b10xxxxxx`).
@warn_unused_result
public // @testable
static func _decodeOne(buffer: UInt32) -> (result: UInt32?, length: UInt8) {
if buffer & 0x80 == 0 { // 1-byte sequence (ASCII), [ XXX XXX XXX CU0 ].
let value = buffer & 0xff
return (value, 1)
}
// Determine sequence length using high 5 bits of 1st byte. We use a
// look-up table to branch less. 1-byte sequences are handled above.
//
// case | pattern | description
// ----------------------------
// 00 | 110xx | 2-byte sequence
// 01 | 1110x | 3-byte sequence
// 10 | 11110 | 4-byte sequence
// 11 | other | invalid
//
// 11xxx 10xxx 01xxx 00xxx
let lut0: UInt32 = 0b1011_0000__1111_1111__1111_1111__1111_1111
let lut1: UInt32 = 0b1100_0000__1111_1111__1111_1111__1111_1111
let index = (buffer >> 3) & 0x1f
let bit0 = (lut0 >> index) & 1
let bit1 = (lut1 >> index) & 1
switch (bit1, bit0) {
case (0, 0): // 2-byte sequence, [ XXX XXX CU1 CU0 ].
// Require 10xx xxxx 110x xxxx.
if _slowPath(buffer & 0xc0e0 != 0x80c0) { return (nil, 1) }
// Disallow xxxx xxxx xxx0 000x (<= 7 bits case).
if _slowPath(buffer & 0x001e == 0x0000) { return (nil, 1) }
// Extract data bits.
let value =
(buffer & 0x3f00) >> 8
| (buffer & 0x001f) << 6
return (value, 2)
case (0, 1): // 3-byte sequence, [ XXX CU2 CU1 CU0 ].
// Disallow xxxx xxxx xx0x xxxx xxxx 0000 (<= 11 bits case).
if _slowPath(buffer & 0x00200f == 0x000000) { return (nil, 1) }
// Disallow xxxx xxxx xx1x xxxx xxxx 1101 (surrogate code points).
if _slowPath(buffer & 0x00200f == 0x00200d) { return (nil, 1) }
// Require 10xx xxxx 10xx xxxx 1110 xxxx.
if _slowPath(buffer & 0xc0c0f0 != 0x8080e0) {
if buffer & 0x00c000 != 0x008000 { return (nil, 1) }
return (nil, 2) // Only return this if all checks on byte 2 passed.
}
// Extract data bits.
let value =
(buffer & 0x3f0000) >> 16
| (buffer & 0x003f00) >> 2
| (buffer & 0x00000f) << 12
return (value, 3)
case (1, 0): // 4-byte sequence, [ CU3 CU2 CU1 CU0 ].
// Disallow xxxx xxxx xxxx xxxx xx00 xxxx xxxx x000 (<= 16 bits case).
if _slowPath(buffer & 0x00003007 == 0x00000000) { return (nil, 1) }
// Case xxxx xxxx xxxx xxxx xxxx xxxx xxxx x1xx.
if buffer & 0x00000004 == 0x00000004 {
// Require xxxx xxxx xxxx xxxx xx00 xxxx xxxx xx00 (<= 0x10FFFF).
if _slowPath(buffer & 0x00003003 != 0x00000000) { return (nil, 1) }
}
// Require 10xx xxxx 10xx xxxx 10xx xxxx 1111 0xxx.
if _slowPath(buffer & 0xc0c0c0f8 != 0x808080f0) {
if buffer & 0x0000c000 != 0x00008000 { return (nil, 1) }
// We can return 2/3 here because all other checks on those CUs passed.
if buffer & 0x00c00000 != 0x00800000 { return (nil, 2) }
return (nil, 3)
}
// Extract data bits.
let value =
(buffer & 0x3f000000) >> 24
| (buffer & 0x003f0000) >> 10
| (buffer & 0x00003f00) << 4
| (buffer & 0x00000007) << 18
return (value, 4)
default: return (nil, 1) // Invalid sequence.
}
}
}
/// A codec for [UTF-8](http://www.unicode.org/glossary/#UTF_8).
public struct NewUTF8 {
// See Unicode 8.0.0, Ch 3.9, UTF-8.
// http://www.unicode.org/versions/Unicode8.0.0/ch03.pdf
/// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this
/// encoding.
public typealias CodeUnit = UInt8
public init() {}
/// Lookahead buffer used for UTF-8 decoding. New bytes are inserted at MSB,
/// and bytes are read at LSB. Note that we need to use a buffer, because
/// in case of invalid subsequences we sometimes don't know whether we should
/// consume a certain byte before looking at it.
var _decodeBuffer: UInt32 = 0
/// The amount of bits in `_decodeBuffer` that are current filled.
var _bitsInBuffer: UInt8 = 0
/// Start or continue decoding a UTF sequence.
///
/// In order to decode a code unit sequence completely, this function should
/// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`.
/// Checking that the generator was exhausted is not sufficient. The decoder
/// can have an internal buffer that is pre-filled with data from the input
/// generator.
///
/// Because of buffering, it is impossible to find the corresponding position
/// in the generator for a given returned `UnicodeScalar` or an error.
///
/// - parameter next: A generator of code units to be decoded. Repeated
/// calls to this method on the same instance should always reuse the same
/// generator. Failing to do so will result in undefined behavior.
public mutating func decode<
G : GeneratorType where G.Element == CodeUnit
>(inout next: G) -> UnicodeDecodingResult {
// FIXME: An UTF8 instance must always reuse the same generator for
// decoding. It would be better to have the generator passed at init time.
refillBuffer: do {
// Bufferless ASCII fastpath.
if _fastPath(_bitsInBuffer == 0) {
if let codeUnit = next.next() {
if codeUnit & 0x80 == 0 {
return .Result(UnicodeScalar(UInt32(codeUnit)))
} else { // Non-ASCII, switch to buffering mode.
_decodeBuffer = UInt32(codeUnit)
_bitsInBuffer = 8
}
} else {
return .EmptyInput
}
} else if(_decodeBuffer & 0x80 == 0) {
// ASCII in buffer. Don't refill buffer, so we might get back to
// the faster bufferless mode.
break refillBuffer
}
// Buffering mode.
// Fill buffer back to 4 bytes (or as many as left in the generator).
_sanityCheck(_bitsInBuffer < 32)
repeat {
if let codeUnit = next.next() {
// We use & 0x1f to make the compiler omit a bounds check branch.
_decodeBuffer |= (UInt32(codeUnit) << UInt32(_bitsInBuffer & 0x1f))
_bitsInBuffer = _bitsInBuffer &+ 8
} else { // Reached end.
if _bitsInBuffer == 0 { return .EmptyInput }
break // We still have some bytes left in our buffer.
}
} while _bitsInBuffer < 32
}
// Decode one unicode scalar.
// Note our empty bytes are always 0x00, which is required for this call.
let (result, length) = OldUTF8._decodeOne(_decodeBuffer)
// Consume the decoded bytes (or maximal subpart of ill-formed sequence).
let bitsConsumed = 8 &* length
_sanityCheck(1...4 ~= length && bitsConsumed <= _bitsInBuffer)
// Swift doesn't allow shifts greater than or equal to the # of bits.
// _decodeBuffer >>= UInt32(bitsConsumed) & 0x1f
_decodeBuffer = UInt32(truncatingBitPattern:
UInt64(_decodeBuffer) >> (UInt64(bitsConsumed) & 0x1f))
_bitsInBuffer = _bitsInBuffer &- bitsConsumed
if _fastPath(result != nil) {
return .Result(UnicodeScalar(result!))
} else {
return .Error // Ill-formed UTF-8 code unit sequence.
}
}
/// Attempts to decode a single UTF-8 code unit sequence starting at the LSB
/// of `buffer`.
///
/// - Returns:
/// - result: The decoded code point if the code unit sequence is
/// well-formed; `nil` otherwise.
/// - length: The length of the code unit sequence in bytes if it is
/// well-formed; otherwise the *maximal subpart of the ill-formed
/// sequence* (Unicode 8.0.0, Ch 3.9, D93b), i.e. the number of leading
/// code units that were valid or 1 in case none were valid. Unicode
/// recommends to skip these bytes bytes and replace them by a single
/// replacement character (U+FFFD).
///
/// - Requires: There is at least one used byte in `buffer`, and the unused
/// space in `buffer` is filled with some value not matching the UTF-8
/// continuation byte form (`0b10xxxxxx`).
@warn_unused_result
public // @testable
static func _decodeOne(buffer: UInt32) -> (result: UInt32?, length: UInt8) {
if buffer & 0x80 == 0 { // 1-byte sequence (ASCII), [ XXX XXX XXX CU0 ].
let value = buffer & 0xff
return (value, 1)
}
// Determine sequence length using high 5 bits of 1st byte. We use a
// look-up table to branch less. 1-byte sequences are handled above.
//
// case | pattern | description
// ----------------------------
// 00 | 110xx | 2-byte sequence
// 01 | 1110x | 3-byte sequence
// 10 | 11110 | 4-byte sequence
// 11 | other | invalid
//
// 11xxx 10xxx 01xxx 00xxx
let lut0: UInt32 = 0b1011_0000__1111_1111__1111_1111__1111_1111
let lut1: UInt32 = 0b1100_0000__1111_1111__1111_1111__1111_1111
let index = (buffer >> 3) & 0x1f
let bit0 = (lut0 >> index) & 1
let bit1 = (lut1 >> index) & 1
switch (bit1, bit0) {
case (0, 0): // 2-byte sequence, [ XXX XXX CU1 CU0 ].
// Require 10xx xxxx 110x xxxx.
if _slowPath(buffer & 0xc0e0 != 0x80c0) { return (nil, 1) }
// Disallow xxxx xxxx xxx0 000x (<= 7 bits case).
if _slowPath(buffer & 0x001e == 0x0000) { return (nil, 1) }
// Extract data bits.
let value =
(buffer & 0x3f00) >> 8
| (buffer & 0x001f) << 6
return (value, 2)
case (0, 1): // 3-byte sequence, [ XXX CU2 CU1 CU0 ].
// Disallow xxxx xxxx xx0x xxxx xxxx 0000 (<= 11 bits case).
if _slowPath(buffer & 0x00200f == 0x000000) { return (nil, 1) }
// Disallow xxxx xxxx xx1x xxxx xxxx 1101 (surrogate code points).
if _slowPath(buffer & 0x00200f == 0x00200d) { return (nil, 1) }
// Require 10xx xxxx 10xx xxxx 1110 xxxx.
if _slowPath(buffer & 0xc0c0f0 != 0x8080e0) {
if buffer & 0x00c000 != 0x008000 { return (nil, 1) }
return (nil, 2) // Only return this if all checks on byte 2 passed.
}
// Extract data bits.
let value =
(buffer & 0x3f0000) >> 16
| (buffer & 0x003f00) >> 2
| (buffer & 0x00000f) << 12
return (value, 3)
case (1, 0): // 4-byte sequence, [ CU3 CU2 CU1 CU0 ].
// Disallow xxxx xxxx xxxx xxxx xx00 xxxx xxxx x000 (<= 16 bits case).
if _slowPath(buffer & 0x00003007 == 0x00000000) { return (nil, 1) }
// Case xxxx xxxx xxxx xxxx xxxx xxxx xxxx x1xx.
if buffer & 0x00000004 == 0x00000004 {
// Require xxxx xxxx xxxx xxxx xx00 xxxx xxxx xx00 (<= 0x10FFFF).
if _slowPath(buffer & 0x00003003 != 0x00000000) { return (nil, 1) }
}
// Require 10xx xxxx 10xx xxxx 10xx xxxx 1111 0xxx.
if _slowPath(buffer & 0xc0c0c0f8 != 0x808080f0) {
if buffer & 0x0000c000 != 0x00008000 { return (nil, 1) }
// We can return 2/3 here because all other checks on those CUs passed.
if buffer & 0x00c00000 != 0x00800000 { return (nil, 2) }
return (nil, 3)
}
// Extract data bits.
let value =
(buffer & 0x3f000000) >> 24
| (buffer & 0x003f0000) >> 10
| (buffer & 0x00003f00) << 4
| (buffer & 0x00000007) << 18
return (value, 4)
default: return (nil, 1) // Invalid sequence.
}
}
}
// MARK: - Test
let asciiData = "Swift is a multi-paradigm, compiled programming language created for iOS, OS X, watchOS, tvOS and Linux development by Apple Inc. Swift is designed to work with Apple's Cocoa and Cocoa Touch frameworks and the large body of existing Objective-C code written for Apple products. Swift is intended to be more resilient to erroneous code (\"safer\") than Objective-C and also more concise. It is built with the LLVM compiler framework included in Xcode 6 and later and uses the Objective-C runtime, which allows C, Objective-C, C++ and Swift code to run within a single program."
let data = Array(asciiData.utf8)
func testOldUTF8() -> NSTimeInterval {
let start = NSDate()
for _ in 0..<1_000_000 {
var utf8 = OldUTF8()
var generator = data.generate()
while !utf8.decode(&generator).isEmptyInput() { }
}
return NSDate().timeIntervalSinceDate(start)
}
func testNewUTF8() -> NSTimeInterval {
let start = NSDate()
for _ in 0..<1_000_000 {
var utf8 = NewUTF8()
var generator = data.generate()
while !utf8.decode(&generator).isEmptyInput() { }
}
return NSDate().timeIntervalSinceDate(start)
}
let oldTime = testOldUTF8()
print("OldUTF8: \(oldTime)")
let newTime = testNewUTF8()
print("NewUTF8: \(newTime)")
let factor = oldTime/newTime
print("Speed-up: \(factor)x")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment