Last active November 29, 2015 03:07
Testing Swift's ability to optimize code using simple custom Float4 and Float4x4 value types, comparing to SIMD counterparts, subscript implementation turns out to make a big difference for F4x4 while not for F4 ...
// This program is possibly demonstrating an issue/opportunity for improvement
// of the optimizer.
// (Tested with Xcode 7.2 beta 4, OS X 10.10.5, Macbook Pro late 2013)
// (Compiled (-O -gnone) as command line app)
// There is a test at the end which measures the time it takes to do some
// matrix (and vector) type operations.
// It does this using SIMD float4x4 vs my F4x4 (and float4 vs my F4).
// F4x4 is identical in structure to F4, and it has four F4 as elements.
// The program defines two ways of implementing the subscripts, Subs1 and Subs2.
// So F4x4 and F4 can be set to use one of them by conforming to Subs1 or Subs2.
// But here is the thing:
// Using Subs1 or Subs2 for F4 doesn't affect the timings of the test, BUT:
// Using Subs1 or Subs2 for F4x4 will produce ~ 8 times faster or slower code.
// (The + operator for F4x4 uses F4 and its + operator and subscript.)
// So how come Subs1/Subs2 makes a difference for F4x4 and not for F4?
// (And also, using F4x4:Subs2 with safety checks disabled makes it even slower.
// Why should disabling safety checks make it slower?)
import Cocoa
import simd
// NOTE: Protocols, protocol extensions and generics are used only to make the
// code smaller and clearer. The issue(?) has nothing to do with these features.
// (I've tried and the results are the same with no protocols or generics.)
protocol FourElementSubscriptable : CustomStringConvertible {
typealias Element
subscript(index: Int) -> Element { get set }
var elements: (Element, Element, Element, Element) { get set }
extension FourElementSubscriptable {
var description: String { return "\(elements)" }
// Subs1 and Subs2 are the two different ways of implementing subscripts:
protocol Subs1 : FourElementSubscriptable {}
protocol Subs2 : FourElementSubscriptable {}
extension Subs1 {
subscript(index: Int) -> Element {
get {
switch index {
case 0: return elements.0
case 1: return elements.1
case 2: return elements.2
case 3: return elements.3
default: fatalError("Index out of bounds")
set {
switch index {
case 0: elements.0 = newValue
case 1: elements.1 = newValue
case 2: elements.2 = newValue
case 3: elements.3 = newValue
default: fatalError("Index out of bounds")
extension Subs2 {
subscript(index: Int) -> Element {
get {
precondition(index >= 0 && index < 4)
var selfCopy = self; return withUnsafePointer(&selfCopy) { UnsafePointer<Element>($0)[index] }
set {
precondition(index >= 0 && index < 4)
withUnsafeMutablePointer(&self) { UnsafeMutablePointer<Element>($0)[index] = newValue }
// F4 - To be compared to SIMD float4.
// NOTE: Using Subs1 or Subs2 for this type will not affect the test timings.
struct F4 : Subs1 { // <--- Same timings no matter if using Subs1 or Subs2.
var elements : (Float, Float, Float, Float)
init() { elements = (0, 0, 0, 0) }
init(_ elements: (Float, Float, Float, Float)) { self.elements = elements }
func mapWith(var other: F4, @noescape transform: (Float, Float) -> Float) -> F4 {
for i in 0 ..< 4 { other[i] = transform(self[i], other[i]) }; return other
func +(lhs: F4, rhs: F4) -> F4 { return lhs.mapWith(rhs, transform: +) }
// F4x4 - To be compared to SIMD float4x4.
// NOTE: Using Subs2 instead of Subs1 will make it about 8 times slower.
struct F4x4 : Subs1 { // <--- Try Subs1 & Subs2, NOTE: Subs2 is 8x slower, why?
var elements: (F4, F4, F4, F4)
init() { elements = (F4(), F4(), F4(), F4()) }
init(_ elements: (F4, F4, F4, F4)) { self.elements = elements }
func mapWith(var other: F4x4, @noescape transform: (F4, F4) -> F4) -> F4x4 {
for i in 0 ..< 4 { other[i] = transform(self[i], other[i]) }; return other
func +(lhs: F4x4, rhs: F4x4) -> F4x4 { return lhs.mapWith(rhs, transform: +) }
// Some convenience (no effect on issue; same without protocols and generics)
protocol Testable : CustomStringConvertible {
static func random() -> Self
func +(lhs: Self, rhs: Self) -> Self
extension Testable {
static func random(num: Int) -> [Self] {
var a = [Self](count: num, repeatedValue: Self())
for i in a.indices { a[i] = Self.random() }
return a
static func random4() -> (Self, Self, Self, Self) { return (.random(), .random(), .random(), .random()) }
extension float4 : CustomStringConvertible { public var description: String { return "(\(x), \(y), \(z), \(w))" } }
extension float4x4 : CustomStringConvertible { public var description: String { return "(\(self[0]), \(self[1]), \(self[2]), \(self[3]))" } }
extension Float : Testable { static func random() -> Float { return Float(Double(arc4random()) / Double(UInt32.max)) } }
extension float4 : Testable { static func random() -> float4 { return float4(Float.random(4)) } }
extension float4x4 : Testable { static func random() -> float4x4 { return float4x4(float4.random(4)) } }
extension F4 : Testable { static func random() -> F4 { return F4(Float.random4()) } }
extension F4x4 : Testable { static func random() -> F4x4 { return F4x4(F4.random4()) } }
// The test
func test<T: Testable>(_: T.Type){
let num = 10_000_000
print("Preparing ... (arc4random() is slow ...)")
let a = T.random(num)
print("Testing \(T.self):")
for _ in 0 ..< 4 {
var sum = T()
let t0 = CACurrentMediaTime()
for i in 0 ..< num { sum = sum + a[i] }
let t1 = CACurrentMediaTime()
print(String(format: "time: %8.6f ( deadcodeeliminationprevention: \(sum.description.hashValue) )", t1 - t0))
func subsVariant<T: Subs1>(_:T.Type) -> String { return "Subs1" }
func subsVariant<T: Subs2>(_:T.Type) -> String { return "Subs2" }
print("Testing float4x4 and F4x4 : \(subsVariant(F4x4)) <--- Edit code to make F4x4 conform to Subs1 / Subs2 to see difference.")
print("\nMight as well run the test for float4 and F4 : \(subsVariant(F4)) too:")
