Skip to content

Instantly share code, notes, and snippets.

@niwinz
Last active October 13, 2022 11:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save niwinz/2eeca31c4357a2a354762ec30be0468a to your computer and use it in GitHub Desktop.
Save niwinz/2eeca31c4357a2a354762ec30be0468a to your computer and use it in GitHub Desktop.
WASM experiments

README

How to compile:

clang example.c --target=wasm32 -O3 -msimd128 -nostdlib -Wl,--export-all -Wl,--no-entry -Wl,--allow-undefined --output example.wasm

How to run:

node example.mjs

Output:

~ ❯ node --version                                                                                                                                                                                                 13:07:09
v16.18.0
~ ❯ node example.mjs                                                                                                                                                                                               13:14:23
==> MEASURE [wasm instance]: 1.420547ns
=> Data size: 1200000
=> Memory available: 30277632
=> Required memory:  28800000
==> MEASURE [mem init]: 158.288570ns
=> Data Size: 9600000
==> MEASURE [mem fill]: 4.993731ns
==> MEASURE [run JS]: 6.862855ns
=> Result JS: 86046.8079
=> Result WASM[E]: 86046.8079
==> MEASURE [run WASM E]: 2.745297ns
=> Result WASM[I]: 86046.8079
==> MEASURE [run WASM I]: 2.101041ns

NOTE: The example.wat is just for convenience, it is not needed to run this example code.

#include <wasm_simd128.h>
/* Explicit SIMD usage */
void mulExplicitSimdFloat64 (double *output, double *data1, double *data2, int length) {
for (int i = 0; i < length; i += 2) {
v128_t a = wasm_v128_load(&data1[i]);
v128_t b = wasm_v128_load(&data2[i]);
v128_t res = wasm_f64x2_mul(a, b);
wasm_v128_store(&output[i], res);
}
}
/* Implicit SIMD, we expect the compiler will be able to optimize it without
adding additional code or explicit SIMD operations. */
void mulImplicitSimdFloat64 (double *output, double *data1, double *data2, int length) {
for (int i = 0; i < length; i++) {
output[i] = data1[i] * data2[i];
}
}
/* This is a convenience function but it probably also will be optimized
by the compiler to use SIMD operations */
double sumAll(double *input, int length) {
double total = 0;
for (int i = 0; i < length; i++) {
total += input[i];
}
return total;
}
import fs from "fs";
// -----------------------
// HELPERS
// -----------------------
let measures = {};
let measureLabel = null;
let measureStart = null;
function startMeasure(label) {
measures[label] = performance.now();
}
function endMeasure(label) {
let start = measures[label];
let res = performance.now() - start;
console.log(`==> MEASURE [${label}]: ${res.toFixed(6)}ns`);
}
function randFloat() {
return Math.random();
}
function fillZero(data) {
let length = data.length;
for (let i=0; i<length; i++) {
data[i] = 0;
}
}
// -----------------------
// WASM INSTANCE
// -----------------------
const DATA_SIZE = 300000*4;
// == INITIALIZE
startMeasure("wasm instance");
const buf = fs.readFileSync('./example.wasm')
const res = await WebAssembly.instantiate(buf, {})
const {
mulExplicitSimdFloat64,
mulImplicitSimdFloat64,
sumAll,
memory
} = res.instance.exports;
await memory.grow(460);
endMeasure("wasm instance");
// == MEM ALLOC
const requiredMemory = Float64Array.BYTES_PER_ELEMENT*DATA_SIZE;
console.log(`=> Data size: ${DATA_SIZE}`);
console.log("=> Memory available:", memory.buffer.byteLength);
console.log("=> Required memory: ", requiredMemory*3);
const bdata1 = new Float64Array(memory.buffer, 0, DATA_SIZE);
const bdata2 = new Float64Array(memory.buffer, DATA_SIZE, DATA_SIZE);
const boutput = new Float64Array(memory.buffer, DATA_SIZE*2, DATA_SIZE);
// == MEM INIT
startMeasure("mem init");
// Populate data
const data1 = Array.from({length: DATA_SIZE}, randFloat);
const data2 = Array.from({length: DATA_SIZE}, randFloat);
endMeasure("mem init");
startMeasure("mem fill");
bdata1.set(data1);
bdata2.set(data2);
console.log("=> Data Size:", bdata1.byteLength);
endMeasure("mem fill");
startMeasure("run JS");
// Calculate it on JS
let result = (function (output, data1, data2) {
for (let i=0; i<DATA_SIZE; i++) {
output[i] = data1[i]*data2[i];
}
let total = 0.0;
let length = output.length;
for (let i=0; i<length; i++) {
total += output[i];
}
return total;
})(boutput, bdata1, bdata2);
endMeasure("run JS");
console.log(`=> Result JS: ${result.toFixed(4)}`);
fillZero(boutput);
// Call the function and display the results.
startMeasure("run WASM E");
mulExplicitSimdFloat64(boutput.byteOffset, bdata1.byteOffset, bdata2.byteOffset, DATA_SIZE)
result = sumAll(boutput.byteOffset, DATA_SIZE);
console.log(`=> Result WASM[E]: ${result.toFixed(4)}`);
endMeasure("run WASM E");
fillZero(boutput);
startMeasure("run WASM I");
mulImplicitSimdFloat64(boutput.byteOffset, bdata1.byteOffset, bdata2.byteOffset, DATA_SIZE)
result = sumAll(boutput.byteOffset, DATA_SIZE);
console.log(`=> Result WASM[I]: ${result.toFixed(4)}`);
endMeasure("run WASM I");
(module
(type (;0;) (func))
(type (;1;) (func (param i32 i32 i32 i32)))
(type (;2;) (func (param i32 i32) (result f64)))
(func $__wasm_call_ctors (type 0))
(func $mulExplicitSimdFloat64 (type 1) (param i32 i32 i32 i32)
(local i32)
block ;; label = @1
local.get 3
i32.const 1
i32.lt_s
br_if 0 (;@1;)
i32.const 0
local.set 4
loop ;; label = @2
local.get 0
local.get 2
v128.load align=1
local.get 1
v128.load align=1
f64x2.mul
v128.store align=1
local.get 1
i32.const 16
i32.add
local.set 1
local.get 2
i32.const 16
i32.add
local.set 2
local.get 0
i32.const 16
i32.add
local.set 0
local.get 4
i32.const 2
i32.add
local.tee 4
local.get 3
i32.lt_s
br_if 0 (;@2;)
end
end)
(func $mulImplicitSimdFloat64 (type 1) (param i32 i32 i32 i32)
(local i32 i32 i32 i32 i32)
block ;; label = @1
local.get 3
i32.const 1
i32.lt_s
br_if 0 (;@1;)
i32.const 0
local.set 4
block ;; label = @2
local.get 3
i32.const 1
i32.eq
br_if 0 (;@2;)
local.get 1
local.get 3
i32.const 3
i32.shl
local.tee 5
i32.add
local.get 0
i32.gt_u
local.get 0
local.get 5
i32.add
local.tee 6
local.get 1
i32.gt_u
i32.and
br_if 0 (;@2;)
local.get 2
local.get 5
i32.add
local.get 0
i32.gt_u
local.get 6
local.get 2
i32.gt_u
i32.and
br_if 0 (;@2;)
local.get 3
i32.const -2
i32.and
local.tee 4
local.set 7
local.get 0
local.set 5
local.get 2
local.set 6
local.get 1
local.set 8
loop ;; label = @3
local.get 5
local.get 6
v128.load align=8
local.get 8
v128.load align=8
f64x2.mul
v128.store align=8
local.get 5
i32.const 16
i32.add
local.set 5
local.get 6
i32.const 16
i32.add
local.set 6
local.get 8
i32.const 16
i32.add
local.set 8
local.get 7
i32.const -2
i32.add
local.tee 7
br_if 0 (;@3;)
end
local.get 4
local.get 3
i32.eq
br_if 1 (;@1;)
end
local.get 3
local.get 4
i32.sub
local.set 7
local.get 1
local.get 4
i32.const 3
i32.shl
local.tee 8
i32.add
local.set 5
local.get 2
local.get 8
i32.add
local.set 6
local.get 0
local.get 8
i32.add
local.set 8
loop ;; label = @2
local.get 8
local.get 6
f64.load
local.get 5
f64.load
f64.mul
f64.store
local.get 5
i32.const 8
i32.add
local.set 5
local.get 6
i32.const 8
i32.add
local.set 6
local.get 8
i32.const 8
i32.add
local.set 8
local.get 7
i32.const -1
i32.add
local.tee 7
br_if 0 (;@2;)
end
end)
(func $sumAll (type 2) (param i32 i32) (result f64)
(local f64 i32 v128 i32 i32)
block ;; label = @1
local.get 1
i32.const 1
i32.ge_s
br_if 0 (;@1;)
f64.const 0x0p+0 (;=0;)
return
end
block ;; label = @1
block ;; label = @2
block ;; label = @3
local.get 1
i32.const 1
i32.ne
br_if 0 (;@3;)
f64.const 0x0p+0 (;=0;)
local.set 2
i32.const 0
local.set 3
br 1 (;@2;)
end
f64.const 0x0p+0 (;=0;)
f64x2.splat
local.set 4
local.get 1
i32.const -2
i32.and
local.tee 3
local.set 5
local.get 0
local.set 6
loop ;; label = @3
local.get 6
v128.load align=8
local.get 4
f64x2.add
local.set 4
local.get 6
i32.const 16
i32.add
local.set 6
local.get 5
i32.const -2
i32.add
local.tee 5
br_if 0 (;@3;)
end
local.get 4
local.get 4
local.get 4
i8x16.shuffle 8 9 10 11 12 13 14 15 0 0 0 0 0 0 0 0
f64x2.add
f64x2.extract_lane 0
local.set 2
local.get 3
local.get 1
i32.eq
br_if 1 (;@1;)
end
local.get 1
local.get 3
i32.sub
local.set 5
local.get 0
local.get 3
i32.const 3
i32.shl
i32.add
local.set 6
loop ;; label = @2
local.get 6
f64.load
local.get 2
f64.add
local.set 2
local.get 6
i32.const 8
i32.add
local.set 6
local.get 5
i32.const -1
i32.add
local.tee 5
br_if 0 (;@2;)
end
end
local.get 2)
(table (;0;) 1 1 funcref)
(memory (;0;) 2)
(global (;0;) (mut i32) (i32.const 66560))
(global (;1;) i32 (i32.const 1024))
(global (;2;) i32 (i32.const 1024))
(global (;3;) i32 (i32.const 1024))
(global (;4;) i32 (i32.const 66560))
(global (;5;) i32 (i32.const 0))
(global (;6;) i32 (i32.const 1))
(export "memory" (memory 0))
(export "__wasm_call_ctors" (func $__wasm_call_ctors))
(export "mulExplicitSimdFloat64" (func $mulExplicitSimdFloat64))
(export "mulImplicitSimdFloat64" (func $mulImplicitSimdFloat64))
(export "sumAll" (func $sumAll))
(export "__dso_handle" (global 1))
(export "__data_end" (global 2))
(export "__global_base" (global 3))
(export "__heap_base" (global 4))
(export "__memory_base" (global 5))
(export "__table_base" (global 6)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment