Skip to content

Instantly share code, notes, and snippets.

@MartinNowak
Created September 22, 2011 17:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MartinNowak/1235470 to your computer and use it in GitHub Desktop.
Save MartinNowak/1235470 to your computer and use it in GitHub Desktop.
vector array operations
import core.bitop, core.cpuid;
version (D_InlineAsm_X86)
{
enum GeneralPurposeRegs : string
{
AX = "EAX",
CX = "ECX",
DX = "EDX",
BX = "EBX",
SP = "ESP",
BP = "EBP",
SI = "ESI",
DI = "EDI",
}
}
else version (D_InlineAsm_X86_64)
{
enum GeneralPurposeRegs : string
{
AX = "RAX",
CX = "RCX",
DX = "RDX",
BX = "RBX",
SP = "RSP",
BP = "RBP",
SI = "RSI",
DI = "RDI",
}
}
alias GeneralPurposeRegs GP;
version (D_InlineAsm_X86)
version = AsmX86;
else version (D_InlineAsm_X86_64)
version = AsmX86;
enum MM : string
{
_0 = "MM0",
_1 = "MM1",
_2 = "MM2",
_3 = "MM3",
_4 = "MM4",
_5 = "MM5",
_6 = "MM6",
_7 = "MM7",
}
enum XMM : string
{
_0 = "XMM0",
_1 = "XMM1",
_2 = "XMM2",
_3 = "XMM3",
_4 = "XMM4",
_5 = "XMM5",
_6 = "XMM6",
_7 = "XMM7",
}
enum UnitMask
{
native,
mmx,
amd,
sse,
}
alias core.cpuid.sse sse;
alias core.cpuid.amd3dnow amd3dnow;
alias core.cpuid.mmx mmx;
version (unittest)
{
@property uint machineMask()
{
return sse() << UnitMask.sse | amd3dnow() << UnitMask.amd |
mmx() << UnitMask.mmx | 1 << UnitMask.native;
}
}
else
{
static this()
{
_machineMask = sse() << UnitMask.sse | amd3dnow() << UnitMask.amd |
mmx() << UnitMask.mmx | 1 << UnitMask.native;
}
@property uint machineMask()
{
return _machineMask;
}
private uint _machineMask;
}
enum Op { Add, Sub, Mul, Div }
Op opStringToEnum(string op)
{
switch (op)
{
case "+": return Op.Add;
case "-": return Op.Sub;
case "*": return Op.Mul;
case "/": return Op.Div;
default: assert(0);
}
}
bool isCommutative(string op)
{
switch (op)
{
case "+":
case "*":
return true;
default:
return false;
}
}
string toString(size_t sz)
{
char[sz.sizeof == 8 ? 20 : 10] res;
size_t idx = res.length;
do
{
res[--idx] = '0' + (sz % 10);
sz /= 10;
} while (sz > 0);
return res[idx .. $].idup;
}
template nativeUnit()
{
enum shouldAlign = false;
template inst(T)
{
enum optab = ["+", "-", "*", "/"];
}
}
template mmxUnit()
{
alias MM MMRegs;
enum regsz = 8;
enum shouldAlign = false;
template inst(T) if(is(T == int) || is(T == uint))
{
enum amov = "movq";
enum umov = "movq";
enum optab = ["paddd", "psubd", null, null];
}
template inst(T) if(is(T == short) || is(T == ushort))
{
enum amov = "movq";
enum umov = "movq";
enum optab = ["paddw", "psubw", null, null];
}
template inst(T) if(is(T == byte) || is(T == ubyte))
{
enum amov = "movq";
enum umov = "movq";
enum optab = ["paddb", "psubb", null, null];
}
enum loopCleanup = "emms;";
}
template amdUnit()
{
alias MM MMRegs;
enum regsz = 8;
enum shouldAlign = false;
template inst(T) if(is(T == float))
{
enum amov = "movq";
enum umov = "movq";
enum optab = ["pfadd", "pfsub", "pfmul", null];
}
enum loopCleanup = "emms;";
}
template sseUnit()
{
alias XMM MMRegs;
enum regsz = 16;
enum shouldAlign = true;
template inst(T) if(is(T == double))
{
enum amov = "movapd";
enum umov = "movupd";
enum optab = ["addpd", "subpd", "mulpd", "divpd"];
}
template inst(T) if(is(T == float))
{
enum amov = "movaps";
enum umov = "movups";
enum optab = ["addps", "subps", "mulps", "divps"];
}
template inst(T) if(is(T == int) || is(T == uint))
{
enum amov = "movdqa";
enum umov = "movdqu";
enum optab = ["paddd", "psubd", null, null];
}
template inst(T) if(is(T == short) || is(T == ushort))
{
enum amov = "movdqa";
enum umov = "movdqu";
enum optab = ["paddw", "psubw", null, null];
}
template inst(T) if(is(T == byte) || is(T == ubyte))
{
enum amov = "movdqa";
enum umov = "movdqu";
enum optab = ["paddb", "psubb", null, null];
}
enum loopCleanup = "";
}
// ==================== 2 array operands ====================
void arrayOp(string op, T)(T[] a, T[] b, T[] c)
in
{
assert(a.length == b.length && b.length == c.length);
}
body
{
if (a.length == 0)
return;
alias void function(size_t n, T*, T*, T*) Fn;
static struct FnTab
{
uint mask;
Fn[] fptrs;
}
FnTab buildFnTab()
{
enum opidx = opStringToEnum(op);
FnTab result;
void addUnit(alias unit)(UnitMask m)
{
// unit support for T and op ?
static if (is(typeof(unit.inst!(T).optab)) &&
unit.inst!(T).optab[opidx] != null)
{
result.mask |= (1 << m);
result.fptrs ~= &arrayOpUnit!(unit, op, T);
}
else
{
result.fptrs ~= null;
}
}
addUnit!(nativeUnit!())(UnitMask.native);
version (AsmX86)
{
addUnit!(mmxUnit!())(UnitMask.mmx);
addUnit!(amdUnit!())(UnitMask.amd);
addUnit!(sseUnit!())(UnitMask.sse);
}
return result;
}
static immutable units = buildFnTab();
immutable chosen = bsr(units.mask & machineMask);
return units.fptrs[chosen](a.length, a.ptr, b.ptr, c.ptr);
}
void arrayOpUnit(alias unit, string op, T)(size_t n, T* a, T* b, T* c)
{
static if (unit.shouldAlign)
{
static immutable alignedTab = [
&arrayOpImpl!(unit, op, 0b000, T),
&arrayOpImpl!(unit, op, 0b001, T),
&arrayOpImpl!(unit, op, 0b010, T),
&arrayOpImpl!(unit, op, 0b011, T),
&arrayOpImpl!(unit, op, 0b100, T),
&arrayOpImpl!(unit, op, 0b101, T),
&arrayOpImpl!(unit, op, 0b110, T),
&arrayOpImpl!(unit, op, 0b111, T),
];
if (auto nbytes = (-cast(size_t)a & unit.regsz - 1))
{
if ((nbytes & (T.sizeof - 1)) == 0)
{
if (nbytes > n * T.sizeof)
nbytes = n;
else
nbytes /= T.sizeof;
n -= nbytes;
do
{
*a = cast(T)mixin(`*b `~op~` *c`);
++a; ++b; ++c;
--nbytes;
} while (nbytes);
}
}
size_t done;
if (n >= unit.regsz / T.sizeof)
{
auto tabIdx =
((cast(size_t)a & unit.regsz - 1) == 0) << 2 |
((cast(size_t)b & unit.regsz - 1) == 0) << 1 |
((cast(size_t)c & unit.regsz - 1) == 0) << 0;
done = alignedTab[tabIdx](n, a, b, c);
}
}
else
{
size_t done = arrayOpImpl!(unit, op, 0b000, T)(n, a, b, c);
}
while (done < n)
{
a[done] = cast(T)mixin(`b[done] `~op~` c[done]`);
++done;
}
}
// Bugzilla 6701
// size_t arrayOpImpl_6701_NU(alias unit : nativeUnit!(), string op, uint algnMsk, T)
// (size_t len, T* a, T* b, T* c)
size_t arrayOpImpl(alias unit, string op, uint algnMsk, T)
(size_t len, T* a, T* b, T* c) if (unit.mangleof == nativeUnit!().mangleof)
{
auto n = len;
while (n--)
*a++ = cast(T)mixin(`*b++ `~op~` *c++`);
return len;
}
size_t arrayOpImpl(alias unit, string op, uint alignMask, T)
(size_t len, T* a, T* b, T* c) if (unit.mangleof != nativeUnit!().mangleof)
in
{
assert(!(alignMask & 0b100) || !(cast(size_t)a & (unit.regsz - 1)));
assert(!(alignMask & 0b010) || !(cast(size_t)b & (unit.regsz - 1)));
assert(!(alignMask & 0b001) || !(cast(size_t)c & (unit.regsz - 1)));
}
body
{
enum mova = (alignMask & 0b100) ? unit.inst!(T).amov : unit.inst!(T).umov;
enum movb = (alignMask & 0b010) ? unit.inst!(T).amov : unit.inst!(T).umov;
enum movc = (alignMask & 0b001) ? unit.inst!(T).amov : unit.inst!(T).umov;
enum opinst = unit.inst!(T).optab[opStringToEnum(op)];
alias unit.MMRegs MM;
enum regsz = toString(unit.regsz);
auto n = len;
mixin(`
asm {
mov `~GP.DI~`, a; // destination
mov `~GP.SI~`, b; // 1st oprnd source
mov `~GP.DX~`, c; // 2nd oprnd source
mov `~GP.CX~`, n; // loop counter
align `~toString(2 * size_t.sizeof)~`;
LloopA:
cmp `~GP.CX~`, `~toString(4 * unit.regsz / T.sizeof)~`;
jb LloopB;
`~movb~` `~MM._0~`, [`~GP.SI~` + 0*`~regsz~`];
`~movb~` `~MM._1~`, [`~GP.SI~` + 1*`~regsz~`];
`~movb~` `~MM._2~`, [`~GP.SI~` + 2*`~regsz~`];
`~movb~` `~MM._3~`, [`~GP.SI~` + 3*`~regsz~`];
add `~GP.SI~`, 4*`~regsz~`;
`~movc~` `~MM._4~`, [`~GP.DX~` + 0*`~regsz~`];
`~movc~` `~MM._5~`, [`~GP.DX~` + 1*`~regsz~`];
`~movc~` `~MM._6~`, [`~GP.DX~` + 2*`~regsz~`];
`~movc~` `~MM._7~`, [`~GP.DX~` + 3*`~regsz~`];
add `~GP.DX~`, 4*`~regsz~`;
`~opinst~` `~MM._0~`, `~MM._4~`;
`~opinst~` `~MM._1~`, `~MM._5~`;
`~opinst~` `~MM._2~`, `~MM._6~`;
`~opinst~` `~MM._3~`, `~MM._7~`;
`~mova~`[`~GP.DI~` + 0*`~regsz~`], `~MM._0~`;
`~mova~`[`~GP.DI~` + 1*`~regsz~`], `~MM._1~`;
`~mova~`[`~GP.DI~` + 2*`~regsz~`], `~MM._2~`;
`~mova~`[`~GP.DI~` + 3*`~regsz~`], `~MM._3~`;
add `~GP.DI~`, 4*`~regsz~`;
sub `~GP.CX~`, `~toString(4 * unit.regsz / T.sizeof)~`;
jmp LloopA;
align `~toString(2 * size_t.sizeof)~`;
LloopB:
cmp `~GP.CX~`, `~toString(unit.regsz / T.sizeof)~`;
jb Ldone;
`~movb~` `~MM._0~`, [`~GP.SI~`];
add `~GP.SI~`, `~regsz~`;
`~movc~` `~MM._4~`, [`~GP.DX~`];
add `~GP.DX~`, `~regsz~`;
`~opinst~` `~MM._0~`, `~MM._4~`;
`~mova~`[`~GP.DI~`], `~MM._0~`;
add `~GP.DI~`, `~regsz~`;
sub `~GP.CX~`, `~toString(unit.regsz / T.sizeof)~`;
jmp LloopB;
Ldone:
`~unit.loopCleanup~`
mov n, `~GP.CX~`;
}
`);
return len - n;
}
// ==================== 1 array 1 scalar operands ====================
void arrayOp(string op, T)(T[] a, T[] b, T c)
in
{
assert(a.length == b.length);
}
body
{
if (a.length == 0)
return;
alias void function(size_t n, T*, T*, T) Fn;
static struct FnTab
{
uint mask;
Fn[] fptrs;
}
FnTab buildFnTab()
{
enum opidx = opStringToEnum(op);
FnTab result;
void addUnit(alias unit)(UnitMask m)
{
// unit support for T and op ?
static if (is(typeof(unit.inst!(T).optab)) &&
unit.inst!(T).optab[opidx] != null)
{
result.mask |= (1 << m);
result.fptrs ~= &arrayOpScalarUnit!(unit, op, T);
}
else
{
result.fptrs ~= null;
}
}
addUnit!(nativeUnit!())(UnitMask.native);
version (AsmX86)
{
addUnit!(mmxUnit!())(UnitMask.mmx);
addUnit!(amdUnit!())(UnitMask.amd);
addUnit!(sseUnit!())(UnitMask.sse);
}
return result;
}
static immutable units = buildFnTab();
immutable chosen = bsr(units.mask & machineMask);
return units.fptrs[chosen](a.length, a.ptr, b.ptr, c);
}
void arrayOpScalarUnit(alias unit, string op, T)(size_t n, T* a, T* b, T c)
{
// used for fast aligned loads to register
static if (unit.shouldAlign)
{
static immutable alignedTab = [
&arrayOpScalarImpl!(unit, op, 0b00, T),
&arrayOpScalarImpl!(unit, op, 0b01, T),
&arrayOpScalarImpl!(unit, op, 0b10, T),
&arrayOpScalarImpl!(unit, op, 0b11, T),
];
if (auto nbytes = (-cast(size_t)a & unit.regsz - 1))
{
if ((nbytes & (T.sizeof - 1)) == 0)
{
if (nbytes > n * T.sizeof)
nbytes = n;
else
nbytes /= T.sizeof;
n -= nbytes;
do
{
*a = cast(T)mixin(`*b `~op~` c`);
++a; ++b;
--nbytes;
} while (nbytes);
}
}
size_t done;
if (n >= unit.regsz / T.sizeof)
{
auto tabIdx =
((cast(size_t)a & unit.regsz - 1) == 0) << 1 |
((cast(size_t)b & unit.regsz - 1) == 0) << 0;
done = alignedTab[tabIdx](n, a, b, c);
}
}
else
{
size_t done = arrayOpScalarImpl!(unit, op, 0b000, T)(n, a, b, c);
}
while (done < n)
{
a[done] = cast(T)mixin(`b[done] `~op~` c`);
++done;
}
}
// Bugzilla 6701
// size_t arrayOpImpl_6701_NU(alias unit : nativeUnit!(), string op, uint algnMsk, T)
// (size_t len, T* a, T* b, T* c)
size_t arrayOpScalarImpl(alias unit, string op, uint algnMsk, T)
(size_t len, T* a, T* b, T c) if (unit.mangleof == nativeUnit!().mangleof)
{
auto n = len;
while (n--)
*a++ = cast(T)mixin(`*b++ `~op~`c`);
return len;
}
string broadcast(T, MMR)(MMR dst, string src) if (is(T == short))
{
string code = `
mov AX , `~src~`;
movd `~dst~`, EAX;
pshufw `~dst~`, `~dst~`, 0;`;
static if (is(MMR == XMM))
code ~=
`shufpd `~dst~`, `~dst~`, 0;`;
return code;
}
string broadcast(T, MMR)(MMR dst, string src) if (is(T == int))
{
string code =
`movd `~dst~`, `~src~`;`;
static if (is(MMR == MM))
code ~=
`pshufw `~dst~`, `~dst~`, 0;`; // BUG
else static if (is(MMR == XMM))
code ~=
`pshufd `~dst~`, `~dst~`, 0;`;
else
static assert(0);
return code;
}
string broadcast(T, MMR)(MMR dst, string src) if (is(T == double))
{
string code =
`movq `~dst~`, `~src~`;`;
static if (is(MMR == XMM))
code ~=
`shufpd `~dst~`, `~dst~`, 0;`;
return code;
}
size_t arrayOpScalarImpl(alias unit, string op, uint alignMask, T)
(size_t len, T* a, T* b, T c) if (unit.mangleof != nativeUnit!().mangleof)
in
{
assert(!(alignMask & 0b10) || !(cast(size_t)a & (unit.regsz - 1)));
assert(!(alignMask & 0b01) || !(cast(size_t)b & (unit.regsz - 1)));
}
body
{
enum mova = (alignMask & 0b10) ? unit.inst!(T).amov : unit.inst!(T).umov;
enum movb = (alignMask & 0b01) ? unit.inst!(T).amov : unit.inst!(T).umov;
enum amov = unit.inst!(T).amov;
enum opinst = unit.inst!(T).optab[opStringToEnum(op)];
alias unit.MMRegs MM;
enum regsz = toString(unit.regsz);
auto n = len;
mixin(`
asm {
mov `~GP.DI~`, a; // destination
mov `~GP.SI~`, b; // 1st oprnd source
`~broadcast!(T)(MM._4, `c[`~GP.BP~`]`)~`
mov `~GP.CX~`, n; // loop counter
align `~toString(2 * size_t.sizeof)~`;
LloopA:
cmp `~GP.CX~`, `~toString(4 * unit.regsz / T.sizeof)~`;
jb LloopB;
`~movb~` `~MM._0~`, [`~GP.SI~` + 0*`~regsz~`];
`~movb~` `~MM._1~`, [`~GP.SI~` + 1*`~regsz~`];
`~movb~` `~MM._2~`, [`~GP.SI~` + 2*`~regsz~`];
`~movb~` `~MM._3~`, [`~GP.SI~` + 3*`~regsz~`];
add `~GP.SI~`, 4*`~regsz~`;
`~opinst~` `~MM._0~`, `~MM._4~`;
`~opinst~` `~MM._1~`, `~MM._4~`;
`~opinst~` `~MM._2~`, `~MM._4~`;
`~opinst~` `~MM._3~`, `~MM._4~`;
`~mova~`[`~GP.DI~` + 0*`~regsz~`], `~MM._0~`;
`~mova~`[`~GP.DI~` + 1*`~regsz~`], `~MM._1~`;
`~mova~`[`~GP.DI~` + 2*`~regsz~`], `~MM._2~`;
`~mova~`[`~GP.DI~` + 3*`~regsz~`], `~MM._3~`;
add `~GP.DI~`, 4*`~regsz~`;
sub `~GP.CX~`, `~toString(4 * unit.regsz / T.sizeof)~`;
jmp LloopA;
align `~toString(2 * size_t.sizeof)~`;
LloopB:
cmp `~GP.CX~`, `~toString(unit.regsz / T.sizeof)~`;
jb Ldone;
`~movb~` `~MM._0~`, [`~GP.SI~`];
add `~GP.SI~`, `~regsz~`;
`~opinst~` `~MM._0~`, `~MM._4~`;
`~mova~`[`~GP.DI~`], `~MM._0~`;
add `~GP.DI~`, `~regsz~`;
sub `~GP.CX~`, `~toString(unit.regsz / T.sizeof)~`;
jmp LloopB;
Ldone:
`~unit.loopCleanup~`
mov n, `~GP.CX~`;
}
`);
return len - n;
}
// ==================== unittests ====================
version (unittest)
{
template TypeTuple(T...)
{
alias T TypeTuple;
}
}
unittest
{
alias TypeTuple!(byte, ubyte, short, ushort, int, uint, long, ulong,
float, double, real) NumericTypes;
alias TypeTuple!("+", "-", "*", "/") Ops;
enum maxSize = 1030;
foreach(T; NumericTypes)
{
auto dst = new T[](maxSize);
auto oprnd1 = new T[](maxSize);
auto oprnd2 = new T[](maxSize);
foreach(i; 0 .. maxSize)
{
T val = cast(T)(i);
if (val == 0)
val = 1; // avoid 0 to test division
oprnd1[i] = val;
oprnd2[i] = val;
}
for (size_t sz = 1; sz < maxSize; sz <<= 1)
{
foreach(j; 0 .. sz >= 16 ? 16 : sz)
{
foreach(op; Ops)
{
dst[j .. sz] = 0;
arrayOp!op(dst[j .. sz], oprnd1[0 .. sz - j], oprnd2[0 .. sz - j]);
foreach(i; 0 .. sz - j)
{
import std.string;
assert(dst[i + j] == cast(T)mixin(`oprnd1[i]`~op~`oprnd2[i]`),
std.string.format("%s [%d]: %s = %s %s %s",
sz, i, dst[i + j], oprnd1[i], op, oprnd2[i]));
}
}
}
}
foreach(i; 0 .. maxSize)
{
auto val = cast(T)i;
if (val != 0)
{
assert(oprnd1[i] == val);
assert(oprnd2[i] == val);
}
}
}
}
// ==================== test driver ====================
extern(C) float[] _arraySliceSliceAddSliceAssign_f(float[] a, float[] b, float[] c);
extern(C) float[] _arraySliceExpAddSliceAssign_f(float[] a, float value, float[] b);
void main()
{
import std.datetime, std.stdio;
auto ary1 = new float[]((1 << 10) - 1);
ary1[] = 1;
auto ary2 = new float[]((1 << 10) - 1);
ary2[] = 2;
auto sw = StopWatch(AutoStart.yes);
foreach(_; 0 .. 20_000)
arrayOp!("+")(ary1, ary1, 2.0f);
//_arraySliceExpAddSliceAssign_f(ary1, 2.0f, ary1);
//_arraySliceSliceAddSliceAssign_f(ary1, ary1, ary2);
sw.stop();
writeln(sw.peek().usecs);
// writeln(ary1);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment