Last active
January 17, 2016 06:10
-
-
Save lukego/450e108c9c9cf420252d to your computer and use it in GitHub Desktop.
blit.dasl: assembler routine for memory copies
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This is initial work-in-progress code for exploring an efficient | |
design for a "blitter" written in assembler. | |
The idea here is to take a large number of memory copy operations, for | |
example 100, sort them into buckets based on length (in cache lines), | |
and then execute several of them in parallel. The idea is that this | |
would be efficient for copies that are bounded by the memory subsystem | |
(e.g. L3 cache latency) and don't achieve the maximum throughput (~32 | |
bytes/cycle) when executed serially with memcpy. | |
This code is hard-coded for 128-byte copies to explore whether the | |
idea is effective before generalizing. | |
In initial tests I am seeing a large speedup (~30%) on the end-to-end | |
packet forwarding benchmark with a DPDK VM. This suggests to me that | |
the idea is worth pursuing but there is still a risk that it won't pan | |
out for some reason. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- blit.lua - offload asynchronous memory operations | |
module(..., package.seeall) | |
local dasm = require("dasm") | |
local ffi = require("ffi") | |
|.arch x64 | |
|.actionlist actions | |
local gen = {} | |
-- Table keeping machine code alive to the GC. | |
local anchor = {} | |
debug=true | |
-- Utility: assemble code and optionally dump disassembly. | |
function assemble (name, prototype, generator) | |
local Dst = dasm.new(actions) | |
generator(Dst) | |
local mcode, size = Dst:build() | |
table.insert(anchor, mcode) | |
if debug then | |
print("mcode dump: "..name) | |
dasm.dump(mcode, size) | |
end | |
return ffi.cast(prototype, mcode) | |
end | |
mode = 'batch' | |
if mode == 'simple' then | |
function copy (src, dst, size) | |
ffi.copy(dst, src, size) | |
end | |
function barrier () | |
end | |
end | |
if mode == 'skip' then | |
function copy (src, dst, size) | |
end | |
function barrier () | |
end | |
end | |
if mode == 'batch' then | |
local source = ffi.new("void*[10240]") | |
local dest = ffi.new("void*[10240]") | |
local length = ffi.new("int[10240]") | |
local n = 0 | |
local scratch = ffi.new("char[10240]") | |
function copy (src, dst, len) | |
source[n] = src | |
dest[n] = dst | |
length[n] = bit.rshift(len+31, 5) | |
n = n + 1 | |
assert(n < 10240) | |
end | |
function gen_barrier (Dst) | |
| push r12; push r13; push r14; push r15 | |
| xor rax, rax | |
| mov rdx, rdi | |
| mov64 rsi, source | |
| mov64 rdi, dest | |
|1: | |
| cmp rax, rdx | |
| jge >9 | |
-- unrolled 128-byte copy, two in parallel | |
| mov r12, [rsi+rax*8] | |
| mov r13, [rdi+rax*8] | |
| mov r14, [rsi+rax*8+8] | |
| mov r15, [rdi+rax*8+8] | |
for w = 0, 3 do | |
| vmovdqu ymm0, [r12+w*32] | |
| vmovdqu ymm1, [r14+w*32] | |
| vmovdqu [r13+w*32], ymm0 | |
| vmovdqu [r15+w*32], ymm1 | |
end | |
| add rax, 2 -- XXX selftest passes by changing 2 to 1 | |
| jmp <1 | |
|9: | |
| pop r15; pop r14; pop r13; pop r12 | |
| ret | |
end | |
asmbarrier = assemble("barrier", "void(*)(int)", gen_barrier) | |
function barrier () | |
if n > 0 then | |
source[n] = scratch | |
dest[n] = scratch | |
asmbarrier(n) | |
end | |
n = 0 | |
end | |
end | |
function selftest () | |
print("selftest: blit") | |
local membytes = 10240 | |
local memx = ffi.new("char[?]", membytes) | |
local memy = ffi.new("char[?]", membytes) | |
for i = 0, 10 do | |
print("loop "..i) | |
-- Initialize memx and memy with identical randomly chosen values | |
for i = 0, membytes-1 do | |
local n = math.random(256) | |
memx[i] = n | |
memy[i] = n | |
end | |
-- Perform some random copies | |
for i = 0, math.random(1000) do | |
local srcoffset = math.random(1000) | |
local dstoffset = math.random(1000) | |
-- srcoffset = srcoffset - (srcoffset%32) | |
-- dstoffset = dstoffset - (dstoffset%32) | |
local length = 128 | |
copy(memx+srcoffset+5120, memx+dstoffset, length) | |
ffi.copy(memy+dstoffset, memy+srcoffset+5120, length) | |
end | |
-- Execute deferred copies | |
barrier() | |
-- Check for same contents | |
for i = 0, membytes-1 do | |
if memx[i] ~= memy[i] then | |
print(require("core.lib").hexdump(ffi.string(memx, 32))) | |
print(require("core.lib").hexdump(ffi.string(memy, 32))) | |
error("mismatch at byte " .. i) | |
end | |
end | |
end | |
print("selftest: ok") | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment