Skip to content

Instantly share code, notes, and snippets.

@lukego
Last active January 17, 2016 06:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lukego/450e108c9c9cf420252d to your computer and use it in GitHub Desktop.
Save lukego/450e108c9c9cf420252d to your computer and use it in GitHub Desktop.
blit.dasl: assembler routine for memory copies
This is initial work-in-progress code for exploring an efficient
design for a "blitter" written in assembler.
The idea here is to take a large number of memory copy operations, for
example 100, sort them into buckets based on length (in cache lines),
and then execute several of them in parallel. The idea is that this
would be efficient for copies that are bounded by the memory subsystem
(e.g. L3 cache latency) and don't achieve the maximum throughput (~32
bytes/cycle) when executed serially with memcpy.
This code is hard-coded for 128-byte copies to explore whether the
idea is effective before generalizing.
In initial tests I am seeing a large speedup (~30%) on the end-to-end
packet forwarding benchmark with a DPDK VM. This suggests to me that
the idea is worth pursuing but there is still a risk that it won't pan
out for some reason.
-- blit.lua - offload asynchronous memory operations
module(..., package.seeall)
local dasm = require("dasm")
local ffi = require("ffi")
|.arch x64
|.actionlist actions
local gen = {}
-- Table keeping machine code alive to the GC.
local anchor = {}
debug=true
-- Utility: assemble code and optionally dump disassembly.
function assemble (name, prototype, generator)
local Dst = dasm.new(actions)
generator(Dst)
local mcode, size = Dst:build()
table.insert(anchor, mcode)
if debug then
print("mcode dump: "..name)
dasm.dump(mcode, size)
end
return ffi.cast(prototype, mcode)
end
mode = 'batch'
if mode == 'simple' then
function copy (src, dst, size)
ffi.copy(dst, src, size)
end
function barrier ()
end
end
if mode == 'skip' then
function copy (src, dst, size)
end
function barrier ()
end
end
if mode == 'batch' then
local source = ffi.new("void*[10240]")
local dest = ffi.new("void*[10240]")
local length = ffi.new("int[10240]")
local n = 0
local scratch = ffi.new("char[10240]")
function copy (src, dst, len)
source[n] = src
dest[n] = dst
length[n] = bit.rshift(len+31, 5)
n = n + 1
assert(n < 10240)
end
function gen_barrier (Dst)
| push r12; push r13; push r14; push r15
| xor rax, rax
| mov rdx, rdi
| mov64 rsi, source
| mov64 rdi, dest
|1:
| cmp rax, rdx
| jge >9
-- unrolled 128-byte copy, two in parallel
| mov r12, [rsi+rax*8]
| mov r13, [rdi+rax*8]
| mov r14, [rsi+rax*8+8]
| mov r15, [rdi+rax*8+8]
for w = 0, 3 do
| vmovdqu ymm0, [r12+w*32]
| vmovdqu ymm1, [r14+w*32]
| vmovdqu [r13+w*32], ymm0
| vmovdqu [r15+w*32], ymm1
end
| add rax, 2 -- XXX selftest passes by changing 2 to 1
| jmp <1
|9:
| pop r15; pop r14; pop r13; pop r12
| ret
end
asmbarrier = assemble("barrier", "void(*)(int)", gen_barrier)
function barrier ()
if n > 0 then
source[n] = scratch
dest[n] = scratch
asmbarrier(n)
end
n = 0
end
end
function selftest ()
print("selftest: blit")
local membytes = 10240
local memx = ffi.new("char[?]", membytes)
local memy = ffi.new("char[?]", membytes)
for i = 0, 10 do
print("loop "..i)
-- Initialize memx and memy with identical randomly chosen values
for i = 0, membytes-1 do
local n = math.random(256)
memx[i] = n
memy[i] = n
end
-- Perform some random copies
for i = 0, math.random(1000) do
local srcoffset = math.random(1000)
local dstoffset = math.random(1000)
-- srcoffset = srcoffset - (srcoffset%32)
-- dstoffset = dstoffset - (dstoffset%32)
local length = 128
copy(memx+srcoffset+5120, memx+dstoffset, length)
ffi.copy(memy+dstoffset, memy+srcoffset+5120, length)
end
-- Execute deferred copies
barrier()
-- Check for same contents
for i = 0, membytes-1 do
if memx[i] ~= memy[i] then
print(require("core.lib").hexdump(ffi.string(memx, 32)))
print(require("core.lib").hexdump(ffi.string(memy, 32)))
error("mismatch at byte " .. i)
end
end
end
print("selftest: ok")
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment