Skip to content

Instantly share code, notes, and snippets.

@Yawning
Created August 14, 2024 09:38
Show Gist options
  • Save Yawning/2f7826f10a5d7af813d9b9b7121eb117 to your computer and use it in GitHub Desktop.
Save Yawning/2f7826f10a5d7af813d9b9b7121eb117 to your computer and use it in GitHub Desktop.
index_byte: Alternative approach
when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
@(private)
SCANNER_INDICES_256 : simd.u8x32 : {
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31,
}
@(private)
SCANNER_SENTINEL_256: simd.u8x32 : u8(0xff)
}
@(private)
SCANNER_INDICES_128 : simd.u8x16 : {
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
}
@(private)
SCANNER_SENTINEL: simd.u8x16 : u8(0xff)
/*
Scan a slice of bytes for a specific byte.
This procedure safely handles slices of any length, including empty slices.
Inputs:
- data: A slice of bytes.
- c: The byte to search for.
Returns:
- index: The index of the byte `c`, or -1 if it was not found.
*/
index_byte :: proc(s: []byte, c: byte) -> (index: int) #no_bounds_check {
i, l := 0, len(s)
// Don't bother with SIMD if the slice is smaller than a SIMD register.
if l < 16 {
for /**/; i < l; i += 1 {
if s[i] == c {
return i
}
}
return -1
}
scanner: simd.u8x16 = c
when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
if l > 64 {
scanner_256 := simd.swizzle(
scanner,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
)
// Scan 128-byte chunks, using 256-bit SIMD.
for nr_blocks := l / 128; nr_blocks > 0; nr_blocks -= 1 {
s0 := intrinsics.unaligned_load(transmute(^simd.u8x32)raw_data(s[i:]))
s1 := intrinsics.unaligned_load(transmute(^simd.u8x32)raw_data(s[i+32:]))
s2 := intrinsics.unaligned_load(transmute(^simd.u8x32)raw_data(s[i+64:]))
s3 := intrinsics.unaligned_load(transmute(^simd.u8x32)raw_data(s[i+96:]))
c0 := simd.lanes_eq(s0, scanner_256)
c1 := simd.lanes_eq(s1, scanner_256)
c2 := simd.lanes_eq(s2, scanner_256)
c3 := simd.lanes_eq(s3, scanner_256)
m0 := simd.reduce_or(c0)
m1 := simd.reduce_or(c1)
m2 := simd.reduce_or(c2)
m3 := simd.reduce_or(c3)
if m0 | m1 | m2 | m3 > 0 {
if m0 > 0 {
sel := simd.select(c0, SCANNER_INDICES_256, SCANNER_SENTINEL_256)
off := simd.reduce_min(sel)
return i + int(off)
}
if m1 > 0 {
sel := simd.select(c1, SCANNER_INDICES_256, SCANNER_SENTINEL_256)
off := simd.reduce_min(sel)
return i + 32 + int(off)
}
if m2 > 0 {
sel := simd.select(c2, SCANNER_INDICES_256, SCANNER_SENTINEL_256)
off := simd.reduce_min(sel)
return i + 64 + int(off)
}
if m3 > 0 {
sel := simd.select(c3, SCANNER_INDICES_256, SCANNER_SENTINEL_256)
off := simd.reduce_min(sel)
return i + 96 + int(off)
}
}
i += 128
}
// Scan 64-byte chunks, using 256-bit SIMD.
for nr_blocks := (l - i) / 64; nr_blocks > 0; nr_blocks -= 1 {
s0 := intrinsics.unaligned_load(transmute(^simd.u8x32)raw_data(s[i:]))
s1 := intrinsics.unaligned_load(transmute(^simd.u8x32)raw_data(s[i+32:]))
c0 := simd.lanes_eq(s0, scanner_256)
c1 := simd.lanes_eq(s1, scanner_256)
m0 := simd.reduce_or(c0)
m1 := simd.reduce_or(c1)
if m0 | m1 > 0 {
if m0 > 0 {
sel := simd.select(c0, SCANNER_INDICES_256, SCANNER_SENTINEL_256)
off := simd.reduce_min(sel)
return i + int(off)
}
if m1 > 0 {
sel := simd.select(c1, SCANNER_INDICES_256, SCANNER_SENTINEL_256)
off := simd.reduce_min(sel)
return i + 32 + int(off)
}
}
i += 64
}
}
} else {
// Note: ARM can probaby do 128 or 256-bytes at a time.
// Scan 64-byte chunks, using 128-bit SIMD.
for nr_blocks := (l - i) / 64; nr_blocks > 0; nr_blocks -= 1 {
s0 := intrinsics.unaligned_load(transmute(^simd.u8x16)raw_data(s[i:]))
s1 := intrinsics.unaligned_load(transmute(^simd.u8x16)raw_data(s[i+16:]))
s2 := intrinsics.unaligned_load(transmute(^simd.u8x16)raw_data(s[i+32:]))
s3 := intrinsics.unaligned_load(transmute(^simd.u8x16)raw_data(s[i+48:]))
c0 := simd.lanes_eq(s0, scanner)
c1 := simd.lanes_eq(s1, scanner)
c2 := simd.lanes_eq(s2, scanner)
c3 := simd.lanes_eq(s3, scanner)
m0 := simd.reduce_or(c0)
m1 := simd.reduce_or(c1)
m2 := simd.reduce_or(c2)
m3 := simd.reduce_or(c3)
if m0 | m1 | m2 | m3 > 0 {
if m0 > 0 {
sel := simd.select(c0, SCANNER_INDICES_128, SCANNER_SENTINEL)
off := simd.reduce_min(sel)
return i + int(off)
}
if m1 > 0 {
sel := simd.select(c1, SCANNER_INDICES_128, SCANNER_SENTINEL)
off := simd.reduce_min(sel)
return i + 16 + int(off)
}
if m2 > 0 {
sel := simd.select(c2, SCANNER_INDICES_128, SCANNER_SENTINEL)
off := simd.reduce_min(sel)
return i + 32 + int(off)
}
if m3 > 0 {
sel := simd.select(c3, SCANNER_INDICES_128, SCANNER_SENTINEL)
off := simd.reduce_min(sel)
return i + 48 + int(off)
}
}
i += 64
}
}
// Scan 16-byte chunks, using 128-bit SIMD.
for nr_blocks := (l - i) / 16; nr_blocks > 0; nr_blocks -= 1 {
s0 := intrinsics.unaligned_load(transmute(^simd.u8x16)raw_data(s[i:]))
c0 := simd.lanes_eq(s0, scanner)
if simd.reduce_or(c0) > 0 {
sel := simd.select(c0, SCANNER_INDICES_128, SCANNER_SENTINEL)
off := simd.reduce_min(sel)
return i + int(off)
}
i += 16
}
// Iterate as a scalar over the remaining non-SIMD register sized
// portion.
for /**/; i < l; i += 1 {
if s[i] == c {
return i
}
}
return -1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment