Skip to content

Instantly share code, notes, and snippets.

@alex
Last active May 4, 2020 03:07
Show Gist options
  • Save alex/40ef49ae0e56343cdc610b4c1240f11c to your computer and use it in GitHub Desktop.
Save alex/40ef49ae0e56343cdc610b4c1240f11c to your computer and use it in GitHub Desktop.
vectorized contains4 implementation in rust
[package]
name = "f"
version = "0.1.0"
authors = ["Alex Gaynor <alex.gaynor@gmail.com>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
packed_simd = "0.3"
[dev-dependencies]
quickcheck = "0.9"
quickcheck_macros = "0.9"
#![feature(test)]
#[cfg(test)]
extern crate quickcheck;
#[cfg(test)]
#[macro_use(quickcheck)]
extern crate quickcheck_macros;
pub fn contains4(data: &[u8], a: u8, b: u8, c: u8, d: u8) -> bool {
let (prefix, body, suffix) = unsafe {
data.align_to::<packed_simd::u8x16>()
};
for &el in prefix {
if el == a || el == b || el == c || el == d {
return true;
}
}
let a_vec = packed_simd::u8x16::splat(a);
let b_vec = packed_simd::u8x16::splat(b);
let c_vec = packed_simd::u8x16::splat(c);
let d_vec = packed_simd::u8x16::splat(d);
for &data_vec in body {
// We now have a set of bool vectors whether any byte matches each of
// a/b/c/d
let a_matches = data_vec.eq(a_vec);
let b_matches = data_vec.eq(b_vec);
let c_matches = data_vec.eq(c_vec);
let d_matches = data_vec.eq(d_vec);
// Bit-or of these together "did any byte match at this position"
if (a_matches | b_matches | c_matches | d_matches).any() {
return true;
}
}
// Check the remaining things non-vectorized
for &el in suffix {
if el == a || el == b || el == c || el == d {
return true;
}
}
false
}
#[cfg(test)]
mod tests {
use super::contains4;
extern crate test;
#[quickcheck]
fn contains4_vs_naive(data: Vec<u8>, a: u8, b: u8, c: u8, d: u8) {
let expected = data
.iter()
.any(|&el| el == a || el == b || el == c || el == d);
assert_eq!(contains4(&data, a, b, c, d), expected);
}
#[bench]
fn bench_naive(b: &mut test::Bencher) {
b.iter(|| {
let data = test::black_box(b"fdsjalkfadlkfjdsafjl;dsafjdsaiofewjoa;fjdsia;fds;alfjdsioafjiworfewjaifdsfdsafisa;fdsfsidj;offdsk;l");
test::black_box(data.iter().any(|&el| el == b'm' || el == b'n' || el == b'p' || el == b'q'));
});
}
#[bench]
fn bench_contains4(b: &mut test::Bencher) {
b.iter(|| {
let data = test::black_box(b"fdsjalkfadlkfjdsafjl;dsafjdsaiofewjoa;fjdsia;fds;alfjdsioafjiworfewjaifdsfdsafisa;fdsfsidj;offdsk;l");
test::black_box(contains4(data, b'm', b'n', b'p', b'q'));
});
}
}
.text
.file "f.6oqy8ea2-cgu.0"
.section .text._ZN1f9contains417h8b9a88902e2bd619E,"ax",@progbits
.globl _ZN1f9contains417h8b9a88902e2bd619E
.p2align 4, 0x90
.type _ZN1f9contains417h8b9a88902e2bd619E,@function
_ZN1f9contains417h8b9a88902e2bd619E:
.cfi_startproc
pushq %r15
.cfi_def_cfa_offset 16
pushq %r14
.cfi_def_cfa_offset 24
pushq %r12
.cfi_def_cfa_offset 32
pushq %rbx
.cfi_def_cfa_offset 40
.cfi_offset %rbx, -40
.cfi_offset %r12, -32
.cfi_offset %r14, -24
.cfi_offset %r15, -16
movl %edi, %ebx
andl $15, %ebx
movl $16, %eax
subq %rbx, %rax
testq %rbx, %rbx
cmoveq %rbx, %rax
cmpq %rsi, %rax
jbe .LBB0_2
leaq .L__unnamed_1(%rip), %r10
xorl %r15d, %r15d
movq %r10, %r14
xorl %r11d, %r11d
jmp .LBB0_3
.LBB0_2:
leaq (%rdi,%rax), %r14
subq %rax, %rsi
movq %rsi, %r15
shrq $4, %r15
movl %esi, %r11d
andl $15, %r11d
subq %r11, %rsi
addq %r14, %rsi
movq %rsi, %r10
movq %rax, %rsi
.LBB0_3:
xorl %r12d, %r12d
.p2align 4, 0x90
.LBB0_4:
cmpq %r12, %rsi
je .LBB0_5
movzbl (%rdi,%r12), %ebx
movb $1, %al
cmpb %r9b, %bl
je .LBB0_14
cmpb %r8b, %bl
je .LBB0_14
cmpb %dl, %bl
je .LBB0_14
addq $1, %r12
cmpb %cl, %bl
jne .LBB0_4
jmp .LBB0_14
.LBB0_5:
movzbl %dl, %eax
movd %eax, %xmm0
punpcklbw %xmm0, %xmm0
pshuflw $224, %xmm0, %xmm0
pshufd $0, %xmm0, %xmm0
movzbl %cl, %eax
movd %eax, %xmm1
punpcklbw %xmm1, %xmm1
pshuflw $224, %xmm1, %xmm1
pshufd $0, %xmm1, %xmm1
movzbl %r8b, %eax
movd %eax, %xmm2
punpcklbw %xmm2, %xmm2
pshuflw $224, %xmm2, %xmm2
pshufd $0, %xmm2, %xmm2
movzbl %r9b, %eax
movd %eax, %xmm3
punpcklbw %xmm3, %xmm3
pshuflw $224, %xmm3, %xmm3
pshufd $0, %xmm3, %xmm3
shlq $4, %r15
xorl %eax, %eax
.p2align 4, 0x90
.LBB0_6:
cmpq %rax, %r15
je .LBB0_7
movdqa (%r14,%rax), %xmm4
movdqa %xmm4, %xmm5
pcmpeqb %xmm0, %xmm5
movdqa %xmm4, %xmm6
pcmpeqb %xmm1, %xmm6
por %xmm5, %xmm6
movdqa %xmm4, %xmm5
pcmpeqb %xmm2, %xmm5
pcmpeqb %xmm3, %xmm4
por %xmm5, %xmm4
por %xmm6, %xmm4
pmovmskb %xmm4, %esi
addq $16, %rax
testw %si, %si
je .LBB0_6
movb $1, %al
jmp .LBB0_14
.LBB0_7:
xorl %esi, %esi
.p2align 4, 0x90
.LBB0_8:
cmpq %rsi, %r11
je .LBB0_9
leaq (%r10,%rsi), %rax
movzbl (%rax), %ebx
movb $1, %al
cmpb %r9b, %bl
je .LBB0_14
cmpb %r8b, %bl
je .LBB0_14
cmpb %dl, %bl
je .LBB0_14
addq $1, %rsi
cmpb %cl, %bl
jne .LBB0_8
jmp .LBB0_14
.LBB0_9:
xorl %eax, %eax
.LBB0_14:
popq %rbx
.cfi_def_cfa_offset 32
popq %r12
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
retq
.Lfunc_end0:
.size _ZN1f9contains417h8b9a88902e2bd619E, .Lfunc_end0-_ZN1f9contains417h8b9a88902e2bd619E
.cfi_endproc
.type .L__unnamed_1,@object
.section .rodata..L__unnamed_1,"a",@progbits
.p2align 4
.L__unnamed_1:
.size .L__unnamed_1, 0
.section ".note.GNU-stack","",@progbits
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment