Last active
May 4, 2020 03:07
-
-
Save alex/40ef49ae0e56343cdc610b4c1240f11c to your computer and use it in GitHub Desktop.
vectorized contains4 implementation in rust
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[package] | |
name = "f" | |
version = "0.1.0" | |
authors = ["Alex Gaynor <alex.gaynor@gmail.com>"] | |
edition = "2018" | |
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | |
[dependencies] | |
packed_simd = "0.3" | |
[dev-dependencies] | |
quickcheck = "0.9" | |
quickcheck_macros = "0.9" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![feature(test)] | |
#[cfg(test)] | |
extern crate quickcheck; | |
#[cfg(test)] | |
#[macro_use(quickcheck)] | |
extern crate quickcheck_macros; | |
pub fn contains4(data: &[u8], a: u8, b: u8, c: u8, d: u8) -> bool { | |
let (prefix, body, suffix) = unsafe { | |
data.align_to::<packed_simd::u8x16>() | |
}; | |
for &el in prefix { | |
if el == a || el == b || el == c || el == d { | |
return true; | |
} | |
} | |
let a_vec = packed_simd::u8x16::splat(a); | |
let b_vec = packed_simd::u8x16::splat(b); | |
let c_vec = packed_simd::u8x16::splat(c); | |
let d_vec = packed_simd::u8x16::splat(d); | |
for &data_vec in body { | |
// We now have a set of bool vectors whether any byte matches each of | |
// a/b/c/d | |
let a_matches = data_vec.eq(a_vec); | |
let b_matches = data_vec.eq(b_vec); | |
let c_matches = data_vec.eq(c_vec); | |
let d_matches = data_vec.eq(d_vec); | |
// Bit-or of these together "did any byte match at this position" | |
if (a_matches | b_matches | c_matches | d_matches).any() { | |
return true; | |
} | |
} | |
// Check the remaining things non-vectorized | |
for &el in suffix { | |
if el == a || el == b || el == c || el == d { | |
return true; | |
} | |
} | |
false | |
} | |
#[cfg(test)] | |
mod tests { | |
use super::contains4; | |
extern crate test; | |
#[quickcheck] | |
fn contains4_vs_naive(data: Vec<u8>, a: u8, b: u8, c: u8, d: u8) { | |
let expected = data | |
.iter() | |
.any(|&el| el == a || el == b || el == c || el == d); | |
assert_eq!(contains4(&data, a, b, c, d), expected); | |
} | |
#[bench] | |
fn bench_naive(b: &mut test::Bencher) { | |
b.iter(|| { | |
let data = test::black_box(b"fdsjalkfadlkfjdsafjl;dsafjdsaiofewjoa;fjdsia;fds;alfjdsioafjiworfewjaifdsfdsafisa;fdsfsidj;offdsk;l"); | |
test::black_box(data.iter().any(|&el| el == b'm' || el == b'n' || el == b'p' || el == b'q')); | |
}); | |
} | |
#[bench] | |
fn bench_contains4(b: &mut test::Bencher) { | |
b.iter(|| { | |
let data = test::black_box(b"fdsjalkfadlkfjdsafjl;dsafjdsaiofewjoa;fjdsia;fds;alfjdsioafjiworfewjaifdsfdsafisa;fdsfsidj;offdsk;l"); | |
test::black_box(contains4(data, b'm', b'n', b'p', b'q')); | |
}); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.text | |
.file "f.6oqy8ea2-cgu.0" | |
.section .text._ZN1f9contains417h8b9a88902e2bd619E,"ax",@progbits | |
.globl _ZN1f9contains417h8b9a88902e2bd619E | |
.p2align 4, 0x90 | |
.type _ZN1f9contains417h8b9a88902e2bd619E,@function | |
_ZN1f9contains417h8b9a88902e2bd619E: | |
.cfi_startproc | |
pushq %r15 | |
.cfi_def_cfa_offset 16 | |
pushq %r14 | |
.cfi_def_cfa_offset 24 | |
pushq %r12 | |
.cfi_def_cfa_offset 32 | |
pushq %rbx | |
.cfi_def_cfa_offset 40 | |
.cfi_offset %rbx, -40 | |
.cfi_offset %r12, -32 | |
.cfi_offset %r14, -24 | |
.cfi_offset %r15, -16 | |
movl %edi, %ebx | |
andl $15, %ebx | |
movl $16, %eax | |
subq %rbx, %rax | |
testq %rbx, %rbx | |
cmoveq %rbx, %rax | |
cmpq %rsi, %rax | |
jbe .LBB0_2 | |
leaq .L__unnamed_1(%rip), %r10 | |
xorl %r15d, %r15d | |
movq %r10, %r14 | |
xorl %r11d, %r11d | |
jmp .LBB0_3 | |
.LBB0_2: | |
leaq (%rdi,%rax), %r14 | |
subq %rax, %rsi | |
movq %rsi, %r15 | |
shrq $4, %r15 | |
movl %esi, %r11d | |
andl $15, %r11d | |
subq %r11, %rsi | |
addq %r14, %rsi | |
movq %rsi, %r10 | |
movq %rax, %rsi | |
.LBB0_3: | |
xorl %r12d, %r12d | |
.p2align 4, 0x90 | |
.LBB0_4: | |
cmpq %r12, %rsi | |
je .LBB0_5 | |
movzbl (%rdi,%r12), %ebx | |
movb $1, %al | |
cmpb %r9b, %bl | |
je .LBB0_14 | |
cmpb %r8b, %bl | |
je .LBB0_14 | |
cmpb %dl, %bl | |
je .LBB0_14 | |
addq $1, %r12 | |
cmpb %cl, %bl | |
jne .LBB0_4 | |
jmp .LBB0_14 | |
.LBB0_5: | |
movzbl %dl, %eax | |
movd %eax, %xmm0 | |
punpcklbw %xmm0, %xmm0 | |
pshuflw $224, %xmm0, %xmm0 | |
pshufd $0, %xmm0, %xmm0 | |
movzbl %cl, %eax | |
movd %eax, %xmm1 | |
punpcklbw %xmm1, %xmm1 | |
pshuflw $224, %xmm1, %xmm1 | |
pshufd $0, %xmm1, %xmm1 | |
movzbl %r8b, %eax | |
movd %eax, %xmm2 | |
punpcklbw %xmm2, %xmm2 | |
pshuflw $224, %xmm2, %xmm2 | |
pshufd $0, %xmm2, %xmm2 | |
movzbl %r9b, %eax | |
movd %eax, %xmm3 | |
punpcklbw %xmm3, %xmm3 | |
pshuflw $224, %xmm3, %xmm3 | |
pshufd $0, %xmm3, %xmm3 | |
shlq $4, %r15 | |
xorl %eax, %eax | |
.p2align 4, 0x90 | |
.LBB0_6: | |
cmpq %rax, %r15 | |
je .LBB0_7 | |
movdqa (%r14,%rax), %xmm4 | |
movdqa %xmm4, %xmm5 | |
pcmpeqb %xmm0, %xmm5 | |
movdqa %xmm4, %xmm6 | |
pcmpeqb %xmm1, %xmm6 | |
por %xmm5, %xmm6 | |
movdqa %xmm4, %xmm5 | |
pcmpeqb %xmm2, %xmm5 | |
pcmpeqb %xmm3, %xmm4 | |
por %xmm5, %xmm4 | |
por %xmm6, %xmm4 | |
pmovmskb %xmm4, %esi | |
addq $16, %rax | |
testw %si, %si | |
je .LBB0_6 | |
movb $1, %al | |
jmp .LBB0_14 | |
.LBB0_7: | |
xorl %esi, %esi | |
.p2align 4, 0x90 | |
.LBB0_8: | |
cmpq %rsi, %r11 | |
je .LBB0_9 | |
leaq (%r10,%rsi), %rax | |
movzbl (%rax), %ebx | |
movb $1, %al | |
cmpb %r9b, %bl | |
je .LBB0_14 | |
cmpb %r8b, %bl | |
je .LBB0_14 | |
cmpb %dl, %bl | |
je .LBB0_14 | |
addq $1, %rsi | |
cmpb %cl, %bl | |
jne .LBB0_8 | |
jmp .LBB0_14 | |
.LBB0_9: | |
xorl %eax, %eax | |
.LBB0_14: | |
popq %rbx | |
.cfi_def_cfa_offset 32 | |
popq %r12 | |
.cfi_def_cfa_offset 24 | |
popq %r14 | |
.cfi_def_cfa_offset 16 | |
popq %r15 | |
.cfi_def_cfa_offset 8 | |
retq | |
.Lfunc_end0: | |
.size _ZN1f9contains417h8b9a88902e2bd619E, .Lfunc_end0-_ZN1f9contains417h8b9a88902e2bd619E | |
.cfi_endproc | |
.type .L__unnamed_1,@object | |
.section .rodata..L__unnamed_1,"a",@progbits | |
.p2align 4 | |
.L__unnamed_1: | |
.size .L__unnamed_1, 0 | |
.section ".note.GNU-stack","",@progbits |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment