Skip to content

Instantly share code, notes, and snippets.

@nelhage
Forked from alex/Cargo.toml
Last active May 4, 2020 03:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nelhage/57c0bb3e02fefa502f8b0642e373b9dc to your computer and use it in GitHub Desktop.
Save nelhage/57c0bb3e02fefa502f8b0642e373b9dc to your computer and use it in GitHub Desktop.
vectorized contains4 implementation in rust
[package]
name = "f"
version = "0.1.0"
authors = ["Alex Gaynor <alex.gaynor@gmail.com>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
packed_simd = "0.3"
rand = {version = "0.7.3", features = ["small_rng"]}
[dev-dependencies]
quickcheck = "0.9"
quickcheck_macros = "0.9"
#![feature(test)]
#[cfg(test)]
extern crate quickcheck;
#[cfg(test)]
#[macro_use(quickcheck)]
extern crate quickcheck_macros;
use core::arch::x86_64;
#[target_feature(enable = "sse4.2")]
unsafe fn _contains4_pcmp(data: &[u8], a: u8, b: u8, c: u8, d: u8) -> bool {
let (prefix, body, suffix) = data.align_to::<x86_64::__m128i>();
for &el in prefix {
if el == a || el == b || el == c || el == d {
return true;
}
}
let needle = x86_64::_mm_set_epi8(
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, a as i8, b as i8, c as i8, d as i8,
);
for &data_vec in body {
if x86_64::_mm_cmpistrc(data_vec, needle, 0) != 0 {
return true;
}
}
// Check the remaining things non-vectorized
for &el in suffix {
if el == a || el == b || el == c || el == d {
return true;
}
}
false
}
pub fn contains4_pcmp(data: &[u8], a: u8, b: u8, c: u8, d: u8) -> bool {
unsafe { _contains4_pcmp(data, a, b, c, d) }
}
pub fn contains4(data: &[u8], a: u8, b: u8, c: u8, d: u8) -> bool {
let (prefix, body, suffix) = unsafe { data.align_to::<packed_simd::u8x16>() };
for &el in prefix {
if el == a || el == b || el == c || el == d {
return true;
}
}
let a_vec = packed_simd::u8x16::splat(a);
let b_vec = packed_simd::u8x16::splat(b);
let c_vec = packed_simd::u8x16::splat(c);
let d_vec = packed_simd::u8x16::splat(d);
for &data_vec in body {
// We now have a set of bool vectors whether any byte matches each of
// a/b/c/d
let a_matches = data_vec.eq(a_vec);
let b_matches = data_vec.eq(b_vec);
let c_matches = data_vec.eq(c_vec);
let d_matches = data_vec.eq(d_vec);
// Bit-or of these together "did any byte match at this position"
if (a_matches | b_matches | c_matches | d_matches).any() {
return true;
}
}
// Check the remaining things non-vectorized
for &el in suffix {
if el == a || el == b || el == c || el == d {
return true;
}
}
false
}
#[cfg(test)]
mod tests {
use super::*;
extern crate rand;
extern crate test;
use rand::rngs::SmallRng;
use rand::{Rng, SeedableRng};
#[quickcheck]
fn contains4_vs_naive(data: Vec<u8>, a: u8, b: u8, c: u8, d: u8) {
let expected = data
.iter()
.any(|&el| el == a || el == b || el == c || el == d);
assert_eq!(contains4(&data, a, b, c, d), expected);
}
#[quickcheck]
fn contains4_pcmp_vs_naive(data: Vec<u8>, a: u8, b: u8, c: u8, d: u8) {
let expected = data
.iter()
.any(|&el| el == a || el == b || el == c || el == d);
if data.contains(&0) || a == 0 || b == 0 || c == 0 || d == 0 {
// the PCMPI* variant can't handle NULL bytes
return;
}
assert_eq!(contains4_pcmp(&data, a, b, c, d), expected);
}
static HAYSTACK_SHORT: &'static [u8] = b"fdsjalkfadlkfjdsafjl;dsafjdsaiofewjoa;fjdsia;fds;alfjdsioafjiworfewjaifdsfdsafisa;fdsfsidj;offdsk;l";
#[bench]
fn bench_naive(b: &mut test::Bencher) {
b.iter(|| {
let data = test::black_box(HAYSTACK_SHORT);
test::black_box(
data.iter()
.any(|&el| el == b'm' || el == b'n' || el == b'p' || el == b'q'),
);
});
}
#[bench]
fn bench_contains4(b: &mut test::Bencher) {
b.iter(|| {
let data = test::black_box(HAYSTACK_SHORT);
test::black_box(contains4(data, b'm', b'n', b'p', b'q'));
});
}
#[bench]
fn bench_contains4_pcmp(b: &mut test::Bencher) {
b.iter(|| {
let data = test::black_box(HAYSTACK_SHORT);
test::black_box(contains4_pcmp(data, b'm', b'n', b'p', b'q'));
});
}
const LONG_HAYSTACK_LEN: usize = 1 << 20;
fn long_haystack() -> Vec<u8> {
let mut ret = Vec::with_capacity(LONG_HAYSTACK_LEN);
let mut rng = SmallRng::seed_from_u64(1);
while ret.len() < LONG_HAYSTACK_LEN {
let b = rng.gen::<u8>();
if b == 0 || b == b'm' || b == b'n' || b == b'p' || b == b'q' {
continue;
}
ret.push(b);
}
ret
}
#[bench]
fn bench_naive_long(b: &mut test::Bencher) {
let data = test::black_box(long_haystack());
b.iter(|| {
test::black_box(
data.iter()
.any(|&el| el == b'm' || el == b'n' || el == b'p' || el == b'q'),
);
});
}
#[bench]
fn bench_contains4_long(b: &mut test::Bencher) {
let data = test::black_box(long_haystack());
b.iter(|| {
test::black_box(contains4(&data, b'm', b'n', b'p', b'q'));
});
}
#[bench]
fn bench_contains4_pcmp_long(b: &mut test::Bencher) {
let data = test::black_box(long_haystack());
b.iter(|| {
test::black_box(contains4_pcmp(&data, b'm', b'n', b'p', b'q'));
});
}
}
.text
.file "f.6oqy8ea2-cgu.0"
.section .text._ZN1f9contains417h8b9a88902e2bd619E,"ax",@progbits
.globl _ZN1f9contains417h8b9a88902e2bd619E
.p2align 4, 0x90
.type _ZN1f9contains417h8b9a88902e2bd619E,@function
_ZN1f9contains417h8b9a88902e2bd619E:
.cfi_startproc
pushq %r15
.cfi_def_cfa_offset 16
pushq %r14
.cfi_def_cfa_offset 24
pushq %r12
.cfi_def_cfa_offset 32
pushq %rbx
.cfi_def_cfa_offset 40
.cfi_offset %rbx, -40
.cfi_offset %r12, -32
.cfi_offset %r14, -24
.cfi_offset %r15, -16
movl %edi, %ebx
andl $15, %ebx
movl $16, %eax
subq %rbx, %rax
testq %rbx, %rbx
cmoveq %rbx, %rax
cmpq %rsi, %rax
jbe .LBB0_2
leaq .L__unnamed_1(%rip), %r10
xorl %r15d, %r15d
movq %r10, %r14
xorl %r11d, %r11d
jmp .LBB0_3
.LBB0_2:
leaq (%rdi,%rax), %r14
subq %rax, %rsi
movq %rsi, %r15
shrq $4, %r15
movl %esi, %r11d
andl $15, %r11d
subq %r11, %rsi
addq %r14, %rsi
movq %rsi, %r10
movq %rax, %rsi
.LBB0_3:
xorl %r12d, %r12d
.p2align 4, 0x90
.LBB0_4:
cmpq %r12, %rsi
je .LBB0_5
movzbl (%rdi,%r12), %ebx
movb $1, %al
cmpb %r9b, %bl
je .LBB0_14
cmpb %r8b, %bl
je .LBB0_14
cmpb %dl, %bl
je .LBB0_14
addq $1, %r12
cmpb %cl, %bl
jne .LBB0_4
jmp .LBB0_14
.LBB0_5:
movzbl %dl, %eax
movd %eax, %xmm0
punpcklbw %xmm0, %xmm0
pshuflw $224, %xmm0, %xmm0
pshufd $0, %xmm0, %xmm0
movzbl %cl, %eax
movd %eax, %xmm1
punpcklbw %xmm1, %xmm1
pshuflw $224, %xmm1, %xmm1
pshufd $0, %xmm1, %xmm1
movzbl %r8b, %eax
movd %eax, %xmm2
punpcklbw %xmm2, %xmm2
pshuflw $224, %xmm2, %xmm2
pshufd $0, %xmm2, %xmm2
movzbl %r9b, %eax
movd %eax, %xmm3
punpcklbw %xmm3, %xmm3
pshuflw $224, %xmm3, %xmm3
pshufd $0, %xmm3, %xmm3
shlq $4, %r15
xorl %eax, %eax
.p2align 4, 0x90
.LBB0_6:
cmpq %rax, %r15
je .LBB0_7
movdqa (%r14,%rax), %xmm4
movdqa %xmm4, %xmm5
pcmpeqb %xmm0, %xmm5
movdqa %xmm4, %xmm6
pcmpeqb %xmm1, %xmm6
por %xmm5, %xmm6
movdqa %xmm4, %xmm5
pcmpeqb %xmm2, %xmm5
pcmpeqb %xmm3, %xmm4
por %xmm5, %xmm4
por %xmm6, %xmm4
pmovmskb %xmm4, %esi
addq $16, %rax
testw %si, %si
je .LBB0_6
movb $1, %al
jmp .LBB0_14
.LBB0_7:
xorl %esi, %esi
.p2align 4, 0x90
.LBB0_8:
cmpq %rsi, %r11
je .LBB0_9
leaq (%r10,%rsi), %rax
movzbl (%rax), %ebx
movb $1, %al
cmpb %r9b, %bl
je .LBB0_14
cmpb %r8b, %bl
je .LBB0_14
cmpb %dl, %bl
je .LBB0_14
addq $1, %rsi
cmpb %cl, %bl
jne .LBB0_8
jmp .LBB0_14
.LBB0_9:
xorl %eax, %eax
.LBB0_14:
popq %rbx
.cfi_def_cfa_offset 32
popq %r12
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
retq
.Lfunc_end0:
.size _ZN1f9contains417h8b9a88902e2bd619E, .Lfunc_end0-_ZN1f9contains417h8b9a88902e2bd619E
.cfi_endproc
.type .L__unnamed_1,@object
.section .rodata..L__unnamed_1,"a",@progbits
.p2align 4
.L__unnamed_1:
.size .L__unnamed_1, 0
.section ".note.GNU-stack","",@progbits

results (on my AMD Ryzen 9 3900)

test tests::bench_contains4           ... bench:           8 ns/iter (+/- 0)
test tests::bench_contains4_long      ... bench:      46,040 ns/iter (+/- 328)
test tests::bench_contains4_pcmp      ... bench:           7 ns/iter (+/- 0)
test tests::bench_contains4_pcmp_long ... bench:      31,210 ns/iter (+/- 304)
test tests::bench_naive               ... bench:          51 ns/iter (+/- 0)
test tests::bench_naive_long          ... bench:     398,034 ns/iter (+/- 22,295)

results (on my Intel Xeon E3-1230)

test tests::bench_contains4           ... bench:           9 ns/iter (+/- 0)
test tests::bench_contains4_long      ... bench:      56,849 ns/iter (+/- 1,532)
test tests::bench_contains4_pcmp      ... bench:          10 ns/iter (+/- 0)
test tests::bench_contains4_pcmp_long ... bench:      50,624 ns/iter (+/- 448)
test tests::bench_naive               ... bench:          39 ns/iter (+/- 0)
test tests::bench_naive_long          ... bench:     568,305 ns/iter (+/- 13,586)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment