zommiommy/sip.rs

## sip.rs
//! Implementation of SipHash from [Jean-Philippe Aumasson](https://www.aumasson.jp/) and Daniel J. Bernstein.
//!
//! SipHash is a general-purpose hashing function: it runs at a good
//! speed (competitive with Spooky and City) and permits strong _keyed_
//! hashing. This lets you key your hash tables from a strong RNG, such as
//! [`rand::os::OsRng`](https://docs.rs/rand/latest/rand/rngs/struct.OsRng.html).
//!
//! Although the SipHash algorithm is considered to be generally strong,
//! it is not intended for cryptographic purposes. As such, all
//! cryptographic uses of this implementation are _strongly discouraged
//!
//! # Reference
//! - <https://www.aumasson.jp/siphash/siphash.pdf>
//! - <https://131002.net/siphash/>
//! - <https://github.com/floodyberry/supercop/blob/master/crypto_auth/siphash24/sse41/siphash.c>
//! - <https://github.com/google/highwayhash/blob/master/highwayhash/sip_hash.h>
//! - <https://github.com/rust-lang/rust/blob/master/library/core/src/hash/sip.rs>
//!
//! # Reimplementation reasons
//! - The rust implementation is not const generic and is deprecated and has no simd optimzations
//! - The [most popular rust library](https://github.com/jedisct1/rust-siphash/tree/master)
//!  is just a port of rust implementation and has the same problems minus the deprecation

pub type SipHash64<const C: usize, const D: usize> = Sip64Scalar<C, D>;

/// Loads an integer of the desired type from a byte stream, in LE order. Uses
/// `copy_nonoverlapping` to let the compiler generate the most efficient way
/// to load it from a possibly unaligned address.
///
/// Unsafe because: unchecked indexing at `i..i+size_of(int_ty)`
macro_rules! load_int_le {
    ($buf:expr, $i:expr, $int_ty:ident) => {{
        debug_assert!($i + core::mem::size_of::<$int_ty>() <= $buf.len());
        let mut data = 0 as $int_ty;
        core::ptr::copy_nonoverlapping(
            $buf.as_ptr().add($i),
            &mut data as *mut _ as *mut u8,
            core::mem::size_of::<$int_ty>(),
        );
        data.to_le()
    }};
}

/// Loads a u64 using up to 7 bytes of a byte slice. It looks clumsy but the
/// `copy_nonoverlapping` calls that occur (via `load_int_le!`) all have fixed
/// sizes and avoid calling `memcpy`, which is good for speed.
///
/// Unsafe because: unchecked indexing at start..start+len
#[inline]
unsafe fn u8to64_le(buf: &[u8], start: usize, len: usize) -> u64 {
    debug_assert!(len < 8);
    let mut i = 0; // current byte index (from LSB) in the output u64
    let mut out = 0;
    if i + 3 < len {
        out = load_int_le!(buf, start + i, u32) as u64;
        i += 4;
    }
    if i + 1 < len {
        out |= (load_int_le!(buf, start + i, u16) as u64) << (i * 8);
        i += 2
    }
    if i < len {
        out |= (*buf.get_unchecked(start + i) as u64) << (i * 8);
        i += 1;
    }
    debug_assert_eq!(i, len);
    out
}

/// Porting of 64-bit SipHash from https://github.com/veorq/SipHash
#[derive(Debug, Clone, Copy)]
#[repr(align(16), C)] //alignement and C repr so the compiler can use simd instructions
pub struct Sip64Scalar<const C: usize, const D: usize> {
    // interleave the v values so that the compiler can optimize with simd
    v0: u64,
    v2: u64,
    v1: u64,
    v3: u64,
    k0: u64,
    k1: u64,
    /// precedent message
    m: u64,
    /// how many bytes we've processed
    length: usize,
    /// buffer of unprocessed bytes in little endian order
    tail: u64,
    /// how many bytes in tail are valid
    ntail: usize,
}

impl<const C: usize, const D: usize> core::default::Default for Sip64Scalar<C, D> {
    #[inline]
    fn default() -> Self {
        Self::new()
    }
}

impl<const C: usize, const D: usize> crate::hash::SeedableHasher for Sip64Scalar<C, D> {
    type SeedType = [u8; 16];

    /// Creates a new hasher with the given seed.
    #[inline(always)]
    fn new_with_seed(seed: Self::SeedType) -> Self {
        Self::new_with_key(seed)
    }
}

impl<const C: usize, const D: usize> Sip64Scalar<C, D> {
    #[inline]
    pub fn new() -> Self {
        // same default key as rust implementation
        Self::new_with_key([0; 16])
    }

    #[inline(always)]
    pub fn new_with_key(key: [u8; 16]) -> Self {
        Self::new_with_key_and_state(
            key,
            [
                0x736f6d6570736575,
                0x646f72616e646f6d,
                0x6c7967656e657261,
                0x7465646279746573,
            ],
        )
    }

    #[inline(always)]
    pub fn new_with_key_and_state(key: [u8; 16], state: [u64; 4]) -> Self {
        let mut res = Self {
            v0: state[0],
            v1: state[1],
            v2: state[2],
            v3: state[3],
            k0: u64::from_le_bytes(key[0..8].try_into().unwrap()),
            k1: u64::from_le_bytes(key[8..16].try_into().unwrap()),
            m: 0,
            length: 0,
            tail: 0,
            ntail: 0,
        };

        res.v3 ^= res.k1;
        res.v2 ^= res.k0;
        res.v1 ^= res.k1;
        res.v0 ^= res.k0;

        res
    }

    #[inline(always)]
    fn round(&mut self) {
        self.v0 = self.v0.wrapping_add(self.v1);
        self.v2 = self.v2.wrapping_add(self.v3);

        self.v1 = self.v1.rotate_left(13);
        self.v3 = self.v3.rotate_left(16);

        self.v1 ^= self.v0;
        self.v3 ^= self.v2;

        self.v0 = self.v0.rotate_left(32);

        self.v0 = self.v0.wrapping_add(self.v3);
        self.v2 = self.v2.wrapping_add(self.v1);

        self.v1 = self.v1.rotate_left(17);
        self.v3 = self.v3.rotate_left(21);

        self.v3 ^= self.v0;
        self.v1 ^= self.v2;

        self.v2 = self.v2.rotate_left(32);
    }

    // A specialized write function for values with size <= 8.
    //
    // The hashing of multi-byte integers depends on endianness. E.g.:
    // - little-endian: `write_u32(0xDDCCBBAA)` == `write([0xAA, 0xBB, 0xCC, 0xDD])`
    // - big-endian:    `write_u32(0xDDCCBBAA)` == `write([0xDD, 0xCC, 0xBB, 0xAA])`
    //
    // This function does the right thing for little-endian hardware. On
    // big-endian hardware `x` must be byte-swapped first to give the right
    // behaviour. After any byte-swapping, the input must be zero-extended to
    // 64-bits. The caller is responsible for the byte-swapping and
    // zero-extension.
    #[inline]
    pub fn short_write<T>(&mut self, _x: T, x: u64) {
        let size = core::mem::size_of::<T>();
        self.length += size;

        // The original number must be zero-extended, not sign-extended.
        debug_assert!(if size < 8 { x >> (8 * size) == 0 } else { true });

        // The number of bytes needed to fill `self.tail`.
        let needed = 8 - self.ntail;

        self.tail |= x << (8 * self.ntail);
        if size < needed {
            self.ntail += size;
            return;
        }

        // `self.tail` is full, process it.
        self.v3 ^= self.tail;
        for _ in 0..C {
            self.round();
        }
        self.v0 ^= self.tail;

        self.ntail = size - needed;
        self.tail = if needed < 8 { x >> (8 * needed) } else { 0 };
    }
}

impl<const C: usize, const D: usize> core::hash::Hasher for Sip64Scalar<C, D> {
    #[inline]
    fn write(&mut self, msg: &[u8]) {
        let length = msg.len();
        self.length += length;

        let mut needed = 0;

        if self.ntail != 0 {
            needed = 8 - self.ntail;
            self.tail |=
                unsafe { u8to64_le(msg, 0, core::cmp::min(length, needed)) } << (8 * self.ntail);
            if length < needed {
                self.ntail += length;
                return;
            } else {
                self.v3 ^= self.tail;
                for _ in 0..C {
                    self.round();
                }
                self.v0 ^= self.tail;
                self.ntail = 0;
            }
        }

        // Buffered tail is now flushed, process new input.
        let len = length - needed;
        let left = len & 0x7;

        let mut i = needed;
        while i < len - left {
            let mi = unsafe { load_int_le!(msg, i, u64) };

            self.v3 ^= mi;
            for _ in 0..C {
                self.round();
            }
            self.v0 ^= mi;

            i += 8;
        }

        self.tail = unsafe { u8to64_le(msg, i, left) };
        self.ntail = left;
    }

    #[inline]
    fn finish(&self) -> u64 {
        let mut state = *self;

        let b: u64 = ((self.length as u64 & 0xff) << 56) | self.tail;

        state.v3 ^= b;
        for _ in 0..C {
            state.round();
        }
        state.v0 ^= b;

        state.v2 ^= 0xff;
        for _ in 0..D {
            state.round();
        }

        state.v0 ^ state.v1 ^ state.v2 ^ state.v3
    }

    #[inline]
    fn write_usize(&mut self, i: usize) {
        self.short_write(i, i.to_le() as u64);
    }

    #[inline]
    fn write_u8(&mut self, i: u8) {
        self.short_write(i, i as u64);
    }

    #[inline]
    fn write_u32(&mut self, i: u32) {
        self.short_write(i, i.to_le() as u64);
    }

    #[inline]
    fn write_u64(&mut self, i: u64) {
        self.short_write(i, i.to_le());
    }
}

/// Porting of 128-bit SipHash from https://github.com/veorq/SipHash
#[derive(Debug, Clone, Copy)]
#[repr(transparent)]
pub struct Sip128Scalar<const C: usize, const D: usize>(Sip64Scalar<C, D>);

impl<const C: usize, const D: usize> core::default::Default for Sip128Scalar<C, D> {
    #[inline]
    fn default() -> Self {
        Self::new()
    }
}

impl<const C: usize, const D: usize> Sip128Scalar<C, D> {
    #[inline]
    pub fn new() -> Self {
        Self(<Sip64Scalar<C, D>>::new())
    }

    #[inline(always)]
    pub fn new_with_key(key: [u8; 16]) -> Self {
        Self(<Sip64Scalar<C, D>>::new_with_key(key))
    }

    #[inline(always)]
    pub fn new_with_key_and_state(key: [u8; 16], state: [u64; 4]) -> Self {
        Self(<Sip64Scalar<C, D>>::new_with_key_and_state(key, state))
    }
}

impl<const C: usize, const D: usize> crate::hash::SeedableHasher for Sip128Scalar<C, D> {
    type SeedType = [u8; 16];

    /// Creates a new hasher with the given seed.
    #[inline(always)]
    fn new_with_seed(seed: Self::SeedType) -> Self {
        Self::new_with_key(seed)
    }
}

impl<const C: usize, const D: usize> crate::hash::Hasher for Sip128Scalar<C, D> {
    type HashType = u128;

    #[inline(always)]
    fn write(&mut self, msg: &[u8]) {
        self.0.write(msg)
    }

    #[inline]
    fn finish(&self) -> u128 {
        let mut state = *self;

        let b: u64 = ((self.0.length as u64 & 0xff) << 56) | self.0.tail;

        state.0.v3 ^= b;
        for _ in 0..C {
            state.0.round();
        }
        state.0.v0 ^= b;

        state.0.v2 ^= 0xff;
        for _ in 0..D {
            state.0.round();
        }

        let low = state.0.v0 ^ state.0.v1 ^ state.0.v2 ^ state.0.v3;

        state.0.v1 ^= 0xdd;
        for _ in 0..D {
            state.0.round();
        }

        let high = state.0.v0 ^ state.0.v1 ^ state.0.v2 ^ state.0.v3;

        ((high as u128) << 64) | (low as u128)
    }

    #[inline]
    fn write_usize(&mut self, i: usize) {
        self.0.write_usize(i);
    }

    #[inline]
    fn write_u8(&mut self, i: u8) {
        self.0.write_u8(i);
    }

    #[inline]
    fn write_u32(&mut self, i: u32) {
        self.0.write_u32(i);
    }

    #[inline]
    fn write_u64(&mut self, i: u64) {
        self.0.write_u64(i);
    }
}

#[cfg(test)]
mod test {
    use crate::prelude::Hasher;

    use super::*;
    #[test]
    fn test_siphasher() {
        let data = (0..255_u8).collect::<Vec<_>>();
        for i in 0..16 {
            #[allow(deprecated)]
            let mut sip =
                core::hash::SipHasher::new_with_keys(0x0706050403020100, 0x0f0e0d0c0b0a0908);
            sip.write(&data[..i]);
            let truth = sip.finish();

            let mut sip = <Sip64Scalar<2, 4>>::new_with_key([
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
            ]);
            sip.write(&data[..i]);
            assert_eq!(sip.finish(), truth);
        }
    }
    #[test]
    fn test_default() {
        let data = (0..255_u8).collect::<Vec<_>>();
        for i in 0..16 {
            #[allow(deprecated)]
            let mut sip = std::collections::hash_map::DefaultHasher::new();
            sip.write(&data[..i]);
            let truth = sip.finish();

            let mut sip = <Sip64Scalar<1, 3>>::new();
            sip.write(&data[..i]);
            assert_eq!(sip.finish(), truth);
        }
    }
}

## siphalf.rs
pub struct SipHalf32<const C: usize, const D: usize> {}
pub struct SipHalf64<const C: usize, const D: usize> {}

## sipsse.rs
#[cfg(target_feature = "sse")]
use core::arch::x86_64::*;

#[allow(non_snake_case)]
#[inline(always)]
const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
    ((z << 6) | (y << 4) | (x << 2) | w) as i32
}

/// Porting of 64-bit SipHash from https://github.com/veorq/SipHash
/// based on the sse implementation of https://github.com/floodyberry/supercop/blob/master/crypto_auth/siphash24/sse41/siphash.c
#[derive(Debug, Clone, Copy)]
#[repr(align(16), C)] //alignement and C repr so we can use simd instructions
pub struct SipHash64SSE<const C: usize, const D: usize> {
    v0v2: __m128i,
    v1v3: __m128i,
    k0k1: __m128i,
    m: u64,
    length: u64,
    tail: __m128i,
    ntail: usize,
}
impl<const C: usize, const D: usize> SipHash64SSE<C, D> {
    #[inline(always)]
    pub fn new() -> Self {
        // same default key as rust implementation
        Self::new_with_key([0; 16])
    }

    #[inline(always)]
    pub fn new_with_key(key: [u8; 16]) -> Self {
        unsafe {
            let mut res = Self {
                v0v2: _mm_set_epi64x(0x6c7967656e657261, 0x736f6d6570736575),
                v1v3: _mm_set_epi64x(0x7465646279746573, 0x646f72616e646f6d),
                k0k1: _mm_loadu_si128(key.as_ptr() as *const __m128i),
                m: 0,
                length: 0,
                tail: _mm_setzero_si128(),
                ntail: 0,
            };

            res.v1v3 = _mm_xor_si128(res.v1v3, res.k0k1);
            res.v0v2 = _mm_xor_si128(res.v0v2, res.k0k1);

            res
        }
    }

    #[inline(always)]
    fn round(&mut self) {
        unsafe {
            // v0 += v1, v2 += v3
            self.v0v2 = _mm_add_epi64(self.v0v2, self.v1v3);

            // rotl(v1v3, 13)
            let b1 = _mm_xor_si128(
                _mm_slli_epi64(self.v1v3, 13),
                _mm_srli_epi64(self.v1v3, 64 - 13),
            );
            // rotl(v3, 16)
            let b2 = _mm_shufflehi_epi16::<{ _MM_SHUFFLE(2, 1, 0, 3) }>(self.v1v3);
            // select v1 from rotl(v1v3, 13) and v3 from rotl(v3, 16)
            self.v1v3 = _mm_blend_epi16(b1, b2, 0b1111_0000);

            // rotl 32 v2
            self.v1v3 = _mm_shuffle_epi32::<{ _MM_SHUFFLE(0, 1, 3, 2) }>(self.v0v2);

            // xor the two halves
            self.v1v3 = _mm_xor_si128(self.v1v3, self.v0v2);

            // rotl(v1v3, 17)
            let b1 = _mm_xor_si128(
                _mm_slli_epi64(self.v1v3, 17),
                _mm_srli_epi64(self.v1v3, 64 - 17),
            );
            // rotl(v1v3, 21)
            let b2 = _mm_xor_si128(
                _mm_slli_epi64(self.v1v3, 21),
                _mm_srli_epi64(self.v1v3, 64 - 21),
            );
            // select v1 from rotl(v1v3, 17) and v3 from rotl(v1v3, 17)
            self.v1v3 = _mm_blend_epi16(b1, b2, 0b1111_0000);

            // rotl 32 v2
            self.v0v2 = _mm_shuffle_epi32::<{ _MM_SHUFFLE(0, 1, 3, 2) }>(self.v0v2);

            // xor the two halves
            self.v1v3 = _mm_xor_si128(self.v1v3, self.v0v2);
        }
    }

    #[inline]
    pub fn short_write<T>(&mut self, _x: T, x: u64) {
        todo!();
    }
}

impl<const C: usize, const D: usize> core::hash::Hasher for SipHash64SSE<C, D> {
    #[inline]
    fn write(&mut self, msg: &[u8]) {
        let length = msg.len();
        self.length += length as u64;

        let mut needed = 0;

        if self.ntail != 0 {
            needed = 8 - self.ntail;
            self.tail |=
                unsafe { u8to64_le(msg, 0, core::cmp::min(length, needed)) } << (8 * self.ntail);
            if length < needed {
                self.ntail += length;
                return;
            } else {
                self.v3 ^= self.tail;
                for _ in 0..C {
                    self.round();
                }
                self.v0 ^= self.tail;
                self.ntail = 0;
            }
        }

        // Buffered tail is now flushed, process new input.
        let len = length - needed;
        let left = len & 0x7;

        let mut i = needed;
        while i < len - left {
            let mi = unsafe { load_int_le!(msg, i, u64) };

            self.v3 ^= mi;
            for _ in 0..C {
                self.round();
            }
            self.v0 ^= mi;

            i += 8;
        }

        self.tail = unsafe { u8to64_le(msg, i, left) };
        self.ntail = left;
    }

    #[inline]
    fn finish(&self) -> u64 {
        let mut state = *self;
        unsafe {
            //state.tail = _mm_xor_si128(state.tail, _mm_set_epi64x(self.length & 0xff), 0);

            state.v1v3 = _mm_xor_si128(state.v1v3, state.tail);

            for _ in 0..C {
                state.round();
            }
            // v0 ^= b and v2 ^= 0xff
            //state.v0v2 = _mm_xor_si128(state.v0v2, _mm_set_epi64x(b, 0xff));

            for _ in 0..D {
                state.round();
            }

            // v0 ^ v1 ^ v2 ^ v3
            let tmp = _mm_xor_si128(state.v0v2, state.v1v3);
            (_mm_extract_epi64(tmp, 0) ^ _mm_extract_epi64(tmp, 1)) as u64
        }
    }

    #[inline]
    fn write_usize(&mut self, i: usize) {
        self.short_write(i, i.to_le() as u64);
    }

    #[inline]
    fn write_u8(&mut self, i: u8) {
        self.short_write(i, i as u64);
    }

    #[inline]
    fn write_u32(&mut self, i: u32) {
        self.short_write(i, i.to_le() as u64);
    }

    #[inline]
    fn write_u64(&mut self, i: u64) {
        self.short_write(i, i.to_le());
    }
}

## spookyv2.rs
//! SpookyHash is a public domain noncryptographic hash function producing
//! well-distributed 128-bit hash values for byte arrays of any length. It can
//! produce 64-bit and 32-bit hash values too, at the same speed, just use the
//! bottom n bits. The C++ reference implementation is specific to 64-bit x86
//! platforms, in particular it assumes the processor is little endian. Long
//! keys hash in 3 bytes per cycle, short keys take about 1 byte per cycle, and
//! there is a 30 cycle startup cost. Keys can be supplied in fragments. The
//! function allows a 128-bit seed. It's named SpookyHash because it was
//! released on Halloween.
//!
//! # Reference
//! - <http://burtleburtle.net/bob/hash/spooky.html>
use crate::hash;

use super::SeedableHasher;

///
/// sc_const: a constant which:
///  * is not zero
///  * is odd
///  * is a not-very-regular mix of 1's and 0's
///  * does not need any other special mathematical properties
///
pub const SC_CONST: u64 = 0xdeadbeefdeadbeef;

#[repr(align(8))] // so it's safe to read data as u64s
pub struct SpookyHashV2 {
    data: [u8; 24], // unhashed data, for partial messages
    state: State,   // internal state of the hash
    length: usize,  // total length of the input so far
    remainder: u8,  // length of unhashed data stashed in m_data
}

impl SpookyHashV2 {
    const ALLOW_UNALIGNED_READS: bool = false;
    const NUM_VARS: usize = 12;
    const BLOCK_SIZE: usize = Self::NUM_VARS * 8;
    const BUF_SIZE: usize = Self::BLOCK_SIZE * 2;
}

impl core::default::Default for SpookyHashV2 {
    #[inline(always)]
    fn default() -> Self {
        Self::new_with_seed(0)
    }
}
impl SeedableHasher for SpookyHashV2 {
    type SeedType = u128;

    #[inline(always)]
    fn new_with_seed(seed: Self::SeedType) -> Self {
        let seed1 = seed as u64;
        let seed2 = (seed >> 64) as u64;
        Self {
            data: [0; 24],
            state: State {
                s0: seed1,
                s1: seed2,
                s2: SC_CONST,
                s3: seed1,
                s4: seed2,
                s5: SC_CONST,
                s6: seed1,
                s7: seed2,
                s8: SC_CONST,
                s9: seed1,
                s10: seed2,
                s11: SC_CONST,
            },
            length: 0,
            remainder: 0,
        }
    }
}

impl super::Hasher for SpookyHashV2 {
    type HashType = u128;

    fn write(&mut self, mut msg: &[u8]) {
        let new_length = self.remainder as usize + msg.len();

        // Is this message fragment too short?  If it is, stuff it away.
        if new_length < Self::BUF_SIZE {
            self.data[self.remainder as usize..new_length].copy_from_slice(msg);
            self.remainder = new_length as u8;
            return;
        }

        self.length += msg.len();

        // if we've got anything stuffed away, use it now
        if self.remainder != 0 {
            let prefix = Self::BUF_SIZE - self.remainder as usize;
            self.data[self.remainder as usize..Self::BUF_SIZE].copy_from_slice(&msg[..prefix]);
            let low = unsafe { self.data[..Self::BLOCK_SIZE].align_to().1 };
            let high = unsafe { self.data[Self::BLOCK_SIZE..].align_to().1 };
            mix(low, &mut self.state);
            mix(high, &mut self.state);
            self.length -= prefix;
            msg = &msg[prefix..];
        }

        // handle all whole blocks of sc_blockSize bytes
        let blocks = self.length / Self::BLOCK_SIZE;

        if Self::ALLOW_UNALIGNED_READS || (msg.as_ptr() as usize & 0x7) == 0 {
            for _ in 0..blocks {
                let block = unsafe { msg[..Self::BLOCK_SIZE].align_to().1 };
                mix(block, &mut self.state);
                msg = &msg[Self::BLOCK_SIZE..];
            }
        } else {
            for _ in 0..blocks {
                self.data.copy_from_slice(&msg[..Self::BLOCK_SIZE]);
                let block = unsafe { self.data[..Self::BLOCK_SIZE].align_to().1 };
                mix(block, &mut self.state);
                msg = &msg[Self::BLOCK_SIZE..];
            }
        }

        // stuff away the last few bytes
        self.data.copy_from_slice(msg);
    }

    fn finish(&self) -> Self::HashType {
        let hash1: u64;
        let hash1: u64;
        // init the variables
        if self.length < Self::BUF_SIZE
        {
            let hash = ((self.state.s1 as u128) << 64) | (self.state.s0 as u128);
            return short(self.data, hash);
        }

        const uint64 *data = (const uint64 *)m_data;
        uint8 remainder = m_remainder;

        if (remainder >= Self::BLOCK_SIZE)
        {
            // m_data can contain two blocks; handle any whole first block
            mix(&data, &mut self.state);
            data += Self::NUM_VARS;
            remainder -= Self::BLOCK_SIZE;
        }

        // mix in the last partial block, and the length mod Self::BLOCK_SIZE
        memset(&((uint8 *)data)[remainder], 0, (Self::BLOCK_SIZE-remainder));

        ((uint8 *)data)[Self::BLOCK_SIZE-1] = remainder;

        // do some final mixing
        End(data, &mut self.state);

        *hash1 = h0;
        *hash2 = h1;
    }
}

/// This is not an array so the compiler can reorder the fields for simd operations
#[derive(Clone, Copy)]
struct State {
    s0: u64,
    s1: u64,
    s2: u64,
    s3: u64,
    s4: u64,
    s5: u64,
    s6: u64,
    s7: u64,
    s8: u64,
    s9: u64,
    s10: u64,
    s11: u64,
}

#[inline(always)]
/// This is used if the input is 96 bytes long or longer.
///
/// The internal state is fully overwritten every 96 bytes.
/// Every input bit appears to cause at least 128 bits of entropy
/// before 96 other bytes are combined, when run forward or backward
///   For every input bit,
///   Two inputs differing in just that input bit
///   Where "differ" means xor or subtraction
///   And the base value is random
///   When run forward or backwards one Mix
/// I tried 3 pairs of each; they all differed by at least 212 bits.
///
fn mix(data: &[u64], s: &mut State) {
    debug_assert_eq!(data.len(), 12);
    s.s0 = s.s0.wrapping_add(data[0]);
    s.s2 ^= s.s10;
    s.s11 ^= s.s0;
    s.s0 = s.s0.rotate_left(11);
    s.s11 = s.s11.wrapping_add(s.s1);
    s.s1 = s.s1.wrapping_add(data[1]);
    s.s3 ^= s.s11;
    s.s0 ^= s.s1;
    s.s1 = s.s1.rotate_left(32);
    s.s0 = s.s0.wrapping_add(s.s2);
    s.s2 = s.s2.wrapping_add(data[2]);
    s.s4 ^= s.s0;
    s.s1 ^= s.s2;
    s.s2 = s.s2.rotate_left(43);
    s.s1 = s.s1.wrapping_add(s.s3);
    s.s3 = s.s3.wrapping_add(data[3]);
    s.s5 ^= s.s1;
    s.s2 ^= s.s3;
    s.s3 = s.s3.rotate_left(31);
    s.s2 = s.s2.wrapping_add(s.s4);
    s.s4 = s.s4.wrapping_add(data[4]);
    s.s6 ^= s.s2;
    s.s3 ^= s.s4;
    s.s4 = s.s4.rotate_left(17);
    s.s3 = s.s3.wrapping_add(s.s5);
    s.s5 = s.s5.wrapping_add(data[5]);
    s.s7 ^= s.s3;
    s.s4 ^= s.s5;
    s.s5 = s.s5.rotate_left(28);
    s.s4 = s.s4.wrapping_add(s.s6);
    s.s6 = s.s6.wrapping_add(data[6]);
    s.s8 ^= s.s4;
    s.s5 ^= s.s6;
    s.s6 = s.s6.rotate_left(39);
    s.s5 = s.s5.wrapping_add(s.s7);
    s.s7 = s.s7.wrapping_add(data[7]);
    s.s9 ^= s.s5;
    s.s6 ^= s.s7;
    s.s7 = s.s7.rotate_left(57);
    s.s6 = s.s6.wrapping_add(s.s8);
    s.s8 = s.s8.wrapping_add(data[8]);
    s.s10 ^= s.s6;
    s.s7 ^= s.s8;
    s.s8 = s.s8.rotate_left(55);
    s.s7 = s.s7.wrapping_add(s.s9);
    s.s9 = s.s9.wrapping_add(data[9]);
    s.s11 ^= s.s7;
    s.s8 ^= s.s9;
    s.s9 = s.s9.rotate_left(54);
    s.s8 = s.s8.wrapping_add(s.s10);
    s.s10 = s.s10.wrapping_add(data[10]);
    s.s0 ^= s.s8;
    s.s9 ^= s.s10;
    s.s10 = s.s10.rotate_left(22);
    s.s9 = s.s9.wrapping_add(s.s11);
    s.s11 = s.s11.wrapping_add(data[11]);
    s.s1 ^= s.s9;
    s.s10 ^= s.s11;
    s.s11 = s.s11.rotate_left(46);
    s.s10 = s.s10.wrapping_add(s.s0);
}

#[inline(always)]
/// Mix all 12 inputs together so that h0, h1 are a hash of them all.
///
/// For two inputs differing in just the input bits
/// Where "differ" means xor or subtraction
/// And the base value is random, or a counting value starting at that bit
/// The final result will have each bit of h0, h1 flip
/// For every input bit,
/// with probability 50 +- .3%
/// For every pair of input bits,
/// with probability 50 +- 3%
///
/// This does not rely on the last Mix() call having already mixed some.
/// Two iterations was almost good enough for a 64-bit result, but a
/// 128-bit result is reported, so End() does three iterations.
///
fn end_partial(h: &mut State) {
    h.s11 = h.s11.wrapping_add(h.s1);
    h.s2 ^= h.s11;
    h.s1 = h.s1.rotate_left(44);
    h.s0 = h.s0.wrapping_add(h.s2);
    h.s3 ^= h.s0;
    h.s2 = h.s2.rotate_left(15);
    h.s1 = h.s1.wrapping_add(h.s3);
    h.s4 ^= h.s1;
    h.s3 = h.s3.rotate_left(34);
    h.s2 = h.s2.wrapping_add(h.s4);
    h.s5 ^= h.s2;
    h.s4 = h.s4.rotate_left(21);
    h.s3 = h.s3.wrapping_add(h.s5);
    h.s6 ^= h.s3;
    h.s5 = h.s5.rotate_left(38);
    h.s4 = h.s4.wrapping_add(h.s6);
    h.s7 ^= h.s4;
    h.s6 = h.s6.rotate_left(33);
    h.s5 = h.s5.wrapping_add(h.s7);
    h.s8 ^= h.s5;
    h.s7 = h.s7.rotate_left(10);
    h.s6 = h.s6.wrapping_add(h.s8);
    h.s9 ^= h.s6;
    h.s8 = h.s8.rotate_left(13);
    h.s7 = h.s7.wrapping_add(h.s9);
    h.s10 ^= h.s7;
    h.s9 = h.s9.rotate_left(38);
    h.s8 = h.s8.wrapping_add(h.s10);
    h.s11 ^= h.s8;
    h.s10 = h.s10.rotate_left(53);
    h.s9 = h.s9.wrapping_add(h.s11);
    h.s0 ^= h.s9;
    h.s11 = h.s11.rotate_left(42);
    h.s10 = h.s10.wrapping_add(h.s0);
    h.s1 ^= h.s10;
    h.s0 = h.s0.rotate_left(54);
}

#[inline(always)]
fn end(data: &[u64], h: &mut State) {
    debug_assert_eq!(data.len(), 12);
    h.s0 += data[0];
    h.s1 += data[1];
    h.s2 += data[2];
    h.s3 += data[3];
    h.s4 += data[4];
    h.s5 += data[5];
    h.s6 += data[6];
    h.s7 += data[7];
    h.s8 += data[8];
    h.s9 += data[9];
    h.s10 += data[10];
    h.s11 += data[11];
    end_partial(h);
    end_partial(h);
    end_partial(h);
}

///
/// The goal is for each bit of the input to expand into 128 bits of
///   apparent entropy before it is fully overwritten.
/// n trials both set and cleared at least m bits of h0 h1 h2 h3
///   n: 2   m: 29
///   n: 3   m: 46
///   n: 4   m: 57
///   n: 5   m: 107
///   n: 6   m: 146
///   n: 7   m: 152
/// when run forwards or backwards
/// for all 1-bit and 2-bit diffs
/// with diffs defined by either xor or subtraction
/// with a base of all zeros plus a counter, or plus another bit, or random
///
#[inline(always)]
fn short_mix(h: &mut State) {
    h.s2 = h.s2.rotate_left(50);
    h.s2 = h.s2.wrapping_add(h.s3);
    h.s0 ^= h.s2;
    h.s3 = h.s3.rotate_left(52);
    h.s3 = h.s3.wrapping_add(h.s0);
    h.s1 ^= h.s3;
    h.s0 = h.s0.rotate_left(30);
    h.s0 = h.s0.wrapping_add(h.s1);
    h.s2 ^= h.s0;
    h.s1 = h.s1.rotate_left(41);
    h.s1 = h.s1.wrapping_add(h.s2);
    h.s3 ^= h.s1;
    h.s2 = h.s2.rotate_left(54);
    h.s2 = h.s2.wrapping_add(h.s3);
    h.s0 ^= h.s2;
    h.s3 = h.s3.rotate_left(48);
    h.s3 = h.s3.wrapping_add(h.s0);
    h.s1 ^= h.s3;
    h.s0 = h.s0.rotate_left(38);
    h.s0 = h.s0.wrapping_add(h.s1);
    h.s2 ^= h.s0;
    h.s1 = h.s1.rotate_left(37);
    h.s1 = h.s1.wrapping_add(h.s2);
    h.s3 ^= h.s1;
    h.s2 = h.s2.rotate_left(62);
    h.s2 = h.s2.wrapping_add(h.s3);
    h.s0 ^= h.s2;
    h.s3 = h.s3.rotate_left(34);
    h.s3 = h.s3.wrapping_add(h.s0);
    h.s1 ^= h.s3;
    h.s0 = h.s0.rotate_left(5);
    h.s0 = h.s0.wrapping_add(h.s1);
    h.s2 ^= h.s0;
    h.s1 = h.s1.rotate_left(36);
    h.s1 = h.s1.wrapping_add(h.s2);
    h.s3 ^= h.s1;
}

///
/// Mix all 4 inputs together so that h0, h1 are a hash of them all.
///
/// For two inputs differing in just the input bits
/// Where "differ" means xor or subtraction
/// And the base value is random, or a counting value starting at that bit
/// The final result will have each bit of h0, h1 flip
/// For every input bit,
/// with probability 50 +- .3% (it is probably better than that)
/// For every pair of input bits,
/// with probability 50 +- .75% (the worst case is approximately that)
///
#[inline(always)]
fn short_end(h: &mut State) {
    h.s3 ^= h.s2;
    h.s2 = h.s2.rotate_left(15);
    h.s3 = h.s3.wrapping_add(h.s2);
    h.s0 ^= h.s3;
    h.s3 = h.s3.rotate_left(52);
    h.s0 = h.s0.wrapping_add(h.s3);
    h.s1 ^= h.s0;
    h.s0 = h.s0.rotate_left(26);
    h.s1 = h.s1.wrapping_add(h.s0);
    h.s2 ^= h.s1;
    h.s1 = h.s1.rotate_left(51);
    h.s2 = h.s2.wrapping_add(h.s1);
    h.s3 ^= h.s2;
    h.s2 = h.s2.rotate_left(28);
    h.s3 = h.s3.wrapping_add(h.s2);
    h.s0 ^= h.s3;
    h.s3 = h.s3.rotate_left(9);
    h.s0 = h.s0.wrapping_add(h.s3);
    h.s1 ^= h.s0;
    h.s0 = h.s0.rotate_left(47);
    h.s1 = h.s1.wrapping_add(h.s0);
    h.s2 ^= h.s1;
    h.s1 = h.s1.rotate_left(54);
    h.s2 = h.s2.wrapping_add(h.s1);
    h.s3 ^= h.s2;
    h.s2 = h.s2.rotate_left(32);
    h.s3 = h.s3.wrapping_add(h.s2);
    h.s0 ^= h.s3;
    h.s3 = h.s3.rotate_left(25);
    h.s0 = h.s0.wrapping_add(h.s3);
    h.s1 ^= h.s0;
    h.s0 = h.s0.rotate_left(63);
    h.s1 = h.s1.wrapping_add(h.s0);
}


#[inline(always)]
fn short(data: &[u8], hash: u128) -> u128 {
    todo!();
}
	//! Implementation of SipHash from [Jean-Philippe Aumasson](https://www.aumasson.jp/) and Daniel J. Bernstein.
	//!
	//! SipHash is a general-purpose hashing function: it runs at a good
	//! speed (competitive with Spooky and City) and permits strong _keyed_
	//! hashing. This lets you key your hash tables from a strong RNG, such as
	//! [`rand::os::OsRng`](https://docs.rs/rand/latest/rand/rngs/struct.OsRng.html).
	//!
	//! Although the SipHash algorithm is considered to be generally strong,
	//! it is not intended for cryptographic purposes. As such, all
	//! cryptographic uses of this implementation are _strongly discouraged
	//!
	//! # Reference
	//! - <https://www.aumasson.jp/siphash/siphash.pdf>
	//! - <https://131002.net/siphash/>
	//! - <https://github.com/floodyberry/supercop/blob/master/crypto_auth/siphash24/sse41/siphash.c>
	//! - <https://github.com/google/highwayhash/blob/master/highwayhash/sip_hash.h>
	//! - <https://github.com/rust-lang/rust/blob/master/library/core/src/hash/sip.rs>
	//!
	//! # Reimplementation reasons
	//! - The rust implementation is not const generic and is deprecated and has no simd optimzations
	//! - The [most popular rust library](https://github.com/jedisct1/rust-siphash/tree/master)
	//! is just a port of rust implementation and has the same problems minus the deprecation

	pub type SipHash64<const C: usize, const D: usize> = Sip64Scalar<C, D>;

	/// Loads an integer of the desired type from a byte stream, in LE order. Uses
	/// `copy_nonoverlapping` to let the compiler generate the most efficient way
	/// to load it from a possibly unaligned address.
	///
	/// Unsafe because: unchecked indexing at `i..i+size_of(int_ty)`
	macro_rules! load_int_le {
	($buf:expr, $i:expr, $int_ty:ident) => {{
	debug_assert!($i + core::mem::size_of::<$int_ty>() <= $buf.len());
	let mut data = 0 as $int_ty;
	core::ptr::copy_nonoverlapping(
	$buf.as_ptr().add($i),
	&mut data as mut _ as mut u8,
	core::mem::size_of::<$int_ty>(),
	);
	data.to_le()
	}};
	}

	/// Loads a u64 using up to 7 bytes of a byte slice. It looks clumsy but the
	/// `copy_nonoverlapping` calls that occur (via `load_int_le!`) all have fixed
	/// sizes and avoid calling `memcpy`, which is good for speed.
	///
	/// Unsafe because: unchecked indexing at start..start+len
	#[inline]
	unsafe fn u8to64_le(buf: &[u8], start: usize, len: usize) -> u64 {
	debug_assert!(len < 8);
	let mut i = 0; // current byte index (from LSB) in the output u64
	let mut out = 0;
	if i + 3 < len {
	out = load_int_le!(buf, start + i, u32) as u64;
	i += 4;
	}
	if i + 1 < len {
	out \|= (load_int_le!(buf, start + i, u16) as u64) << (i * 8);
	i += 2
	}
	if i < len {
	out \|= (buf.get_unchecked(start + i) as u64) << (i 8);
	i += 1;
	}
	debug_assert_eq!(i, len);
	out
	}

	/// Porting of 64-bit SipHash from https://github.com/veorq/SipHash
	#[derive(Debug, Clone, Copy)]
	#[repr(align(16), C)] //alignement and C repr so the compiler can use simd instructions
	pub struct Sip64Scalar<const C: usize, const D: usize> {
	// interleave the v values so that the compiler can optimize with simd
	v0: u64,
	v2: u64,
	v1: u64,
	v3: u64,
	k0: u64,
	k1: u64,
	/// precedent message
	m: u64,
	/// how many bytes we've processed
	length: usize,
	/// buffer of unprocessed bytes in little endian order
	tail: u64,
	/// how many bytes in tail are valid
	ntail: usize,
	}

	impl<const C: usize, const D: usize> core::default::Default for Sip64Scalar<C, D> {
	#[inline]
	fn default() -> Self {
	Self::new()
	}
	}

	impl<const C: usize, const D: usize> crate::hash::SeedableHasher for Sip64Scalar<C, D> {
	type SeedType = [u8; 16];

	/// Creates a new hasher with the given seed.
	#[inline(always)]
	fn new_with_seed(seed: Self::SeedType) -> Self {
	Self::new_with_key(seed)
	}
	}

	impl<const C: usize, const D: usize> Sip64Scalar<C, D> {
	#[inline]
	pub fn new() -> Self {
	// same default key as rust implementation
	Self::new_with_key([0; 16])
	}

	#[inline(always)]
	pub fn new_with_key(key: [u8; 16]) -> Self {
	Self::new_with_key_and_state(
	key,
	[
	0x736f6d6570736575,
	0x646f72616e646f6d,
	0x6c7967656e657261,
	0x7465646279746573,
	],
	)
	}

	#[inline(always)]
	pub fn new_with_key_and_state(key: [u8; 16], state: [u64; 4]) -> Self {
	let mut res = Self {
	v0: state[0],
	v1: state[1],
	v2: state[2],
	v3: state[3],
	k0: u64::from_le_bytes(key[0..8].try_into().unwrap()),
	k1: u64::from_le_bytes(key[8..16].try_into().unwrap()),
	m: 0,
	length: 0,
	tail: 0,
	ntail: 0,
	};

	res.v3 ^= res.k1;
	res.v2 ^= res.k0;
	res.v1 ^= res.k1;
	res.v0 ^= res.k0;

	res
	}

	#[inline(always)]
	fn round(&mut self) {
	self.v0 = self.v0.wrapping_add(self.v1);
	self.v2 = self.v2.wrapping_add(self.v3);

	self.v1 = self.v1.rotate_left(13);
	self.v3 = self.v3.rotate_left(16);

	self.v1 ^= self.v0;
	self.v3 ^= self.v2;

	self.v0 = self.v0.rotate_left(32);

	self.v0 = self.v0.wrapping_add(self.v3);
	self.v2 = self.v2.wrapping_add(self.v1);

	self.v1 = self.v1.rotate_left(17);
	self.v3 = self.v3.rotate_left(21);

	self.v3 ^= self.v0;
	self.v1 ^= self.v2;

	self.v2 = self.v2.rotate_left(32);
	}

	// A specialized write function for values with size <= 8.
	//
	// The hashing of multi-byte integers depends on endianness. E.g.:
	// - little-endian: `write_u32(0xDDCCBBAA)` == `write([0xAA, 0xBB, 0xCC, 0xDD])`
	// - big-endian: `write_u32(0xDDCCBBAA)` == `write([0xDD, 0xCC, 0xBB, 0xAA])`
	//
	// This function does the right thing for little-endian hardware. On
	// big-endian hardware `x` must be byte-swapped first to give the right
	// behaviour. After any byte-swapping, the input must be zero-extended to
	// 64-bits. The caller is responsible for the byte-swapping and
	// zero-extension.
	#[inline]
	pub fn short_write<T>(&mut self, _x: T, x: u64) {
	let size = core::mem::size_of::<T>();
	self.length += size;

	// The original number must be zero-extended, not sign-extended.
	debug_assert!(if size < 8 { x >> (8 * size) == 0 } else { true });

	// The number of bytes needed to fill `self.tail`.
	let needed = 8 - self.ntail;

	self.tail \|= x << (8 * self.ntail);
	if size < needed {
	self.ntail += size;
	return;
	}

	// `self.tail` is full, process it.
	self.v3 ^= self.tail;
	for _ in 0..C {
	self.round();
	}
	self.v0 ^= self.tail;

	self.ntail = size - needed;
	self.tail = if needed < 8 { x >> (8 * needed) } else { 0 };
	}
	}

	impl<const C: usize, const D: usize> core::hash::Hasher for Sip64Scalar<C, D> {
	#[inline]
	fn write(&mut self, msg: &[u8]) {
	let length = msg.len();
	self.length += length;

	let mut needed = 0;

	if self.ntail != 0 {
	needed = 8 - self.ntail;
	self.tail \|=
	unsafe { u8to64_le(msg, 0, core::cmp::min(length, needed)) } << (8 * self.ntail);
	if length < needed {
	self.ntail += length;
	return;
	} else {
	self.v3 ^= self.tail;
	for _ in 0..C {
	self.round();
	}
	self.v0 ^= self.tail;
	self.ntail = 0;
	}
	}

	// Buffered tail is now flushed, process new input.
	let len = length - needed;
	let left = len & 0x7;

	let mut i = needed;
	while i < len - left {
	let mi = unsafe { load_int_le!(msg, i, u64) };

	self.v3 ^= mi;
	for _ in 0..C {
	self.round();
	}
	self.v0 ^= mi;

	i += 8;
	}

	self.tail = unsafe { u8to64_le(msg, i, left) };
	self.ntail = left;
	}

	#[inline]
	fn finish(&self) -> u64 {
	let mut state = *self;

	let b: u64 = ((self.length as u64 & 0xff) << 56) \| self.tail;

	state.v3 ^= b;
	for _ in 0..C {
	state.round();
	}
	state.v0 ^= b;

	state.v2 ^= 0xff;
	for _ in 0..D {
	state.round();
	}

	state.v0 ^ state.v1 ^ state.v2 ^ state.v3
	}

	#[inline]
	fn write_usize(&mut self, i: usize) {
	self.short_write(i, i.to_le() as u64);
	}

	#[inline]
	fn write_u8(&mut self, i: u8) {
	self.short_write(i, i as u64);
	}

	#[inline]
	fn write_u32(&mut self, i: u32) {
	self.short_write(i, i.to_le() as u64);
	}

	#[inline]
	fn write_u64(&mut self, i: u64) {
	self.short_write(i, i.to_le());
	}
	}

	/// Porting of 128-bit SipHash from https://github.com/veorq/SipHash
	#[derive(Debug, Clone, Copy)]
	#[repr(transparent)]
	pub struct Sip128Scalar<const C: usize, const D: usize>(Sip64Scalar<C, D>);

	impl<const C: usize, const D: usize> core::default::Default for Sip128Scalar<C, D> {
	#[inline]
	fn default() -> Self {
	Self::new()
	}
	}

	impl<const C: usize, const D: usize> Sip128Scalar<C, D> {
	#[inline]
	pub fn new() -> Self {
	Self(<Sip64Scalar<C, D>>::new())
	}

	#[inline(always)]
	pub fn new_with_key(key: [u8; 16]) -> Self {
	Self(<Sip64Scalar<C, D>>::new_with_key(key))
	}

	#[inline(always)]
	pub fn new_with_key_and_state(key: [u8; 16], state: [u64; 4]) -> Self {
	Self(<Sip64Scalar<C, D>>::new_with_key_and_state(key, state))
	}
	}

	impl<const C: usize, const D: usize> crate::hash::SeedableHasher for Sip128Scalar<C, D> {
	type SeedType = [u8; 16];

	/// Creates a new hasher with the given seed.
	#[inline(always)]
	fn new_with_seed(seed: Self::SeedType) -> Self {
	Self::new_with_key(seed)
	}
	}

	impl<const C: usize, const D: usize> crate::hash::Hasher for Sip128Scalar<C, D> {
	type HashType = u128;

	#[inline(always)]
	fn write(&mut self, msg: &[u8]) {
	self.0.write(msg)
	}

	#[inline]
	fn finish(&self) -> u128 {
	let mut state = *self;

	let b: u64 = ((self.0.length as u64 & 0xff) << 56) \| self.0.tail;

	state.0.v3 ^= b;
	for _ in 0..C {
	state.0.round();
	}
	state.0.v0 ^= b;

	state.0.v2 ^= 0xff;
	for _ in 0..D {
	state.0.round();
	}

	let low = state.0.v0 ^ state.0.v1 ^ state.0.v2 ^ state.0.v3;

	state.0.v1 ^= 0xdd;
	for _ in 0..D {
	state.0.round();
	}

	let high = state.0.v0 ^ state.0.v1 ^ state.0.v2 ^ state.0.v3;

	((high as u128) << 64) \| (low as u128)
	}

	#[inline]
	fn write_usize(&mut self, i: usize) {
	self.0.write_usize(i);
	}

	#[inline]
	fn write_u8(&mut self, i: u8) {
	self.0.write_u8(i);
	}

	#[inline]
	fn write_u32(&mut self, i: u32) {
	self.0.write_u32(i);
	}

	#[inline]
	fn write_u64(&mut self, i: u64) {
	self.0.write_u64(i);
	}
	}

	#[cfg(test)]
	mod test {
	use crate::prelude::Hasher;

	use super::*;
	#[test]
	fn test_siphasher() {
	let data = (0..255_u8).collect::<Vec<_>>();
	for i in 0..16 {
	#[allow(deprecated)]
	let mut sip =
	core::hash::SipHasher::new_with_keys(0x0706050403020100, 0x0f0e0d0c0b0a0908);
	sip.write(&data[..i]);
	let truth = sip.finish();

	let mut sip = <Sip64Scalar<2, 4>>::new_with_key([
	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
	]);
	sip.write(&data[..i]);
	assert_eq!(sip.finish(), truth);
	}
	}
	#[test]
	fn test_default() {
	let data = (0..255_u8).collect::<Vec<_>>();
	for i in 0..16 {
	#[allow(deprecated)]
	let mut sip = std::collections::hash_map::DefaultHasher::new();
	sip.write(&data[..i]);
	let truth = sip.finish();

	let mut sip = <Sip64Scalar<1, 3>>::new();
	sip.write(&data[..i]);
	assert_eq!(sip.finish(), truth);
	}
	}
	}
	pub struct SipHalf32<const C: usize, const D: usize> {}
	pub struct SipHalf64<const C: usize, const D: usize> {}
	#[cfg(target_feature = "sse")]
	use core::arch::x86_64::*;

	#[allow(non_snake_case)]
	#[inline(always)]
	const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
	((z << 6) \| (y << 4) \| (x << 2) \| w) as i32
	}

	/// Porting of 64-bit SipHash from https://github.com/veorq/SipHash
	/// based on the sse implementation of https://github.com/floodyberry/supercop/blob/master/crypto_auth/siphash24/sse41/siphash.c
	#[derive(Debug, Clone, Copy)]
	#[repr(align(16), C)] //alignement and C repr so we can use simd instructions
	pub struct SipHash64SSE<const C: usize, const D: usize> {
	v0v2: __m128i,
	v1v3: __m128i,
	k0k1: __m128i,
	m: u64,
	length: u64,
	tail: __m128i,
	ntail: usize,
	}
	impl<const C: usize, const D: usize> SipHash64SSE<C, D> {
	#[inline(always)]
	pub fn new() -> Self {
	// same default key as rust implementation
	Self::new_with_key([0; 16])
	}

	#[inline(always)]
	pub fn new_with_key(key: [u8; 16]) -> Self {
	unsafe {
	let mut res = Self {
	v0v2: _mm_set_epi64x(0x6c7967656e657261, 0x736f6d6570736575),
	v1v3: _mm_set_epi64x(0x7465646279746573, 0x646f72616e646f6d),
	k0k1: _mm_loadu_si128(key.as_ptr() as *const __m128i),
	m: 0,
	length: 0,
	tail: _mm_setzero_si128(),
	ntail: 0,
	};

	res.v1v3 = _mm_xor_si128(res.v1v3, res.k0k1);
	res.v0v2 = _mm_xor_si128(res.v0v2, res.k0k1);

	res
	}
	}

	#[inline(always)]
	fn round(&mut self) {
	unsafe {
	// v0 += v1, v2 += v3
	self.v0v2 = _mm_add_epi64(self.v0v2, self.v1v3);

	// rotl(v1v3, 13)
	let b1 = _mm_xor_si128(
	_mm_slli_epi64(self.v1v3, 13),
	_mm_srli_epi64(self.v1v3, 64 - 13),
	);
	// rotl(v3, 16)
	let b2 = _mm_shufflehi_epi16::<{ _MM_SHUFFLE(2, 1, 0, 3) }>(self.v1v3);
	// select v1 from rotl(v1v3, 13) and v3 from rotl(v3, 16)
	self.v1v3 = _mm_blend_epi16(b1, b2, 0b1111_0000);

	// rotl 32 v2
	self.v1v3 = _mm_shuffle_epi32::<{ _MM_SHUFFLE(0, 1, 3, 2) }>(self.v0v2);

	// xor the two halves
	self.v1v3 = _mm_xor_si128(self.v1v3, self.v0v2);

	// rotl(v1v3, 17)
	let b1 = _mm_xor_si128(
	_mm_slli_epi64(self.v1v3, 17),
	_mm_srli_epi64(self.v1v3, 64 - 17),
	);
	// rotl(v1v3, 21)
	let b2 = _mm_xor_si128(
	_mm_slli_epi64(self.v1v3, 21),
	_mm_srli_epi64(self.v1v3, 64 - 21),
	);
	// select v1 from rotl(v1v3, 17) and v3 from rotl(v1v3, 17)
	self.v1v3 = _mm_blend_epi16(b1, b2, 0b1111_0000);

	// rotl 32 v2
	self.v0v2 = _mm_shuffle_epi32::<{ _MM_SHUFFLE(0, 1, 3, 2) }>(self.v0v2);

	// xor the two halves
	self.v1v3 = _mm_xor_si128(self.v1v3, self.v0v2);
	}
	}

	#[inline]
	pub fn short_write<T>(&mut self, _x: T, x: u64) {
	todo!();
	}
	}

	impl<const C: usize, const D: usize> core::hash::Hasher for SipHash64SSE<C, D> {
	#[inline]
	fn write(&mut self, msg: &[u8]) {
	let length = msg.len();
	self.length += length as u64;

	let mut needed = 0;

	if self.ntail != 0 {
	needed = 8 - self.ntail;
	self.tail \|=
	unsafe { u8to64_le(msg, 0, core::cmp::min(length, needed)) } << (8 * self.ntail);
	if length < needed {
	self.ntail += length;
	return;
	} else {
	self.v3 ^= self.tail;
	for _ in 0..C {
	self.round();
	}
	self.v0 ^= self.tail;
	self.ntail = 0;
	}
	}

	// Buffered tail is now flushed, process new input.
	let len = length - needed;
	let left = len & 0x7;

	let mut i = needed;
	while i < len - left {
	let mi = unsafe { load_int_le!(msg, i, u64) };

	self.v3 ^= mi;
	for _ in 0..C {
	self.round();
	}
	self.v0 ^= mi;

	i += 8;
	}

	self.tail = unsafe { u8to64_le(msg, i, left) };
	self.ntail = left;
	}

	#[inline]
	fn finish(&self) -> u64 {
	let mut state = *self;
	unsafe {
	//state.tail = _mm_xor_si128(state.tail, _mm_set_epi64x(self.length & 0xff), 0);

	state.v1v3 = _mm_xor_si128(state.v1v3, state.tail);

	for _ in 0..C {
	state.round();
	}
	// v0 ^= b and v2 ^= 0xff
	//state.v0v2 = _mm_xor_si128(state.v0v2, _mm_set_epi64x(b, 0xff));

	for _ in 0..D {
	state.round();
	}

	// v0 ^ v1 ^ v2 ^ v3
	let tmp = _mm_xor_si128(state.v0v2, state.v1v3);
	(_mm_extract_epi64(tmp, 0) ^ _mm_extract_epi64(tmp, 1)) as u64
	}
	}

	#[inline]
	fn write_usize(&mut self, i: usize) {
	self.short_write(i, i.to_le() as u64);
	}

	#[inline]
	fn write_u8(&mut self, i: u8) {
	self.short_write(i, i as u64);
	}

	#[inline]
	fn write_u32(&mut self, i: u32) {
	self.short_write(i, i.to_le() as u64);
	}

	#[inline]
	fn write_u64(&mut self, i: u64) {
	self.short_write(i, i.to_le());
	}
	}
	//! SpookyHash is a public domain noncryptographic hash function producing
	//! well-distributed 128-bit hash values for byte arrays of any length. It can
	//! produce 64-bit and 32-bit hash values too, at the same speed, just use the
	//! bottom n bits. The C++ reference implementation is specific to 64-bit x86
	//! platforms, in particular it assumes the processor is little endian. Long
	//! keys hash in 3 bytes per cycle, short keys take about 1 byte per cycle, and
	//! there is a 30 cycle startup cost. Keys can be supplied in fragments. The
	//! function allows a 128-bit seed. It's named SpookyHash because it was
	//! released on Halloween.
	//!
	//! # Reference
	//! - <http://burtleburtle.net/bob/hash/spooky.html>
	use crate::hash;

	use super::SeedableHasher;

	///
	/// sc_const: a constant which:
	/// * is not zero
	/// * is odd
	/// * is a not-very-regular mix of 1's and 0's
	/// * does not need any other special mathematical properties
	///
	pub const SC_CONST: u64 = 0xdeadbeefdeadbeef;

	#[repr(align(8))] // so it's safe to read data as u64s
	pub struct SpookyHashV2 {
	data: [u8; 24], // unhashed data, for partial messages
	state: State, // internal state of the hash
	length: usize, // total length of the input so far
	remainder: u8, // length of unhashed data stashed in m_data
	}

	impl SpookyHashV2 {
	const ALLOW_UNALIGNED_READS: bool = false;
	const NUM_VARS: usize = 12;
	const BLOCK_SIZE: usize = Self::NUM_VARS * 8;
	const BUF_SIZE: usize = Self::BLOCK_SIZE * 2;
	}

	impl core::default::Default for SpookyHashV2 {
	#[inline(always)]
	fn default() -> Self {
	Self::new_with_seed(0)
	}
	}
	impl SeedableHasher for SpookyHashV2 {
	type SeedType = u128;

	#[inline(always)]
	fn new_with_seed(seed: Self::SeedType) -> Self {
	let seed1 = seed as u64;
	let seed2 = (seed >> 64) as u64;
	Self {
	data: [0; 24],
	state: State {
	s0: seed1,
	s1: seed2,
	s2: SC_CONST,
	s3: seed1,
	s4: seed2,
	s5: SC_CONST,
	s6: seed1,
	s7: seed2,
	s8: SC_CONST,
	s9: seed1,
	s10: seed2,
	s11: SC_CONST,
	},
	length: 0,
	remainder: 0,
	}
	}
	}

	impl super::Hasher for SpookyHashV2 {
	type HashType = u128;

	fn write(&mut self, mut msg: &[u8]) {
	let new_length = self.remainder as usize + msg.len();

	// Is this message fragment too short? If it is, stuff it away.
	if new_length < Self::BUF_SIZE {
	self.data[self.remainder as usize..new_length].copy_from_slice(msg);
	self.remainder = new_length as u8;
	return;
	}

	self.length += msg.len();

	// if we've got anything stuffed away, use it now
	if self.remainder != 0 {
	let prefix = Self::BUF_SIZE - self.remainder as usize;
	self.data[self.remainder as usize..Self::BUF_SIZE].copy_from_slice(&msg[..prefix]);
	let low = unsafe { self.data[..Self::BLOCK_SIZE].align_to().1 };
	let high = unsafe { self.data[Self::BLOCK_SIZE..].align_to().1 };
	mix(low, &mut self.state);
	mix(high, &mut self.state);
	self.length -= prefix;
	msg = &msg[prefix..];
	}

	// handle all whole blocks of sc_blockSize bytes
	let blocks = self.length / Self::BLOCK_SIZE;

	if Self::ALLOW_UNALIGNED_READS \|\| (msg.as_ptr() as usize & 0x7) == 0 {
	for _ in 0..blocks {
	let block = unsafe { msg[..Self::BLOCK_SIZE].align_to().1 };
	mix(block, &mut self.state);
	msg = &msg[Self::BLOCK_SIZE..];
	}
	} else {
	for _ in 0..blocks {
	self.data.copy_from_slice(&msg[..Self::BLOCK_SIZE]);
	let block = unsafe { self.data[..Self::BLOCK_SIZE].align_to().1 };
	mix(block, &mut self.state);
	msg = &msg[Self::BLOCK_SIZE..];
	}
	}

	// stuff away the last few bytes
	self.data.copy_from_slice(msg);
	}

	fn finish(&self) -> Self::HashType {
	let hash1: u64;
	let hash1: u64;
	// init the variables
	if self.length < Self::BUF_SIZE
	{
	let hash = ((self.state.s1 as u128) << 64) \| (self.state.s0 as u128);
	return short(self.data, hash);
	}

	const uint64 data = (const uint64 )m_data;
	uint8 remainder = m_remainder;

	if (remainder >= Self::BLOCK_SIZE)
	{
	// m_data can contain two blocks; handle any whole first block
	mix(&data, &mut self.state);
	data += Self::NUM_VARS;
	remainder -= Self::BLOCK_SIZE;
	}

	// mix in the last partial block, and the length mod Self::BLOCK_SIZE
	memset(&((uint8 *)data)[remainder], 0, (Self::BLOCK_SIZE-remainder));

	((uint8 *)data)[Self::BLOCK_SIZE-1] = remainder;

	// do some final mixing
	End(data, &mut self.state);

	*hash1 = h0;
	*hash2 = h1;
	}
	}

	/// This is not an array so the compiler can reorder the fields for simd operations
	#[derive(Clone, Copy)]
	struct State {
	s0: u64,
	s1: u64,
	s2: u64,
	s3: u64,
	s4: u64,
	s5: u64,
	s6: u64,
	s7: u64,
	s8: u64,
	s9: u64,
	s10: u64,
	s11: u64,
	}

	#[inline(always)]
	/// This is used if the input is 96 bytes long or longer.
	///
	/// The internal state is fully overwritten every 96 bytes.
	/// Every input bit appears to cause at least 128 bits of entropy
	/// before 96 other bytes are combined, when run forward or backward
	/// For every input bit,
	/// Two inputs differing in just that input bit
	/// Where "differ" means xor or subtraction
	/// And the base value is random
	/// When run forward or backwards one Mix
	/// I tried 3 pairs of each; they all differed by at least 212 bits.
	///
	fn mix(data: &[u64], s: &mut State) {
	debug_assert_eq!(data.len(), 12);
	s.s0 = s.s0.wrapping_add(data[0]);
	s.s2 ^= s.s10;
	s.s11 ^= s.s0;
	s.s0 = s.s0.rotate_left(11);
	s.s11 = s.s11.wrapping_add(s.s1);
	s.s1 = s.s1.wrapping_add(data[1]);
	s.s3 ^= s.s11;
	s.s0 ^= s.s1;
	s.s1 = s.s1.rotate_left(32);
	s.s0 = s.s0.wrapping_add(s.s2);
	s.s2 = s.s2.wrapping_add(data[2]);
	s.s4 ^= s.s0;
	s.s1 ^= s.s2;
	s.s2 = s.s2.rotate_left(43);
	s.s1 = s.s1.wrapping_add(s.s3);
	s.s3 = s.s3.wrapping_add(data[3]);
	s.s5 ^= s.s1;
	s.s2 ^= s.s3;
	s.s3 = s.s3.rotate_left(31);
	s.s2 = s.s2.wrapping_add(s.s4);
	s.s4 = s.s4.wrapping_add(data[4]);
	s.s6 ^= s.s2;
	s.s3 ^= s.s4;
	s.s4 = s.s4.rotate_left(17);
	s.s3 = s.s3.wrapping_add(s.s5);
	s.s5 = s.s5.wrapping_add(data[5]);
	s.s7 ^= s.s3;
	s.s4 ^= s.s5;
	s.s5 = s.s5.rotate_left(28);
	s.s4 = s.s4.wrapping_add(s.s6);
	s.s6 = s.s6.wrapping_add(data[6]);
	s.s8 ^= s.s4;
	s.s5 ^= s.s6;
	s.s6 = s.s6.rotate_left(39);
	s.s5 = s.s5.wrapping_add(s.s7);
	s.s7 = s.s7.wrapping_add(data[7]);
	s.s9 ^= s.s5;
	s.s6 ^= s.s7;
	s.s7 = s.s7.rotate_left(57);
	s.s6 = s.s6.wrapping_add(s.s8);
	s.s8 = s.s8.wrapping_add(data[8]);
	s.s10 ^= s.s6;
	s.s7 ^= s.s8;
	s.s8 = s.s8.rotate_left(55);
	s.s7 = s.s7.wrapping_add(s.s9);
	s.s9 = s.s9.wrapping_add(data[9]);
	s.s11 ^= s.s7;
	s.s8 ^= s.s9;
	s.s9 = s.s9.rotate_left(54);
	s.s8 = s.s8.wrapping_add(s.s10);
	s.s10 = s.s10.wrapping_add(data[10]);
	s.s0 ^= s.s8;
	s.s9 ^= s.s10;
	s.s10 = s.s10.rotate_left(22);
	s.s9 = s.s9.wrapping_add(s.s11);
	s.s11 = s.s11.wrapping_add(data[11]);
	s.s1 ^= s.s9;
	s.s10 ^= s.s11;
	s.s11 = s.s11.rotate_left(46);
	s.s10 = s.s10.wrapping_add(s.s0);
	}

	#[inline(always)]
	/// Mix all 12 inputs together so that h0, h1 are a hash of them all.
	///
	/// For two inputs differing in just the input bits
	/// Where "differ" means xor or subtraction
	/// And the base value is random, or a counting value starting at that bit
	/// The final result will have each bit of h0, h1 flip
	/// For every input bit,
	/// with probability 50 +- .3%
	/// For every pair of input bits,
	/// with probability 50 +- 3%
	///
	/// This does not rely on the last Mix() call having already mixed some.
	/// Two iterations was almost good enough for a 64-bit result, but a
	/// 128-bit result is reported, so End() does three iterations.
	///
	fn end_partial(h: &mut State) {
	h.s11 = h.s11.wrapping_add(h.s1);
	h.s2 ^= h.s11;
	h.s1 = h.s1.rotate_left(44);
	h.s0 = h.s0.wrapping_add(h.s2);
	h.s3 ^= h.s0;
	h.s2 = h.s2.rotate_left(15);
	h.s1 = h.s1.wrapping_add(h.s3);
	h.s4 ^= h.s1;
	h.s3 = h.s3.rotate_left(34);
	h.s2 = h.s2.wrapping_add(h.s4);
	h.s5 ^= h.s2;
	h.s4 = h.s4.rotate_left(21);
	h.s3 = h.s3.wrapping_add(h.s5);
	h.s6 ^= h.s3;
	h.s5 = h.s5.rotate_left(38);
	h.s4 = h.s4.wrapping_add(h.s6);
	h.s7 ^= h.s4;
	h.s6 = h.s6.rotate_left(33);
	h.s5 = h.s5.wrapping_add(h.s7);
	h.s8 ^= h.s5;
	h.s7 = h.s7.rotate_left(10);
	h.s6 = h.s6.wrapping_add(h.s8);
	h.s9 ^= h.s6;
	h.s8 = h.s8.rotate_left(13);
	h.s7 = h.s7.wrapping_add(h.s9);
	h.s10 ^= h.s7;
	h.s9 = h.s9.rotate_left(38);
	h.s8 = h.s8.wrapping_add(h.s10);
	h.s11 ^= h.s8;
	h.s10 = h.s10.rotate_left(53);
	h.s9 = h.s9.wrapping_add(h.s11);
	h.s0 ^= h.s9;
	h.s11 = h.s11.rotate_left(42);
	h.s10 = h.s10.wrapping_add(h.s0);
	h.s1 ^= h.s10;
	h.s0 = h.s0.rotate_left(54);
	}

	#[inline(always)]
	fn end(data: &[u64], h: &mut State) {
	debug_assert_eq!(data.len(), 12);
	h.s0 += data[0];
	h.s1 += data[1];
	h.s2 += data[2];
	h.s3 += data[3];
	h.s4 += data[4];
	h.s5 += data[5];
	h.s6 += data[6];
	h.s7 += data[7];
	h.s8 += data[8];
	h.s9 += data[9];
	h.s10 += data[10];
	h.s11 += data[11];
	end_partial(h);
	end_partial(h);
	end_partial(h);
	}

	///
	/// The goal is for each bit of the input to expand into 128 bits of
	/// apparent entropy before it is fully overwritten.
	/// n trials both set and cleared at least m bits of h0 h1 h2 h3
	/// n: 2 m: 29
	/// n: 3 m: 46
	/// n: 4 m: 57
	/// n: 5 m: 107
	/// n: 6 m: 146
	/// n: 7 m: 152
	/// when run forwards or backwards
	/// for all 1-bit and 2-bit diffs
	/// with diffs defined by either xor or subtraction
	/// with a base of all zeros plus a counter, or plus another bit, or random
	///
	#[inline(always)]
	fn short_mix(h: &mut State) {
	h.s2 = h.s2.rotate_left(50);
	h.s2 = h.s2.wrapping_add(h.s3);
	h.s0 ^= h.s2;
	h.s3 = h.s3.rotate_left(52);
	h.s3 = h.s3.wrapping_add(h.s0);
	h.s1 ^= h.s3;
	h.s0 = h.s0.rotate_left(30);
	h.s0 = h.s0.wrapping_add(h.s1);
	h.s2 ^= h.s0;
	h.s1 = h.s1.rotate_left(41);
	h.s1 = h.s1.wrapping_add(h.s2);
	h.s3 ^= h.s1;
	h.s2 = h.s2.rotate_left(54);
	h.s2 = h.s2.wrapping_add(h.s3);
	h.s0 ^= h.s2;
	h.s3 = h.s3.rotate_left(48);
	h.s3 = h.s3.wrapping_add(h.s0);
	h.s1 ^= h.s3;
	h.s0 = h.s0.rotate_left(38);
	h.s0 = h.s0.wrapping_add(h.s1);
	h.s2 ^= h.s0;
	h.s1 = h.s1.rotate_left(37);
	h.s1 = h.s1.wrapping_add(h.s2);
	h.s3 ^= h.s1;
	h.s2 = h.s2.rotate_left(62);
	h.s2 = h.s2.wrapping_add(h.s3);
	h.s0 ^= h.s2;
	h.s3 = h.s3.rotate_left(34);
	h.s3 = h.s3.wrapping_add(h.s0);
	h.s1 ^= h.s3;
	h.s0 = h.s0.rotate_left(5);
	h.s0 = h.s0.wrapping_add(h.s1);
	h.s2 ^= h.s0;
	h.s1 = h.s1.rotate_left(36);
	h.s1 = h.s1.wrapping_add(h.s2);
	h.s3 ^= h.s1;
	}

	///
	/// Mix all 4 inputs together so that h0, h1 are a hash of them all.
	///
	/// For two inputs differing in just the input bits
	/// Where "differ" means xor or subtraction
	/// And the base value is random, or a counting value starting at that bit
	/// The final result will have each bit of h0, h1 flip
	/// For every input bit,
	/// with probability 50 +- .3% (it is probably better than that)
	/// For every pair of input bits,
	/// with probability 50 +- .75% (the worst case is approximately that)
	///
	#[inline(always)]
	fn short_end(h: &mut State) {
	h.s3 ^= h.s2;
	h.s2 = h.s2.rotate_left(15);
	h.s3 = h.s3.wrapping_add(h.s2);
	h.s0 ^= h.s3;
	h.s3 = h.s3.rotate_left(52);
	h.s0 = h.s0.wrapping_add(h.s3);
	h.s1 ^= h.s0;
	h.s0 = h.s0.rotate_left(26);
	h.s1 = h.s1.wrapping_add(h.s0);
	h.s2 ^= h.s1;
	h.s1 = h.s1.rotate_left(51);
	h.s2 = h.s2.wrapping_add(h.s1);
	h.s3 ^= h.s2;
	h.s2 = h.s2.rotate_left(28);
	h.s3 = h.s3.wrapping_add(h.s2);
	h.s0 ^= h.s3;
	h.s3 = h.s3.rotate_left(9);
	h.s0 = h.s0.wrapping_add(h.s3);
	h.s1 ^= h.s0;
	h.s0 = h.s0.rotate_left(47);
	h.s1 = h.s1.wrapping_add(h.s0);
	h.s2 ^= h.s1;
	h.s1 = h.s1.rotate_left(54);
	h.s2 = h.s2.wrapping_add(h.s1);
	h.s3 ^= h.s2;
	h.s2 = h.s2.rotate_left(32);
	h.s3 = h.s3.wrapping_add(h.s2);
	h.s0 ^= h.s3;
	h.s3 = h.s3.rotate_left(25);
	h.s0 = h.s0.wrapping_add(h.s3);
	h.s1 ^= h.s0;
	h.s0 = h.s0.rotate_left(63);
	h.s1 = h.s1.wrapping_add(h.s0);
	}


	#[inline(always)]
	fn short(data: &[u8], hash: u128) -> u128 {
	todo!();
	}