Skip to content

Instantly share code, notes, and snippets.

@mmozeiko mmozeiko/meow_hash_armv8.h
Last active Oct 27, 2018

Embed
What would you like to do?
Meow hash for ARMv8 (v0.2)
/* ========================================================================
Meow - A Fast Non-cryptographic Hash for Large Data Sizes
(C) Copyright 2018 by Molly Rocket, Inc. (https://mollyrocket.com)
See https://mollyrocket.com/meowhash for details.
========================================================================
zlib License
(C) Copyright 2018 Molly Rocket, Inc.
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
========================================================================
Source below is heavily modified to port to ARMv8 using Neon intrinsics.
======================================================================== */
#pragma once
#include <arm_neon.h>
#include <stdint.h>
#define MEOW_HASH_VERSION 2
#define MEOW_HASH_VERSION_NAME "0.2/Ragdoll"
#define MEOW_HASH_ALIGNMENT 1
#define MEOW_HASH_BLOCK_SIZE_SHIFT 8
#define meow_u8 uint8_t
#define meow_u32 uint32_t
#define meow_u64 uint64_t
#define meow_u128 uint8x16_t
typedef union meow_hash
{
meow_u128 u128;
meow_u64 u64[2];
meow_u32 u32[4];
} meow_hash;
#ifndef MEOW_API
#define MEOW_API static
#endif
typedef meow_hash meow_hash_implementation(meow_u64 seed, meow_u64 length, const void* data);
MEOW_API
meow_hash meow_hash_armv8(meow_u64 seed, meow_u64 length, const void* data)
{
const meow_u128 zero = vdupq_n_u8(0);
meow_u128 S0 = zero;
meow_u128 S1 = zero;
meow_u128 S2 = zero;
meow_u128 S3 = zero;
meow_u128 S4 = zero;
meow_u128 S5 = zero;
meow_u128 S6 = zero;
meow_u128 S7 = zero;
meow_u128 S8 = zero;
meow_u128 S9 = zero;
meow_u128 SA = zero;
meow_u128 SB = zero;
meow_u128 SC = zero;
meow_u128 SD = zero;
meow_u128 SE = zero;
meow_u128 SF = zero;
meow_u128 T0 = zero;
meow_u128 T1 = zero;
meow_u128 T2 = zero;
meow_u128 T3 = zero;
meow_u128 T4 = zero;
meow_u128 T5 = zero;
meow_u128 T6 = zero;
meow_u128 T7 = zero;
meow_u128 T8 = zero;
meow_u128 T9 = zero;
meow_u128 TA = zero;
meow_u128 TB = zero;
meow_u128 TC = zero;
meow_u128 TD = zero;
meow_u128 TE = zero;
meow_u128 TF = zero;
meow_u64 total = length;
const meow_u8* bytes = (meow_u8*)data;
meow_u64 blocks = length >> MEOW_HASH_BLOCK_SIZE_SHIFT;
length -= blocks << MEOW_HASH_BLOCK_SIZE_SHIFT;
while (blocks--)
{
S0 = vaesimcq_u8(vaesdq_u8(S0, T0)); T0 = vld1q_u8(bytes);
S0 = vaesimcq_u8(vaesdq_u8(S0, T0)); T0 = vld1q_u8(bytes);
S1 = vaesimcq_u8(vaesdq_u8(S1, T1)); T1 = vld1q_u8(bytes + 16);
S2 = vaesimcq_u8(vaesdq_u8(S2, T2)); T2 = vld1q_u8(bytes + 32);
S3 = vaesimcq_u8(vaesdq_u8(S3, T3)); T3 = vld1q_u8(bytes + 48);
S4 = vaesimcq_u8(vaesdq_u8(S4, T4)); T4 = vld1q_u8(bytes + 64);
S5 = vaesimcq_u8(vaesdq_u8(S5, T5)); T5 = vld1q_u8(bytes + 80);
S6 = vaesimcq_u8(vaesdq_u8(S6, T6)); T6 = vld1q_u8(bytes + 96);
S7 = vaesimcq_u8(vaesdq_u8(S7, T7)); T7 = vld1q_u8(bytes + 112);
S8 = vaesimcq_u8(vaesdq_u8(S8, T8)); T8 = vld1q_u8(bytes + 128);
S9 = vaesimcq_u8(vaesdq_u8(S9, T9)); T9 = vld1q_u8(bytes + 144);
SA = vaesimcq_u8(vaesdq_u8(SA, TA)); TA = vld1q_u8(bytes + 160);
SB = vaesimcq_u8(vaesdq_u8(SB, TB)); TB = vld1q_u8(bytes + 176);
SC = vaesimcq_u8(vaesdq_u8(SC, TC)); TC = vld1q_u8(bytes + 192);
SD = vaesimcq_u8(vaesdq_u8(SD, TD)); TD = vld1q_u8(bytes + 208);
SE = vaesimcq_u8(vaesdq_u8(SE, TE)); TE = vld1q_u8(bytes + 224);
SF = vaesimcq_u8(vaesdq_u8(SF, TF)); TF = vld1q_u8(bytes + 240);
bytes += 1 << MEOW_HASH_BLOCK_SIZE_SHIFT;
}
switch (length >> 4)
{
case 15: SE = vaesimcq_u8(vaesdq_u8(SE, TE)); TE = vld1q_u8(bytes + 224);
case 14: SD = vaesimcq_u8(vaesdq_u8(SD, TD)); TD = vld1q_u8(bytes + 208);
case 13: SC = vaesimcq_u8(vaesdq_u8(SC, TC)); TC = vld1q_u8(bytes + 192);
case 12: SB = vaesimcq_u8(vaesdq_u8(SB, TB)); TB = vld1q_u8(bytes + 176);
case 11: SA = vaesimcq_u8(vaesdq_u8(SA, TA)); TA = vld1q_u8(bytes + 160);
case 10: S9 = vaesimcq_u8(vaesdq_u8(S9, T9)); T9 = vld1q_u8(bytes + 144);
case 9: S8 = vaesimcq_u8(vaesdq_u8(S8, T8)); T8 = vld1q_u8(bytes + 128);
case 8: S7 = vaesimcq_u8(vaesdq_u8(S7, T7)); T7 = vld1q_u8(bytes + 112);
case 7: S6 = vaesimcq_u8(vaesdq_u8(S6, T6)); T6 = vld1q_u8(bytes + 96);
case 6: S5 = vaesimcq_u8(vaesdq_u8(S5, T5)); T5 = vld1q_u8(bytes + 80);
case 5: S4 = vaesimcq_u8(vaesdq_u8(S4, T4)); T4 = vld1q_u8(bytes + 64);
case 4: S3 = vaesimcq_u8(vaesdq_u8(S3, T3)); T3 = vld1q_u8(bytes + 48);
case 3: S2 = vaesimcq_u8(vaesdq_u8(S2, T2)); T2 = vld1q_u8(bytes + 32);
case 2: S1 = vaesimcq_u8(vaesdq_u8(S1, T1)); T1 = vld1q_u8(bytes + 16);
case 1: S0 = vaesimcq_u8(vaesdq_u8(S0, T0)); T0 = vld1q_u8(bytes);
default:;
}
if (length & 0xF)
{
meow_u128 partial;
if (total >= 16)
{
partial = vld1q_u8(bytes + length - 16);
}
else
{
partial = zero;
meow_u8* dest = (meow_u8*)&partial;
while (length--)
{
*dest++ = *bytes++;
}
}
SF = vaesimcq_u8(vaesdq_u8(SF, TF)); TF = partial;
}
S0 = veorq_u8(S0, T0);
S1 = veorq_u8(S1, T1);
S2 = veorq_u8(S2, T2);
S3 = veorq_u8(S3, T3);
S4 = veorq_u8(S4, T4);
S5 = veorq_u8(S5, T5);
S6 = veorq_u8(S6, T6);
S7 = veorq_u8(S7, T7);
S8 = veorq_u8(S8, T8);
S9 = veorq_u8(S9, T9);
SA = veorq_u8(SA, TA);
SB = veorq_u8(SB, TB);
SC = veorq_u8(SC, TC);
SD = veorq_u8(SD, TD);
SE = veorq_u8(SE, TE);
SF = veorq_u8(SF, TF);
meow_u128 M0 = S7;
meow_u128 M1 = zero;
M0 = vaesimcq_u8(vaesdq_u8(M0, M1)); M1 = SA;
M0 = vaesimcq_u8(vaesdq_u8(M0, M1)); M1 = S4;
M0 = vaesimcq_u8(vaesdq_u8(M0, M1)); M1 = S5;
M0 = vaesimcq_u8(vaesdq_u8(M0, M1)); M1 = SC;
M0 = vaesimcq_u8(vaesdq_u8(M0, M1)); M1 = S8;
M0 = vaesimcq_u8(vaesdq_u8(M0, M1)); M1 = S0;
M0 = vaesimcq_u8(vaesdq_u8(M0, M1)); M1 = S1;
M0 = vaesimcq_u8(vaesdq_u8(M0, M1)); M1 = S9;
M0 = vaesimcq_u8(vaesdq_u8(M0, M1)); M1 = SD;
M0 = vaesimcq_u8(vaesdq_u8(M0, M1)); M1 = S2;
M0 = vaesimcq_u8(vaesdq_u8(M0, M1)); M1 = S6;
M0 = vaesimcq_u8(vaesdq_u8(M0, M1)); M1 = SE;
M0 = vaesimcq_u8(vaesdq_u8(M0, M1)); M1 = S3;
M0 = vaesimcq_u8(vaesdq_u8(M0, M1)); M1 = SB;
M0 = vaesimcq_u8(vaesdq_u8(M0, M1)); M1 = SF;
uint64x2_t mixer64 = vcombine_u64(vcreate_u64(seed - total), vcreate_u64(seed + total + 1));
meow_u128 mixer = vreinterpretq_u8_u64(mixer64);
M0 = vaesimcq_u8(vaesdq_u8(M0, M1)); M1 = mixer;
M0 = vaesimcq_u8(vaesdq_u8(M0, M1)); M1 = mixer;
M0 = vaesimcq_u8(vaesdq_u8(M0, M1)); M1 = mixer;
meow_hash Result;
Result.u128 = veorq_u8(M0, M1);
return(Result);
}
#define MEOW_API static __attribute__((noinline))
#include "meow_hash_armv8.h"
// header and kernel module helper to access PMU cycle counter from user-space code
// download & build from https://github.com/zhiyisun/enable_arm_pmu
#include "enable_arm_pmu/armpmu_lib.h"
#define __rdtsc() read_pmu()
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define ArrayCount(Array) (sizeof(Array)/sizeof((Array)[0]))
typedef struct
{
meow_u64 Size;
meow_u64 Clocks;
} best_result;
static void
FuddleBuffer(meow_u64 Size, void *Buffer)
{
// NOTE(casey): This code is here for literally no purpose other than to prevent CLANG from
// optimizing out loads, since apparently it thinks it doesn't have to actually read from
// uninitialized memory, WHICH IS ABSURD and this whole undefined behavior thing is completely
// unacceptable. Spec writers are ALL FIRED.
meow_u8 *Dest = (meow_u8 *)Buffer;
for(meow_u64 Index = 0;
Index < Size;
++Index)
{
Dest[Index] = 13*Index;
}
}
static void
PrintSize(FILE *Stream, double Size, int Fixed)
{
char *Suffix = Fixed ? (char *)"b " : (char *)"b";
if(Size >= 1024.0)
{
Suffix = (char *)"kb";
Size /= 1024.0;
if(Size >= 1024.0)
{
Suffix = (char *)"mb";
Size /= 1024.0;
if(Size >= 1024.0)
{
Suffix = (char *)"gb";
Size /= 1024.0;
}
}
}
fprintf(Stream, Fixed ? "%4.0f%s" : "%0.0f%s", Size, Suffix);
}
int main()
{
enable_pmu(0x008);
meow_u64 MaxClocksWithoutDrop = 4000000000ULL/8;
best_result Bests[40] = {};
double BytesPerCycle[ArrayCount(Bests)] = {};
{
int BestIndex = 0;
Bests[BestIndex++].Size = 1;
Bests[BestIndex++].Size = 7;
Bests[BestIndex++].Size = 8;
Bests[BestIndex++].Size = 15;
Bests[BestIndex++].Size = 16;
Bests[BestIndex++].Size = 31;
Bests[BestIndex++].Size = 32;
Bests[BestIndex++].Size = 63;
Bests[BestIndex++].Size = 64;
Bests[BestIndex++].Size = 127;
Bests[BestIndex++].Size = 128;
Bests[BestIndex++].Size = 255;
Bests[BestIndex++].Size = 256;
Bests[BestIndex++].Size = 511;
Bests[BestIndex++].Size = 512;
Bests[BestIndex++].Size = 1023;
Bests[BestIndex++].Size = 1024;
meow_u64 Size = Bests[BestIndex - 1].Size;
while (BestIndex < ArrayCount(Bests))
{
Size *= 2;
Bests[BestIndex++].Size = Size;
}
}
{
fprintf(stderr, "Single-threaded performance:\n");
for (int Batch = 0; Batch < ArrayCount(Bests); ++Batch)
{
best_result *ThisBest = Bests + Batch;
meow_u64 Size = ThisBest->Size;
ThisBest->Clocks = (meow_u64)-1ULL;
void *Buffer = aligned_alloc(MEOW_HASH_ALIGNMENT, Size);
if (Buffer)
{
FuddleBuffer(Size, Buffer);
fprintf(stderr, " Fewest cycles to hash ");
PrintSize(stderr, Size, 0);
meow_u64 ClocksSinceLastDrop = 0;
meow_u64 BestClocks = (meow_u64)-1ULL;
int TryIndex = 0;
while ((TryIndex < 10) || (ClocksSinceLastDrop < MaxClocksWithoutDrop))
{
meow_u64 StartClock = __rdtsc();
meow_hash h = meow_hash_armv8(0, Size, Buffer);
meow_u64 EndClock = __rdtsc();
volatile meow_hash h0;
h0 = h;
meow_u64 Clocks = EndClock - StartClock;
ClocksSinceLastDrop += Clocks;
if (BestClocks > Clocks)
{
ClocksSinceLastDrop = 0;
BestClocks = Clocks;
}
++TryIndex;
}
double BPC = (double)Size / (double)BestClocks;
fprintf(stderr, "%10.0f (%3.03f bytes/cycle)\n", (double)BestClocks, BPC);
fflush(stderr);
BytesPerCycle[Batch] = BPC;
if (ThisBest->Clocks > BestClocks)
{
ThisBest->Clocks = BestClocks;
}
free(Buffer);
}
}
}
fprintf(stderr, "\n");
fprintf(stderr, "Leaderboard:\n");
for (int BestIndex = 0; BestIndex < ArrayCount(Bests); ++BestIndex)
{
best_result *Best = Bests + BestIndex;
fprintf(stderr, " ");
PrintSize(stderr, Best->Size, 1);
double BPC = (double)Best->Size / (double)Best->Clocks;
fprintf(stderr, ": %10.0f (%3.03f bytes/cycle)\n", (double)Best->Clocks, BPC);
}
fprintf(stderr, "\n");
disable_pmu(0x008);
}
# Measured on Pine A64, Cortex-A53, 1152Mhz
# http://wiki.pine64.org/index.php/PINE_A64_Main_Page#SoC_and_Memory_Specification
# benchmark compiled with: clang -mcpu=cortex-a53 -O3 meow_hash_bench.c -o meow_hash_bench.exe
# clang version 7.0.0
# disassembly of hash function: https://godbolt.org/z/R3w6mV
$ ./meow_hash_bench.exe
[...]
1b : 126 (0.008 bytes/cycle)
7b : 150 (0.047 bytes/cycle)
8b : 154 (0.052 bytes/cycle)
15b : 182 (0.082 bytes/cycle)
16b : 109 (0.147 bytes/cycle)
31b : 122 (0.254 bytes/cycle)
32b : 112 (0.286 bytes/cycle)
63b : 128 (0.492 bytes/cycle)
64b : 118 (0.542 bytes/cycle)
127b : 141 (0.901 bytes/cycle)
128b : 130 (0.985 bytes/cycle)
255b : 164 (1.555 bytes/cycle)
256b : 137 (1.869 bytes/cycle)
511b : 199 (2.568 bytes/cycle)
512b : 168 (3.048 bytes/cycle)
1023b : 261 (3.920 bytes/cycle)
1kb: 231 (4.433 bytes/cycle)
2kb: 354 (5.785 bytes/cycle)
4kb: 602 (6.804 bytes/cycle)
8kb: 1098 (7.461 bytes/cycle)
16kb: 2090 (7.839 bytes/cycle)
32kb: 4074 (8.043 bytes/cycle)
64kb: 8042 (8.149 bytes/cycle)
128kb: 15978 (8.203 bytes/cycle)
256kb: 31850 (8.231 bytes/cycle)
512kb: 63594 (8.244 bytes/cycle)
1mb: 127082 (8.251 bytes/cycle)
2mb: 254058 (8.255 bytes/cycle)
4mb: 508010 (8.256 bytes/cycle)
8mb: 1015914 (8.257 bytes/cycle)
16mb: 2036333 (8.239 bytes/cycle)
32mb: 4073273 (8.238 bytes/cycle)
64mb: 8147040 (8.237 bytes/cycle)
128mb: 16295195 (8.237 bytes/cycle)
256mb: 32595564 (8.235 bytes/cycle)
512mb: 65195721 (8.235 bytes/cycle)
1gb: 130394613 (8.235 bytes/cycle)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.