Skip to content

Instantly share code, notes, and snippets.

@pinkeen
Last active September 8, 2019 20:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pinkeen/efcc81232e4c2d7c53837e97b68cc677 to your computer and use it in GitHub Desktop.
Save pinkeen/efcc81232e4c2d7c53837e97b68cc677 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <stdint.h>
/**
* Simple string hash with minimal memory requirements.
*
* This is meant for uCs with 128-512 byte SRAM -
* the purpose is to avoid having to store string
* identifier in RAM when parsing some incoming data.
*
* A hash can be computed as the string is consumed and
* then only the 16bit value needs to be compared instead
* of the buffering whole string.
*
* Of course this should consume char stream instead of
* iterating over buffered string.
*
* Collisions should not be an issue with small indentifier
* space to cover - maybe the hash size can be even decreased
* to 8bit?
*
* This is not crypto-safe! In fact the exact opposite πŸ˜…
**/
/* Use a simple xorshift hash function
* instead of storing the full mixing table. */
uint8_t T(uint8_t i) {
i ^= i >> 2;
i ^= i << 1;
return i;
}
/* Simple pearson hash algo. */
uint16_t H(char *c) {
uint8_t h[2];
while (*c != '\0') {
uint8_t *hb = h + ((size_t)c & 1);
*hb = T(*hb ^ (uint8_t)*c);
++c;
}
return h[0] + (h[1] << 8);
}
void print_str_hash(char *str) {
uint16_t HV = H(str);
printf("%05d [0x%04x] = H(\"%s\")\n", HV, HV, str);
}
int main() {
/* On a single core of i7-8700 the speed is roughly
* 27M iterations / second for the word "identifier"
* which translates to 270M chars/sec. */
for (unsigned int i = 0; i < 0xFFFFFFF; ++i) {
H("identifier");
}
print_str_hash("identifier");
print_str_hash("identifierB");
print_str_hash("identif1er");
print_str_hash("/identif1er");
print_str_hash("a.very.long.path.to[1].something");
print_str_hash("value");
print_str_hash("a_value_a");
print_str_hash("a_value_b");
print_str_hash("a_value_c");
print_str_hash("a_value_d");
print_str_hash("a_value_e");
print_str_hash("a_value_f");
print_str_hash("a_v4lue_f");
print_str_hash("a_valu3_f");
print_str_hash("a_valu3_f4324234");
print_str_hash("name");
print_str_hash("what to you mean ___ ??? !!Ε›Δ‡_.fewqrwe%$#@$#@$!(_@)#_)!8FJLDK:FJLDS:34234234!");
print_str_hash("aaaa");
print_str_hash("zzbb");
print_str_hash("00cc");
print_str_hash("__zz");
print_str_hash("abab");
print_str_hash("abba");
/* No reason why UTF8 would not work 🧐 */
print_str_hash("πŸ˜ŽπŸ˜…πŸŽ‰πŸ˜‚πŸŒ…");
}
/** Output **
59378 [0xe7f2] = H("identifier")
62100 [0xf294] = H("identifierB")
55527 [0xd8e7] = H("identif1er")
33255 [0x81e7] = H("/identif1er")
18000 [0x4650] = H("a.very.long.path.to[1].something")
17384 [0x43e8] = H("value")
56218 [0xdb9a] = H("a_value_a")
56223 [0xdb9f] = H("a_value_b")
56220 [0xdb9c] = H("a_value_c")
56214 [0xdb96] = H("a_value_d")
56213 [0xdb95] = H("a_value_e")
56208 [0xdb90] = H("a_value_f")
59280 [0xe790] = H("a_v4lue_f")
56204 [0xdb8c] = H("a_valu3_f")
13117 [0x333d] = H("a_valu3_f4324234")
21119 [0x527f] = H("name")
54759 [0xd5e7] = H("what to you mean ___ ??? !!Ε›Δ‡_.fewqrwe%$#@$#@$!(_@)#_)!8FJLDK:FJLDS:34234234!")
28784 [0x7070] = H("aaaa")
01799 [0x0707] = H("zzbb")
29298 [0x7272] = H("00cc")
40606 [0x9e9e] = H("__zz")
31088 [0x7970] = H("abab")
30076 [0x757c] = H("abba")
40919 [0x9fd7] = H("πŸ˜ŽπŸ˜…πŸŽ‰πŸ˜‚πŸŒ…")
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment