Last active
April 25, 2024 12:52
-
-
Save pervognsen/3a0429996ddf4271bcd0556f59fb7fc6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdlib.h> | |
#include <stdio.h> | |
#include <stdint.h> | |
#include <string.h> | |
const char *keywords[] = { | |
"completed_in", | |
"contributors", | |
"contributors_enabled", | |
"coordinates", | |
"count", | |
"created_at", | |
"default_profile", | |
"default_profile_image", | |
"description", | |
"display_url", | |
"entities", | |
"expanded_url", | |
"favorite_count", | |
"favorited", | |
"favourites_count", | |
"follow_request_sent", | |
"followers_count", | |
"following", | |
"friends_count", | |
"geo", | |
"geo_enabled", | |
"h", | |
"hashtags", | |
"id", | |
"id_str", | |
"in_reply_to_screen_name", | |
"in_reply_to_status_id", | |
"in_reply_to_status_id_str", | |
"in_reply_to_user_id", | |
"in_reply_to_user_id_str", | |
"indices", | |
"is_translation_enabled", | |
"is_translator", | |
"iso_language_code", | |
"lang", | |
"large", | |
"listed_count", | |
"location", | |
"max_id", | |
"max_id_str", | |
"media", | |
"media_url", | |
"media_url_https", | |
"medium", | |
"metadata", | |
"name", | |
"next_results", | |
"notifications", | |
"place", | |
"possibly_sensitive", | |
"profile_background_color", | |
"profile_background_image_url", | |
"profile_background_image_url_https", | |
"profile_background_tile", | |
"profile_banner_url", | |
"profile_image_url", | |
"profile_image_url_https", | |
"profile_link_color", | |
"profile_sidebar_border_color", | |
"profile_sidebar_fill_color", | |
"profile_text_color", | |
"profile_use_background_image", | |
"protected", | |
"query", | |
"refresh_url", | |
"resize", | |
"result_type", | |
"retweet_count", | |
"retweeted", | |
"retweeted_status", | |
"screen_name", | |
"search_metadata", | |
"since_id", | |
"since_id_str", | |
"sizes", | |
"small", | |
"source", | |
"source_status_id", | |
"source_status_id_str", | |
"statuses", | |
"statuses_count", | |
"symbols", | |
"text", | |
"thumb", | |
"time_zone", | |
"truncated", | |
"type", | |
"url", | |
"urls", | |
"user", | |
"user_mentions", | |
"utc_offset", | |
"verified", | |
"w", | |
}; | |
enum | |
{ | |
num_keywords = sizeof(keywords) / sizeof(*keywords) | |
}; | |
uint64_t signatures[num_keywords]; | |
void init(void) | |
{ | |
srand(0); | |
for (int i = 0; i < num_keywords; i++) | |
{ | |
const char *keyword = keywords[i]; | |
size_t keyword_len = strlen(keyword); | |
uint64_t signature = 0; | |
// The 7 high bytes of the signature are characters 0, 2, 4, etc. | |
for (int j = 0; 2 * j < keyword_len && j < 7; j++) | |
{ | |
signature = (uint8_t)keyword[2 * j] + signature * 256; | |
} | |
// The low byte of the signature is the length. | |
signature = keyword_len + signature * 256; | |
for (int j = 0; j < i; j++) | |
{ | |
if (signatures[j] == signature) | |
{ | |
printf("Signature collision.\n"); | |
exit(1); | |
} | |
} | |
signatures[i] = signature; | |
} | |
} | |
int check(uint64_t a, uint8_t max_collisions) | |
{ | |
uint8_t slot[256] = {0}; | |
for (int i = 0; i < num_keywords; i++) | |
{ | |
uint8_t hash = (uint8_t)((a * signatures[i]) >> 56); | |
if (slot[hash] > max_collisions) | |
return 0; | |
slot[hash]++; | |
} | |
return 1; | |
} | |
uint64_t rand16(void) | |
{ | |
return rand(); | |
} | |
uint64_t rand64(void) | |
{ | |
return rand16() | (rand16() << 16ull) | (rand16() << 32ull) | (rand16() << 48ull); | |
} | |
int main(int argc, char **argv) | |
{ | |
init(); | |
uint64_t max_iterations = 1ull << 40; | |
uint8_t max_collisions = 0; | |
for (uint64_t i = 0; i < max_iterations; i++) | |
{ | |
uint64_t a = rand64(); | |
// a = 15564265448486149432 | |
if (check(a, max_collisions)) | |
{ | |
printf("a = %llu\n", a); | |
return 0; | |
} | |
} | |
printf("No hash function with max %d collisions found.\n", max_collisions); | |
return 1; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks, I fixed it and also added the missing string.h include. The bounds condition should be 2 * j < keyword_len, which is not actually the same as i < keyword_len / 2 due to rounding (consider i = 1, keyword_len = 3); if you want to move the scaling to the other side of the inequality to fold it into the loop-invariant upper bound, you'd need to round up, i.e. use i < (keyword_len + 1) / 2.
Incidentally, one of the annoying things about rearranging equations and inequalities is that even when they're mathematically equivalent as statements about arbitrary-precision integers they're often inequivalent when you consider finite-precision overflow conditions. For example, 2 * j < keyword_len overflows when j > INT_MAX/2 (SIZE_MAX/2 if I'd made it a size_t) whereas keyword_len / 2 cannot overflow for any value of keyword_len and (keyword_len + 1) / 2 can only overflow for keyword_len = SIZE_MAX. In this program it doesn't matter anyway since j < 7 is part of the loop condition and with 64-bit indices and sizes we can often get away with being sloppier about this stuff since even an adversary would often need to push adversary-controlled strings into the O(2^64) length range to trigger overflows.