-
-
Save pervognsen/3a0429996ddf4271bcd0556f59fb7fc6 to your computer and use it in GitHub Desktop.
#include <stdlib.h> | |
#include <stdio.h> | |
#include <stdint.h> | |
#include <string.h> | |
const char *keywords[] = { | |
"completed_in", | |
"contributors", | |
"contributors_enabled", | |
"coordinates", | |
"count", | |
"created_at", | |
"default_profile", | |
"default_profile_image", | |
"description", | |
"display_url", | |
"entities", | |
"expanded_url", | |
"favorite_count", | |
"favorited", | |
"favourites_count", | |
"follow_request_sent", | |
"followers_count", | |
"following", | |
"friends_count", | |
"geo", | |
"geo_enabled", | |
"h", | |
"hashtags", | |
"id", | |
"id_str", | |
"in_reply_to_screen_name", | |
"in_reply_to_status_id", | |
"in_reply_to_status_id_str", | |
"in_reply_to_user_id", | |
"in_reply_to_user_id_str", | |
"indices", | |
"is_translation_enabled", | |
"is_translator", | |
"iso_language_code", | |
"lang", | |
"large", | |
"listed_count", | |
"location", | |
"max_id", | |
"max_id_str", | |
"media", | |
"media_url", | |
"media_url_https", | |
"medium", | |
"metadata", | |
"name", | |
"next_results", | |
"notifications", | |
"place", | |
"possibly_sensitive", | |
"profile_background_color", | |
"profile_background_image_url", | |
"profile_background_image_url_https", | |
"profile_background_tile", | |
"profile_banner_url", | |
"profile_image_url", | |
"profile_image_url_https", | |
"profile_link_color", | |
"profile_sidebar_border_color", | |
"profile_sidebar_fill_color", | |
"profile_text_color", | |
"profile_use_background_image", | |
"protected", | |
"query", | |
"refresh_url", | |
"resize", | |
"result_type", | |
"retweet_count", | |
"retweeted", | |
"retweeted_status", | |
"screen_name", | |
"search_metadata", | |
"since_id", | |
"since_id_str", | |
"sizes", | |
"small", | |
"source", | |
"source_status_id", | |
"source_status_id_str", | |
"statuses", | |
"statuses_count", | |
"symbols", | |
"text", | |
"thumb", | |
"time_zone", | |
"truncated", | |
"type", | |
"url", | |
"urls", | |
"user", | |
"user_mentions", | |
"utc_offset", | |
"verified", | |
"w", | |
}; | |
enum | |
{ | |
num_keywords = sizeof(keywords) / sizeof(*keywords) | |
}; | |
uint64_t signatures[num_keywords]; | |
void init(void) | |
{ | |
srand(0); | |
for (int i = 0; i < num_keywords; i++) | |
{ | |
const char *keyword = keywords[i]; | |
size_t keyword_len = strlen(keyword); | |
uint64_t signature = 0; | |
// The 7 high bytes of the signature are characters 0, 2, 4, etc. | |
for (int j = 0; 2 * j < keyword_len && j < 7; j++) | |
{ | |
signature = (uint8_t)keyword[2 * j] + signature * 256; | |
} | |
// The low byte of the signature is the length. | |
signature = keyword_len + signature * 256; | |
for (int j = 0; j < i; j++) | |
{ | |
if (signatures[j] == signature) | |
{ | |
printf("Signature collision.\n"); | |
exit(1); | |
} | |
} | |
signatures[i] = signature; | |
} | |
} | |
int check(uint64_t a, uint8_t max_collisions) | |
{ | |
uint8_t slot[256] = {0}; | |
for (int i = 0; i < num_keywords; i++) | |
{ | |
uint8_t hash = (uint8_t)((a * signatures[i]) >> 56); | |
if (slot[hash] > max_collisions) | |
return 0; | |
slot[hash]++; | |
} | |
return 1; | |
} | |
uint64_t rand16(void) | |
{ | |
return rand(); | |
} | |
uint64_t rand64(void) | |
{ | |
return rand16() | (rand16() << 16ull) | (rand16() << 32ull) | (rand16() << 48ull); | |
} | |
int main(int argc, char **argv) | |
{ | |
init(); | |
uint64_t max_iterations = 1ull << 40; | |
uint8_t max_collisions = 0; | |
for (uint64_t i = 0; i < max_iterations; i++) | |
{ | |
uint64_t a = rand64(); | |
// a = 15564265448486149432 | |
if (check(a, max_collisions)) | |
{ | |
printf("a = %llu\n", a); | |
return 0; | |
} | |
} | |
printf("No hash function with max %d collisions found.\n", max_collisions); | |
return 1; | |
} |
In function
init
, the index intokeyword[2*j]
can go out of bounds, i.e the loop condition should be(j < keyword_len / 2) && (j < 7)
?for (int j = 0; j < keyword_len && j < 7; j++) { signature = (uint8_t)keyword[2*j] + signature * 256; }
Thanks, I fixed it and also added the missing string.h include. The bounds condition should be 2 * j < keyword_len, which is not actually the same as i < keyword_len / 2 due to rounding (consider i = 1, keyword_len = 3); if you want to move the scaling to the other side of the inequality to fold it into the loop-invariant upper bound, you'd need to round up, i.e. use i < (keyword_len + 1) / 2.
Incidentally, one of the annoying things about rearranging equations and inequalities is that even when they're mathematically equivalent as statements about arbitrary-precision integers they're often inequivalent when you consider finite-precision overflow conditions. For example, 2 * j < keyword_len overflows when j > INT_MAX/2 (SIZE_MAX/2 if I'd made it a size_t) whereas keyword_len / 2 cannot overflow for any value of keyword_len and (keyword_len + 1) / 2 can only overflow for keyword_len = SIZE_MAX. In this program it doesn't matter anyway since j < 7 is part of the loop condition and with 64-bit indices and sizes we can often get away with being sloppier about this stuff since even an adversary would often need to push adversary-controlled strings into the O(2^64) length range to trigger overflows.
In function
init
, the index intokeyword[2*j]
can go out of bounds, i.e the loop condition should be(j < keyword_len / 2) && (j < 7)
?