Created
August 13, 2018 08:31
-
-
Save FlorianMerkle/3648fc0af44f3de24f594d429c949546 to your computer and use it in GitHub Desktop.
Implementation of a spell checker with mmap instead of malloc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Implements a dictionary's functionality | |
#include <ctype.h> | |
#include <errno.h> | |
#include <fcntl.h> | |
#include <stdbool.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <strings.h> | |
#include <sys/mman.h> | |
#include <sys/types.h> | |
#include <sys/stat.h> | |
#include <unistd.h> | |
#include "dictionary.h" | |
// Definitions | |
#define hashmod buckets | |
#define buckets 350000 | |
#define seedx 0 | |
// Global declarations | |
node *hashtable [buckets] = {NULL}; | |
unsigned int wcounter = 0; | |
int arraynr = 0; | |
// Returns true if word is in dictionary else false | |
bool check(const char *word) | |
{ | |
char lword[LENGTH + 1]; | |
strncpy(lword, word, strlen(word) + 1); | |
for (int i = 0; i < strlen(lword); i++) // Convert input to non-const, all-lower case string | |
{ | |
lword[i] = tolower(lword[i]); | |
} | |
arraynr = hash((unsigned const char *) lword, strlen(word), seedx) % buckets; // Assign hash-based index | |
node *travers = hashtable[arraynr]; // Create travers node | |
if (travers == NULL) | |
{ | |
return false; | |
} | |
while (travers->next != NULL) // Loop through SLL and compare every element | |
{ | |
if (strcmp(travers->value, lword) == 0) | |
{ | |
return true; | |
} | |
else | |
{ | |
travers = travers->next; | |
} | |
} | |
if (strcmp(travers->value, lword) == 0) // Check the last element | |
{ | |
return true; | |
} | |
else | |
{ | |
return false; | |
} | |
} | |
// Loads dictionary into memory, returning true if successful else false | |
bool load(const char *dictionary) | |
{ | |
char input [LENGTH + 1]; // Input buffer | |
FILE *dictptr = fopen(dictionary, "r"); | |
while (fscanf(dictptr, "%s", input) != EOF) // Scan whole document for strings and write each into SLL | |
{ | |
wcounter++; | |
int len = strlen(input); | |
arraynr = hash((unsigned const char *)input, len, seedx) % buckets; // Assign hash-based index | |
if (hashtable[arraynr] == NULL) // If it is the first element to be written into this SLL, create SLL | |
{ | |
//hashtable[arraynr] = malloc(sizeof(node)); // Allocate memory for (struct) node | |
hashtable[arraynr] = mmap(NULL, sizeof(node), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); | |
/*if (hashtable[arraynr] == NULL) // Check if mem allocation was successfull | |
{ | |
fprintf(stderr, "could not allocate memory\n"); | |
return false; | |
}*/ | |
strncpy(hashtable[arraynr]->value, input, len + 1); // Write value into node | |
hashtable[arraynr]->next = NULL; // Set adress to NULL | |
} | |
else // If the SLL already exist, insert a new node | |
{ | |
//node *newnode = malloc(sizeof(node)); // Allocate memory for buffer (struct) node | |
node *newnode = mmap(NULL, sizeof(node), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); | |
/*if (newnode == NULL) // Check if mem allocation was successfull | |
{ | |
fprintf(stderr, "could not allocate memory\n"); | |
return false; | |
}*/ | |
strncpy(newnode->value, input, len + 1); // Write value to the node | |
newnode->next = hashtable[arraynr]; // Connect the new node with (in front of) the old head of the list | |
hashtable[arraynr] = newnode; // Hashtable points now to the new node as the first node of the SLL | |
} | |
} | |
fclose(dictptr); | |
return true; | |
} | |
// Returns number of words in dictionary if loaded else 0 if not yet loaded | |
unsigned int size(void) | |
{ | |
return wcounter; | |
} | |
// Unloads dictionary from memory, returning true if successful else false | |
bool unload(void) | |
{ | |
/* for (int i = 0; i < buckets; i++) | |
{ | |
if (hashtable[i] != NULL) | |
{ | |
delete_sll(hashtable[i]); | |
} | |
}*/ | |
return true; | |
} | |
// Delete a whole SLL | |
void delete_sll (node *head) | |
{ | |
if (head->next == NULL) // If current element is the last element of the SLL | |
{ | |
free(head); // Free memory | |
} | |
else | |
{ | |
node *travers = head->next; // Else move to next element | |
delete_sll(travers); // Stop here on this stack and do it over again for the next element | |
free(head); // After all other elements are freeed, free memory of this element | |
} | |
} | |
// 32-bit Murmur 3 hash algoritm, code from Wikipedia @ https://en.wikipedia.org/wiki/MurmurHash | |
unsigned int hash(const unsigned char *key, int len, unsigned int seed) | |
{ | |
uint32_t h = seed; | |
if (len > 3) | |
{ | |
const uint32_t *key_x4 = (const uint32_t*) key; | |
size_t i = len >> 2; | |
do | |
{ | |
uint32_t k = *key_x4++; | |
k *= 0xcc9e2d51; | |
k = (k << 15) | (k >> 17); | |
k *= 0x1b873593; | |
h ^= k; | |
h = (h << 13) | (h >> 19); | |
h = (h * 5) + 0xe6546b64; | |
} | |
while (--i); | |
key = (const uint8_t*) key_x4; | |
} | |
if (len & 3) | |
{ | |
size_t i = len & 3; | |
uint32_t k = 0; | |
key = &key[i - 1]; | |
do | |
{ | |
k <<= 8; | |
k |= *key--; | |
} | |
while (--i); | |
k *= 0xcc9e2d51; | |
k = (k << 15) | (k >> 17); | |
k *= 0x1b873593; | |
h ^= k; | |
} | |
h ^= len; | |
h ^= h >> 16; | |
h *= 0x85ebca6b; | |
h ^= h >> 13; | |
h *= 0xc2b2ae35; | |
h ^= h >> 16; | |
return h; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment