Skip to content

Instantly share code, notes, and snippets.

@odeke-em
Last active August 29, 2015 14:02
Show Gist options
  • Save odeke-em/7b5e1597e633946087d3 to your computer and use it in GitHub Desktop.
Save odeke-em/7b5e1597e633946087d3 to your computer and use it in GitHub Desktop.
Dictionary implementation using a Trie.
#include <stdlib.h>
#include "element.h"
inline Bool hasNext(Element *e) { return e != NULL && e->next != NULL; }
inline Element *getNext(Element *e) { return e == NULL ? NULL : e->next; }
Element *addToTail(Element *sl, void *data) {
sl = addToTailWithMetaInfo(sl, data, 0);
return sl;
}
Element *addToTailWithMetaInfo(Element *sl, void *data, const int metaInfo) {
if (sl == NULL) {
sl = initElement(sl);
sl->value = data;
sl->metaInfo = metaInfo;
} else {
sl->next = initElement(sl->next);
sl->value = data;
sl->metaInfo = metaInfo;
sl->next = sl->next->next;
}
return sl;
}
Element *addToHeadWithRank(Element *sl, void *data, const double rank) {
sl = addToHead(sl, data);
if (sl != NULL) {
sl->rank = rank;
}
return sl;
}
Element *addToHead(Element *sl, void *data) {
if (sl == NULL) {
sl = initElement(sl);
sl->value = data;
} else {
Element *newElem = NULL;
newElem = initElement(newElem);
newElem->value = data;
newElem->next = sl;
sl = newElem;
}
return sl;
}
Element *initElement(Element *elem) {
if (elem == NULL) {
elem = (Element *)malloc(sizeof(Element));
}
elem->next = NULL;
elem->value = NULL;
elem->rank = 1; // Iniitally rank is at 100%
elem->metaInfo = 0;
elem->dTag = False; // Hasn't been discovered
return elem;
}
long int destroySList(Element *sl) {
long int nValueFrees = 0;
if (sl != NULL) {
// printf("Sl == NULL: %d\n", sl != NULL);
Element *tmp;
while (sl != NULL) {
tmp = sl->next;
#ifdef DEBUG
printf("Freeing:: curHead: %p Next: %p\n", sl, tmp);
#endif
if (sl->value != NULL) {
free(sl->value);
++nValueFrees;
}
free(sl);
sl = tmp;
}
sl = NULL;
}
return nValueFrees;
}
#ifndef _ELEMENT_H
#define _ELEMENT_H
#ifndef _BOOL_D
#define _BOOL_D
typedef enum {
False=0, True=1
} Bool;
#endif // _BOOL_H
typedef struct MetaRankMule_ {
int rank;
int metaInfo;
int hash;
} MetaRankMule;
typedef struct Element_ {
Bool dTag; // Discovery tag
void *value;
double rank;
int metaInfo;
struct Element_ *next;
} Element;
inline Bool hasNext(Element *e);
inline Element *getNext(Element *e);
Element *addToTail(Element *sl, void *data);
Element *addToTailWithMetaInfo(Element *sl, void *data, const int metaInfo);
Element *addToHeadWithRank(Element *sl, void *data, const double rank);
Element *addToHead(Element *sl, void *data);
Element *initElement(Element *elem);
long int destroySList(Element *sl);
#endif // _ELEMENT_H
#ifndef _ERRORS_H
#define _ERRORS_H
#include <stdio.h>
typedef enum {
TypeError, ValueError, IndexError, SyntaxError, BufferOverFlow,
AssertionError, NullPointerException, IndexOutOfBoundsException,
ZeroDivisionException, CorruptedDataException
} Exception;
#define raiseWarning(errMsg){\
fprintf(stderr,"\033[31m%s on line %d in function '%s' file '%s'\033[00m\n",\
errMsg,__LINE__,__func__,__FILE__);\
}
#define throwException(errCode,errMsg){\
if (errMsg != NULL){\
fprintf(stderr, "%s\n", #errMsg);\
}\
raiseWarning(#errCode);\
exit(-1);\
}
#undef assert
#define assert(validExpression){\
if (! (validExpression))\
raiseError((validExpression));\
}
#define raiseError(args) {\
fprintf(stderr, "Traceback most recent call at line: %d ", __LINE__);\
fprintf(stderr, "of file: %s\n\033[31m%s\033[00m\n", \
__FILE__, #args);\
exit(-2);\
}
#define raiseExceptionIfNull(expression){\
if (! expression)\
throwException(NullPointerException, expression);\
}
#endif
CC := gcc
radTest: radLoadWords.c radTrie.o element.o wordTransition.o
$(CC) -DTEST_LOAD_WORDS radTrie.o element.o wordTransition.o radLoadWords.c -o $@
%.o: %.c
$(CC) -c $< -o $@
clean:
rm -f *.o radTest
// Author: Emmanuel Odeke <odeke@ualberta.ca>
#include <errno.h>
#include <ctype.h>
#include <stdio.h>
#include <fcntl.h>
#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include "radTrie.h"
#include "errors.h"
#include "radLoadWords.h"
#include "wordTransition.h"
#define tolower(x) (x | ('a' - 'A'))
#define BUF_SIZ 10 // Starting LinearizedTrie size for buffers
#define ALIGN_PAGE_SIZE(x, pageSize) (((x) + (pageSize) -1)/(pageSize) * (pageSize))
RTrie *fileToRTrie(const char *filePath) {
RTrie *rt = NULL;
int fd = open(filePath, 0, O_RDONLY);
if (fd < 0) {
raiseError(strerror(errno));
}
else {
struct stat st;
if (fstat(fd, &st) != 0) {
close(fd);
raiseError(strerror(errno));
}
else {
int pageSize = sysconf(_SC_PAGE_SIZE);
int mapLength = ALIGN_PAGE_SIZE(st.st_size, pageSize);
#ifdef DEBUG
printf("pageSize: %d mapLength: %d stSize: %d\n", pageSize, mapLength, st.st_size);
#endif
char *buf = mmap(NULL, mapLength, PROT_READ, MAP_SHARED, fd, 0);
if (buf == MAP_FAILED) {
raiseError(strerror(errno));
}
else {
register int i=0, j;
char c = '\0';
while (i < st.st_size && c != EOF) {
int wBufLen = 10;
j = 0;
char *wordIn = (char *)malloc(sizeof(char) * wBufLen);
while ((c = buf[i++]) != EOF && isalpha(c)) {
if (j >= wBufLen) {
wBufLen += 10;
wordIn = (char *)realloc(wordIn, sizeof(char) * wBufLen);
}
wordIn[j++] = tolower(c);
}
if (! j) {
if (wordIn != NULL)
free(wordIn);
}
else {
wordIn = (char *)realloc(wordIn, sizeof(char) * (j + 1));
wordIn[j] = '\0';
unsigned long int h = pjwCharHash(wordIn);
void *gotten = get(rt, h);
if (gotten == NULL)
rt = put(rt, h, (void *)wordIn, True);
else
free(wordIn);
}
}
// Now for the clean-up
if (munmap(buf, mapLength)) {
raiseWarning(strerror(errno));
}
}
}
close(fd);
}
return rt;
}
LinearizedTrie *linearizeRTrie(RTrie *rt, LinearizedTrie *mp) {
if (rt != NULL && rt->keys != NULL) {
if (rt->value != NULL)
mp = addToHead(mp, rt->value);
register unsigned int i=0;
while (i < BASE) {
if (*(rt->keys + i) != NULL)
mp = linearizeRTrie(*(rt->keys + i), mp);
++i;
}
}
return mp;
}
LinearizedTrie *destroyLinearizedTrie(LinearizedTrie *lt) {
// Won't be freeing the saved data since it's rightful owner is the Trie that got linearized
LinearizedTrie *next = NULL;
while (lt != NULL) {
next = lt->next;
free(lt);
lt = next;
}
return lt;
}
Element *matches(
const char *query, RTrie *dict,
const unsigned int ownRank, const double percentMatch
) {
Element *matchL = NULL;
if (query != NULL && dict != NULL) {
double threshHold = ownRank * percentMatch;
if (dict->meta == NULL)
dict->meta = linearizeRTrie(dict, (LinearizedTrie *)dict->meta);
LinearizedTrie *trav = (LinearizedTrie *)dict->meta;
while (trav != NULL) {
if (trav->value != NULL) {
int rank = getRank(query, (char *)trav->value);
if (rank >= threshHold)
matchL = addToHeadWithRank(matchL, trav->value, (double)rank/ownRank);
}
trav = trav->next;
}
}
return matchL;
}
Element *getCloseMatches(const char *query, RTrie *dict, const double percentMatch) {
if (query == NULL || dict == NULL)
return NULL;
else {
// First check if the query exists in the dict
void *check = get(dict, pjwCharHash(query));
// In case of any collisions, strcmp should help sort things out
if (check != NULL) {
Element *matchList = NULL;
matchList = addToHead(matchList, check);
return matchList;
}
else {
int ownRank = getRank(query, query);
return matches(query, dict, ownRank, percentMatch);
}
}
}
void printLinearizedTrie(LinearizedTrie *lt) {
if (lt != NULL) {
LinearizedTrie *trav = lt;
while (trav != NULL) {
printf("%s\n", (char *)trav->value);
trav = trav->next;
}
}
}
#ifdef TEST_LOAD_WORDS
int main() {
RTrie *rt = fileToRTrie(__FILE__);
printf("\033[47mDone loading words\033[00m\n");
LinearizedTrie *linear = NULL;
linear = linearizeRTrie(rt, linear);
printLinearizedTrie(linear);
linear = destroyLinearizedTrie(linear);
Element *matches = getCloseMatches("matchong\0", rt, 0.6);
printLinearizedTrie(matches);
matches = destroyLinearizedTrie(matches);
rt = destroyRTrie(rt);
return 0;
}
#endif
// Author: Emmanuel Odeke <odeke@ualberta.ca>
#ifndef _RADLOADWORDS_H
#define _RADLOADWORDS_H
#include "element.h"
typedef Element LinearizedTrie;
RTrie *fileToRTrie(const char *filePath);
Element *getCloseMatches(const char *query, RTrie *dict, const double percentMatch);
Element *matches(const char *query, RTrie *dict, const unsigned int r, const double percentMatch);
void printLinearizedTrie(LinearizedTrie *lt);
LinearizedTrie *linearizeRTrie(RTrie *rt, LinearizedTrie *mp);
LinearizedTrie *destroyLinearizedTrie(LinearizedTrie *lt);
#endif // _RADLOADWORDS.H
// Author: Emmanuel Odeke <odeke@ualberta.ca>
#include <errno.h>
#include <ctype.h>
#include <stdio.h>
#include <fcntl.h>
#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include "radTrie.h"
#include "errors.h"
inline RTrie *allocRTrie(void) {
return (RTrie *)malloc(sizeof(RTrie));
}
unsigned int pjwCharHash(const char *srcW) {
// PJW hashing algorithm
unsigned int h = 0, g, i, srcLen = strlen(srcW) / sizeof(char);
for (i=0; i < srcLen; ++i) {
h = (h << 4) + srcW[i];
g = h & 0xf0000000;
if (g) {
h ^= (g >> 24);
h ^= g;
}
}
return h;
}
RTrie *newRTrie(void) {
RTrie *rt = allocRTrie();
assert(rt);
rt->EOS = 0;
rt->keys = NULL;
rt->value = NULL;
rt->availBlock = 0;
return rt;
}
RTrie *put(RTrie *rt, unsigned long int hash, void *data, Bool heapBool) {
if (rt == NULL) {
rt = newRTrie();
}
if (rt->keys == NULL) {
rt->keys = (RTrie **)malloc(sizeof(RTrie *) * BASE);
}
unsigned int residue = hash % BASE;
unsigned int bitPos = 1 << residue;
hash /= BASE;
if (! (rt->availBlock & bitPos)) {
rt->availBlock |= bitPos;
// printf("bP: %d residue: %d hash: %ld avB: %d\n", bitPos, residue, hash, rt->availBlock);
*(rt->keys + residue) = NULL;
}
if (hash || residue) {
*(rt->keys + residue) = put(*(rt->keys + residue), hash, data, heapBool);
}
else {
// TODO: Define what to do if the value was already present
// printf("setting eos for %p\n", data);
// printf("origValue: %s newValue: %s\n", (char *)rt->value, (char *)data);
rt->EOS = 1;
rt->value = data;
rt->valueIsHeapd = heapBool;
}
return rt;
}
void *__rAccess(RTrie *rt, unsigned long int hash, Bool isGetOp) {
if (rt == NULL)
return NULL;
else {
unsigned int residue = hash % BASE, bitPos;
bitPos = 1 << residue;
hash /= BASE;
if (! (rt->availBlock & bitPos))
return NULL;
else {
if (hash || residue)
return __rAccess(*(rt->keys + residue), hash, isGetOp);
else {
// printf("EOS: %d data: %p\n", rt->EOS, rt->value);
if (rt->EOS) {
if (isGetOp)
return rt->value;
else {
void *popd = rt->value;
rt->value = NULL;
rt->EOS = False;
return popd;
}
}
else
return NULL;
}
}
}
}
void *get(RTrie *rt, unsigned long int hash) {
return __rAccess(rt, hash, True);
}
void *pop(RTrie *rt, unsigned long int hash) {
return __rAccess(rt, hash, False);
}
RTrie *destroyRTrie(RTrie *rt) {
if (rt != NULL) {
if (rt->keys != NULL) {
unsigned int i=0, bitPos;
while (i < BASE) {
bitPos = 1<<i;
if (rt->availBlock & bitPos) {
// printf("bitPos: %d avB: %d\n", bitPos, rt->availBlock);
*(rt->keys + i) = destroyRTrie(*(rt->keys + i));
}
++i;
}
free(rt->keys);
rt->keys = NULL;
}
if (rt->valueIsHeapd && rt->value != NULL) {
free(rt->value);
rt->value = NULL;
}
// printf("Freeing rt: %p\n", rt);
free(rt);
rt = NULL;
}
return rt;
}
#ifdef TEST_RAD_TRIE
int main() {
return 0;
}
#endif
// Author: Emmanuel Odeke <odeke@ualberta.ca>
// Hash map implementation in the form of a trie.
// Helps solve the limited size of hashlists as well enables lookup, adding and removal
// For every step takes the modulo remainder of the hash
// given in, creating a path, the purpose is to preserve memory ie setting pointers of keys
// only if ever __rAccessed otherwise, they are left uninitialized
// given h = WXYZ and base M. Step 1: h % M = Z => Create path in Z, h /= M
// Step 2: h % M = Y => Create path in Y, h /= M
// ... Step 4: h % M = W => Set payload to data, and then EOS to True
#ifndef _RAD_TRIE_H
#define _RAD_TRIE_H
#ifndef _BOOL_D
#define _BOOL_D
typedef enum {
False=0, True=1
} Bool;
#endif // _BOOL_D
static unsigned int BASE = 10;
typedef struct RTrie_ {
void *value;
Bool EOS:1; // EndOfSequence
Bool valueIsHeapd:1;
struct RTrie_ **keys;
unsigned int availBlock; // Each set bit will indicate the presence of a key in that position
} RTrie;
inline RTrie *allocRTrie(void);
RTrie *newRTrie(void);
RTrie *destroyRTrie(RTrie *);
void *get(RTrie *rt, unsigned long int hash);
void *pop(RTrie *rt, unsigned long int hash);
void *__rAccess(RTrie *rt, unsigned long int hash, Bool isGetOp);
RTrie *put(RTrie *rt, unsigned long int hash, void *value, Bool heapBool);
unsigned int pjwCharHash(const char *srcW);
#endif
// Author: Emmanuel Odeke <odeke@ualberta.ca>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "errors.h"
#include "wordTransition.h"
#define ALPHA_SIZE 26
// #define DEBUG
void printIndexNode(IndexNode **r) {
int i=0;
for (i=0; i < ALPHA_SIZE; ++i) {
printf("%c => [", i + 'a');
IndexNode *t = *(r + i);
while (t != NULL) {
printf("%d", t->index);
if (t->next != NULL) printf(", ");
t = t->next;
}
printf("]\n");
}
}
void initEditStat(EditStat *est) {
if (est != NULL) {
est->moves = est->inplace = est->deletions = 0;
est->reuses = est->stringLen = est->additions = 0;
}
}
inline EditStat *allocEditStat() {
return (EditStat *)malloc(sizeof(EditStat));
}
EditStat *allocAndInitEditStat() {
EditStat *freshEditStat = allocEditStat();
if (freshEditStat == NULL) {
raiseError("Run out of memory");
} else {
initEditStat(freshEditStat);
}
return freshEditStat;
}
void printStat(const EditStat *est) {
if (est != NULL) {
printf(
"\033[32mInplace: %d Moves: %d\nReuses: %d Deletions: %d\nAdditions: %d\033[00m\n",
est->inplace, est->moves, est->reuses, est->deletions, est->additions
);
}
}
int getRank(const char *subject, const char *base) {
int rank = -1;
if (subject != NULL && base != NULL) {
IndexNode *r[ALPHA_SIZE];
int i;
for (i=0; i < ALPHA_SIZE; ++i) {
*(r + i) = NULL;
}
// For later ascending order traversal, add indices from high to low
i = strlen(subject)/1;
while (--i >= 0) {
if (isalpha(*(subject + i))) {
int index = tolower(*(subject + i)) - 'a';
IndexNode *t = (IndexNode *)malloc(sizeof(IndexNode));
t->index = i;
t->next = *(r + index);
*(r + index) = t;
}
}
// printIndexNode(r);
int reuses=0, moves=0, inplace=0, deletions=0, additions=0;
int baseLen = strlen(base)/1;
for (i =0; i < baseLen; ++i) {
if (isalpha(*(base + i))) {
int index = tolower(*(base + i)) - 'a';
if (*(r + index) == NULL) {
++deletions;
} else {
++reuses;
if ((*(r + index))->index == i) {
++inplace;
} else {
++moves;
}
// Time to pop it off
IndexNode *tmp = (*(r + index))->next;
free(*(r + index));
*(r + index) = tmp;
}
}
}
// Cleaning up
IndexNode **tIt = r, **tEnd = tIt + ALPHA_SIZE, *tmp;
while (tIt < tEnd) {
while (*tIt != NULL) {
tmp = (*tIt)->next;
free(*tIt);
*tIt = tmp;
++additions;
}
++tIt;
}
#ifdef DEBUG
printf("Add: %d Move: %d Delete: %d Keep: %d\n",
additions, moves, deletions, inplace
);
#endif
rank = (inplace * 3) + (moves * 2) + ((deletions + additions) * -1);
}
return rank;
}
#ifdef SAMPLE
int main() {
char *w = "googre\0", *base[] = {
"monk\0", "bolton\0", "goo gle\0"
};
char **trav = base, **end = base + sizeof(base)/sizeof(base[0]);
int setRank = getRank(w, w);
while (trav != end) {
printf("\033[33mTo get %s from %s\033[00m\n", w, *trav);
int rank = getRank(*trav, w); //w, *trav);
printf("Base: %d Rank : %d\n", setRank, rank);
++trav;
}
return 0;
}
#endif
// Author: Emmanuel Odeke <odeke@ualberta.ca>
#ifndef _WORD_TRANSITION_H
#define _WORD_TRANSITION_H
typedef unsigned int uint32;
typedef struct {
uint32 moves,
reuses,
inplace,
additions,
deletions,
stringLen;
} EditStat;
typedef struct IndexNode_ {
int index;
struct IndexNode_ *next;
} IndexNode;
void initEditStat(EditStat *est);
inline EditStat *allocEditStat(void);
EditStat *allocAndInitEditStat(void);
void printStat(const EditStat *);
void printIndexNode(IndexNode **r);
int getRank(const char *query, const char *from);
EditStat *getEditStats(const char *subject, const char *base);
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment