Skip to content

Instantly share code, notes, and snippets.

@douglas-vaz
Last active December 25, 2015 18:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save douglas-vaz/7019574 to your computer and use it in GitHub Desktop.
Save douglas-vaz/7019574 to your computer and use it in GitHub Desktop.
Glib Bigrams using a 2-level hash table
/**
-#Copyright (c) 2013 Douglas Vaz, Sharvari Bhosale
-#
-#Permission is hereby granted, free of charge, to any person obtaining a copy
-#of this software and associated documentation files (the "Software"), to deal
-#in the Software without restriction, including without limitation the rights
-#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-#copies of the Software, and to permit persons to whom the Software is
-#furnished to do so, subject to the following conditions:
-#
-#The above copyright notice and this permission notice shall be included in
-#all copies or substantial portions of the Software.
-#
-#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-#THE SOFTWARE.
-
*/
#include <glib.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/**
* Clear stdin buffer
* http://stackoverflow.com/questions/7898215/how-to-clear-input-buffer-in-c
*/
void clear (void)
{
while ( getchar() != '\n' );
}
/*
* View key-value pairs
*/
void value_iterator(gpointer key, gpointer value, gpointer user_data) {
printf(user_data, key, value);
}
void iterator(gpointer key, gpointer value, gpointer user_data) {
printf(user_data, key);
g_hash_table_foreach(value, (GHFunc)value_iterator, "{%s: %d}\n");
}
gchar** tokenize(gchar* str, const gint len)
{
const gchar *delims = " !,\"'-;.?‘";
str = g_ascii_strdown(str, -1);
return g_strsplit_set(str, delims, -1);
}
void bigrams(gchar** tokens, GHashTable** bigram, GHashTable** counts)
{
gint i = 0, j = 0, count = GPOINTER_TO_INT(NULL), total = 0;
gdouble prob;
GHashTable* values;
if(*bigram == NULL){
*bigram = g_hash_table_new(g_str_hash, g_str_equal);
}
//Word counts
while(tokens[i] != NULL)
{
if(strncmp(tokens[i], "\0", 1))
{
count = GPOINTER_TO_INT(g_hash_table_lookup(*counts, tokens[i]));
g_hash_table_insert(*counts, tokens[i], GINT_TO_POINTER(count)+1);
}
++i;
}
//Bigram probabilities
i = 0;
while(tokens[i] != NULL)
{
//Set i to a non-empty token
while(tokens[i] != NULL && !strncmp(tokens[i], "\0", 1))
++i;
if(tokens[i] == NULL)
break;
//Add token as a key if it doesn't exist
if((values = g_hash_table_lookup(*bigram, tokens[i])) == NULL)
{
values = g_hash_table_new(g_str_hash, g_str_equal);
g_hash_table_insert(*bigram, tokens[i], values);
}
//Set j to next non-empty token
j = i+1;
while(tokens[j] != NULL && !strncmp(tokens[j], "\0", 1))
++j;
if(tokens[j] == NULL)
break;
//Retrieve hash table of count for token[i]
values = g_hash_table_lookup(*bigram, tokens[i]);
//Retrieve count of token[j] when token[i] has occured and increment it
count = GPOINTER_TO_INT(g_hash_table_lookup(values, tokens[j])) + 1;
//Insert new count
g_hash_table_insert(values, tokens[j], GINT_TO_POINTER(count));
//Update table
g_hash_table_insert(*bigram, tokens[i], values);
i = j;
}
}
int main(void)
{
gint n, line;
const gint size = 1000;
gchar *str = (char*)malloc(size); //Input string
gchar** result; //Array of tokens
GHashTable* bigram = g_hash_table_new(g_str_hash, g_str_equal);
GHashTable* counts = g_hash_table_new(g_str_hash, g_str_equal);
//Read number of lines
scanf("%d", &n);
clear();
for (line = 0; line < n; ++line)
{
//Read lines from stdin
scanf("%[^\n]",str);
clear();
result = tokenize(str, size);
bigrams(result, &bigram, &counts);
}
//Display all words with counts
g_hash_table_foreach(bigram, (GHFunc)iterator, "%s:\n");
g_hash_table_foreach(counts, (GHFunc)value_iterator, "%s: %d\n");
//Cleanup
g_strfreev(result);
g_hash_table_destroy(bigram);
g_hash_table_destroy(counts);
g_free(str);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment