Skip to content

Instantly share code, notes, and snippets.

@amakukha
Last active August 7, 2022 18:01
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amakukha/8d1ac179c42c13bff3d007b56a48dcd4 to your computer and use it in GitHub Desktop.
Save amakukha/8d1ac179c42c13bff3d007b56a48dcd4 to your computer and use it in GitHub Desktop.
Find k most frequent words in a file (fast implementation)
/*
* Solution of the Jon Bentley's k most frequent words problem using a prefix tree and
* a heap of size k. Worst case time complexity is O(N log k), where N is the total
* number of words.
*
* The problem is formulated in the Communications of the ACM 29,5 (May 1986), 364-369:
* "Given a text file and an integer k, you are to print the k
* most common words in the file (and the number of their
* occurrences) in decreasing frequency."
*
* Donald Knuth famously solved the problem by using hash tries in a an 8-pages-long
* WEB program. Doug McIlroy, in response, solved it in six lines of shell script
* reusing standard Unix utilities (even though his solution is suboptimal).
*/
// Original: https://www.geeksforgeeks.org/find-the-k-most-frequent-words-from-a-file/
// Licence: Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)
// Modifications by Andriy Makukha:
// - increased max. word size
// - scanning input for alphabetic words only, ignoring the rest
// - taking arguments from command line
// - sorting output
#include <stdio.h>
#include <string>
#include <algorithm>
# define MAX_CHARS 26
# define BUFFER_SIZE 10000 // both max. word size and max. distance between words (+1)
// A Trie node
struct TrieNode
{
bool isEnd; // indicates end of word
unsigned frequency; // the number of occurrences of a word
int indexMinHeap; // the index of the word in minHeap
TrieNode* child[MAX_CHARS]; // represents 26 slots each for 'a' to 'z'.
};
// A Min Heap node
struct MinHeapNode
{
TrieNode* root; // indicates the leaf node of TRIE
unsigned frequency; // number of occurrences
char* word; // the actual word stored
};
// A Min Heap
struct MinHeap
{
unsigned capacity; // the total size a min heap
int count; // indicates the number of slots filled.
MinHeapNode* array; // represents the collection of minHeapNodes
};
bool compareMinHeapNodes(MinHeapNode n1, MinHeapNode n2) {
return n1.frequency > n2.frequency;
};
// A routine to create a new Trie node
TrieNode* newTrieNode()
{
// Allocate memory for Trie Node
TrieNode* trieNode = new TrieNode;
// Initialize values for new node
trieNode->isEnd = 0;
trieNode->frequency = 0;
trieNode->indexMinHeap = -1;
for( int i = 0; i < MAX_CHARS; ++i )
trieNode->child[i] = NULL;
return trieNode;
}
// A routine to create a Min Heap of given capacity
MinHeap* createMinHeap( int capacity )
{
MinHeap* minHeap = new MinHeap;
minHeap->capacity = capacity;
minHeap->count = 0;
// Allocate memory for array of min heap nodes
minHeap->array = new MinHeapNode [ minHeap->capacity ];
return minHeap;
}
// A routine to swap two min heap nodes. This function
// is needed in minHeapify
void swapMinHeapNodes ( MinHeapNode* a, MinHeapNode* b )
{
MinHeapNode temp = *a;
*a = *b;
*b = temp;
}
// This is the standard minHeapify function. It does one thing extra.
// It updates the minHeapIndex in Trie when two nodes are swapped in
// in min heap
void minHeapify( MinHeap* minHeap, int idx )
{
int left, right, smallest;
left = 2 * idx + 1;
right = 2 * idx + 2;
smallest = idx;
if ( left < minHeap->count &&
minHeap->array[ left ]. frequency <
minHeap->array[ smallest ]. frequency
)
smallest = left;
if ( right < minHeap->count &&
minHeap->array[ right ]. frequency <
minHeap->array[ smallest ]. frequency
)
smallest = right;
if( smallest != idx )
{
// Update the corresponding index in Trie node.
minHeap->array[ smallest ]. root->indexMinHeap = idx;
minHeap->array[ idx ]. root->indexMinHeap = smallest;
// Swap nodes in min heap
swapMinHeapNodes (&minHeap->array[ smallest ], &minHeap->array[ idx ]);
minHeapify( minHeap, smallest );
}
}
// A standard function to build a heap
void buildMinHeap( MinHeap* minHeap )
{
int n, i;
n = minHeap->count - 1;
for( i = ( n - 1 ) / 2; i >= 0; --i )
minHeapify( minHeap, i );
}
// Inserts a word to heap, the function handles the 3 cases explained above
void insertInMinHeap( MinHeap* minHeap, TrieNode** root, const char* word )
{
// Case 1: the word is already present in minHeap
if( (*root)->indexMinHeap != -1 )
{
++( minHeap->array[ (*root)->indexMinHeap ]. frequency );
// percolate down
minHeapify( minHeap, (*root)->indexMinHeap );
}
// Case 2: Word is not present and heap is not full
else if( minHeap->count < minHeap->capacity )
{
int count = minHeap->count;
minHeap->array[ count ]. frequency = (*root)->frequency;
minHeap->array[ count ]. word = strdup( word );
minHeap->array[ count ]. root = *root;
(*root)->indexMinHeap = minHeap->count;
++( minHeap->count );
buildMinHeap( minHeap );
}
// Case 3: Word is not present and heap is full. And frequency of word
// is more than root. The root is the least frequent word in heap,
// replace root with new word
else if ( (*root)->frequency > minHeap->array[0]. frequency )
{
minHeap->array[ 0 ]. root->indexMinHeap = -1;
minHeap->array[ 0 ]. root = *root;
minHeap->array[ 0 ]. root->indexMinHeap = 0;
minHeap->array[ 0 ]. frequency = (*root)->frequency;
// delete previously allocated memory and
free( minHeap->array[ 0 ]. word );
minHeap->array[ 0 ]. word = strdup( word );
minHeapify ( minHeap, 0 );
}
}
// Inserts a new word to both Trie and Heap
void insertUtil ( TrieNode** root, MinHeap* minHeap,
const char* word, const char* dupWord )
{
// Base Case
if ( *root == NULL )
*root = newTrieNode();
// There are still more characters in word
if ( *word != '\0' )
insertUtil ( &((*root)->child[ tolower( *word ) - 97 ]),
minHeap, word + 1, dupWord );
else // The complete word is processed
{
// word is already present, increase the frequency
if ( (*root)->isEnd )
++( (*root)->frequency );
else
{
(*root)->isEnd = 1;
(*root)->frequency = 1;
}
// Insert in min heap also
insertInMinHeap( minHeap, root, dupWord );
}
}
// add a word to Trie & min heap. A wrapper over the insertUtil
void insertTrieAndHeap(const char *word, TrieNode** root, MinHeap* minHeap)
{
insertUtil( root, minHeap, word, word );
}
// A routine to show results, The min heap
// contains k most frequent words so far, at any time
void displayMinHeap( MinHeap* minHeap )
{
// Sort in decreasing order of frequency
std::sort(minHeap->array, minHeap->array + minHeap->count, compareMinHeapNodes);
// print top K word with frequency
for(int i = 0; i < minHeap->count; ++i ) {
// convert the stored word to lowercase for consistency
for(int j = 0; minHeap->array[i].word[j]; j++)
minHeap->array[i].word[j] = tolower( minHeap->array[i].word[j] );
// display word and frequency
printf("%d %s\n",
minHeap->array[i].frequency,
minHeap->array[i].word
);
}
}
// The main function that takes a file as input, add words to heap
// and Trie, finally shows result from heap
void printKMostFreq( FILE* fp, int k )
{
// Create a Min Heap of Size k
MinHeap* minHeap = createMinHeap( k );
// Create an empty Trie
TrieNode* root = NULL;
// A buffer to store one word at a time (as well as junk inbetween words)
char buffer[BUFFER_SIZE];
// Prepare format strings for input scanning
char format[20];
sprintf(format, "%%%d[a-zA-Z]", (int)sizeof(buffer)-1);
char junkformat[20];
sprintf(junkformat, "%%%d[^a-zA-Z]", (int)sizeof(buffer)-1);
// Read words one by one from file. Insert the word in Trie and Min Heap
fscanf( fp, junkformat, buffer );
while( fscanf( fp, format, buffer ) != EOF ) {
insertTrieAndHeap(buffer, &root, minHeap);
fscanf( fp, junkformat, buffer );
}
// The Min Heap will have the k most frequent words, so print Min Heap nodes
displayMinHeap( minHeap );
}
int main(int argc, char* argv[])
{
if (argc < 3) {
printf("Usage: %s k filename\n", argv[0]);
printf(" k - number of most frequent words to display\n");
printf(" fn - text file name\n");
return 1;
}
int k = std::stoi(argv[1]);
FILE *fp = fopen (argv[2], "r");
if (fp == NULL)
printf ("file \"%s\" doesn't exist\n", argv[2]);
else
printKMostFreq (fp, k);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment