Skip to content

Instantly share code, notes, and snippets.

@saidinesh5
Created January 24, 2015 18:36
Show Gist options
  • Save saidinesh5/97a6c31278787d8c7550 to your computer and use it in GitHub Desktop.
Save saidinesh5/97a6c31278787d8c7550 to your computer and use it in GitHub Desktop.
C++ implementation of a PerfectHash algorithm as described in: http://www.drdobbs.com/architecture-and-design/generating-perfect-hash-functions/184404506
/*
* Perfect Hash implementation as described in:
* http://www.drdobbs.com/architecture-and-design/generating-perfect-hash-functions/184404506
*
* Copyright 2015-2020 Dinesh Manajipet <saidinesh5@gmail.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of the nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <iostream>
#include <initializer_list>
#include <utility>
#include <cmath>
#include <algorithm>
using namespace std;
//Your Key must support std::div() and operator==
template <typename K, typename V,
const K invalid_key, const V invalid_value,
const int n> // maximum allowed number of buckets
class PerfectHash {
public:
PerfectHash( initializer_list< pair<K,V> > data )
{
if(!initializeData(data))
{
m_count = 0;
m_rows = 0;
}
}
PerfectHash( const vector< pair <K,V> >& data )
{
if(!initializeData(data))
{
m_count = 0;
m_rows = 0;
}
}
// How many (key,value) pairs could this list have taken
inline int capacity() const { return m_data.size(); }
// How many (key, value) pairs are present in this list
inline int count() const { return m_count; }
// Is there a (key,value) pair with this key?
inline bool contains(K key) const
{
const auto d = std::div(key , m_rows);
if(d.quot >= (int)m_rows)
return false;
const int offset = m_rowOffsets[d.quot];
if(offset < 0)
return false;
const unsigned int c = offset + d.rem;
return c < m_data.size() && m_data[ c ].first == key;
}
// Return the value associated with this key
// If nothing found, this returns a default value
inline V value(K key) const
{
const auto d = std::div(key, m_rows);
if(d.quot >= (int)m_rows)
return invalid_value;
const int offset = m_rowOffsets[d.quot];
if(offset < 0)
return invalid_value;
const unsigned int c = offset + d.rem;
if(c < m_data.size() && m_data[c].first == key)
return m_data[c].second;
return invalid_value;
}
//Overloading value(K key)
inline V operator[](K key) const { return value(key); }
private:
typedef vector< vector<K> > ScratchPad;
struct RowInfo { int index, count; };
bool initializeData(const vector< pair<K,V> >& rawData)
{
K kMax = max_element(rawData.begin(), rawData.end())->first;
m_rows = sqrt(kMax) + 1;
//The 2D array for inserting the keys into
ScratchPad scratchpad(m_rows, vector<K>(m_rows, invalid_key) );
vector<RowInfo> rowInfo;
//No row is shifted at all
for(unsigned int r = 0; r < m_rows; r++)
{
m_rowOffsets.push_back(-1);
rowInfo.push_back({r,0}); // Assume that each row has 0 keys
}
//Insert the keys into the Key Array
for(const auto element : rawData)
{
K k = element.first;
int r = k/m_rows;
int c = k%m_rows;
if(scratchpad[r][c] == invalid_key)
{
scratchpad[r][c] = k;
rowInfo[r].count++;
}
}
// Process the keys and populate the hashmap
return firstFitDescending(rawData, scratchpad, rowInfo);
}
bool firstFitDescending( const vector< pair<K,V> >& rawData,
ScratchPad& scratchPad,
vector<RowInfo> &rowInfo )
{
// A temporary large array to hold all the keys being hashed
vector<K> keys = vector<K>(n, invalid_key);
// 0. Sort the Rows based on the number of elements they contain
sort(rowInfo.begin(), rowInfo.end(),
[](const RowInfo& A, const RowInfo& B){ return A.count > B.count; });
// For each non-empty row:
for(int r = 0; rowInfo[r].count > 0; r++)
{
unsigned int row = rowInfo[r].index;
// 1. Shift the row right until none of its items collide with
// any of the items in previous rows.
for(unsigned int offset = 0; offset < n - m_rows - 1; offset++)
{
unsigned int y = 0;
for(y = 0; y < m_rows; y++)
{
if( (keys[offset + y] != invalid_key) && (scratchPad[row][y] != invalid_key) )
break;
}
if( y == m_rows )
{
// 2. Record the shift amount in m_rowOffsets
//cout << row << "--->" << offset << endl;
m_rowOffsets[row] = offset;
// 3. Copy the keys from this row into the keys already hashed
for(unsigned int x = 0; x < m_rows; x++)
{
if(scratchPad[row][x] != invalid_key)
{
keys[offset + x] = scratchPad[row][x];
//cout << "Keys[" << offset + x << "] = " << keys[offset+x] <<endl;
}
}
//Move on to the next row
break;
}
if(offset == n - m_rows - 1)
{
cout << "Error::Failed to fit the Row " << row << " in hash table." << endl;
return false;
}
}
}
// Now copy the elements of rawData into their appropriate buckets
int capacity = 0;
for(int i = 0; i < n; i++)
if(keys[i] != invalid_key) capacity = i + 1;
m_data = vector< pair<K,V> >(capacity, { invalid_key, invalid_value} );
m_count = 0;
for(unsigned int i = 0; i < m_data.size(); i++)
{
const auto currentKey = keys[i];
//Bother only if currentKey is a valid key
if( currentKey == invalid_key) continue;
for(const auto element : rawData)
{
if(element.first == currentKey)
{
//cout << "Bucket " << offset + x << " :: " << element.first << "," << element.second << endl;
m_data[i] = element;
break;
}
}
//cout << i << "::" << m_data[i].first << "," << m_data[i].second << endl;
m_count++;
}
return true;
}
unsigned int m_rows; // Number of rows in the scratchPad
vector<int> m_rowOffsets; // Offset of each row in m_data
unsigned int m_count; // Total number of keys hashed into m_data
vector< pair<K,V> > m_data; // Actual data
};
int main(int argc, char* argv[])
{
vector < pair<int, char> > items;
for(int i = 0; i<26; i++)
{
items.push_back( {i+'A',i+'a'} );
items.push_back( {i+'a',i+'A'} );
}
PerfectHash<int, char, -1, -1, 1000> hash{ items };
cout << "Hash capacity:" << hash.capacity() << endl;
cout << "Hash count:" << hash.count() << endl;
for(int i = 0 ; i < 125 ; i++ )
{
if(hash.contains(i)) cout << "Hash[" << (char)i << "] = " << (char)(hash[i]) << endl;
else cout << "Hash[" << i << "] doesn't exist" << endl;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment