matklad/hash_tables.cpp

## hash_tables.cpp
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <ctime>
#include <forward_list>
#include <iostream>
#include <list>
#include <unordered_set>
#include <vector>

// Better: power of two and a proper hash function.
const size_t capacity = 1'000'007; // Yay C++14!!
const double load_factor = 0.9;

struct std_us {
  std_us() : data_() { data_.reserve(capacity); }

  void add(uint32_t x) { data_.insert(x); }
  bool contains(uint32_t x) { return data_.find(x) != end(data_); }

private:
  std::unordered_set<uint32_t> data_;
};

struct chaining {
  chaining() : data_(capacity, std::forward_list<uint32_t>()) {}

  void add(uint32_t x) {
    size_t idx = hash(x);
    // duplicates :(
    data_[idx].push_front(x);
  }

  bool contains(uint32_t x) {
    auto const &bucket = data_[hash(x)];
    return std::find(begin(bucket), end(bucket), x) != end(bucket);
  }

private:
  // forward_list -> vector = 3x as much space.
  // Can steal one element though...
  std::vector<std::forward_list<uint32_t>> data_;

  size_t hash(uint32_t x) { return x % capacity; }
};

struct chaining2h {
  chaining2h() : data_(capacity, std::list<uint32_t>()) {}

  void add(uint32_t x) {
    auto &bucket1 = data_[hash1(x)];
    auto &bucket2 = data_[hash1(x)];
    (bucket1.size() < bucket2.size() ? bucket1 : bucket2).push_front(x);
  }

  bool contains(uint32_t x) {
    return bucket_contains(data_[hash1(x)], x) ||
           bucket_contains(data_[hash2(x)], x);
  }

private:
  bool bucket_contains(std::list<uint32_t> const &bucket, uint32_t x) {
    return std::find(begin(bucket), end(bucket), x) != end(bucket);
  }
  std::vector<std::list<uint32_t>> data_;

  size_t hash1(uint32_t x) { return x % capacity; }
  size_t hash2(uint32_t x) { return (x * 15485863) % capacity; }
};

struct linear_probing {
  linear_probing() : data_(capacity, EMPTY) {}

  void add(uint32_t x) { data_[insertion_index(x)] = x; }

  bool contains(uint32_t x) { return data_[insertion_index(x)] == x; }

private:
  const uint32_t EMPTY = 0;
  const uint32_t TOMBSTONE = 1;
  std::vector<uint32_t> data_;

  size_t insertion_index(uint32_t x) {
    assert(x != EMPTY && x != TOMBSTONE); // :(
    auto index = hash(x);
    while (data_[index] != 0 && data_[index] != x) {
      index = (index + 1) % capacity;
    }
    return index;
  }

  size_t hash(uint32_t x) { return x % capacity; }
};

struct robin_hood {
  robin_hood() : data_(capacity, EMPTY) {}

  void add(uint32_t x) {
    assert(x != EMPTY && x != TOMBSTONE); // :(
    auto index = hash(x);
    size_t curr_probe = 0;
    while (data_[index] != 0 && data_[index] != x) {
      auto existing_probe = probe_dist(index);
      if (existing_probe < curr_probe) {
        std::swap(x, data_[index]);
        curr_probe = existing_probe;
      }
      index = (index + 1) % capacity;
      ++curr_probe;
    }
    data_[index] = x;
  }

  bool contains(uint32_t x) {
    assert(x != EMPTY && x != TOMBSTONE); // :(
    auto index = hash(x);
    size_t curr_probe = 0;
    while (data_[index] != 0 && data_[index] != x) {
      auto existing_probe = probe_dist(index);
      if (existing_probe < curr_probe) {
        return false; // better than plain linear probing
      }
      index = (index + 1) % capacity;
      ++curr_probe;
    }
    return data_[index] == x;
  }

private:
  const uint32_t EMPTY = 0;
  const uint32_t TOMBSTONE = 1;
  std::vector<uint32_t> data_;

  size_t hash(uint32_t x) { return x % capacity; }

  size_t probe_dist(size_t index) {
    auto ideal_index = hash(data_[index]);
    return (index + capacity - ideal_index) % capacity;
  }
};

struct cuckoo {
  const uint32_t EMPTY = 0;

  void add(uint32_t x) {
    for (;;) {
      assert(x != EMPTY);
      auto index = hash1(x);
      std::swap(x, data1_[index]);
      if (x == EMPTY)
        return;

      index = hash2(x);
      std::swap(x, data2_[index]);
      if (x == EMPTY)
        return;
    }
  }

  bool contains(uint32_t x) {
    assert(x != EMPTY);
    return data1_[hash1(x)] == x || data2_[hash2(x)] == x;
  }

  cuckoo() : data1_(capacity, EMPTY), data2_(capacity, EMPTY) {}

private:
  std::vector<uint32_t> data1_;
  std::vector<uint32_t> data2_;
  size_t hash1(uint32_t x) { return x % capacity; }
  size_t hash2(uint32_t x) { return (x * 15485863) % capacity; }
};

// use std::chrono
clock_t tick_begin;

void tick() { tick_begin = clock(); }

double tack() { return (clock() - tick_begin); }

std::vector<uint32_t> random_numbers() {
  // use proper C++ random numbers!
  std::vector<uint32_t> result;
  for (size_t i = 0; i < capacity * load_factor; i++) {
    result.push_back(rand());
  }
  return result;
}

template <typename T>
void bench(std::vector<uint32_t> const &to_add,
           std::vector<uint32_t> const &to_lookup) {
  srand(time(nullptr));
  T map;

  tick();
  for (auto x : to_add) {
    map.add(x);
  }
  std::cout << "insertion         " << tack() << std::endl;

  tick();
  for (auto x : to_add) {
    if (!map.contains(x)) {
      std::cout << "oups :(" << std::endl;
    }
  }
  std::cout << "successful lookup " << tack() << std::endl;

  tick();
  uint32_t num_failures = 0;
  for (auto x : to_lookup) {
    if (!map.contains(x)) {
      num_failures += 1;
    }
  }

  std::cout << "failed lookup     " << tack() << std::endl;
  std::cout << "failures          " << num_failures << std::endl;
}

int main() {
  auto to_add = random_numbers();
  auto to_lookup = random_numbers();
  std::cout << "std::unordered_set" << std::endl;
  bench<std_us>(to_add, to_lookup);

  std::cout << "\nchaining" << std::endl;
  bench<chaining>(to_add, to_lookup);

  std::cout << "\nchaining two hashes" << std::endl;
  bench<chaining2h>(to_add, to_lookup);

  std::cout << "\nlinear probing" << std::endl;
  bench<linear_probing>(to_add, to_lookup);

  std::cout << "\nrobin hood" << std::endl;
  bench<robin_hood>(to_add, to_lookup);

  //  std::cout << "\ncuckoo" << std::endl;
  //  bench<cuckoo>(to_add, to_lookup);
}
	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <cstdlib>
	#include <ctime>
	#include <forward_list>
	#include <iostream>
	#include <list>
	#include <unordered_set>
	#include <vector>

	// Better: power of two and a proper hash function.
	const size_t capacity = 1'000'007; // Yay C++14!!
	const double load_factor = 0.9;

	struct std_us {
	std_us() : data_() { data_.reserve(capacity); }

	void add(uint32_t x) { data_.insert(x); }
	bool contains(uint32_t x) { return data_.find(x) != end(data_); }

	private:
	std::unordered_set<uint32_t> data_;
	};

	struct chaining {
	chaining() : data_(capacity, std::forward_list<uint32_t>()) {}

	void add(uint32_t x) {
	size_t idx = hash(x);
	// duplicates :(
	data_[idx].push_front(x);
	}

	bool contains(uint32_t x) {
	auto const &bucket = data_[hash(x)];
	return std::find(begin(bucket), end(bucket), x) != end(bucket);
	}

	private:
	// forward_list -> vector = 3x as much space.
	// Can steal one element though...
	std::vector<std::forward_list<uint32_t>> data_;

	size_t hash(uint32_t x) { return x % capacity; }
	};

	struct chaining2h {
	chaining2h() : data_(capacity, std::list<uint32_t>()) {}

	void add(uint32_t x) {
	auto &bucket1 = data_[hash1(x)];
	auto &bucket2 = data_[hash1(x)];
	(bucket1.size() < bucket2.size() ? bucket1 : bucket2).push_front(x);
	}

	bool contains(uint32_t x) {
	return bucket_contains(data_[hash1(x)], x) \|\|
	bucket_contains(data_[hash2(x)], x);
	}

	private:
	bool bucket_contains(std::list<uint32_t> const &bucket, uint32_t x) {
	return std::find(begin(bucket), end(bucket), x) != end(bucket);
	}
	std::vector<std::list<uint32_t>> data_;

	size_t hash1(uint32_t x) { return x % capacity; }
	size_t hash2(uint32_t x) { return (x * 15485863) % capacity; }
	};

	struct linear_probing {
	linear_probing() : data_(capacity, EMPTY) {}

	void add(uint32_t x) { data_[insertion_index(x)] = x; }

	bool contains(uint32_t x) { return data_[insertion_index(x)] == x; }

	private:
	const uint32_t EMPTY = 0;
	const uint32_t TOMBSTONE = 1;
	std::vector<uint32_t> data_;

	size_t insertion_index(uint32_t x) {
	assert(x != EMPTY && x != TOMBSTONE); // :(
	auto index = hash(x);
	while (data_[index] != 0 && data_[index] != x) {
	index = (index + 1) % capacity;
	}
	return index;
	}

	size_t hash(uint32_t x) { return x % capacity; }
	};

	struct robin_hood {
	robin_hood() : data_(capacity, EMPTY) {}

	void add(uint32_t x) {
	assert(x != EMPTY && x != TOMBSTONE); // :(
	auto index = hash(x);
	size_t curr_probe = 0;
	while (data_[index] != 0 && data_[index] != x) {
	auto existing_probe = probe_dist(index);
	if (existing_probe < curr_probe) {
	std::swap(x, data_[index]);
	curr_probe = existing_probe;
	}
	index = (index + 1) % capacity;
	++curr_probe;
	}
	data_[index] = x;
	}

	bool contains(uint32_t x) {
	assert(x != EMPTY && x != TOMBSTONE); // :(
	auto index = hash(x);
	size_t curr_probe = 0;
	while (data_[index] != 0 && data_[index] != x) {
	auto existing_probe = probe_dist(index);
	if (existing_probe < curr_probe) {
	return false; // better than plain linear probing
	}
	index = (index + 1) % capacity;
	++curr_probe;
	}
	return data_[index] == x;
	}

	private:
	const uint32_t EMPTY = 0;
	const uint32_t TOMBSTONE = 1;
	std::vector<uint32_t> data_;

	size_t hash(uint32_t x) { return x % capacity; }

	size_t probe_dist(size_t index) {
	auto ideal_index = hash(data_[index]);
	return (index + capacity - ideal_index) % capacity;
	}
	};

	struct cuckoo {
	const uint32_t EMPTY = 0;

	void add(uint32_t x) {
	for (;;) {
	assert(x != EMPTY);
	auto index = hash1(x);
	std::swap(x, data1_[index]);
	if (x == EMPTY)
	return;

	index = hash2(x);
	std::swap(x, data2_[index]);
	if (x == EMPTY)
	return;
	}
	}

	bool contains(uint32_t x) {
	assert(x != EMPTY);
	return data1_[hash1(x)] == x \|\| data2_[hash2(x)] == x;
	}

	cuckoo() : data1_(capacity, EMPTY), data2_(capacity, EMPTY) {}

	private:
	std::vector<uint32_t> data1_;
	std::vector<uint32_t> data2_;
	size_t hash1(uint32_t x) { return x % capacity; }
	size_t hash2(uint32_t x) { return (x * 15485863) % capacity; }
	};

	// use std::chrono
	clock_t tick_begin;

	void tick() { tick_begin = clock(); }

	double tack() { return (clock() - tick_begin); }

	std::vector<uint32_t> random_numbers() {
	// use proper C++ random numbers!
	std::vector<uint32_t> result;
	for (size_t i = 0; i < capacity * load_factor; i++) {
	result.push_back(rand());
	}
	return result;
	}

	template <typename T>
	void bench(std::vector<uint32_t> const &to_add,
	std::vector<uint32_t> const &to_lookup) {
	srand(time(nullptr));
	T map;

	tick();
	for (auto x : to_add) {
	map.add(x);
	}
	std::cout << "insertion " << tack() << std::endl;

	tick();
	for (auto x : to_add) {
	if (!map.contains(x)) {
	std::cout << "oups :(" << std::endl;
	}
	}
	std::cout << "successful lookup " << tack() << std::endl;

	tick();
	uint32_t num_failures = 0;
	for (auto x : to_lookup) {
	if (!map.contains(x)) {
	num_failures += 1;
	}
	}

	std::cout << "failed lookup " << tack() << std::endl;
	std::cout << "failures " << num_failures << std::endl;
	}

	int main() {
	auto to_add = random_numbers();
	auto to_lookup = random_numbers();
	std::cout << "std::unordered_set" << std::endl;
	bench<std_us>(to_add, to_lookup);

	std::cout << "\nchaining" << std::endl;
	bench<chaining>(to_add, to_lookup);

	std::cout << "\nchaining two hashes" << std::endl;
	bench<chaining2h>(to_add, to_lookup);

	std::cout << "\nlinear probing" << std::endl;
	bench<linear_probing>(to_add, to_lookup);

	std::cout << "\nrobin hood" << std::endl;
	bench<robin_hood>(to_add, to_lookup);

	// std::cout << "\ncuckoo" << std::endl;
	// bench<cuckoo>(to_add, to_lookup);
	}