Skip to content

Instantly share code, notes, and snippets.

@engelmarkus
Last active July 13, 2016 21:23
Show Gist options
  • Save engelmarkus/895a79811bfa0b3b4564f3833da31889 to your computer and use it in GitHub Desktop.
Save engelmarkus/895a79811bfa0b3b4564f3833da31889 to your computer and use it in GitHub Desktop.
k-way merge sort - sorting data using a fixed amount of memory
#include <algorithm>
#include <array>
#include <fstream>
#include <numeric>
#include <random>
using namespace std;
// Step 1 - Create a file with 256 MiB of data
array<uint64_t, 256 * 1024 * 1024 / sizeof(uint64_t)> data;
int main() {
iota(begin(data), end(data), 0);
random_device rd;
mt19937 gen(rd());
shuffle(begin(data), end(data), gen);
ofstream file("data.dat");
file.write(reinterpret_cast<char*>(data.data()), data.size() * sizeof(uint64_t));
}
#include <array>
#include <algorithm>
#include <fstream>
#include <limits>
#include <vector>
using namespace std;
// Step 3 - Merge chunks
constexpr size_t const ElemsPerBlock = 1024 * 1024;
uint64_t getElemAt(ifstream& ifs, size_t blockNum, size_t index) {
ifs.seekg((blockNum * ElemsPerBlock + index) * sizeof(uint64_t), ifs.beg);
uint64_t result;
ifs.read(reinterpret_cast<char*>(&result), sizeof(uint64_t));
return result;
}
int main() {
ifstream din("data_sorted.dat");
din.seekg(0, din.end);
auto length = din.tellg();
din.seekg(0, din.beg);
ofstream dout("data_merged.dat");
vector<uint64_t> data(length / sizeof(uint64_t) / ElemsPerBlock);
vector<uint64_t> indizes(length / sizeof(uint64_t) / ElemsPerBlock);
// init data
for (auto i = 0; i < data.size(); ++i) {
data[i] = getElemAt(din, i, 0);
}
auto to_write = length / sizeof(uint64_t);
auto written = 0;
while (written < to_write) {
// pick smallest element
auto minelem = min_element(begin(data), end(data));
auto pos = distance(begin(data), minelem);
dout.write(reinterpret_cast<char*>(&*minelem), sizeof(uint64_t));
written++;
indizes[pos]++;
data[pos] = (indizes[pos] >= ElemsPerBlock) ? numeric_limits<uint64_t>::max() : getElemAt(din, pos, indizes[pos]);
}
}
#include <array>
#include <algorithm>
#include <fstream>
using namespace std;
constexpr size_t const BufferSize = 1024 * 1024;
using Buffer = array<uint64_t, BufferSize>;
// Step 2 - Sort chunks of data using only 8 MiB of RAM
Buffer buf;
int main() {
ifstream din("data.dat");
din.seekg(0, din.end);
auto length = din.tellg();
din.seekg(0, din.beg);
ofstream dout("data_sorted.dat");
for (auto i = 0; i < length; i += buf.size() * sizeof(Buffer::value_type)) {
din.read(reinterpret_cast<char*>(buf.data()), buf.size() * sizeof(Buffer::value_type));
sort(begin(buf), end(buf));
dout.write(reinterpret_cast<char*>(buf.data()), buf.size() * sizeof(Buffer::value_type));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment