Skip to content

Instantly share code, notes, and snippets.

@sebclaeys
Created September 21, 2011 21:03
Show Gist options
  • Save sebclaeys/1233306 to your computer and use it in GitHub Desktop.
Save sebclaeys/1233306 to your computer and use it in GitHub Desktop.
Read and merge multiple sorted file in C++
shell $ g++ mergetest.cpp
shell $ cat f1.txt
a:1
c:2
f:3
h:4
j:5
l:6
shell $ cat f2.txt
a:1
b:2
d:3
e:4
g:5
shell $ ./a.out f1.txt f2.txt
a:1
a:1
b:2
c:2
d:3
e:4
f:3
g:5
h:4
j:5
l:6
total line read = 11
#include "merge_reader.hpp"
#include <fstream>
#include <iostream>
#include <string>
// Implement what a line is for you
struct Line
{
std::string key;
int val;
bool operator() (const Line& line)
{
return strcmp(key.c_str(), line.key.c_str()) < 0;
}
};
// Implement the way to read your line
std::istream& operator>> (std::istream& is, Line& line)
{
getline(is, line.key, ':');
is >> line.val;
is.ignore(256, '\n');
return is;
}
// Use the merge_reader to read and merge multiple sorted files of this format
int main(int argc, char** argv)
{
MergeReader<Line, LineCompare> mr(argv + 1, argc - 1); // Take the list of files in argument
Line line;
while (mr.get_next(line)) // Iterate on each line returned in the correct order regardless the source file.
std::cout << line.key << ":" << line.val << '\n';
std::cout << "total line read = " << mr.count() << std::endl;
}
#ifndef __MERGE_READER_HPP__
# define __MERGE_READER_HPP__
# include <map>
# include <vector>
# include <iostream>
# include <algorithm>
# include <fstream>
template<typename T>
struct Less
{
bool operator() (T* s1, T* s2)
{
return *s1 < *s2;
}
};
template<typename T>
struct LoopAllocator
{
LoopAllocator(uint32_t size): _size(size), _pos(0)
{
_pointer_array = new T*[size*2];
for (uint i = 0; i < _size; ++i)
_pointer_array[i] = new T();
}
~LoopAllocator()
{
for (uint i = 0; i < _size; ++i)
delete _pointer_array[i];
delete _pointer_array;
}
inline T* allocate ()
{
T* tmp = _pointer_array[_pos++];
_pos = _pos % _size;
return tmp;
}
inline void free(T*)
{
return;
}
private:
uint32_t _size;
uint32_t _pos;
T** _pointer_array;
};
template<typename T>
struct NewAllocator
{
NewAllocator(uint32_t size) {}
inline T* allocate()
{
return new T();
}
inline void free(T* pointer)
{
delete pointer;
}
};
/**
* MergeReader
* Inpue: array of sorted file of the same format.
* Specialization: The type of the block to read (define the file format)
*/
template <typename Block, typename BlockCompare = Less<Block>, typename BlockAllocator = LoopAllocator<Block> >
class MergeReader
{
MergeReader () {};
public:
MergeReader (std::vector<std::string> filenames);
MergeReader (char** filenames, int nb_files);
~MergeReader ();
Block* see_next (); // Return a pointer to the next block. (do not change the state of the reader)
Block* get_next (int*); // Return a pointer to the current block. Then buffurze the next one. User have to delete the block
bool get_next (Block&, int*); // Fill the reference with the current block.
// MergeReader& operator>>(Block& block);
/**
* the optional parameter idx is filled with the index of the file from which the returned block has been extracted.
* this index refer to the file position in the array or vector passed to the constructor
*/
uint64_t count (); // Return the number of block extracted.
protected:
void init();
void buffurize();
void load_block();
protected:
BlockAllocator _allocator;
uint64_t _count;
int _nb_files;
std::istream** _files;
Block** _read_buf;
std::multimap<Block*, int, BlockCompare> _map;
typename std::multimap<Block*, int, BlockCompare>::iterator _map_iter;
int _idx;
};
template <typename Block, typename BlockCompare, typename BlockAllocator>
MergeReader<Block, BlockCompare, BlockAllocator>::MergeReader(std::vector<std::string> filenames)
: _allocator(filenames.size())
{
_nb_files = filenames.size();
this->init();
for (_idx = 0; _idx < _nb_files; _idx++)
{
_files[_idx] = new std::ifstream(filenames[_idx].c_str());
this->load_block();
}
}
template <typename Block, typename BlockCompare, typename BlockAllocator>
MergeReader<Block, BlockCompare, BlockAllocator>::MergeReader(char** filenames, int nb_files)
: _allocator(nb_files), _nb_files(nb_files)
{
this->init();
for (_idx = 0; _idx < _nb_files; _idx++)
{
_files[_idx] = new std::ifstream(filenames[_idx]);
this->load_block();
}
}
template <typename Block, typename BlockCompare, typename BlockAllocator>
void MergeReader<Block, BlockCompare, BlockAllocator>::init()
{
_count = 0;
_files = new std::istream*[_nb_files];
_read_buf = new Block*[_nb_files];
}
template <typename Block, typename BlockCompare, typename BlockAllocator>
MergeReader<Block, BlockCompare, BlockAllocator >::~MergeReader()
{
for (int i = 0; i < _nb_files; i++)
delete _files[i];
delete _read_buf;
}
template <typename Block, typename BlockCompare, typename BlockAllocator>
inline void MergeReader<Block, BlockCompare, BlockAllocator>::buffurize()
{
_read_buf[_idx] = _allocator.allocate();
(*_files[_idx]) >> *(_read_buf[_idx]);
if (!_files[_idx]->good())
{
_allocator.free(_read_buf[_idx]);
_read_buf[_idx] = 0;
}
}
template <typename Block, typename BlockCompare, typename BlockAllocator>
uint64_t MergeReader<Block, BlockCompare, BlockAllocator>::count()
{
return _count;
}
template <typename Block, typename BlockCompare, typename BlockAllocator>
inline void MergeReader<Block, BlockCompare, BlockAllocator>::load_block()
{
this->buffurize();
if (_read_buf[_idx])
_map.insert(std::pair<Block*, int>(_read_buf[_idx], _idx));
}
template <typename Block, typename BlockCompare, typename BlockAllocator>
inline Block* MergeReader<Block, BlockCompare, BlockAllocator>::get_next(int* idx=0)
{
_map_iter = _map.begin();
if (_map_iter == _map.end())
return 0;
_idx = _map_iter->second;
Block* tmp = _read_buf[_idx];
_map.erase(_map_iter);
if (idx)
*idx = _idx;
this->load_block();
_count += 1;
return tmp;
}
template <typename Block, typename BlockCompare, typename BlockAllocator>
inline bool MergeReader<Block, BlockCompare, BlockAllocator >::get_next(Block& block, int* idx=0)
{
Block* tmp = get_next(idx);
if (!tmp)
return false;
block = *tmp;
_allocator.free(tmp);
return true;
}
template <typename Block, typename BlockCompare, typename BlockAllocator>
inline Block* MergeReader<Block, BlockCompare, BlockAllocator>::see_next()
{
_map_iter = _map.begin();
if (_map_iter == _map.end())
return 0;
return _map_iter->first;
}
#endif /* !__MERGE_READER_HPP__ */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment