Skip to content

Instantly share code, notes, and snippets.

@barrybingo
Created February 1, 2015 16:33
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save barrybingo/4513180a262786038c12 to your computer and use it in GitHub Desktop.
Save barrybingo/4513180a262786038c12 to your computer and use it in GitHub Desktop.
/*
Testing read file speed for the three read functions from
http://cpp.indi.frih.net/blog/2014/09/how-to-read-an-entire-file-into-memory-in-cpp/
compile with -std=c++11
*/
#include <type_traits>
#include <ostream>
#include <sstream>
#include <limits>
#include <array>
#include <vector>
#include <deque>
/* go to method for small files (<100K)
auto ss = std::ostringstream{};
ss << in.rdbuf();
auto s = ss.str();
Problems: has to copy data from the ostringstream into a string and for large
data this could be an issue due to having two copies of large data in memory.
*/
template <typename CharT, typename Traits = std::char_traits<CharT>,
typename Allocator = std::allocator<CharT> >
std::basic_string<CharT, Traits, Allocator> read_stream_into_string(
std::basic_istream<CharT, Traits>& in, Allocator alloc = {}) {
std::basic_ostringstream<CharT, Traits, Allocator> ss(
std::basic_string<CharT, Traits, Allocator>(std::move(alloc)));
if (!(ss << in.rdbuf()))
throw std::ios_base::failure{ "error" };
return ss.str();
}
/* reading straight into a container:
If you are dealing with files it can be faster to count all the
characters first, then do one big allocation and one big whopper of a read:
auto const start_pos = in.tellg();
in.ignore(std::numeric_limits<std::streamsize>::max());
auto const char_count = in.gcount();
in.seekg(start_pos);
auto s = std::string(char_count, char{});
in.read(&s[0], s.size());
in.ignore() is a safe way to count the bytes in a file but means this method
requires reading the file twice, once to count bytes and once to read them in.
*/
template <typename Container = std::string, typename CharT = char,
typename Traits = std::char_traits<char> >
Container read_stream_into_container(
std::basic_istream<CharT, Traits>& in,
typename Container::allocator_type alloc = {}) {
static_assert(
// Allow only strings...
std::is_same<
Container,
std::basic_string<CharT, Traits,
typename Container::allocator_type> >::value ||
// ... and vectors of the plain, signed, and
// unsigned flavours of CharT.
std::is_same<
Container,
std::vector<CharT, typename Container::allocator_type> >::value ||
std::is_same<
Container,
std::vector<std::make_unsigned<CharT>,
typename Container::allocator_type> >::value ||
std::is_same<Container,
std::vector<std::make_signed<CharT>,
typename Container::allocator_type> >::value,
"only strings and vectors of ((un)signed) CharT allowed");
auto const start_pos = in.tellg();
if (std::streamsize(-1) == start_pos)
throw std::ios_base::failure{ "error" };
if (!in.ignore(std::numeric_limits<std::streamsize>::max()))
throw std::ios_base::failure{ "error" };
auto const char_count = in.gcount();
if (!in.seekg(start_pos))
throw std::ios_base::failure{ "error" };
auto container = Container(std::move(alloc));
container.resize(char_count);
if (0 != container.size()) {
if (!in.read(reinterpret_cast<CharT*>(&container[0]), container.size()))
throw std::ios_base::failure{ "error" };
}
return container;
}
/* read chunks into a deque:
If you’re expecting enormous files (at least several hundreds of megabytes,
on average) and you don’t want to seek on stream, read the file in chunks into
a deque. Advantage is no copy unless you can't work with the dequeu and end up
copying the data out of it.
*/
template <typename CharT, typename Traits = std::char_traits<CharT>,
typename CharO = CharT, typename Allocator = std::allocator<CharO> >
std::deque<CharO, Allocator> read_file_into_deque(
std::basic_istream<CharT, Traits>& in, Allocator alloc = {}) {
static_assert(std::is_same<CharT, CharO>::value ||
std::is_same<std::make_unsigned<CharT>, CharO>::value ||
std::is_same<std::make_signed<CharT>, CharO>::value,
"char type of deque must be same "
"as stream char type "
"(possibly signed or unsigned)");
using std::begin;
using std::end;
auto const chunk_size = std::size_t{ BUFSIZ };
auto container = std::deque<CharO, Allocator>(std::move(alloc));
auto chunk = std::array<CharO, chunk_size>{};
while (in.read(reinterpret_cast<CharT*>(chunk.data()), chunk.size()) ||
in.gcount())
container.insert(end(container), begin(chunk), begin(chunk) + in.gcount());
return container;
}
/*
Testing section
*/
#include <chrono>
#include <iostream>
#include <fstream>
#include <stdio.h>
#include <cassert>
#include <cctype>
/*
humanize from
https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libtcplay/humanize.c
*/
static const char prefixes[] = " KMGTPE";
template <typename T> std::string humanize(T num) {
const char* prefixp;
uint64_t i, d;
prefixp = prefixes;
i = num;
d = 0;
while ((i > 1024) && (*prefixp != '\0')) {
d = (i % 1024) / 10;
i /= 1024;
++prefixp;
}
if (d > 0)
return std::to_string(i) + '.' + std::to_string(d) + *prefixp;
else
return std::to_string(i) + *prefixp;
}
/*
dehumanize_number from
http://cvsweb.netbsd.org/bsdweb.cgi/~checkout~/src/lib/libc/gen/dehumanize_number.c
*/
uint64_t dehumanize_number(const std::string& str) {
char unit;
size_t delimit;
long multiplier;
long long tmp, tmp2;
size_t ep;
size_t len = str.size();
if (str.empty()) {
return 0;
}
multiplier = 1;
unit = str[len - 1];
if (std::isalpha((unsigned char)unit)) {
switch (std::tolower((unsigned char)unit)) {
case 'b':
multiplier = 1;
break;
case 'k':
multiplier = 1024;
break;
case 'm':
multiplier = 1024 * 1024;
break;
case 'g':
multiplier = 1024 * 1024 * 1024;
break;
default:
return 0; /* Invalid suffix. */
}
delimit = len - 1;
} else
delimit = 0;
tmp = std::stoull(str, &ep, 10);
if (str[0] == '\0' || (ep != delimit && str[ep] != '\0'))
return 0; /* Not a number. */
tmp2 = tmp * multiplier;
tmp2 = tmp2 / multiplier;
if (tmp != tmp2) {
return 0; /* Out of range. */
}
return tmp *= multiplier;
}
std::string create_empty_file(size_t size) {
// dispite a warning tmpnam is portable and there is no risk for this program
std::string name = std::tmpnam(nullptr);
std::ofstream out(name, std::ofstream::binary);
if (!out.seekp(size - 1))
throw std::ios_base::failure{ "error" };
out << 'X';
return name;
}
template <typename F>
bool test(const std::string& test_name, F fn, size_t test_size,
bool skip_long_verify) {
std::string temp_filename = create_empty_file(test_size);
std::ifstream in(temp_filename, std::ifstream::binary);
auto start = std::chrono::high_resolution_clock::now();
auto s = fn(in, {});
auto end = std::chrono::high_resolution_clock::now();
remove(temp_filename.c_str());
std::cout << test_name << ":"
<< std::chrono::duration_cast<std::chrono::milliseconds>(
end - start).count() << "ms\n";
// content tests
if (s.size() != test_size) {
std::cout << "FAILED:wrong size " << s.size() << '\n';
return false;
} else if (s[s.size() - 1] != 'X') {
std::cout << "FAILED:last byte is wrong\n";
return false;
}
if (skip_long_verify) { return true; }
size_t i = 1;
for (; i < s.size() - 1; ++i) {
if (s[i]) {
std::cout << "FAILED:[" << i << "]!=0\n";
break;
}
}
return (i == s.size() - 1);
}
bool test_all(size_t test_size, bool skip_long_verify = false) {
std::cout << "test_size:" << humanize(test_size) << ", BUFSIZ:" << BUFSIZ
<< '\n';
bool t1 = test("read_stream_into_string", &read_stream_into_string<char>,
test_size, skip_long_verify);
bool t2 = test("read_stream_into_container",
&read_stream_into_container<std::string>, test_size,
skip_long_verify);
bool t3 = test("read_file_into_deque", read_file_into_deque<char>, test_size,
skip_long_verify);
std::cout << std::endl;
return (t1 && t2 && t3);
}
void usage(const std::string& prog_name) {
std::cout
<< "Usage: " << prog_name << " [test_file_size] [max_size step]\n\n"
<< " Example: " << prog_name << " 1m 100m 500k\n"
<< " Times read algorithms with files of 1M to 100M in steps of 500K\n"
<< "Minimum size for files is 2 and using step 0 is troublemaking.\n";
}
int main(int argc, char* argv[]) {
size_t test_size = dehumanize_number("1M");
if (argc >= 2) {
test_size = dehumanize_number(argv[1]);
}
size_t test_size_max = test_size;
size_t step = 1;
if (argc == 4) {
test_size_max = dehumanize_number(argv[2]);
step = dehumanize_number(argv[3]);
}
if ((test_size <= 1) || (test_size > test_size_max) || (test_size_max <= 1) ||
(step <= 0)) {
usage(argv[0]);
return 1;
}
for (; test_size <= test_size_max && test_all(test_size); test_size += step) {
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment