barrybingo/file_reading_speed_test.cpp

## file_reading_speed_test.cpp
/*
 Testing read file speed for the three read functions from
http://cpp.indi.frih.net/blog/2014/09/how-to-read-an-entire-file-into-memory-in-cpp/

compile with -std=c++11
*/

#include <type_traits>
#include <ostream>
#include <sstream>
#include <limits>
#include <array>
#include <vector>
#include <deque>

/* go to method for small files (<100K)
      auto ss = std::ostringstream{};
      ss << in.rdbuf();
      auto s = ss.str();
  Problems: has to copy data from the ostringstream into a string and for large
  data this could be an issue due to having two copies of large data in memory.
*/
template <typename CharT, typename Traits = std::char_traits<CharT>,
          typename Allocator = std::allocator<CharT> >
std::basic_string<CharT, Traits, Allocator> read_stream_into_string(
    std::basic_istream<CharT, Traits>& in, Allocator alloc = {}) {
  std::basic_ostringstream<CharT, Traits, Allocator> ss(
      std::basic_string<CharT, Traits, Allocator>(std::move(alloc)));

  if (!(ss << in.rdbuf()))
    throw std::ios_base::failure{ "error" };

  return ss.str();
}

/* reading straight into a container:
If you are dealing with files it can be faster to count all the
characters first, then do one big allocation and one big whopper of a read:
    auto const start_pos = in.tellg();
    in.ignore(std::numeric_limits<std::streamsize>::max());
    auto const char_count = in.gcount();
    in.seekg(start_pos);
    auto s = std::string(char_count, char{});
    in.read(&s[0], s.size());
in.ignore() is a safe way to count the bytes in a file but means this method
requires reading the file twice, once to count bytes and once to read them in.
*/
template <typename Container = std::string, typename CharT = char,
          typename Traits = std::char_traits<char> >
Container read_stream_into_container(
    std::basic_istream<CharT, Traits>& in,
    typename Container::allocator_type alloc = {}) {
  static_assert(
      // Allow only strings...
      std::is_same<
          Container,
          std::basic_string<CharT, Traits,
                            typename Container::allocator_type> >::value ||
          // ... and vectors of the plain, signed, and
          // unsigned flavours of CharT.
          std::is_same<
              Container,
              std::vector<CharT, typename Container::allocator_type> >::value ||
          std::is_same<
              Container,
              std::vector<std::make_unsigned<CharT>,
                          typename Container::allocator_type> >::value ||
          std::is_same<Container,
                       std::vector<std::make_signed<CharT>,
                                   typename Container::allocator_type> >::value,
      "only strings and vectors of ((un)signed) CharT allowed");

  auto const start_pos = in.tellg();
  if (std::streamsize(-1) == start_pos)
    throw std::ios_base::failure{ "error" };

  if (!in.ignore(std::numeric_limits<std::streamsize>::max()))
    throw std::ios_base::failure{ "error" };

  auto const char_count = in.gcount();

  if (!in.seekg(start_pos))
    throw std::ios_base::failure{ "error" };

  auto container = Container(std::move(alloc));
  container.resize(char_count);

  if (0 != container.size()) {
    if (!in.read(reinterpret_cast<CharT*>(&container[0]), container.size()))
      throw std::ios_base::failure{ "error" };
  }

  return container;
}

/* read chunks into a deque:
If you’re expecting enormous files (at least several hundreds of megabytes,
on average) and you don’t want to seek on stream, read the file in chunks into
a deque. Advantage is no copy unless you can't work with the dequeu and end up
copying the data out of it.
*/
template <typename CharT, typename Traits = std::char_traits<CharT>,
          typename CharO = CharT, typename Allocator = std::allocator<CharO> >
std::deque<CharO, Allocator> read_file_into_deque(
    std::basic_istream<CharT, Traits>& in, Allocator alloc = {}) {
  static_assert(std::is_same<CharT, CharO>::value ||
                    std::is_same<std::make_unsigned<CharT>, CharO>::value ||
                    std::is_same<std::make_signed<CharT>, CharO>::value,
                "char type of deque must be same "
                "as stream char type "
                "(possibly signed or unsigned)");

  using std::begin;
  using std::end;

  auto const chunk_size = std::size_t{ BUFSIZ };

  auto container = std::deque<CharO, Allocator>(std::move(alloc));

  auto chunk = std::array<CharO, chunk_size>{};

  while (in.read(reinterpret_cast<CharT*>(chunk.data()), chunk.size()) ||
         in.gcount())
    container.insert(end(container), begin(chunk), begin(chunk) + in.gcount());

  return container;
}

/*
 Testing section
 */
#include <chrono>
#include <iostream>
#include <fstream>
#include <stdio.h>
#include <cassert>
#include <cctype>


/*
  humanize from
https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libtcplay/humanize.c
*/
static const char prefixes[] = " KMGTPE";
template <typename T> std::string humanize(T num) {
  const char* prefixp;
  uint64_t i, d;
  prefixp = prefixes;
  i = num;
  d = 0;
  while ((i > 1024) && (*prefixp != '\0')) {
    d = (i % 1024) / 10;
    i /= 1024;
    ++prefixp;
  }
  if (d > 0)
    return std::to_string(i) + '.' + std::to_string(d) + *prefixp;
  else
    return std::to_string(i) + *prefixp;
}

/*
  dehumanize_number from
  http://cvsweb.netbsd.org/bsdweb.cgi/~checkout~/src/lib/libc/gen/dehumanize_number.c
*/
uint64_t dehumanize_number(const std::string& str) {
  char unit;
  size_t delimit;
  long multiplier;
  long long tmp, tmp2;
  size_t ep;
  size_t len = str.size();

  if (str.empty()) {
    return 0;
  }

  multiplier = 1;

  unit = str[len - 1];
  if (std::isalpha((unsigned char)unit)) {
    switch (std::tolower((unsigned char)unit)) {
      case 'b':
        multiplier = 1;
        break;

      case 'k':
        multiplier = 1024;
        break;

      case 'm':
        multiplier = 1024 * 1024;
        break;

      case 'g':
        multiplier = 1024 * 1024 * 1024;
        break;

      default:
        return 0; /* Invalid suffix. */
    }

    delimit = len - 1;
  } else
    delimit = 0;

  tmp = std::stoull(str, &ep, 10);
  if (str[0] == '\0' || (ep != delimit && str[ep] != '\0'))
    return 0; /* Not a number. */

  tmp2 = tmp * multiplier;
  tmp2 = tmp2 / multiplier;
  if (tmp != tmp2) {
    return 0; /* Out of range. */
  }
  return tmp *= multiplier;
}

std::string create_empty_file(size_t size) {
  // dispite a warning tmpnam is portable and there is no risk for this program
  std::string name = std::tmpnam(nullptr);

  std::ofstream out(name, std::ofstream::binary);
  if (!out.seekp(size - 1))
    throw std::ios_base::failure{ "error" };
  out << 'X';
  return name;
}

template <typename F>
bool test(const std::string& test_name, F fn, size_t test_size,
          bool skip_long_verify) {
  std::string temp_filename = create_empty_file(test_size);

  std::ifstream in(temp_filename, std::ifstream::binary);
  auto start = std::chrono::high_resolution_clock::now();
  auto s = fn(in, {});
  auto end = std::chrono::high_resolution_clock::now();
  remove(temp_filename.c_str());

  std::cout << test_name << ":"
            << std::chrono::duration_cast<std::chrono::milliseconds>(
                   end - start).count() << "ms\n";

  // content tests
  if (s.size() != test_size) {
    std::cout << "FAILED:wrong size " << s.size() << '\n';
    return false;
  } else if (s[s.size() - 1] != 'X') {
    std::cout << "FAILED:last byte is wrong\n";
    return false;
  }

  if (skip_long_verify) { return true; }

  size_t i = 1;
  for (; i < s.size() - 1; ++i) {
    if (s[i]) {
      std::cout << "FAILED:[" << i << "]!=0\n";
      break;
    }
  }

  return (i == s.size() - 1);
}

bool test_all(size_t test_size, bool skip_long_verify = false) {
  std::cout << "test_size:" << humanize(test_size) << ", BUFSIZ:" << BUFSIZ
            << '\n';

  bool t1 = test("read_stream_into_string", &read_stream_into_string<char>,
                 test_size, skip_long_verify);
  bool t2 = test("read_stream_into_container",
                 &read_stream_into_container<std::string>, test_size,
                 skip_long_verify);
  bool t3 = test("read_file_into_deque", read_file_into_deque<char>, test_size,
                 skip_long_verify);
  std::cout << std::endl;
  return (t1 && t2 && t3);
}

void usage(const std::string& prog_name) {
  std::cout
      << "Usage: " << prog_name << " [test_file_size] [max_size step]\n\n"
      << " Example: " << prog_name << " 1m 100m 500k\n"
      << "  Times read algorithms with files of 1M to 100M in steps of 500K\n"
      << "Minimum size for files is 2 and using step 0 is troublemaking.\n";
}

int main(int argc, char* argv[]) {
  size_t test_size = dehumanize_number("1M");
  if (argc >= 2) {
    test_size = dehumanize_number(argv[1]);
  }

  size_t test_size_max = test_size;
  size_t step = 1;

  if (argc == 4) {
    test_size_max = dehumanize_number(argv[2]);
    step = dehumanize_number(argv[3]);
  }

  if ((test_size <= 1) || (test_size > test_size_max) || (test_size_max <= 1) ||
      (step <= 0)) {
    usage(argv[0]);
    return 1;
  }

  for (; test_size <= test_size_max && test_all(test_size); test_size += step) {
  }

  return 0;
}
	/*
	Testing read file speed for the three read functions from
	http://cpp.indi.frih.net/blog/2014/09/how-to-read-an-entire-file-into-memory-in-cpp/

	compile with -std=c++11
	*/

	#include <type_traits>
	#include <ostream>
	#include <sstream>
	#include <limits>
	#include <array>
	#include <vector>
	#include <deque>

	/* go to method for small files (<100K)
	auto ss = std::ostringstream{};
	ss << in.rdbuf();
	auto s = ss.str();
	Problems: has to copy data from the ostringstream into a string and for large
	data this could be an issue due to having two copies of large data in memory.
	*/
	template <typename CharT, typename Traits = std::char_traits<CharT>,
	typename Allocator = std::allocator<CharT> >
	std::basic_string<CharT, Traits, Allocator> read_stream_into_string(
	std::basic_istream<CharT, Traits>& in, Allocator alloc = {}) {
	std::basic_ostringstream<CharT, Traits, Allocator> ss(
	std::basic_string<CharT, Traits, Allocator>(std::move(alloc)));

	if (!(ss << in.rdbuf()))
	throw std::ios_base::failure{ "error" };

	return ss.str();
	}

	/* reading straight into a container:
	If you are dealing with files it can be faster to count all the
	characters first, then do one big allocation and one big whopper of a read:
	auto const start_pos = in.tellg();
	in.ignore(std::numeric_limits<std::streamsize>::max());
	auto const char_count = in.gcount();
	in.seekg(start_pos);
	auto s = std::string(char_count, char{});
	in.read(&s[0], s.size());
	in.ignore() is a safe way to count the bytes in a file but means this method
	requires reading the file twice, once to count bytes and once to read them in.
	*/
	template <typename Container = std::string, typename CharT = char,
	typename Traits = std::char_traits<char> >
	Container read_stream_into_container(
	std::basic_istream<CharT, Traits>& in,
	typename Container::allocator_type alloc = {}) {
	static_assert(
	// Allow only strings...
	std::is_same<
	Container,
	std::basic_string<CharT, Traits,
	typename Container::allocator_type> >::value \|\|
	// ... and vectors of the plain, signed, and
	// unsigned flavours of CharT.
	std::is_same<
	Container,
	std::vector<CharT, typename Container::allocator_type> >::value \|\|
	std::is_same<
	Container,
	std::vector<std::make_unsigned<CharT>,
	typename Container::allocator_type> >::value \|\|
	std::is_same<Container,
	std::vector<std::make_signed<CharT>,
	typename Container::allocator_type> >::value,
	"only strings and vectors of ((un)signed) CharT allowed");

	auto const start_pos = in.tellg();
	if (std::streamsize(-1) == start_pos)
	throw std::ios_base::failure{ "error" };

	if (!in.ignore(std::numeric_limits<std::streamsize>::max()))
	throw std::ios_base::failure{ "error" };

	auto const char_count = in.gcount();

	if (!in.seekg(start_pos))
	throw std::ios_base::failure{ "error" };

	auto container = Container(std::move(alloc));
	container.resize(char_count);

	if (0 != container.size()) {
	if (!in.read(reinterpret_cast<CharT*>(&container[0]), container.size()))
	throw std::ios_base::failure{ "error" };
	}

	return container;
	}

	/* read chunks into a deque:
	If you’re expecting enormous files (at least several hundreds of megabytes,
	on average) and you don’t want to seek on stream, read the file in chunks into
	a deque. Advantage is no copy unless you can't work with the dequeu and end up
	copying the data out of it.
	*/
	template <typename CharT, typename Traits = std::char_traits<CharT>,
	typename CharO = CharT, typename Allocator = std::allocator<CharO> >
	std::deque<CharO, Allocator> read_file_into_deque(
	std::basic_istream<CharT, Traits>& in, Allocator alloc = {}) {
	static_assert(std::is_same<CharT, CharO>::value \|\|
	std::is_same<std::make_unsigned<CharT>, CharO>::value \|\|
	std::is_same<std::make_signed<CharT>, CharO>::value,
	"char type of deque must be same "
	"as stream char type "
	"(possibly signed or unsigned)");

	using std::begin;
	using std::end;

	auto const chunk_size = std::size_t{ BUFSIZ };

	auto container = std::deque<CharO, Allocator>(std::move(alloc));

	auto chunk = std::array<CharO, chunk_size>{};

	while (in.read(reinterpret_cast<CharT*>(chunk.data()), chunk.size()) \|\|
	in.gcount())
	container.insert(end(container), begin(chunk), begin(chunk) + in.gcount());

	return container;
	}

	/*
	Testing section
	*/
	#include <chrono>
	#include <iostream>
	#include <fstream>
	#include <stdio.h>
	#include <cassert>
	#include <cctype>


	/*
	humanize from
	https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libtcplay/humanize.c
	*/
	static const char prefixes[] = " KMGTPE";
	template <typename T> std::string humanize(T num) {
	const char* prefixp;
	uint64_t i, d;
	prefixp = prefixes;
	i = num;
	d = 0;
	while ((i > 1024) && (*prefixp != '\0')) {
	d = (i % 1024) / 10;
	i /= 1024;
	++prefixp;
	}
	if (d > 0)
	return std::to_string(i) + '.' + std::to_string(d) + *prefixp;
	else
	return std::to_string(i) + *prefixp;
	}

	/*
	dehumanize_number from
	http://cvsweb.netbsd.org/bsdweb.cgi/~checkout~/src/lib/libc/gen/dehumanize_number.c
	*/
	uint64_t dehumanize_number(const std::string& str) {
	char unit;
	size_t delimit;
	long multiplier;
	long long tmp, tmp2;
	size_t ep;
	size_t len = str.size();

	if (str.empty()) {
	return 0;
	}

	multiplier = 1;

	unit = str[len - 1];
	if (std::isalpha((unsigned char)unit)) {
	switch (std::tolower((unsigned char)unit)) {
	case 'b':
	multiplier = 1;
	break;

	case 'k':
	multiplier = 1024;
	break;

	case 'm':
	multiplier = 1024 * 1024;
	break;

	case 'g':
	multiplier = 1024 * 1024 * 1024;
	break;

	default:
	return 0; /* Invalid suffix. */
	}

	delimit = len - 1;
	} else
	delimit = 0;

	tmp = std::stoull(str, &ep, 10);
	if (str[0] == '\0' \|\| (ep != delimit && str[ep] != '\0'))
	return 0; /* Not a number. */

	tmp2 = tmp * multiplier;
	tmp2 = tmp2 / multiplier;
	if (tmp != tmp2) {
	return 0; /* Out of range. */
	}
	return tmp *= multiplier;
	}

	std::string create_empty_file(size_t size) {
	// dispite a warning tmpnam is portable and there is no risk for this program
	std::string name = std::tmpnam(nullptr);

	std::ofstream out(name, std::ofstream::binary);
	if (!out.seekp(size - 1))
	throw std::ios_base::failure{ "error" };
	out << 'X';
	return name;
	}

	template <typename F>
	bool test(const std::string& test_name, F fn, size_t test_size,
	bool skip_long_verify) {
	std::string temp_filename = create_empty_file(test_size);

	std::ifstream in(temp_filename, std::ifstream::binary);
	auto start = std::chrono::high_resolution_clock::now();
	auto s = fn(in, {});
	auto end = std::chrono::high_resolution_clock::now();
	remove(temp_filename.c_str());

	std::cout << test_name << ":"
	<< std::chrono::duration_cast<std::chrono::milliseconds>(
	end - start).count() << "ms\n";

	// content tests
	if (s.size() != test_size) {
	std::cout << "FAILED:wrong size " << s.size() << '\n';
	return false;
	} else if (s[s.size() - 1] != 'X') {
	std::cout << "FAILED:last byte is wrong\n";
	return false;
	}

	if (skip_long_verify) { return true; }

	size_t i = 1;
	for (; i < s.size() - 1; ++i) {
	if (s[i]) {
	std::cout << "FAILED:[" << i << "]!=0\n";
	break;
	}
	}

	return (i == s.size() - 1);
	}

	bool test_all(size_t test_size, bool skip_long_verify = false) {
	std::cout << "test_size:" << humanize(test_size) << ", BUFSIZ:" << BUFSIZ
	<< '\n';

	bool t1 = test("read_stream_into_string", &read_stream_into_string<char>,
	test_size, skip_long_verify);
	bool t2 = test("read_stream_into_container",
	&read_stream_into_container<std::string>, test_size,
	skip_long_verify);
	bool t3 = test("read_file_into_deque", read_file_into_deque<char>, test_size,
	skip_long_verify);
	std::cout << std::endl;
	return (t1 && t2 && t3);
	}

	void usage(const std::string& prog_name) {
	std::cout
	<< "Usage: " << prog_name << " [test_file_size] [max_size step]\n\n"
	<< " Example: " << prog_name << " 1m 100m 500k\n"
	<< " Times read algorithms with files of 1M to 100M in steps of 500K\n"
	<< "Minimum size for files is 2 and using step 0 is troublemaking.\n";
	}

	int main(int argc, char* argv[]) {
	size_t test_size = dehumanize_number("1M");
	if (argc >= 2) {
	test_size = dehumanize_number(argv[1]);
	}

	size_t test_size_max = test_size;
	size_t step = 1;

	if (argc == 4) {
	test_size_max = dehumanize_number(argv[2]);
	step = dehumanize_number(argv[3]);
	}

	if ((test_size <= 1) \|\| (test_size > test_size_max) \|\| (test_size_max <= 1) \|\|
	(step <= 0)) {
	usage(argv[0]);
	return 1;
	}

	for (; test_size <= test_size_max && test_all(test_size); test_size += step) {
	}

	return 0;
	}