Skip to content

Instantly share code, notes, and snippets.

@andyleejordan
Last active March 20, 2022 15:32
Show Gist options
  • Star 17 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save andyleejordan/d120be76ba8ebd66cf50 to your computer and use it in GitHub Desktop.
Save andyleejordan/d120be76ba8ebd66cf50 to your computer and use it in GitHub Desktop.
Algorithm for Efficient Chunked File Reading in C++
/* Algorithm for Efficient Chunked File Reading in C++
*
* The MIT License (MIT)
*
* Copyright 2014 Andrew Schwartzmeyer
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <vector>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
using namespace std;
int main(int argc, char* argv[])
{
/* basic CLI interface */
if (argc < 2)
{
cerr << "usage: input_file [chunk_size]" << endl;
return 1;
}
ifstream file(argv[1], ifstream::binary);
/* basic sanity check */
if (not file)
{
cerr << "file: " << argv[1] << " failed to open" << endl;
return 1;
}
/* *NIX way to get file size without seeking to the end and back */
struct stat filestatus;
stat(argv[1], &filestatus);
size_t total_size = filestatus.st_size;
size_t chunk_size = 0;
/* C-string necessitates aoti to get chunk size */
if (argc == 3)
{ chunk_size = atoi(argv[2]); }
/* atoi may fail and leave us with an undefined chunk size*/
if (not (chunk_size > 0))
{ chunk_size = 16 * 1024; }
cout << "using chunk size: " << chunk_size << endl;
/* on to the actual algorithm */
size_t total_chunks = total_size / chunk_size;
size_t last_chunk_size = total_size % chunk_size;
if (last_chunk_size != 0) /* if the above division was uneven */
{
++total_chunks; /* add an unfilled final chunk */
}
else /* if division was even, last chunk is full */
{
last_chunk_size = chunk_size;
}
/* the loop of chunking */
for (size_t chunk = 0; chunk < total_chunks; ++chunk)
{
size_t this_chunk_size =
chunk == total_chunks - 1 /* if last chunk */
? last_chunk_size /* then fill chunk with remaining bytes */
: chunk_size; /* else fill entire chunk */
/* if needed, we also have the position of this chunk in the file
size_t start_of_chunk = chunk * chunk_size; */
/* adapt this portion as necessary, this is the fast C++ way */
vector<char> chunk_data(this_chunk_size);
file.read(&chunk_data[0], /* address of buffer start */
this_chunk_size); /* this many bytes is to be read */
/* do something with chunk_data before next iteration */
cout << "chunk #" << chunk << endl;
for (const auto c : chunk_data) /* I like my C++11 extensions */
{
cout << c;
}
cout << endl;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment