Skip to content

Instantly share code, notes, and snippets.

@abh006
Created August 24, 2022 20:34
Show Gist options
  • Save abh006/05d0171c4f8a378dadf795549bfe877e to your computer and use it in GitHub Desktop.
Save abh006/05d0171c4f8a378dadf795549bfe877e to your computer and use it in GitHub Desktop.
Splits Huge JSON Array into chunked files, with each line a JSON object
#include <iostream>
#include <fstream>
#include <vector>
#include <stack>
using std::cout; using std::cerr;
using std::endl; using std::string;
using std::ifstream; using std::vector;
using std::stack;
using std::to_string;
string outFilePath("/Users/hjpotter/output");
string outFilePrefix("out_");
string getFileName(int chunkCount){
return outFilePath + "/" + outFilePrefix + to_string(chunkCount) + ".txt";
}
int main()
{
int batchSize = 1000;
string filename("/Users/hjpotter/huge-json-array.json");
vector<char> bytes;
std::ofstream outfile;
FILE* input_file = fopen(filename.c_str(), "r");
if (input_file == nullptr) {
return EXIT_FAILURE;
}
stack<unsigned char> st;
string json_str = "";
int chunkCount = 44;
int lineCount = 0;
outfile.open( getFileName(chunkCount), std::ios_base::app); // append instead of overwrite
unsigned char character = 0;
bool startSkipped = false;
while (!feof(input_file)) {
character = getc(input_file);
if(!startSkipped){
startSkipped = true;
continue;
}
if(st.empty() && character != '{'){
// Next JSON not started yet. Skipping intermediate commas and spaces
continue;
}
json_str += character;
if(character == '{'){
st.push(character);
}else if(character == '}'){
st.pop();
}
if(st.empty()){
outfile << json_str <<endl;
json_str = "";
lineCount++;
}
if(lineCount >= batchSize){
cout << "Closing current file and reopening next. File count: " << chunkCount << endl;
outfile.close();
chunkCount++;
lineCount = 0;
outfile.open( getFileName(chunkCount), std::ios_base::app); // append instead of overwrite
}
}
if(outfile.is_open()){
outfile.close();
}
cout << "Finished" << endl;
fclose(input_file);
return EXIT_SUCCESS;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment