Skip to content

Instantly share code, notes, and snippets.

@yiding
Created October 12, 2019 02:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yiding/7aa769fa4d9ac087ae7f3e4e929956b2 to your computer and use it in GitHub Desktop.
Save yiding/7aa769fa4d9ac087ae7f3e4e929956b2 to your computer and use it in GitHub Desktop.
Split a file containing a large json array into a bunch of smaller files.
use clap::clap_app;
use memmap::MmapOptions;
use serde_json::value::RawValue;
use std::fs::File;
use std::io;
fn main() -> io::Result<()> {
let matches = clap_app!(myapp =>
(about: "Split file containing a top level json array into many files.")
(@arg size: -s --size <Size> "size of each resulting file, in megabytes")
(@arg INPUT: +required "input json file")
)
.get_matches();
let input_path = matches.value_of("INPUT").unwrap();
let chunk_size = matches.value_of("size").unwrap().parse::<usize>().unwrap() * 1024 * 1024;
let f = File::open(input_path)?;
let mmap = unsafe { MmapOptions::new().map(&f)? };
let arr: Vec<&RawValue> = serde_json::from_slice(&mmap)?;
let groups = chunk_to_size(chunk_size, &arr);
// Write out the groups
println!("writing out {} files", groups.len());
for (group, fileno) in groups.iter().zip(1..) {
let f = File::create(format!("{}.{}", input_path, fileno))?;
serde_json::to_writer(f, group).unwrap();
}
Ok(())
}
fn chunk_to_size<'a>(chunk_size: usize, entries: &[&'a RawValue]) -> Vec<Vec<&'a RawValue>> {
// Chunk groups by approximate output size.
let mut sz = 0;
let mut groups: Vec<Vec<&RawValue>> = vec![vec![]];
let mut group: &mut Vec<&RawValue> = groups.last_mut().unwrap();
for result in entries {
if sz > chunk_size {
sz = 0;
groups.push(vec![]);
group = groups.last_mut().unwrap();
}
group.push(result);
sz += result.get().len();
}
return groups;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment