Created
May 12, 2019 09:33
-
-
Save red75prime/676ba0f8b8e830861679f03f6824cb77 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// [dependencies] | |
// serde = "1.0" | |
// serde_derive = "1.0" | |
// serde_json = { version = "1.0", features = ["raw_value"] } | |
// fnv = "1.0.5" | |
// memmap = "0.7" | |
use fnv::{FnvHashSet as HashSet, FnvHashMap as HashMap}; | |
use memmap::Mmap; | |
use serde_derive::{Deserialize}; | |
use serde_json::{value::RawValue}; | |
const FILE_BUFFER_SIZE: usize = 50000; | |
#[derive(Debug, Deserialize)] | |
#[serde(untagged)] | |
enum Company<'a> { | |
Name(&'a str), | |
NameRec{ name: &'a str}, | |
} | |
#[derive(Debug, Deserialize)] | |
struct Rec<'a> { | |
#[serde(borrow)] | |
company: Company<'a>, | |
debt: &'a RawValue, | |
phones: &'a RawValue, | |
phone: Option<&'a RawValue>, | |
} | |
//source data | |
struct DebtRec<'a> { | |
pub company: &'a str, | |
pub phones: Vec<&'a str>, | |
pub debt: f64 | |
} | |
//result data | |
struct Debtor { | |
companies: HashSet<String>, | |
phones: HashSet<String>, | |
debt: f64 | |
} | |
struct Debtors { | |
all: Vec<Debtor>, | |
index_by_phone: HashMap<String, usize> | |
} | |
impl Debtor { | |
fn new() -> Debtor { | |
Debtor { | |
companies: HashSet::default(), | |
phones: HashSet::default(), | |
debt: 0.0 | |
} | |
} | |
} | |
impl Debtors { | |
fn new() -> Debtors { | |
Debtors { | |
all: Vec::new(), | |
index_by_phone: HashMap::default() | |
} | |
} | |
} | |
fn main() { | |
let mut res = Debtors::new(); | |
let mut fflag = 0; | |
for arg in std::env::args() { | |
if arg == "-f" { | |
fflag = 1; | |
} | |
else if fflag == 1 { | |
fflag = 2; | |
println!("{}:", &arg); | |
let tbegin = std::time::Instant::now(); | |
let (count, errcount) = process_file(&arg, &mut res); | |
println!("PROCESSED: {} objects in {:?}, {} errors found", count, tbegin.elapsed(), errcount); | |
} | |
} | |
for (di, d) in res.all.iter().enumerate() { | |
println!("-------------------------------"); | |
println!("#{}: debt: {}", di, &d.debt); | |
println!("companies: {:?}\nphones: {:?}", &d.companies, &d.phones); | |
} | |
if fflag < 2 { | |
println!("USAGE: fastpivot -f \"file 1\" -f \"file 2\" ..."); | |
} | |
} | |
fn process_file(fname: &str, res: &mut Debtors) -> (i32, i32) { | |
use std::io::prelude::*; | |
let mut count = 0; | |
let mut errcount = 0; | |
match std::fs::File::open(fname) { | |
Ok(file) => { | |
let mmap = match unsafe{ Mmap::map(&file) } { | |
Ok(mmap) => mmap, | |
Err(e) => { | |
println!("Cannot open '{}': {:?}", fname, e); | |
return (0, 0); | |
} | |
}; | |
let mut braces = 0; | |
let mut start_idx = 0; | |
for (idx, &b) in mmap.iter().enumerate() { | |
if b == b'{' { | |
if braces == 0 { | |
start_idx = idx; | |
} | |
braces += 1; | |
} | |
else if b == b'}' { | |
braces -= 1; | |
if braces == 0 { //object formed ! | |
let obj = &mmap[start_idx ..= idx]; | |
match serde_json::from_slice(obj) { | |
Ok(o) => { | |
process_object(o, res); | |
} | |
Err(e) => { | |
println!("JSON ERROR: {}:\n{}", e, String::from_utf8_lossy(obj)); | |
errcount +=1; | |
} | |
} | |
count += 1; | |
} | |
} | |
} | |
} | |
Err(e) => { | |
println!("ERROR: {}", e); | |
} | |
} | |
return (count, errcount); | |
} | |
fn process_object(o: Rec, res: &mut Debtors) { | |
let dr = extract_data(o); | |
//println!("{} - {:?} - {}", &dr.company, &dr.phones, &dr.debt,); | |
let mut di: Option<usize> = Option::None; //debtor index search result | |
for &p in &dr.phones { | |
if let Some(i) = res.index_by_phone.get(p) { | |
di = Some(*i); | |
break; | |
} | |
} | |
match di { | |
Some(i) => { //existing debtor | |
let d = &mut res.all[i]; | |
d.companies.insert(dr.company.to_string()); | |
for p in &dr.phones { | |
d.phones.insert(p.to_string()); | |
res.index_by_phone.insert(p.to_string(), i); | |
} | |
d.debt += dr.debt; | |
} | |
None => { //new debtor | |
let mut d = Debtor::new(); | |
let i = res.all.len(); | |
d.companies.insert(dr.company.to_string()); | |
for p in &dr.phones { | |
d.phones.insert(p.to_string()); | |
res.index_by_phone.insert(p.to_string(), i); | |
} | |
d.debt = dr.debt; | |
res.all.push(d); | |
} | |
} | |
} | |
fn raw2str(raw: &RawValue) -> &str { | |
let payload = raw.get(); | |
if payload.starts_with('"') { | |
&payload[1 .. payload.len() -1] | |
} else if payload.starts_with(|ch| char::is_digit(ch, 10) ) { | |
payload | |
} else { | |
"" | |
} | |
} | |
fn extract_data(o: Rec) -> DebtRec { | |
use std::str::FromStr; | |
let company = match o.company { | |
Company::Name(c) | Company::NameRec{ name: c} => c, | |
}; | |
let mut phones = vec![]; | |
let payload = o.phones.get(); | |
if payload.starts_with('[') { | |
let ps: Vec<&RawValue> = serde_json::from_str(payload).unwrap_or_else(|_| vec![]); | |
phones.extend(ps.into_iter().map(raw2str)); | |
} else { | |
phones.push(raw2str(o.phones)); | |
}; | |
if let Some(p) = o.phone { | |
phones.push(raw2str(p)); | |
}; | |
let debt = f64::from_str(raw2str(o.debt)).unwrap_or(0.0); | |
DebtRec { | |
company, phones, debt, | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment