Skip to content

Instantly share code, notes, and snippets.

@technic
Last active September 19, 2019 06:40
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save technic/5f23603ac955d246a90a8421a667c8c2 to your computer and use it in GitHub Desktop.
Save technic/5f23603ac955d246a90a8421a667c8c2 to your computer and use it in GitHub Desktop.
Fast rust implementation for https://habr.com/ru/post/450512, reference C++ and Go implementations also present

Results

  • rust = 636ms
  • gcc = 653ms
  • go = 663ms
  • clang = 799ms

Roughly 10ms deviations observed.

C++ (gcc9)

$ time ./a.out /tmp/1.json 
-------------------------------
#0: debt: 910000000.000000
companies: {Шестерочка, Рога и копыта, Первая коллекторская, }
phones: {234, 789, 123, 456, 788, 2128506, }
-------------------------------
#1: debt: 433200000.000000
companies: {Казачий спас, Святой престол, }
phones: {345678, 666, 234567, }

real	0m0.653s
user	0m0.593s
sys	0m0.060s

C++ (clang8)

 time ./a.out /tmp/1.json 
-------------------------------
#0: debt: 910000000.000000
companies: {Шестерочка, Рога и копыта, Первая коллекторская, }
phones: {234, 789, 123, 456, 788, 2128506, }
-------------------------------
#1: debt: 433200000.000000
companies: {Казачий спас, Святой престол, }
phones: {345678, 666, 234567, }

real	0m0.799s
user	0m0.758s
sys	0m0.040s

Go (golang-go 1.10)

$ time ./go -f /tmp/1.json 
total debtors: 2
total time: 658.819995ms
Первая коллекторская, Рога и копыта, Шестерочка:
	123, 2128506, 234, 456, 788, 789
	910000000
Казачий спас, Святой престол:
	234567, 345678, 666
	433200000

real	0m0.663s
user	0m0.650s
sys	0m0.045s

Rust (rustc 1.35 -C target-cpu=native)

$ time ./target/release/rust -f /tmp/1.json 
Reading json from /tmp/1.json
Processed 1000000
#0: debt: 910000000
companies: {"Рога и копыта", "Первая коллекторская", "Шестерочка"}
phones: {"789", "788", "123", "234", "2128506", "456"}
#1: debt: 433200000
companies: {"Казачий спас", "Святой престол"}
phones: {"345678", "666", "234567"}

real	0m0.636s
user	0m0.632s
sys	0m0.005s
#include <rapidjson/document.h>
#include <filesystem>
#include <sys/mman.h>
#include <fcntl.h>
#include <unordered_map>
#include <charconv>
#include <string>
#include <unordered_set>
#include <list>
#include <array>
#include <vector>
struct rec_t {
std::string company;
std::vector<std::string> phones;
double debt;
};
struct debtor_t {
static inline std::unordered_map<std::string, debtor_t &> by_phone_index;
static inline std::list<debtor_t> all;
std::unordered_set<std::string> companies;
std::unordered_set<std::string> phones;
double debt;
};
void process_object(rec_t & rec) {
auto & index = debtor_t::by_phone_index;
auto op = [&](debtor_t & d) {
d.companies.insert(std::move(rec.company));
for(const auto & phone: rec.phones) {
d.phones.insert(phone);
// if(!index.contains(phone))
if(index.find(phone) == end(debtor_t::by_phone_index))
index.emplace(phone, d);
}
d.debt += rec.debt;
};
auto di = end(index);
for(const auto & phone: rec.phones) {
di = index.find(phone);
if(di != end(index)) break;
}
op((di != end(index)) ? di->second : debtor_t::all.emplace_back());
}
rec_t & extract_data(rapidjson::Document & doc) {
static rec_t rec;
rec.phones.clear();
rec.company.clear();
rec.debt = 0;
for(auto & x: doc.GetObject()) {
std::string_view key{x.name.GetString(), x.name.GetStringLength()};
auto & value = x.value;
auto val2str = [](auto & value) -> std::string {
std::array<char, 32> mem;
switch(value.GetType()) {
case rapidjson::kStringType: return {value.GetString(), value.GetStringLength()};
case rapidjson::kNumberType: {
auto [endp, _] = std::to_chars(begin(mem), end(mem), value.GetUint64());
return {begin(mem), endp};
}
default: return {};
}
};
auto val2num = [](auto & value) -> double {
switch(value.GetType()) {
case rapidjson::kStringType: return strtod(value.GetString(), nullptr);
case rapidjson::kNumberType: return value.GetDouble();
default: return {};
}
};
auto push_phone = [&](auto & value) {
if(auto phone = val2str(value);!phone.empty())
rec.phones.emplace_back(phone);
};
if(key == "company") {
if(value.IsString()) {
rec.company = {value.GetString(), value.GetStringLength()};
} else if(value.IsObject()) {
auto name = value.GetObject().FindMember("name");
if(name != value.GetObject().MemberEnd()) rec.company = val2str(name->value);
}
} else if(key == "phone") {
push_phone(value);
} else if(key == "phones") {
if(value.IsArray()) for(auto & x: value.GetArray()) push_phone(x);
push_phone(value);
} else if(key == "debt") rec.debt = val2num(value);
}
return rec;
}
int main(int argc, char * argv[]) {
if(argc < 2) return -1;
std::filesystem::path path = argv[1];
auto file_size = std::filesystem::file_size(path);
auto file_data = (const char *)mmap(nullptr, file_size + 4096, PROT_READ, MAP_POPULATE|MAP_PRIVATE|MAP_FILE, open(path.c_str(), O_RDONLY), 0);
std::string_view file{file_data, file_size};
auto it = file;
auto next = [&] {
auto p = it.data();
const char * begin = nullptr;
size_t braces = 0;
while(true) {
switch(p = strpbrk(p, "{}"); p ? *p : 0) {
case '{': if(!braces++) begin = p; ++p; break;
case '}': ++p; if(!--braces) {
it = it.substr(size_t(p - it.begin()));
return std::string_view{begin, size_t(p - begin)};
} break;
default: it = it.substr(it.size()); return std::string_view{};
}
};
};
rapidjson::Document doc;
while(true) {
auto str = next();
if(str.empty()) break;
doc.Parse(data(str), size(str));
process_object(extract_data(doc));
}
for(size_t it = 0; auto & d: debtor_t::all) {
printf("-------------------------------\n");
printf("#%lu: debt: %f\n", it++, d.debt);
std::string companies, phones;
for(auto & x: d.companies) companies += (x + ", ");
for(auto & x: d.phones) phones += (x + ", ");
//if(companies.ends_with(", ")) companies.resize(companies.size() - 2);
//if(phones.ends_with(", ")) phones.resize(phones.size() - 2);
printf("companies: {%s}\nphones: {%s}\n", companies.c_str(), phones.c_str());
}
}
package main
import (
"flag"
"fmt"
"log"
"os"
"sort"
"strconv"
"strings"
"time"
j "github.com/json-iterator/go"
)
type Debtor struct {
Companies []string
Phones []string
Debt int64
}
var fileName = flag.String("f", "1.json", "file name to open")
func main() {
flag.Parse()
t := time.Now()
f, err := os.Open(*fileName)
if err != nil {
log.Fatalln("failed to open file:", err)
}
defer f.Close()
cfg := j.ConfigDefault
iter := j.Parse(cfg, f, 5*1024)
debtors := make([]Debtor, 0, 100)
debtorByPhone := make(map[string]int, 100)
iter.ReadArrayCB(func(i *j.Iterator) bool {
var (
company string
debt int64
)
phones := make([]string, 0, 10)
read := i.ReadMapCB(func(i *j.Iterator, k string) bool {
switch k {
case "company":
switch i.WhatIsNext() {
case j.StringValue:
company = i.ReadString()
case j.ObjectValue:
i.ReadMapCB(func(i *j.Iterator, s string) bool {
company = i.ReadString()
return true
})
default:
i.Skip()
}
case "debt":
switch i.WhatIsNext() {
case j.NumberValue:
debt = i.ReadInt64()
case j.StringValue:
v := i.ReadString()
n, err := strconv.ParseInt(v, 10, 64)
if err != nil {
panic(err)
}
debt = n
default:
i.Skip()
}
case "phones", "phone":
switch i.WhatIsNext() {
case j.StringValue:
phones = append(phones, i.ReadString())
case j.NumberValue:
phones = append(phones, i.ReadNumber().String())
case j.ArrayValue:
i.ReadArrayCB(func(i *j.Iterator) bool {
switch i.WhatIsNext() {
case j.StringValue:
phones = append(phones, i.ReadString())
case j.NumberValue:
phones = append(phones, i.ReadNumber().String())
default:
i.Skip()
}
return true
})
default:
i.Skip()
}
default:
i.Skip()
}
return true
})
var (
id int
found bool
)
for _, p := range phones {
id, found = debtorByPhone[p]
if found {
break
}
}
if !found {
// Creating new company.
debtors = append(debtors, Debtor{
Companies: []string{company},
Phones: phones,
})
id = len(debtors) - 1
for _, p := range phones {
debtorByPhone[p] = id
}
}
nameFound := false
for _, name := range debtors[id].Companies {
if name == company {
nameFound = true
break
}
}
if !nameFound {
debtors[id].Companies = append(debtors[id].Companies, company)
}
for _, p := range phones {
found := false
for _, pp := range debtors[id].Phones {
if pp == p {
found = true
break
}
}
if !found {
debtorByPhone[p] = id
debtors[id].Phones = append(debtors[id].Phones, p)
}
}
debtors[id].Debt += debt
return read
})
fmt.Println("total debtors:", len(debtors))
fmt.Println("total time:", time.Since(t))
for _, d := range debtors {
sort.Strings(d.Companies)
sort.Strings(d.Phones)
fmt.Printf("%s:\n", strings.Join(d.Companies, ", "))
fmt.Printf("\t%s\n", strings.Join(d.Phones, ", "))
fmt.Printf("\t%d\n", d.Debt)
}
}
use clap::{App, Arg};
use memmap::Mmap;
use serde::Deserialize;
use serde_json::Value;
use serde::de;
use serde::de::{Deserializer, MapAccess, SeqAccess, Visitor};
// use std::collections::{HashMap, HashSet};
use fnv::{FnvHashMap as HashMap, FnvHashSet as HashSet};
use std::error::Error;
use std::result::Result;
use std::fmt;
// use smallstr::SmallString;
// type S = SmallString<[u8; 16]>;
type S = String;
//source data
#[derive(Default, Debug, Deserialize)]
struct DebtRec {
company: S,
phones: Vec<S>,
debt: f64,
}
#[derive(Default, Debug)]
struct ManualRec {
company: S,
phones: Vec<S>,
debt: f64,
}
//result data
#[derive(Default)]
struct Debtor {
companies: HashSet<S>,
phones: HashSet<S>,
debt: f64,
}
#[derive(Default)]
struct Debtors {
all: Vec<Debtor>,
by_phone: HashMap<S, usize>,
}
use std::arch::x86_64::{
__m128i, _mm_cmpestri, _mm_loadu_si128, _SIDD_CMP_EQUAL_ANY, _SIDD_UBYTE_OPS,
};
const CHUNK_LEN: usize = 16;
struct Pattern {
needle: __m128i,
needle_len: usize,
}
impl Pattern {
fn new(bytes: &[u8]) -> Self {
let mut needle = [0u8; CHUNK_LEN];
needle[..bytes.len()].copy_from_slice(bytes);
Self {
needle: unsafe { _mm_loadu_si128((&needle as *const u8) as *const __m128i) },
needle_len: bytes.len(),
}
}
fn find(&self, buf: &[u8]) -> usize {
let mut chunks = buf.chunks_exact(CHUNK_LEN);
let mut index = 0;
while let Some(c) = chunks.next() {
let i = unsafe { self.find_in_chunk(c) };
if i < CHUNK_LEN {
return i + index;
}
index += CHUNK_LEN;
}
let r = chunks.remainder();
let mut c = [0u8; CHUNK_LEN];
c[..r.len()].copy_from_slice(r);
let i = unsafe { self.find_in_chunk(&c) };
if i < r.len() {
return i + index;
}
buf.len()
}
#[inline]
unsafe fn find_in_chunk(&self, haystack: &[u8]) -> usize {
let p = haystack[..CHUNK_LEN].as_ptr();
let haystack = _mm_loadu_si128(p as *const __m128i);
_mm_cmpestri(
self.needle,
self.needle_len as i32,
haystack,
CHUNK_LEN as i32,
_SIDD_CMP_EQUAL_ANY | _SIDD_UBYTE_OPS,
) as usize
}
}
impl<'de> Deserialize<'de> for ManualRec {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
struct Phone(S);
impl<'de> Deserialize<'de> for Phone {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
struct VisitorImpl;
impl<'de> Visitor<'de> for VisitorImpl {
type Value = S;
fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str("Phone as a number or string")
}
fn visit_u64<E>(self, n: u64) -> Result<Self::Value, E>
where
E: de::Error,
{
Ok(S::from(n.to_string()))
}
fn visit_str<E>(self, s: &str) -> Result<Self::Value, E>
where
E: de::Error,
{
Ok(S::from(s))
}
}
deserializer.deserialize_any(VisitorImpl).map(|s| Phone(s))
}
}
struct Phones(Vec<S>);
impl<'de> Deserialize<'de> for Phones {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
struct VisitorImpl;
impl<'de> Visitor<'de> for VisitorImpl {
type Value = Vec<S>;
fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str("Phones as one Phone or list of Phone")
}
fn visit_u64<E>(self, n: u64) -> Result<Self::Value, E>
where
E: de::Error,
{
Ok(vec![S::from(n.to_string())])
}
fn visit_str<E>(self, s: &str) -> Result<Self::Value, E>
where
E: de::Error,
{
Ok(vec![S::from(s)])
}
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
where
A: SeqAccess<'de>,
{
let mut phones = Vec::new();
while let Some(p) = seq.next_element::<Phone>()? {
phones.push(p.0)
}
Ok(phones)
}
}
deserializer.deserialize_any(VisitorImpl).map(|p| Phones(p))
}
}
struct Company(S);
impl<'de> Deserialize<'de> for Company {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
struct CompanyVisitor;
impl<'de> Visitor<'de> for CompanyVisitor {
type Value = S;
fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str("Company")
}
fn visit_map<V>(self, mut map: V) -> Result<Self::Value, V::Error>
where
V: MapAccess<'de>,
{
while let Some(k) = map.next_key::<&str>()? {
if k == "name" {
let value = map.next_value::<S>()?;
return Ok(value);
} else {
map.next_value::<de::IgnoredAny>()?;
}
}
Err(de::Error::missing_field("name"))
}
fn visit_str<E>(self, s: &str) -> Result<Self::Value, E>
where
E: de::Error,
{
Ok(S::from(s))
}
}
deserializer
.deserialize_any(CompanyVisitor)
.map(|s| Company(s))
}
}
struct Debt(f64);
impl<'de> Deserialize<'de> for Debt {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
struct VisitorImpl;
impl<'de> Visitor<'de> for VisitorImpl {
type Value = Debt;
fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str("Debt")
}
fn visit_str<E>(self, s: &str) -> Result<Self::Value, E>
where
E: de::Error,
{
s.parse::<f64>().map(Debt).map_err(de::Error::custom)
}
fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E> {
Ok(Debt(v as f64))
}
fn visit_f64<E>(self, v: f64) -> Result<Self::Value, E> {
Ok(Debt(v))
}
}
deserializer.deserialize_any(VisitorImpl)
}
}
struct RecordVisitor;
impl<'de> Visitor<'de> for RecordVisitor {
type Value = ManualRec;
fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str("Record")
}
fn visit_map<V>(self, mut map: V) -> Result<Self::Value, V::Error>
where
V: MapAccess<'de>,
{
let mut rec = ManualRec::default();
while let Some(k) = map.next_key()? {
// println!("k={}", k);
match k {
"company" => {
let company = map.next_value::<Company>()?;
rec.company = company.0;
}
"phones" => {
let phones = map.next_value::<Phones>()?;
rec.phones.extend(phones.0);
}
"phone" => {
let phone = map.next_value::<Phone>()?;
rec.phones.push(phone.0);
}
"debt" => {
let debt = map.next_value::<Debt>()?;
rec.debt = debt.0;
}
_ => {
map.next_value::<de::IgnoredAny>()?;
}
}
}
// println!("{:?}", rec);
Ok(rec)
}
}
deserializer.deserialize_map(RecordVisitor)
}
}
fn main() -> Result<(), Box<Error>> {
let matches = App::new("json-parser")
.arg(
Arg::with_name("file")
.short("f")
.takes_value(true)
.required(true),
)
.get_matches();
let file_name = matches.value_of("file").unwrap();
println!("Reading json from {}", file_name);
let result = process_file(&file_name)?;
for (di, d) in result.all.iter().enumerate() {
println!("#{}: debt: {}", di, &d.debt);
println!("companies: {:?}\nphones: {:?}", &d.companies, &d.phones);
}
Ok(())
}
fn process_file(file_name: &str) -> Result<Debtors, Box<Error>> {
let file = std::fs::File::open(file_name)?;
let buf = unsafe { Mmap::map(&file)? };
let mut deb = Debtors::default();
let mut count = 0;
let mut braces = 0;
let mut start_idx = 0;
let mut idx = 0;
let needles = Pattern::new(&[b'{', b'}']);
loop {
idx += unsafe { needles.find(&buf.get_unchecked(idx..)) };
if idx >= buf.len() {
break;
}
let b = buf[idx];
// for (idx, b) in buf.iter().enumerate() {
match b {
b'{' => {
if braces == 0 {
start_idx = idx;
}
braces += 1;
}
b'}' => {
braces -= 1;
if braces == 0 {
let utf = std::str::from_utf8(&buf[start_idx..=idx])?;
// let dom = serde_json::from_str::<Value>(utf)?;
// let rec = serde_json::from_str::<DebtRec>(utf)?;
let rec = serde_json::from_str::<ManualRec>(utf)?;
// println!("{:?}", rec);
// let rec = extract_data(dom)?;
process_object(rec, &mut deb);
count += 1;
}
}
_ => {}
};
idx += 1;
}
println!("Processed {}", count);
Ok(deb)
}
fn process_object(rec: ManualRec, result: &mut Debtors) -> std::option::Option<()> {
let phones = rec.phones;
let di = phones
.iter()
.find_map(|p| result.by_phone.get(p).cloned())
.unwrap_or_else(|| {
result.all.push(Debtor::default());
result.all.len() - 1
});
let d = &mut result.all[di];
use std::collections::hash_map::Entry;
for phone in phones {
match result.by_phone.entry(phone) {
Entry::Vacant(e) => {
d.phones.insert(e.key().to_owned());
e.insert(di);
}
Entry::Occupied(e) => {
// Commenting this out add some ms. Doesn't affect wrong algorithm.
//d.phones.insert(e.key().clone());
}
}
}
d.companies.insert(rec.company);
d.debt += rec.debt;
Some(())
}
fn merge_result(part: Debtors, result: &mut Debtors) {
for dr in part.all.into_iter() {
let di = match dr.phones.iter().find_map(|p| result.by_phone.get(p)) {
Some(i) => *i,
None => {
result.all.push(Debtor::default());
result.all.len() - 1
}
};
let d = &mut result.all[di];
for p in &dr.phones {
result.by_phone.insert(p.to_owned(), di);
}
d.phones.extend(dr.phones);
d.companies.extend(dr.companies);
d.debt += dr.debt;
}
}
fn extract_data(dom: Value) -> serde_json::Result<DebtRec> {
let mut rec = DebtRec::default();
fn val2str(v: Value) -> S {
match v {
Value::String(s) => S::from(s),
Value::Number(n) => S::from(n.to_string()), //n.to_string(),
_ => S::default(),
}
}
if let Value::Object(o) = dom {
for (k, v) in o.into_iter() {
if k == "company" {
rec.company = match v {
Value::Object(mut c) => val2str(c["name"].take()),
Value::String(s) => S::from(s),
_ => S::default(),
};
} else if k == "phones" {
match v {
Value::Array(phones) => {
rec.phones.extend(phones.into_iter().map(|p| val2str(p)))
}
phones => rec.phones.push(val2str(phones)),
}
} else if k == "phone" {
match v {
phone => rec.phones.push(val2str(phone)),
}
} else if k == "debt" {
rec.debt = match v {
Value::Number(d) => d.as_f64().unwrap_or(0.0),
Value::String(d) => d.parse::<f64>().unwrap_or(0.0),
_ => 0.0,
};
}
}
}
// println!("Record {:?}", dr);
Ok(rec)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment