Skip to content

Instantly share code, notes, and snippets.

@lpj145
Created March 20, 2021 23:29
Show Gist options
  • Save lpj145/0f903bfdbac6562a0635089b0f9ef3ea to your computer and use it in GitHub Desktop.
Save lpj145/0f903bfdbac6562a0635089b0f9ef3ea to your computer and use it in GitHub Desktop.
Processar 9 milhoes de linhas csv com rust.
use std::{collections::HashMap, fs::File, io::{self, BufRead, BufReader}};
use std::time::{SystemTime, UNIX_EPOCH};
use core::time::Duration;
pub fn get_unix_timestamp_ms() -> Duration {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("Time went problem.")
}
pub fn microtime() -> u64 {
let start_time = get_unix_timestamp_ms();
start_time.as_secs() * 1000 + start_time.subsec_nanos() as u64 / 1_000_000
}
fn main() -> io::Result<()>{
let start_ms = microtime();
let filepath = "./a.csv";
let file = File::open(filepath).expect("What is ./a.csv file ?");
let reader = BufReader::new(file);
let mut consumer_reviews: HashMap<String, u32> = HashMap::new();
for line in reader.lines() {
let copy = line?;
let indexes = copy.split(',');
let mut current_client = String::from("");
for (index, value) in indexes.enumerate() {
let _value = String::from(value);
// Client index
if index == 0 {
current_client = _value.clone();
consumer_reviews.entry(_value).or_insert(0);
continue;
}
// Food index
if index == 1 {
*consumer_reviews.entry(_value).or_insert(0) += 1;
continue;
}
if index == 2 {
let price: u32 = _value.parse().unwrap_or(0);
*consumer_reviews.entry(current_client.clone()).or_insert(0) += price;
}
}
}
println!("Result: {:?}", consumer_reviews);
println!("File path: {:?}", filepath);
println!("End in: {} ms or {} seconds.", (microtime() - start_ms), (microtime() - start_ms) / 1000);
Ok(())
}
@dbofmmbt
Copy link

dbofmmbt commented Mar 21, 2021

@lpj145 "brinquei" um pouco com o gist e cheguei nessa versão aqui. Percebi um bom ganho de ms. Eu queria fazer uma versão que não precisasse ficar clonando String, mas aí a HashMap precisaria guardar &str e pra fazer isso acho que a gente precisaria manter o input todo na memória de uma vez só, o que deve ficar inviável para inputs maiores.

use core::time::Duration;
use std::time::{SystemTime, UNIX_EPOCH};
use std::{
    collections::HashMap,
    fs::File,
    io::{self, BufRead, BufReader},
};

pub fn get_unix_timestamp_ms() -> Duration {
    SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .expect("Time went problem.")
}

pub fn microtime() -> u64 {
    let start_time = get_unix_timestamp_ms();
    start_time.as_secs() * 1000 + start_time.subsec_nanos() as u64 / 1_000_000
}

fn main() -> io::Result<()> {
    let start_ms = microtime();
    let filepath = "./a.csv";
    let file = File::open(filepath).expect("What is ./a.csv file ?");
    let reader = BufReader::new(file);
    let mut consumer_reviews: HashMap<String, u32> = HashMap::new();

    for line in reader.lines() {
        let line = line?;

        let mut indexes = line.split(',');

        let current_client = indexes.next().expect("client missing");

        let food = indexes.next().expect("food missing");
        *consumer_reviews.entry(food.to_string()).or_insert(0) += 1;

        let price: u32 = indexes.next().expect("price missing").parse().unwrap_or(0);
        *consumer_reviews
            .entry(current_client.to_string())
            .or_insert(0) += price;
    }

    println!("Result: {:?}", consumer_reviews);

    println!("File path: {:?}", filepath);
    println!(
        "End in: {} ms or {} seconds.",
        (microtime() - start_ms),
        (microtime() - start_ms) / 1000
    );
    Ok(())
}

@dbofmmbt
Copy link

Versão que clona os &str somente quando precisa inserir uma nova entrada na HashMap:

use core::time::Duration;
use std::time::{SystemTime, UNIX_EPOCH};
use std::{
    collections::HashMap,
    fs::File,
    io::{self, BufRead, BufReader},
};

pub fn get_unix_timestamp_ms() -> Duration {
    SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .expect("Time went problem.")
}

pub fn microtime() -> u64 {
    let start_time = get_unix_timestamp_ms();
    start_time.as_secs() * 1000 + start_time.subsec_nanos() as u64 / 1_000_000
}

fn main() -> io::Result<()> {
    let start_ms = microtime();
    let filepath = "./a.csv";
    let file = File::open(filepath).expect("What is ./a.csv file ?");
    let reader = BufReader::new(file);
    let mut consumer_reviews: HashMap<String, u32> = HashMap::new();

    for line in reader.lines() {
        let line = line?;

        let mut indexes = line.split(',');

        let current_client = indexes.next().expect("client missing");

        let food = indexes.next().expect("food missing");
        match consumer_reviews.get_mut(food) {
            Some(value) => {
                *value += 1;
            }
            None => {
                consumer_reviews.insert(food.to_string(), 0);
            }
        }

        let price: u32 = indexes.next().expect("price missing").parse().unwrap_or(0);
        match consumer_reviews.get_mut(current_client) {
            Some(value) => {
                *value += price;
            }
            None => {
                consumer_reviews.insert(current_client.to_string(), 0);
            }
        }
    }

    println!("Result: {:?}", consumer_reviews);

    println!("File path: {:?}", filepath);
    println!(
        "End in: {} ms or {} seconds.",
        (microtime() - start_ms),
        (microtime() - start_ms) / 1000
    );
    Ok(())
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment