passivedragon/ccwc.rs

## ccwc.rs
#!/usr/bin/env -S cargo +nightly -Zscript

//! ```cargo
//! [package]
//! authors = ["passivedragon"]
//! version = "0.0.1"
//! edition = "2021"
//! [dependencies]
//! clap = { version = "4.2", features = ["derive"] }
//! regex = "1.9.6"
//! unicode-segmentation = "1.10.1"
//! rayon = "1.8"
//! cli-table = "0.4.7"
//! ```

use std::collections::hash_map::HashMap;
use clap::Parser;
use rayon::prelude::*;

#[derive(Parser, Debug)]
#[clap(version, long_about)]
/// a wc replacement written as an exercise for DevOps coding challenges
///
/// the order of numbers returned when requested, is always:
/// newlines, words, characters, bytes
struct Args {
    // #[clap(short, long, help = "Path to config")]
    // config: Option<std::path::PathBuf>,
    #[clap(short = 'l', long = "lines", help = "print newline counts")]
    newlines: bool,
    #[clap(short = 'w', long = "words", help = "print word counts")]
    words: bool,
    #[clap(short = 'm', long = "chars", help = "print character counts")]
    characters: bool,
    #[clap(short = 'c', long = "bytes", help = "print byte counts")]
    bytes: bool,

    #[clap(help = "paths of files to look at")]
    files: Option<Vec<std::path::PathBuf>>
}

#[derive(Eq, Hash, PartialEq, Debug)]
enum CountType {
    NEWLINES,
    WORDS,
    CHARS,
    BYTES,
    WasLastWord, // used for word counting, keeps track of if the last looked at character was a word character
}

type CountMap = HashMap<CountType, usize>;


use cli_table::{format::Justify, print_stdout, Table, WithTitle};
#[derive(Table, Default)]
struct Count {
    #[table(title = "newlines", justify = "Justify::Right")]
    newlines: usize,
    #[table(title = "words", justify = "Justify::Right")]
    words: usize,
    #[table(title = "chars", justify = "Justify::Right")]
    chars: usize,
    #[table(title = "bytes", justify = "Justify::Right")]
    bytes: usize,

    #[table(title = "bytes", justify = "Justify::Right")]
    was_last_word: bool,

    #[table(title = "source")]
    origin: String,
}

fn print_results(args: &Args, count: &[Count]){
    let mut s: String = Default::default();
    /*
    if args.newlines {
        s.push_str(&format!("{}\t", count[&CountType::NEWLINES]));
    }
    if args.words {
        s.push_str(&format!("{}\t", count[&CountType::WORDS]));
    }
    if args.characters {
        s.push_str(&format!("{}\t", count[&CountType::CHARS]));
    }
    if args.bytes {
        s.push_str(&format!("{}\t", count[&CountType::BYTES]));
    }
    */
    // println!("{}{1}", s, count.origin);
    let _ = print_stdout(count.with_title());
}

fn count_from_file(args: &Args, path: &std::path::PathBuf) -> Result<Count, Box<dyn std::error::Error + 'static>> {
    use std::io::{Read, BufReader};
    use std::str::from_utf8;

    let file = &std::fs::File::open(path)?;

    const LIMIT: usize = 512*8;
    // let mut handle = file.take(LIMIT.try_into().unwrap());
    let mut handle = BufReader::new(file);

    let mut count: Count = Default::default();
    let mut buf: [u8; LIMIT] = [0; LIMIT];

    loop {
        let read_bytes = handle.read(&mut buf[..]).unwrap();
        if 0 == read_bytes {
            break; // reached EOF
        }
        let s = match from_utf8(&buf[0..read_bytes]) {
            Ok(s) => s,
            Err(e) => {
                if read_bytes == 0 {
                    panic!("failed to read");
                }
                // println!("caught multibyte");
                let _ = handle.seek_relative(- <usize as TryInto<i64>>::try_into(LIMIT-e.valid_up_to()).unwrap());
                from_utf8(&buf[0..e.valid_up_to()]).unwrap()
            }
        };
        count_from_string(&args, &s, &mut count); // could use from_utf8_unchecked instead
    }
    return Ok(count);
}

fn count_from_string(args: &Args, s: &str, count: &mut Count) {
    if args.newlines {
        count.newlines += s.matches('\n').count();
    }
    if args.words {
        use regex::Regex;
        use unicode_segmentation::UnicodeSegmentation;

        let is_word = Regex::new(r"\S+").unwrap(); // wc doesn't check like "\w+", so this is for compatibility
        let mut words = is_word.find_iter(s).collect::<Vec<_>>().len();
        let graphemes = s.graphemes(true).collect::<Vec<&str>>();
        if is_word.captures(graphemes.first().unwrap()).is_some() && count.was_last_word {
            words -= 1;
        }
        if is_word.captures(&graphemes.last().unwrap()).is_some() {
            count.was_last_word = true;
        } else {
            count.was_last_word = false;
        }
        count.words += words;
    }
    if args.characters {
        count.chars += s.chars().count();
    }
    if args.bytes {
        count.bytes += s.len();
    }
}

fn main() {
    let args = Args::parse();
    // println!("{:?}", args);

    if let Some(ref files) = args.files {
        let results: Vec<Count> = files.par_iter()
            .map(|i|{
                let mut count = count_from_file(&args, &i).unwrap();
                count.origin = format!("{:?}", i);
                count
            }).collect();
        print_results(&args, &results);
    } else {
        // might be getting input from stdin
        let mut buf = String::new();
        let stdin = std::io::stdin();
        let mut count: Count = Default::default();
        loop {
            let res = stdin.read_line(&mut buf);
            if res.is_ok() && 0 == res.unwrap() {
                // reached EOF
                break;
            }
            count_from_string(&args, &buf, &mut count);
            buf.clear();
        }
        count.origin = "stdin".to_owned();
        print_results(&args, &[count]);
    }
}
	#!/usr/bin/env -S cargo +nightly -Zscript

	//! ```cargo
	//! [package]
	//! authors = ["passivedragon"]
	//! version = "0.0.1"
	//! edition = "2021"
	//! [dependencies]
	//! clap = { version = "4.2", features = ["derive"] }
	//! regex = "1.9.6"
	//! unicode-segmentation = "1.10.1"
	//! rayon = "1.8"
	//! cli-table = "0.4.7"
	//! ```

	use std::collections::hash_map::HashMap;
	use clap::Parser;
	use rayon::prelude::*;

	#[derive(Parser, Debug)]
	#[clap(version, long_about)]
	/// a wc replacement written as an exercise for DevOps coding challenges
	///
	/// the order of numbers returned when requested, is always:
	/// newlines, words, characters, bytes
	struct Args {
	// #[clap(short, long, help = "Path to config")]
	// config: Option<std::path::PathBuf>,
	#[clap(short = 'l', long = "lines", help = "print newline counts")]
	newlines: bool,
	#[clap(short = 'w', long = "words", help = "print word counts")]
	words: bool,
	#[clap(short = 'm', long = "chars", help = "print character counts")]
	characters: bool,
	#[clap(short = 'c', long = "bytes", help = "print byte counts")]
	bytes: bool,

	#[clap(help = "paths of files to look at")]
	files: Option<Vec<std::path::PathBuf>>
	}

	#[derive(Eq, Hash, PartialEq, Debug)]
	enum CountType {
	NEWLINES,
	WORDS,
	CHARS,
	BYTES,
	WasLastWord, // used for word counting, keeps track of if the last looked at character was a word character
	}

	type CountMap = HashMap<CountType, usize>;


	use cli_table::{format::Justify, print_stdout, Table, WithTitle};
	#[derive(Table, Default)]
	struct Count {
	#[table(title = "newlines", justify = "Justify::Right")]
	newlines: usize,
	#[table(title = "words", justify = "Justify::Right")]
	words: usize,
	#[table(title = "chars", justify = "Justify::Right")]
	chars: usize,
	#[table(title = "bytes", justify = "Justify::Right")]
	bytes: usize,

	#[table(title = "bytes", justify = "Justify::Right")]
	was_last_word: bool,

	#[table(title = "source")]
	origin: String,
	}

	fn print_results(args: &Args, count: &[Count]){
	let mut s: String = Default::default();
	/*
	if args.newlines {
	s.push_str(&format!("{}\t", count[&CountType::NEWLINES]));
	}
	if args.words {
	s.push_str(&format!("{}\t", count[&CountType::WORDS]));
	}
	if args.characters {
	s.push_str(&format!("{}\t", count[&CountType::CHARS]));
	}
	if args.bytes {
	s.push_str(&format!("{}\t", count[&CountType::BYTES]));
	}
	*/
	// println!("{}{1}", s, count.origin);
	let _ = print_stdout(count.with_title());
	}

	fn count_from_file(args: &Args, path: &std::path::PathBuf) -> Result<Count, Box<dyn std::error::Error + 'static>> {
	use std::io::{Read, BufReader};
	use std::str::from_utf8;

	let file = &std::fs::File::open(path)?;

	const LIMIT: usize = 512*8;
	// let mut handle = file.take(LIMIT.try_into().unwrap());
	let mut handle = BufReader::new(file);

	let mut count: Count = Default::default();
	let mut buf: [u8; LIMIT] = [0; LIMIT];

	loop {
	let read_bytes = handle.read(&mut buf[..]).unwrap();
	if 0 == read_bytes {
	break; // reached EOF
	}
	let s = match from_utf8(&buf[0..read_bytes]) {
	Ok(s) => s,
	Err(e) => {
	if read_bytes == 0 {
	panic!("failed to read");
	}
	// println!("caught multibyte");
	let _ = handle.seek_relative(- <usize as TryInto<i64>>::try_into(LIMIT-e.valid_up_to()).unwrap());
	from_utf8(&buf[0..e.valid_up_to()]).unwrap()
	}
	};
	count_from_string(&args, &s, &mut count); // could use from_utf8_unchecked instead
	}
	return Ok(count);
	}

	fn count_from_string(args: &Args, s: &str, count: &mut Count) {
	if args.newlines {
	count.newlines += s.matches('\n').count();
	}
	if args.words {
	use regex::Regex;
	use unicode_segmentation::UnicodeSegmentation;

	let is_word = Regex::new(r"\S+").unwrap(); // wc doesn't check like "\w+", so this is for compatibility
	let mut words = is_word.find_iter(s).collect::<Vec<_>>().len();
	let graphemes = s.graphemes(true).collect::<Vec<&str>>();
	if is_word.captures(graphemes.first().unwrap()).is_some() && count.was_last_word {
	words -= 1;
	}
	if is_word.captures(&graphemes.last().unwrap()).is_some() {
	count.was_last_word = true;
	} else {
	count.was_last_word = false;
	}
	count.words += words;
	}
	if args.characters {
	count.chars += s.chars().count();
	}
	if args.bytes {
	count.bytes += s.len();
	}
	}

	fn main() {
	let args = Args::parse();
	// println!("{:?}", args);

	if let Some(ref files) = args.files {
	let results: Vec<Count> = files.par_iter()
	.map(\|i\|{
	let mut count = count_from_file(&args, &i).unwrap();
	count.origin = format!("{:?}", i);
	count
	}).collect();
	print_results(&args, &results);
	} else {
	// might be getting input from stdin
	let mut buf = String::new();
	let stdin = std::io::stdin();
	let mut count: Count = Default::default();
	loop {
	let res = stdin.read_line(&mut buf);
	if res.is_ok() && 0 == res.unwrap() {
	// reached EOF
	break;
	}
	count_from_string(&args, &buf, &mut count);
	buf.clear();
	}
	count.origin = "stdin".to_owned();
	print_results(&args, &[count]);
	}
	}