Skip to content

Instantly share code, notes, and snippets.

@passivedragon
Last active October 14, 2023 20:54
Show Gist options
  • Save passivedragon/673bf6898a12385075d28d5ea96a5f37 to your computer and use it in GitHub Desktop.
Save passivedragon/673bf6898a12385075d28d5ea96a5f37 to your computer and use it in GitHub Desktop.
wc coding challenge DevOps
#!/usr/bin/env -S cargo +nightly -Zscript
//! ```cargo
//! [package]
//! authors = ["passivedragon"]
//! version = "0.0.1"
//! edition = "2021"
//! [dependencies]
//! clap = { version = "4.2", features = ["derive"] }
//! regex = "1.9.6"
//! unicode-segmentation = "1.10.1"
//! rayon = "1.8"
//! cli-table = "0.4.7"
//! ```
use std::collections::hash_map::HashMap;
use clap::Parser;
use rayon::prelude::*;
#[derive(Parser, Debug)]
#[clap(version, long_about)]
/// a wc replacement written as an exercise for DevOps coding challenges
///
/// the order of numbers returned when requested, is always:
/// newlines, words, characters, bytes
struct Args {
// #[clap(short, long, help = "Path to config")]
// config: Option<std::path::PathBuf>,
#[clap(short = 'l', long = "lines", help = "print newline counts")]
newlines: bool,
#[clap(short = 'w', long = "words", help = "print word counts")]
words: bool,
#[clap(short = 'm', long = "chars", help = "print character counts")]
characters: bool,
#[clap(short = 'c', long = "bytes", help = "print byte counts")]
bytes: bool,
#[clap(help = "paths of files to look at")]
files: Option<Vec<std::path::PathBuf>>
}
#[derive(Eq, Hash, PartialEq, Debug)]
enum CountType {
NEWLINES,
WORDS,
CHARS,
BYTES,
WasLastWord, // used for word counting, keeps track of if the last looked at character was a word character
}
type CountMap = HashMap<CountType, usize>;
use cli_table::{format::Justify, print_stdout, Table, WithTitle};
#[derive(Table, Default)]
struct Count {
#[table(title = "newlines", justify = "Justify::Right")]
newlines: usize,
#[table(title = "words", justify = "Justify::Right")]
words: usize,
#[table(title = "chars", justify = "Justify::Right")]
chars: usize,
#[table(title = "bytes", justify = "Justify::Right")]
bytes: usize,
#[table(title = "bytes", justify = "Justify::Right")]
was_last_word: bool,
#[table(title = "source")]
origin: String,
}
fn print_results(args: &Args, count: &[Count]){
let mut s: String = Default::default();
/*
if args.newlines {
s.push_str(&format!("{}\t", count[&CountType::NEWLINES]));
}
if args.words {
s.push_str(&format!("{}\t", count[&CountType::WORDS]));
}
if args.characters {
s.push_str(&format!("{}\t", count[&CountType::CHARS]));
}
if args.bytes {
s.push_str(&format!("{}\t", count[&CountType::BYTES]));
}
*/
// println!("{}{1}", s, count.origin);
let _ = print_stdout(count.with_title());
}
fn count_from_file(args: &Args, path: &std::path::PathBuf) -> Result<Count, Box<dyn std::error::Error + 'static>> {
use std::io::{Read, BufReader};
use std::str::from_utf8;
let file = &std::fs::File::open(path)?;
const LIMIT: usize = 512*8;
// let mut handle = file.take(LIMIT.try_into().unwrap());
let mut handle = BufReader::new(file);
let mut count: Count = Default::default();
let mut buf: [u8; LIMIT] = [0; LIMIT];
loop {
let read_bytes = handle.read(&mut buf[..]).unwrap();
if 0 == read_bytes {
break; // reached EOF
}
let s = match from_utf8(&buf[0..read_bytes]) {
Ok(s) => s,
Err(e) => {
if read_bytes == 0 {
panic!("failed to read");
}
// println!("caught multibyte");
let _ = handle.seek_relative(- <usize as TryInto<i64>>::try_into(LIMIT-e.valid_up_to()).unwrap());
from_utf8(&buf[0..e.valid_up_to()]).unwrap()
}
};
count_from_string(&args, &s, &mut count); // could use from_utf8_unchecked instead
}
return Ok(count);
}
fn count_from_string(args: &Args, s: &str, count: &mut Count) {
if args.newlines {
count.newlines += s.matches('\n').count();
}
if args.words {
use regex::Regex;
use unicode_segmentation::UnicodeSegmentation;
let is_word = Regex::new(r"\S+").unwrap(); // wc doesn't check like "\w+", so this is for compatibility
let mut words = is_word.find_iter(s).collect::<Vec<_>>().len();
let graphemes = s.graphemes(true).collect::<Vec<&str>>();
if is_word.captures(graphemes.first().unwrap()).is_some() && count.was_last_word {
words -= 1;
}
if is_word.captures(&graphemes.last().unwrap()).is_some() {
count.was_last_word = true;
} else {
count.was_last_word = false;
}
count.words += words;
}
if args.characters {
count.chars += s.chars().count();
}
if args.bytes {
count.bytes += s.len();
}
}
fn main() {
let args = Args::parse();
// println!("{:?}", args);
if let Some(ref files) = args.files {
let results: Vec<Count> = files.par_iter()
.map(|i|{
let mut count = count_from_file(&args, &i).unwrap();
count.origin = format!("{:?}", i);
count
}).collect();
print_results(&args, &results);
} else {
// might be getting input from stdin
let mut buf = String::new();
let stdin = std::io::stdin();
let mut count: Count = Default::default();
loop {
let res = stdin.read_line(&mut buf);
if res.is_ok() && 0 == res.unwrap() {
// reached EOF
break;
}
count_from_string(&args, &buf, &mut count);
buf.clear();
}
count.origin = "stdin".to_owned();
print_results(&args, &[count]);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment