Measter/main.rs

## main.rs
// Because there's no kill like over-kill, right?

use std::{
    collections::HashMap,
    io::{BufRead, BufReader},
    ffi::OsStr,
    fs::File,
    path::Path,
};

// These pesky Human languages are messy. This will help a bit.
use unicode_normalization::UnicodeNormalization;
use unicode_segmentation::UnicodeSegmentation;

// Rust doesn't have a recursive directory walker in stdlib. While writing an iterater-based one
// probably wouldn't be too complicated, I don't think it would be trivial.
// Plus, I'm already pulling in two dependencies, what's one more?
use walkdir::WalkDir;

#[derive(Ord, PartialOrd, Eq, PartialEq)]
struct WordCount {
    word: String,
    count: u32,
}

#[derive(Default)]
struct Buffers {
    line: String,
    normalise: String
}

fn process_file(file: &Path, word_counts: &mut HashMap<String, WordCount>, buffers: &mut Buffers) -> Result<(), std::io::Error> {
    let file = File::open(file)?;
    let mut reader = BufReader::new(file);

    while reader.read_line(&mut buffers.line)? > 0 {
        for word in buffers.line.split_word_bounds() {
            buffers.normalise.clear();
            buffers.normalise.extend(word.nfc().flat_map(|c| c.to_lowercase()));

            if
                buffers.normalise.graphemes(true).count() < 2
                || buffers.normalise.chars().any(|c| c.is_numeric() || c.is_control() || c.is_whitespace())
            {
                continue;
            }

            // This gets a little awkward, but the Entry syntax requires an owned
            // key as an input, whereas here we only allocate when a new entry is needed,
            // avoiding the allocation for every single word.
            let count = if let Some(v) = word_counts.get_mut(&buffers.normalise) {
                v
            } else {
                word_counts.insert(buffers.normalise.clone(), WordCount {
                    word: word.to_owned(),
                    count: 0
                });
                word_counts.get_mut(&buffers.normalise).unwrap()
            };

            count.count += 1;
        }

        buffers.line.clear();
    }

    Ok(())
}

fn main() {
    let mut word_counts: HashMap<String, WordCount> = HashMap::new();

    // Using these buffers allows us to re-use the allocations when reading a line,
    // and when normalizing each word.
    // We just need to remember to clear the buffers each time.
    let mut buffers = Buffers::default();

    for entry in WalkDir::new(".") {
        let entry = match entry {
            Ok(e) => e,
            Err(err) => {
                eprintln!("Error querying files: {}", err);
                continue;
            }
        };

        // Should do a case-insensitive extension test. Fortunately, this is limited to ASCII, making it simple.
        if !entry.path().is_file() || !matches!(entry.path().extension().and_then(OsStr::to_str), Some(ext) if ext.eq_ignore_ascii_case("txt")) {
            continue;
        }

        if let Err(err) = process_file(&entry.path(), &mut word_counts, &mut buffers) {
            eprintln!("Error processing file: `{}`", entry.path().display());
            eprintln!("{}", err);
        }
    }

    let mut word_array: Vec<_> = word_counts.into_iter()
        .map(|(_, count)| count)
        .collect();

    // Must remember to sort our array. What kind of pillock would do that?
    word_array.sort_by_key(|w| w.count);

    for word in word_array.iter().rev().take(10) {
        println!("{} {}", word.count, word.word);
    }
}

## test.txt
yạ̇y yạ̇y ẽ̗
	// Because there's no kill like over-kill, right?

	use std::{
	collections::HashMap,
	io::{BufRead, BufReader},
	ffi::OsStr,
	fs::File,
	path::Path,
	};

	// These pesky Human languages are messy. This will help a bit.
	use unicode_normalization::UnicodeNormalization;
	use unicode_segmentation::UnicodeSegmentation;

	// Rust doesn't have a recursive directory walker in stdlib. While writing an iterater-based one
	// probably wouldn't be too complicated, I don't think it would be trivial.
	// Plus, I'm already pulling in two dependencies, what's one more?
	use walkdir::WalkDir;

	#[derive(Ord, PartialOrd, Eq, PartialEq)]
	struct WordCount {
	word: String,
	count: u32,
	}

	#[derive(Default)]
	struct Buffers {
	line: String,
	normalise: String
	}

	fn process_file(file: &Path, word_counts: &mut HashMap<String, WordCount>, buffers: &mut Buffers) -> Result<(), std::io::Error> {
	let file = File::open(file)?;
	let mut reader = BufReader::new(file);

	while reader.read_line(&mut buffers.line)? > 0 {
	for word in buffers.line.split_word_bounds() {
	buffers.normalise.clear();
	buffers.normalise.extend(word.nfc().flat_map(\|c\| c.to_lowercase()));

	if
	buffers.normalise.graphemes(true).count() < 2
	\|\| buffers.normalise.chars().any(\|c\| c.is_numeric() \|\| c.is_control() \|\| c.is_whitespace())
	{
	continue;
	}

	// This gets a little awkward, but the Entry syntax requires an owned
	// key as an input, whereas here we only allocate when a new entry is needed,
	// avoiding the allocation for every single word.
	let count = if let Some(v) = word_counts.get_mut(&buffers.normalise) {
	v
	} else {
	word_counts.insert(buffers.normalise.clone(), WordCount {
	word: word.to_owned(),
	count: 0
	});
	word_counts.get_mut(&buffers.normalise).unwrap()
	};

	count.count += 1;
	}

	buffers.line.clear();
	}

	Ok(())
	}

	fn main() {
	let mut word_counts: HashMap<String, WordCount> = HashMap::new();

	// Using these buffers allows us to re-use the allocations when reading a line,
	// and when normalizing each word.
	// We just need to remember to clear the buffers each time.
	let mut buffers = Buffers::default();

	for entry in WalkDir::new(".") {
	let entry = match entry {
	Ok(e) => e,
	Err(err) => {
	eprintln!("Error querying files: {}", err);
	continue;
	}
	};

	// Should do a case-insensitive extension test. Fortunately, this is limited to ASCII, making it simple.
	if !entry.path().is_file() \|\| !matches!(entry.path().extension().and_then(OsStr::to_str), Some(ext) if ext.eq_ignore_ascii_case("txt")) {
	continue;
	}

	if let Err(err) = process_file(&entry.path(), &mut word_counts, &mut buffers) {
	eprintln!("Error processing file: `{}`", entry.path().display());
	eprintln!("{}", err);
	}
	}

	let mut word_array: Vec<_> = word_counts.into_iter()
	.map(\|(_, count)\| count)
	.collect();

	// Must remember to sort our array. What kind of pillock would do that?
	word_array.sort_by_key(\|w\| w.count);

	for word in word_array.iter().rev().take(10) {
	println!("{} {}", word.count, word.word);
	}
	}