Created
December 21, 2019 13:23
-
-
Save gibizer/a2a14aa9f70f45b1eda68b576deb343c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//! # Counting word frequncy in text | |
use std::collections::HashMap; | |
use std::fs::File; | |
use std::io::Read; | |
fn get_word_freq<R: Read>(input: &mut R) -> HashMap<String, u32> { | |
let mut text = String::new(); | |
// TODO: avoid reading the whole input into memory here | |
// TODO: would be nicer to return a Result<HashMap..., Err> instead of | |
// panic on non utf-8 input | |
input.read_to_string(&mut text).expect("non utf-8 content"); | |
let mut counter = HashMap::new(); | |
for s in text | |
.split(|c: char| !c.is_alphabetic()) | |
.filter(|&c| c != "") | |
{ | |
let word = s.to_lowercase(); | |
let count = counter.entry(word).or_insert(0); | |
*count += 1; | |
} | |
// the default move semantic will not copy the HashMap | |
counter | |
} | |
/// Returns the `count` most frequent words from the `input` | |
/// | |
/// Arguments: | |
/// | |
/// * `input`: readable input that is being consumed by the call | |
/// * `count`: the number of most frequent words returned | |
/// | |
/// Returns: a list of (word, freq) two-tuples in a frequency descending order | |
/// | |
/// Panics: | |
/// * if the input is not utf-8 encoded | |
pub fn most_frequent_words<R: Read>( | |
input: &mut R, | |
count: u32, | |
) -> Vec<(String, u32)> { | |
let counter = get_word_freq(input); | |
// consuming the content of the counter here and moving the content into | |
// the Vec | |
let mut entries: Vec<_> = counter.into_iter().collect(); | |
entries.sort_by(|a, b| b.1.cmp(&a.1)); | |
// TODO: would be better returning an iterator instead of a new Vec | |
entries.into_iter().take(count as usize).collect() | |
} | |
fn main() { | |
let mut f = File::open("/home/gibizer/rust/word-count/sherlock.txt") | |
.expect("cannot open file"); | |
for e in most_frequent_words(&mut f, 20) { | |
println!("{} {}", e.0, e.1); | |
} | |
} | |
#[cfg(test)] | |
mod tests_most_frequent_words { | |
use super::*; | |
use std::io::Cursor; | |
// simply writing vec![] in the test code cannot infer T in Vec<T> hence | |
// the helper here | |
fn empty() -> Vec<(String, u32)> { | |
vec![] | |
} | |
#[test] | |
fn test_empty_input() { | |
assert_eq!(empty(), most_frequent_words(&mut Cursor::new(""), 0)); | |
assert_eq!(empty(), most_frequent_words(&mut Cursor::new(""), 10)); | |
} | |
#[test] | |
fn test_asking_for_zero() { | |
assert_eq!( | |
empty(), | |
most_frequent_words(&mut Cursor::new("foo bar"), 0) | |
); | |
} | |
#[test] | |
fn test_more_words_thank_asked() { | |
assert_eq!( | |
vec![("foo".to_owned(), 2)], | |
most_frequent_words(&mut Cursor::new("foo bar foo"), 1) | |
); | |
} | |
#[test] | |
fn test_multiple_whitespace() { | |
assert_eq!( | |
vec![("foo".to_owned(), 2), ("bar".to_owned(), 1)], | |
most_frequent_words(&mut Cursor::new(" foo bar foo "), 2) | |
); | |
} | |
#[test] | |
fn test_puctuation() { | |
assert_eq!( | |
vec![("foo".to_owned(), 2), ("bar".to_owned(), 1)], | |
most_frequent_words(&mut Cursor::new("-foo bar, foo! "), 10) | |
); | |
} | |
#[test] | |
#[should_panic(expected = "non utf-8 content")] | |
fn test_non_utf8_input() { | |
assert_eq!( | |
vec![("foo".to_owned(), 2), ("bar".to_owned(), 1)], | |
most_frequent_words(&mut Cursor::new([0x0u8, 0xFFu8, 0xFFu8]), 10) | |
); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment