Skip to content

Instantly share code, notes, and snippets.

@gibizer
Created December 21, 2019 13:23
Show Gist options
  • Save gibizer/a2a14aa9f70f45b1eda68b576deb343c to your computer and use it in GitHub Desktop.
Save gibizer/a2a14aa9f70f45b1eda68b576deb343c to your computer and use it in GitHub Desktop.
//! # Counting word frequncy in text
use std::collections::HashMap;
use std::fs::File;
use std::io::Read;
fn get_word_freq<R: Read>(input: &mut R) -> HashMap<String, u32> {
let mut text = String::new();
// TODO: avoid reading the whole input into memory here
// TODO: would be nicer to return a Result<HashMap..., Err> instead of
// panic on non utf-8 input
input.read_to_string(&mut text).expect("non utf-8 content");
let mut counter = HashMap::new();
for s in text
.split(|c: char| !c.is_alphabetic())
.filter(|&c| c != "")
{
let word = s.to_lowercase();
let count = counter.entry(word).or_insert(0);
*count += 1;
}
// the default move semantic will not copy the HashMap
counter
}
/// Returns the `count` most frequent words from the `input`
///
/// Arguments:
///
/// * `input`: readable input that is being consumed by the call
/// * `count`: the number of most frequent words returned
///
/// Returns: a list of (word, freq) two-tuples in a frequency descending order
///
/// Panics:
/// * if the input is not utf-8 encoded
pub fn most_frequent_words<R: Read>(
input: &mut R,
count: u32,
) -> Vec<(String, u32)> {
let counter = get_word_freq(input);
// consuming the content of the counter here and moving the content into
// the Vec
let mut entries: Vec<_> = counter.into_iter().collect();
entries.sort_by(|a, b| b.1.cmp(&a.1));
// TODO: would be better returning an iterator instead of a new Vec
entries.into_iter().take(count as usize).collect()
}
fn main() {
let mut f = File::open("/home/gibizer/rust/word-count/sherlock.txt")
.expect("cannot open file");
for e in most_frequent_words(&mut f, 20) {
println!("{} {}", e.0, e.1);
}
}
#[cfg(test)]
mod tests_most_frequent_words {
use super::*;
use std::io::Cursor;
// simply writing vec![] in the test code cannot infer T in Vec<T> hence
// the helper here
fn empty() -> Vec<(String, u32)> {
vec![]
}
#[test]
fn test_empty_input() {
assert_eq!(empty(), most_frequent_words(&mut Cursor::new(""), 0));
assert_eq!(empty(), most_frequent_words(&mut Cursor::new(""), 10));
}
#[test]
fn test_asking_for_zero() {
assert_eq!(
empty(),
most_frequent_words(&mut Cursor::new("foo bar"), 0)
);
}
#[test]
fn test_more_words_thank_asked() {
assert_eq!(
vec![("foo".to_owned(), 2)],
most_frequent_words(&mut Cursor::new("foo bar foo"), 1)
);
}
#[test]
fn test_multiple_whitespace() {
assert_eq!(
vec![("foo".to_owned(), 2), ("bar".to_owned(), 1)],
most_frequent_words(&mut Cursor::new(" foo bar foo "), 2)
);
}
#[test]
fn test_puctuation() {
assert_eq!(
vec![("foo".to_owned(), 2), ("bar".to_owned(), 1)],
most_frequent_words(&mut Cursor::new("-foo bar, foo! "), 10)
);
}
#[test]
#[should_panic(expected = "non utf-8 content")]
fn test_non_utf8_input() {
assert_eq!(
vec![("foo".to_owned(), 2), ("bar".to_owned(), 1)],
most_frequent_words(&mut Cursor::new([0x0u8, 0xFFu8, 0xFFu8]), 10)
);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment