gibizer/word-freq2.rs

## word-freq2.rs
//! # Counting word frequncy in text
use std::collections::HashMap;
use std::fs::File;
use std::io::Read;

fn get_word_freq<R: Read>(input: &mut R) -> HashMap<String, u32> {
    let mut text = String::new();
    // TODO: avoid reading the whole input into memory here
    // TODO: would be nicer to return a Result<HashMap..., Err> instead of
    // panic on non utf-8 input
    input.read_to_string(&mut text).expect("non utf-8 content");

    let mut counter = HashMap::new();
    for s in text
        .split(|c: char| !c.is_alphabetic())
        .filter(|&c| c != "")
    {
        let word = s.to_lowercase();
        let count = counter.entry(word).or_insert(0);
        *count += 1;
    }
    // the default move semantic will not copy the HashMap
    counter
}

/// Returns the `count` most frequent words from the `input`
///
/// Arguments:
///
/// * `input`: readable input that is being consumed by the call
/// * `count`: the number of most frequent words returned
///
/// Returns: a list of (word, freq) two-tuples in a frequency descending order
///
/// Panics:
/// * if the input is not utf-8 encoded
pub fn most_frequent_words<R: Read>(
    input: &mut R,
    count: u32,
) -> Vec<(String, u32)> {
    let counter = get_word_freq(input);
    // consuming the content of the counter here and moving the content into
    // the Vec
    let mut entries: Vec<_> = counter.into_iter().collect();
    entries.sort_by(|a, b| b.1.cmp(&a.1));
    // TODO: would be better returning an iterator instead of a new Vec
    entries.into_iter().take(count as usize).collect()
}

fn main() {
    let mut f = File::open("/home/gibizer/rust/word-count/sherlock.txt")
        .expect("cannot open file");

    for e in most_frequent_words(&mut f, 20) {
        println!("{} {}", e.0, e.1);
    }
}

#[cfg(test)]
mod tests_most_frequent_words {
    use super::*;
    use std::io::Cursor;

    // simply writing vec![] in the test code cannot infer T in Vec<T> hence
    // the helper here
    fn empty() -> Vec<(String, u32)> {
        vec![]
    }

    #[test]
    fn test_empty_input() {
        assert_eq!(empty(), most_frequent_words(&mut Cursor::new(""), 0));
        assert_eq!(empty(), most_frequent_words(&mut Cursor::new(""), 10));
    }

    #[test]
    fn test_asking_for_zero() {
        assert_eq!(
            empty(),
            most_frequent_words(&mut Cursor::new("foo bar"), 0)
        );
    }

    #[test]
    fn test_more_words_thank_asked() {
        assert_eq!(
            vec![("foo".to_owned(), 2)],
            most_frequent_words(&mut Cursor::new("foo bar foo"), 1)
        );
    }

    #[test]
    fn test_multiple_whitespace() {
        assert_eq!(
            vec![("foo".to_owned(), 2), ("bar".to_owned(), 1)],
            most_frequent_words(&mut Cursor::new(" foo  bar foo  "), 2)
        );
    }

    #[test]
    fn test_puctuation() {
        assert_eq!(
            vec![("foo".to_owned(), 2), ("bar".to_owned(), 1)],
            most_frequent_words(&mut Cursor::new("-foo  bar, foo!  "), 10)
        );
    }
    #[test]
    #[should_panic(expected = "non utf-8 content")]
    fn test_non_utf8_input() {
        assert_eq!(
            vec![("foo".to_owned(), 2), ("bar".to_owned(), 1)],
            most_frequent_words(&mut Cursor::new([0x0u8, 0xFFu8, 0xFFu8]), 10)
        );
    }
}
	//! # Counting word frequncy in text
	use std::collections::HashMap;
	use std::fs::File;
	use std::io::Read;

	fn get_word_freq<R: Read>(input: &mut R) -> HashMap<String, u32> {
	let mut text = String::new();
	// TODO: avoid reading the whole input into memory here
	// TODO: would be nicer to return a Result<HashMap..., Err> instead of
	// panic on non utf-8 input
	input.read_to_string(&mut text).expect("non utf-8 content");

	let mut counter = HashMap::new();
	for s in text
	.split(\|c: char\| !c.is_alphabetic())
	.filter(\|&c\| c != "")
	{
	let word = s.to_lowercase();
	let count = counter.entry(word).or_insert(0);
	*count += 1;
	}
	// the default move semantic will not copy the HashMap
	counter
	}

	/// Returns the `count` most frequent words from the `input`
	///
	/// Arguments:
	///
	/// * `input`: readable input that is being consumed by the call
	/// * `count`: the number of most frequent words returned
	///
	/// Returns: a list of (word, freq) two-tuples in a frequency descending order
	///
	/// Panics:
	/// * if the input is not utf-8 encoded
	pub fn most_frequent_words<R: Read>(
	input: &mut R,
	count: u32,
	) -> Vec<(String, u32)> {
	let counter = get_word_freq(input);
	// consuming the content of the counter here and moving the content into
	// the Vec
	let mut entries: Vec<_> = counter.into_iter().collect();
	entries.sort_by(\|a, b\| b.1.cmp(&a.1));
	// TODO: would be better returning an iterator instead of a new Vec
	entries.into_iter().take(count as usize).collect()
	}

	fn main() {
	let mut f = File::open("/home/gibizer/rust/word-count/sherlock.txt")
	.expect("cannot open file");

	for e in most_frequent_words(&mut f, 20) {
	println!("{} {}", e.0, e.1);
	}
	}

	#[cfg(test)]
	mod tests_most_frequent_words {
	use super::*;
	use std::io::Cursor;

	// simply writing vec![] in the test code cannot infer T in Vec<T> hence
	// the helper here
	fn empty() -> Vec<(String, u32)> {
	vec![]
	}

	#[test]
	fn test_empty_input() {
	assert_eq!(empty(), most_frequent_words(&mut Cursor::new(""), 0));
	assert_eq!(empty(), most_frequent_words(&mut Cursor::new(""), 10));
	}

	#[test]
	fn test_asking_for_zero() {
	assert_eq!(
	empty(),
	most_frequent_words(&mut Cursor::new("foo bar"), 0)
	);
	}

	#[test]
	fn test_more_words_thank_asked() {
	assert_eq!(
	vec![("foo".to_owned(), 2)],
	most_frequent_words(&mut Cursor::new("foo bar foo"), 1)
	);
	}

	#[test]
	fn test_multiple_whitespace() {
	assert_eq!(
	vec![("foo".to_owned(), 2), ("bar".to_owned(), 1)],
	most_frequent_words(&mut Cursor::new(" foo bar foo "), 2)
	);
	}

	#[test]
	fn test_puctuation() {
	assert_eq!(
	vec![("foo".to_owned(), 2), ("bar".to_owned(), 1)],
	most_frequent_words(&mut Cursor::new("-foo bar, foo! "), 10)
	);
	}
	#[test]
	#[should_panic(expected = "non utf-8 content")]
	fn test_non_utf8_input() {
	assert_eq!(
	vec![("foo".to_owned(), 2), ("bar".to_owned(), 1)],
	most_frequent_words(&mut Cursor::new([0x0u8, 0xFFu8, 0xFFu8]), 10)
	);
	}
	}