Skip to content

Instantly share code, notes, and snippets.

@svoisen
Last active May 20, 2022 03:03
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save svoisen/c6e9e495392074d20a4187226de10521 to your computer and use it in GitHub Desktop.
Save svoisen/c6e9e495392074d20a4187226de10521 to your computer and use it in GitHub Desktop.
Print histogram of file line lengths using Rust
// Dependencies required in Cargo.toml
// clap = { version = "3.1.7", features = ["derive"] }
// deque = "0.3.2"
// ignore = "0.1.6"
// num_cpus = "1.13.1"
use clap::Parser;
use deque::{Stealer, Stolen};
use ignore::WalkBuilder;
use num_cpus;
use std::cmp::max;
use std::collections::HashMap;
use std::path::PathBuf;
use std::time::Instant;
#[derive(Parser)]
struct Args {
#[clap(parse(from_os_str))]
path: PathBuf
}
// Map of line length (columns) to number of lines with that length.
type LineLenMap = HashMap<usize, usize>;
// Represents a single job in the work queue.
enum Job {
File(String),
Quit
}
// Runs a job on a single thread. This threading model is basically
// stolen from how ripgrep performs threading.
struct Worker {
channel: Stealer<Job>
}
impl Worker {
fn run(self) -> Vec<LineLenMap> {
let mut worker_results: Vec<LineLenMap> = vec![];
loop {
match self.channel.steal() {
Stolen::Empty | Stolen::Abort => continue,
Stolen::Data(Job::Quit) => break,
Stolen::Data(Job::File(file_path)) => {
worker_results.push(measure_line_lengths(&file_path));
}
}
}
worker_results
}
}
fn main() {
let start = Instant::now();
let args = Args::parse();
// Add other file extensions here as needed.
let include_extensions = vec!["js", "ts", "c", "cc", "cpp", "h", "rs"];
let walker = WalkBuilder::new(&args.path)
.ignore(true)
.git_ignore(true)
.hidden(true)
.build();
let files = walker
.filter_map(Result::ok)
.filter(|entry| entry.file_type().expect("No file type").is_file())
.filter_map(|entry| {
let path = entry.path();
let ext = path.extension().and_then(std::ffi::OsStr::to_str);
match ext {
None => None,
Some(ext) => match include_extensions.contains(&ext) {
false => None,
true => Some(String::from(path.to_str().unwrap()))
}
}
});
let num_threads = num_cpus::get();
let mut workers = vec![];
let (work_queue, stealer) = deque::new();
for _ in 0..num_threads {
let worker = Worker { channel: stealer.clone() };
workers.push(std::thread::spawn(|| { worker.run() }));
}
// Add all of the files to the work queue.
let mut num_files = 0;
for file_path in files {
num_files += 1;
work_queue.push(Job::File(file_path));
}
// Ensure the threads terminate when they are done.
for _ in 0..num_threads {
work_queue.push(Job::Quit);
}
let mut all_results = LineLenMap::new();
for worker in workers {
let worker_results = worker.join().unwrap();
for results in worker_results {
merge_line_lengths(&mut all_results, &results);
}
}
print_histogram(&all_results);
println!("Average line length: {:.2}", compute_average(&all_results));
println!("Total files measured: {}", num_files);
let end = Instant::now();
println!("Total time: {}ms", end.duration_since(start).as_millis());
}
fn measure_line_lengths(path: &String) -> LineLenMap {
let content = match std::fs::read_to_string(path) {
Ok(content) => content,
Err(_) => {
return HashMap::new();
}
};
let mut line_lengths = LineLenMap::new();
for line in content.lines() {
let len = line.len();
// Round up to nearest even number.
let len = match len % 2 {
0 => len,
_ => len + 1
};
// Limit to lengths of <= 100. Remove this restriction
// to cover all lines.
if len > 0 && len <= 100 {
match line_lengths.get_mut(&len) {
Some(count) => *count += 1,
None => {
line_lengths.insert(len, 1);
}
}
}
}
line_lengths
}
// Rust doesn't (yet) have extend_with, so this is a use-specific version of
// extending the first HashMap with the values of the second by summing
// them together.
fn merge_line_lengths(first: &mut LineLenMap, second: &LineLenMap) {
for (line_length, num_lines) in second {
match first.get_mut(&line_length) {
Some(count) => *count += num_lines,
None => {
first.insert(*line_length, *num_lines);
}
}
}
}
fn compute_average(line_lengths: &LineLenMap) -> f64 {
let mut total_length: usize = 0;
let mut total_lines: usize = 0;
for (length, num_lines) in line_lengths {
total_length += length * num_lines;
total_lines += num_lines;
}
total_length as f64 / total_lines as f64
}
fn print_histogram(line_lengths: &LineLenMap) {
let bars = vec!["█", "▏", "▎", "▍", "▌", "▋", "▊", "▉"];
let mut line_lengths_vec: Vec<_> = line_lengths.iter().collect();
// Sort by largest value in descending order.
// line_lengths_vec.sort_by(|a, b| a.1.cmp(b.1).reverse());
// Sort by line length in asceding order.
line_lengths_vec.sort_by(|a, b| a.0.cmp(b.0));
// Normalize the maximum bar length of the histogram to no more than
// 80 columns.
let max_count = line_lengths_vec[0].1;
let divisor = max(8, max_count / 80 as usize);
println!(" COLS COUNT");
for tuple in line_lengths_vec {
let full_bar_count = tuple.1 / divisor;
let partial_bar_idx = (((tuple.1 % divisor) as f64 / divisor as f64) * 8.0) as usize;
print!("{cols:>width$} {count:>width$} ", cols=tuple.0, count=tuple.1, width=5);
for _ in 0..full_bar_count {
print!("{}", bars[0]);
}
if partial_bar_idx > 0 {
print!("{}", bars[partial_bar_idx]);
}
println!("");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment