Skip to content

Instantly share code, notes, and snippets.

@steverhall
Last active January 11, 2022 19:30
Show Gist options
  • Save steverhall/545e5d3ef0fc7c0e3a68e235fd26f046 to your computer and use it in GitHub Desktop.
Save steverhall/545e5d3ef0fc7c0e3a68e235fd26f046 to your computer and use it in GitHub Desktop.
CSVUTIL Rust version - parses CSV and outputs column widths and info
use std::env;
use std::fs::File;
use std::io::{BufRead, BufReader};
struct Column {
name: String,
length: usize,
largest_line: u32,
column_type: String,
}
fn main() {
let args: Vec<String> = env::args().collect();
let filename = &args[1];
let f = File::open(filename).expect("file not found");
let mut columns: Vec<Column> = Vec::new();
let mut row_number = 0;
// Read each line of file
let file = BufReader::new(&f);
for line in file.lines() {
let line = line.unwrap();
if columns.len() == 0 {
columns = parse_header_columns(&line);
} else {
process_line(&line, &mut columns, row_number);
}
row_number = row_number + 1;
}
println!("Total rows: {}", row_number);
print_summary(columns);
}
fn process_line(line: &str, columns: &mut Vec<Column>, row_number: u32) {
// iterate through each character in line
let mut field_index = 0;
let mut column_start_index = 0;
let mut in_quotes = false;
for (idx, c) in line.chars().enumerate() {
match c {
'"' => {
in_quotes = !in_quotes;
}
',' => {
if !in_quotes {
let column_width = idx - column_start_index;
if column_width > columns[field_index].length {
columns[field_index].length = column_width;
columns[field_index].largest_line = row_number;
}
columns[field_index].column_type =
calc_column_type(&line[column_start_index..idx].to_string(), &columns[field_index].column_type);
field_index = field_index + 1;
column_start_index = idx + 1;
}
}
_ => {
if c.is_alphabetic() {
columns[field_index].column_type = "VARCHAR".to_string();
}
}
}
}
let column_width = line.len() - column_start_index;
if column_width > columns[field_index].length {
columns[field_index].length = column_width;
columns[field_index].largest_line = row_number;
}
columns[field_index].column_type = calc_column_type(&line[column_start_index..].to_string(), &columns[field_index].column_type);
}
fn parse_header_columns(line: &str) -> Vec<Column> {
let mut columns: Vec<Column> = Vec::new();
let mut start = 0;
for (i, c) in line.chars().enumerate() {
if c == ',' {
let col = create_column(&line[start..i].to_string(), 0, 0, &"".to_string());
columns.push(col);
start = i + 1;
}
}
let col = create_column(&line[start..].to_string(), 0, 0, &"".to_string());
columns.push(col);
columns
}
fn create_column(name: &str, length: usize, largest_line: u32, column_type: &str) -> Column {
Column {
name: name.to_string(),
length: length,
largest_line: largest_line,
column_type: column_type.to_string(),
}
}
fn calc_column_type(field: &str, existing_type: &str) -> String {
let mut column_type = String::new();
if field.len() == 0 {
column_type = "VARCHAR".to_string()
} else if existing_type != "VARCHAR" {
if field.find('.').is_some() {
column_type = "FLOAT".to_string();
} else if existing_type != "FLOAT" {
column_type = "INT".to_string();
} else {
column_type = "VARCHAR".to_string();
}
}
column_type
}
fn print_summary(headers: Vec<Column>) {
println!(
"Index Column Name Width Largest Row Type"
);
println!("------|--------------------------------------------------|------|-----------|--------------");
for (idx, header) in headers.iter().enumerate() {
println!(
"{:<7}{:52}{:4}{:12} {:15}",
idx, header.name, header.length, header.largest_line, header.column_type
);
}
}
@steverhall
Copy link
Author

Processed 2052051 rows in 18.47 seconds vs 7.88 seconds with C.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment