Skip to content

Instantly share code, notes, and snippets.

@srgorelik
Last active May 15, 2020 23:10
Show Gist options
  • Save srgorelik/ac215d2e52929e5f4e57590ce326c1af to your computer and use it in GitHub Desktop.
Save srgorelik/ac215d2e52929e5f4e57590ce326c1af to your computer and use it in GitHub Desktop.
A simple R-based linux command-line tool to print a CSV file, a portion thereof, or a summary report, to the terminal.
#!/usr/bin/env Rscript
# csv - a linux CLI for nicely displaying and summarizing CSV files
suppressPackageStartupMessages(library(optparse))
suppressPackageStartupMessages(library(data.table))
file_ext <- function(x) {
pos <- regexpr("\\.([[:alnum:]]+)$", x)
tolower(ifelse(pos > -1L, substring(x, pos + 1L), ""))
}
opt.list <- list(
make_option(c('-a','--all'), action = 'store_true', default = FALSE,
help = 'Print all rows.'),
make_option(c('-t','--tail'), action = 'store_true', default = FALSE,
help = 'Print only tail. By default only the head is printed.\n\t\tIgnored if the [-a|--all] flag is set.'),
make_option(c('-n','--num-rows'), type = 'integer', default = 6, metavar = 'INTEGER',
help = 'Specify number of rows to print (default is %default). Ignored\n\t\tif the [-a|--all] flag is set.'),
make_option(c('-s','--summary'), action = 'store_true', default = FALSE,
help = 'Summarize all rows by column.'),
make_option(c('-c','--col-names'), action = 'store_true', default = FALSE,
help = 'Only print column names.'),
make_option(c('-k','--keep-cols'), type = 'character', default = NULL, metavar = 'STRING',
help = 'Column selection. Provde names or numbers of columns to keep\n\t\tor drop, separated by commas (e.g., "var1,var5", or "1:4,8").\n\t\tUse "!" to drop columns (e.g., "!1,!5" or "!var1,!var5").'),
make_option(c('-d','--drop-cols'), type = 'character', default = NULL, metavar = 'STRING',
help = 'Column selection. Provde names or numbers of columns to keep\n\t\tor drop, separated by commas (e.g., "var1,var5", or "1:4,8").\n\t\tUse "!" to drop columns (e.g., "!1,!5" or "!var1,!var5").'),
make_option(c('-D','--dims'), action = 'store_true', default = FALSE,
help = 'Only print CSV dimensions (i.e., number of rows and columns).'),
make_option(c('-N','--no-header'), action = 'store_true', default = FALSE,
help = 'Flag that CSV has no header row.'),
make_option(c('-i','--remove-index'), action = 'store_true', default = FALSE,
help = 'Do not print row index.'),
make_option(c('-v','--verbose'), action = 'store_true', default = FALSE,
help = 'Prints progress and information messages.')
)
opt.parser <- OptionParser(option_list = opt.list,
usage = 'usage: %prog <input.csv> [options]\n',
description = paste0('A simple command-line tool to print a CSV file to the terminal.\n\n',
'Arguments:\n',
'\t<input.csv>\n',
'\t\tPath to CSV file.'))
arguments <- parse_args(opt.parser, positional_arguments = T)
pos.args <- arguments$args
opt.args <- arguments$options
csv.file <- pos.args[1]
print.all <- opt.args$all
print.tail <- opt.args$tail
print.nrows <- opt.args$`num-rows`
print.sum <- opt.args$summary
print.colnames <- opt.args$`col-names`
keep.cols <- opt.args$`keep-cols`
drop.cols <- opt.args$`drop-cols`
print.dims <- opt.args$dims
hdr.exists <- !opt.args$`no-header`
rem.ind <- opt.args$`remove-index`
verbose <- opt.args$verbose
if (!is.null(keep.cols) & !is.null(drop.cols)) stop('use either "-k|--keep-cols" or "-d|--drop-cols", but not both. ')
if (is.na(csv.file)) {
print_help(opt.parser)
} else if (!file.exists(csv.file)) {
stop('input file does not exist.')
} else if (file_ext(csv.file) != 'csv') {
stop('input file must be a CSV.')
} else {
if (verbose) cat(paste('Input table exists ...'), fill = T)
tmp.df <- fread(csv.file, header = F, nrows = 1, showProgress = F, data.table = F)
if (hdr.exists) {
hdr <- as.character(as.vector(tmp.df[1,]))
} else {
hdr <- colnames(tmp.df)
}
if (print.colnames) {
if (hdr.exists) {
cat(paste(hdr, collapse = ', '), fill = T)
} else {
stop('no header.')
}
} else {
# count total number of rows in CSV
tot.num.rows <- as.numeric(system(paste('sed -n "$="', csv.file), intern = T)) # includes header
if (hdr.exists) tot.num.rows <- tot.num.rows - 1
if (print.dims) {
cat(paste(tot.num.rows, 'rows,', length(hdr), 'columns'), fill = T)
} else {
if (print.nrows > tot.num.rows) {
print.all <- T
}
if (hdr.exists) {
skip.nrows <- 1 # skip header
} else {
skip.nrows <- 0
}
if (print.all | print.sum) {
print.nrows <- Inf
substr <- 'full'
} else if (print.tail) {
skip.nrows <- tot.num.rows - print.nrows + skip.nrows
substr <- paste('last', print.nrows, 'rows of')
} else {
tot.num.rows <- print.nrows
substr <- paste('first', print.nrows, 'rows of')
}
# get total number of columns
tot.num.cols <- length(hdr)
cols.num <- 1:tot.num.cols
if (!is.null(keep.cols) | !is.null(drop.cols)) {
if (!is.null(keep.cols)) user.cols <- keep.cols
if (!is.null(drop.cols)) user.cols <- drop.cols
# split input into vector of strings
cols.splt <- unlist(strsplit(user.cols, ','))
# if a number sequence exists in vector, expand it
cols.seq <- eval(parse(text = cols.splt[grepl(':', cols.splt)]))
# combine sequence back with other elements of vector
cols.str <- c(cols.seq, cols.splt[!grepl(':', cols.splt)])
# convert vector from character to numeric
cols.num <- suppressWarnings(as.numeric(cols.str))
# count number of NAs created when converting from character to numeric
cols.na.cnt <- sum(is.na(cols.num))
# if no NAs, then user provided column numbers, otherwise user provided column names
if (cols.na.cnt == 0) {
if (!all(abs(cols.num) %in% 1:tot.num.cols)) stop('all columns must exist in CSV.')
} else if (cols.na.cnt == length(cols.str)) {
if (!all(gsub('^-', '', cols.str) %in% hdr)) stop('all columns must exist in CSV.')
cols.num <- charmatch(cols.str, hdr) # return column numbers to keep, in ascending order
} else {
stop('must provide either column names or numbers, not both.')
}
# remove duplicate column selections
cols.num <- unique(cols.num)
# invert column selection if user chose to drop columns
if (!is.null(drop.cols)) cols.num <- (1:tot.num.cols)[-cols.num]
# final column name selection
hdr <- hdr[cols.num]
}
if (verbose) cat(paste('Reading', substr, 'table ...'), fill = T)
df <- fread(csv.file, select = cols.num, nrows = print.nrows, skip = skip.nrows, header = F, col.names = hdr, showProgress = verbose, data.table = F, stringsAsFactors = T)
width <- system('tput cols', intern = T)
options(width = width)
if (print.sum) {
if (verbose) cat('Summarizing table ...', fill = T)
print(summary(df))
} else {
if (verbose) cat('Printing table ...', fill = T)
if (!hdr.exists) skip.nrows <- skip.nrows + 1
if (rem.ind) {
row.names <- F
} else {
row.names <- as.character(skip.nrows:tot.num.rows)
}
print.data.frame(df, row.names = row.names)
}
}
}
}
@srgorelik
Copy link
Author

srgorelik commented Mar 28, 2019

Examples

For help, type:

$ csv.R --help

You should see:

Usage: csv.R <input.csv> [options]

A simple command-line tool to print a CSV file to the terminal.

Arguments:
	<input.csv>
		Path to CSV file.

Options:
	-a, --all
		Print all rows.

	-t, --tail
		Print only tail. By default only the head is printed.
		Ignored if the [-a|--all] flag is set.

	-n INTEGER, --num-rows=INTEGER
		Specify number of rows to print (default is 6). Ignored
		if the [-a|--all] flag is set.

	-s, --summary
		Summarize all rows by column.

	-c, --col-names
		Only print column names.

	-k STRING, --keep-cols=STRING
		Column names or numbers to keep, separated by commas
		(e.g., "var1,var5", or "1:4,8").

	-d STRING, --drop-cols=STRING
		Column names or numbers to drop, separated by commas
		(e.g., "var1,var5", or "1:4,8").

	-D, --dims
		Only print CSV dimensions (i.e., number of rows and columns).

	-N, --no-header
		Flag that CSV has no header row.

	-v, --verbose
		Prints progress and information messages.

	-h, --help
		Show this help message and exit

To view the contents of a CSV from the linux command-line, normally one could do:

$ head mtcars.csv

And see:

"mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb"
21,6,160,110,3.9,2.62,16.46,0,1,4,4
21,6,160,110,3.9,2.875,17.02,0,1,4,4
22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
18.1,6,225,105,2.76,3.46,20.22,1,0,3,1
14.3,8,360,245,3.21,3.57,15.84,0,0,3,4
24.4,4,146.7,62,3.69,3.19,20,1,0,4,2
22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2

But the formatting is not ideal, especially with larger files. Instead, with this tool simply type:

$ csv.R mtcars.csv

And see:

   mpg cyl disp  hp drat    wt  qsec vs am gear carb
1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
3 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
4 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
5 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
6 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

Much better. Additionally, with this tool it is easy to see a summary of the CSV file right from the command-line:

$ csv.R mtcars.csv -s
      mpg             cyl             disp             hp             drat             wt             qsec             vs               am              gear            carb
 Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0   Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000   Min.   :0.0000   Min.   :3.000   Min.   :1.000
 1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5   1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000
 Median :19.20   Median :6.000   Median :196.3   Median :123.0   Median :3.695   Median :3.325   Median :17.71   Median :0.0000   Median :0.0000   Median :4.000   Median :2.000
 Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7   Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375   Mean   :0.4062   Mean   :3.688   Mean   :2.812
 3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0   3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000
 Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0   Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000   Max.   :1.0000   Max.   :5.000   Max.   :8.000

If you just want to know the dimensions of the CSV file, type:

$ csv.R mtcars.csv -D
33 rows, 11 columns

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment