Skip to content

Instantly share code, notes, and snippets.

@sdaza
Last active February 12, 2023 17:59
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
Star You must be signed in to star a gist
Embed
What would you like to do?
# author: sebastian daza
# web: sdaza.com
# libraries
library(data.table)
library(xtable)
# create descriptive tables
#' @title Latex descriptive stats table
#' @description Generate a Latex table with descriptive custom stats
#' @param datlist List of data.tables
#' @param summary_function Function for creating summaary tats
#' @param column_names Names of columns
#' @param variable_names List of vectors with variable name by dataset
#' @param variable_labels List of vectors with variable labels by dataset
#' @param group_variable Character with the name of a grouping column
#' @param align Character with table alignment, e.g., "lrrrr"
#' @param digits Vector with number of digits per column (length should be
#' number of columns 1 + )
#' @param title Title of the table
#' @param label Label of the table
#' @param fontsize Text defining the font size of the table, e.g., "\\fontsize{9pt}{10pt}\\selectfont"
#' @param note Note to be addded at the bottomm of the table
#' @param notesize Font size of note, e.g., "scriptsize"
#' @param arraystretch Number to control space between rows
#' @param tabcolsep Number to contrl space between columns
#' @param file Path to save table
#' @examples
#' library(data.table)
#' library(xtable)
#' n = 100
#' dat = data.table(
#' var1 = runif(n),
#' var2 = sample(0:1, n, prob = c(0.8, 0.2), replace = TRUE),
#' var3 = sample(1:4, n, replace = TRUE),
#' var4 = rnorm(n, 1000, 100),
#' var5 = factor(sample(1:4, n, prob = c(0.1, 0.4, 0.3, 0.2), replace = TRUE),
#' labels = c("Low", "Medium", "High", "Very high"), ordered = TRUE)
#' )
#' variables = paste0("var", 1:5)
#' labels = paste0("Variable ", 1:5)
#' createDescriptiveTable(dat,
#' summary_function = getDescriptives,
#' variable_names = variables,
#' variable_labels = labels,
#' title = "Descriptive statistics",
#' label = "tab:descriptive",
#' file = "example.tex")
createDescriptiveTable = function(datasets,
summary_function,
column_names = NULL,
variable_names,
variable_labels = NULL,
group_variable = NULL,
align = NULL,
digits = NULL,
title = "Descriptive statistics",
label = "tab:descriptive",
fontsize = "\\fontsize{9pt}{10pt}\\selectfont",
note = NULL,
notesize = "footnotesize",
arraystretch = 1.5,
tabcolsep = 10,
file = "descriptive_table.tex") {
# initial checks
if (is.list(datasets)) {
for (i in seq_along(datasets)) {
if (!is.data.table(datasets[[i]])) {
datasets[[i]] = as.data.table(datasets[[i]])
}
}
}
else {
if (!is.data.table(datasets)) {
datasets = as.data.table(datasets)
}
}
if (any(class(datasets) %in% "data.table")) { datasets = list(datasets) }
if (!is.list(variable_names)) { variable_names = list(variable_names) }
if (!is.list(variable_labels)) { variable_labels = list(variable_labels) }
if (is.null(variable_labels)) { variables_labels = variable_names }
length_lists = unique(length(datasets),
length(variable_names),
length(variable_labels)
)
if (length(length_lists) > 1) {
stop("Datasets, column names, and variable_labels should be lists of same length")
}
org_column_names = column_names
# list to save tabs
tabs = list()
factor_header = list()
sumrows = 0
for (i in seq_along(datasets)) {
if (i > 1) { sumrows = sumrows + nrow(tabs[[i-1]]) }
if (is.null(group_variable)) { vars = variable_names[[i]] }
else { vars = c(group_variable, variable_names[[i]]) }
tabs[[i]] = data.table::copy(datasets[[i]][, ..vars])
factor_dataset_list = list()
if (length(variable_names[[i]]) != length(variable_labels[[i]])) {
stop("Length of variable is different from length of labels!")
}
factor_names = names(tabs[[i]])[sapply(tabs[[i]], is.factor)]
# create dummy variables
for (h in seq_along(factor_names)) {
categories = as.vector(na.omit(sort(unique(tabs[[i]][[factor_names[h]]]))))
for (j in seq_along(categories)) {
tabs[[i]][, (paste0(factor_names[h], "_", j)) :=
ifelse(get(factor_names[h]) == categories[j], 1, 0)]
}
# position and variable name
position = which(variable_names[[i]] == factor_names[h])
if (i == 1) {
adj_position = ifelse(position == 1, 1, position - 1)
} else {
adj_position = sumrows - 1 + position
}
variable_names[[i]] = variable_names[[i]][-position]
temp_position = ifelse(i == 1, adj_position, position - 1)
variable_names[[i]] = append(variable_names[[i]], paste0(factor_names[h], "_",
1:length(categories)), after = temp_position)
lname = variable_labels[[i]][position]
factor_dataset_list[[lname]] = c(adj_position, adj_position + length(categories))
variable_labels[[i]] = variable_labels[[i]][-position]
variable_labels[[i]] = append(variable_labels[[i]], paste0("\\quad ", categories),
after = temp_position)
}
factor_header[[i]] = factor_dataset_list
factor_data_list = list()
# descriptive estimation
if (is.null(group_variable)) {
tabs[[i]] = transpose(tabs[[i]][, lapply(.SD, summary_function),
.SDcols = variable_names[[i]]], keep.names = "variables")
if (is.null(column_names) & i == 1) {
column_names = paste0("C", 1:(dim(tabs[[i]])[2]-1))
}
} else if (length(group_variable) == 1) {
group_variable_values = sort(na.omit(unique(tabs[[i]][[group_variable]])))
setorderv(tabs[[i]], group_variable)
tabs[[i]] = tabs[[i]][, lapply(.SD, summary_function), group_variable,
.SDcols = variable_names[[i]]]
num_cols = tabs[[i]][, .N, group_variable]$N[1]
if (is.null(column_names) & i == 1) {
if (num_cols != length(column_names)) {
stop("Number of column names is not the same as outputs of the summary function!")
}
column_names = paste0("C", 1:num_cols)
org_column_names = column_names
}
tabs[[i]][, summary := rep(org_column_names, length(group_variable_values))]
column_names = as.vector(outer(org_column_names, group_variable_values, paste))
tabs[[i]] = dcast(
melt(tabs[[i]], id.vars = c(group_variable, "summary"), variable.name = "variables"),
variables ~ get(group_variable) + summary
)
} else {
stop("Group variable should have length 1!")
}
if (dim(tabs[[i]])[2] != (length(column_names) + 1)) {
stop("Descriptive function values are not equal to the number of column names")
}
colnames(tabs[[i]]) = c("Variables", column_names)
tabs[[i]][["Variables"]] = variable_labels[[i]]
}
if (length(tabs) == 1) { ftabs = tabs[[1]] }
else { ftabs = rbindlist(tabs) }
total_rows = dim(ftabs)[1]
total_columns = dim(ftabs)[2]
if (is.null(align)) {
align = paste0(c("l", "l", rep("r", total_columns - 1)), collapse = "")
}
if (is.null(digits)) {
digits = rep(2, total_columns + 1)
}
if (!is.null(digits) & length(digits) < (total_columns + 1)) {
digits = c(rep(0, (total_columns + 1) - length(digits)), digits)
}
# create headers
addtorow = list()
positions = -1
command = list()
new_columns = c(" ", column_names)
command[[1]] = paste0(
"\\hline\n\\addlinespace\n", paste0(new_columns, collapse = " & "), " \\\\\n\\addlinespace\n")
name_datasets = names(datasets)
if (is.null(name_datasets)) { name_datasets = rep("", length(datasets)) }
sumrows = 0
for (i in seq_along(datasets)) {
if (i > 1) { sumrows = sumrows + nrow(tabs[[i-1]]) }
# conditions for dataset position
if (i == 1 & name_datasets[i] != "") {
positions = append(positions, tail(positions, 1) + 1)
}
else if (i > 1 & name_datasets[i] != "") {
positions = append(positions, sumrows)
}
if (name_datasets[i] != "") {
command[[length(command) + 1]] = paste0(
"\\addlinespace\n\\multicolumn{", total_columns, "}{l}{\\textbf{\\textit{", name_datasets[i], "}}} \\\\\n\\addlinespace\n"
)
}
if (length(factor_header[[i]]) > 0) {
for (h in seq_along(factor_header[[i]])) {
factor_position = factor_header[[i]][[h]][1]
factor_end_position = factor_header[[i]][[h]][2]
text = paste0(
"\\addlinespace\n\\multicolumn{", total_columns, "}{l}{\\textit{", names(factor_header[[i]])[h], "}} \\\\\n")
text_end = "\\addlinespace\n"
if (factor_position == 0 & !is.null(name_datasets)) {
command[[2]] = paste0(command[[2]], text)
} else {
positions = append(positions, factor_position)
command[[length(command) + 1]] = text
positions = append(positions, factor_end_position)
command[[length(command) + 1]] = text_end
}
}
}
}
positions = append(positions, total_rows)
command[[length(command) + 1]] = "\\addlinespace\n\\hline\n\\addlinespace\n"
addtorow$command = unlist(command)
addtorow$pos = as.list(positions)
# create latex table
latex_table = xtable(ftabs,
caption = title,
label= label,
align = align,
digits = digits)
output = print(latex_table,
hline.after = c(-1),
table.placement = 'htp',
caption.placement = 'top',
add.to.row = addtorow,
include.rownames = FALSE,
include.colnames = FALSE,
type = 'latex',
sanitize.text.function = identity,
size = fontsize,
)
# create header
if (!is.null(group_variable)) {
group_length = rep(length(org_column_names), length(group_variable_values))
# simple header
table_header = paste0("& ", column_names[1], ".+", column_names[length(column_names)], " \\\\")
print(table_header)
# nested header
table_header_line1 = paste0(paste0("& \\\\multicolumn\\{", group_length, "\\}\\{c\\}\\{",
as.character(group_variable_values), "\\}"), collapse = " ")
# positions
rule_position = NULL
for (i in seq(group_variable_values)) {
stat_number = length(org_column_names)
if (i == 1) {
last_value = 2 + stat_number - 1
rule_position[i] = paste0("2", "-", last_value)
} else {
rule_position[i] = paste0(last_value + 1, "-", last_value + stat_number)
last_value = last_value + stat_number
}
}
# rule
table_header_line2 = paste0(paste0("\\\\cmidrule(lr)\\{", rule_position, "\\}"), collapse = " ")
# stats
table_header_line3 = paste0("& ", rep(org_column_names, length(group_variable_values)),
collapse = " ")
header_replacement = paste0(table_header_line1, " \\\\\\\\ \\\n", table_header_line2,
" \\\n", table_header_line3, " \\\\")
output = gsub(table_header, header_replacement, output)
print(output)
}
# adding comment
if (!is.null(note)) {
clean_note = longText(note)
header_replacement = paste0("begin\\{table\\}\\[htp\\]\\\n",
"\\\\setlength\\{\\\\tabcolsep\\}\\{",
tabcolsep,
"pt\\}\\\n\\\\renewcommand\\{\\\\arraystretch\\}\\{",
arraystretch,
"\\}\\\n\\\\centering\\\n\\\\begin\\{threeparttable\\}\\\n")
bottom = "end\\{tabular\\}\\\n\\\\endgroup\\\n"
bottom_replacement = paste0("end\\{tabular\\}\\\n\\\\endgroup\\\n\\\\begin{tablenotes}\\\n\\\\", notesize, "\\\n\\\\item ",
note,
"\\\n\\\\end{tablenotes}")
add_notes_table(output,
comment = clean_note,
header_replacement = header_replacement,
bottom = bottom,
bottom_replacement = bottom_replacement,
closing = "end\\{tablenotes\\}",
closing_replacement = "end\\{tablenotes\\}\\\n\\\\end{threeparttable}\\\n",
filename = file)
} else {
header = "begin\\{table\\}\\[htp\\]\\\n"
header_replacement = paste0("begin\\{table\\}\\[htp\\]\\\n",
"\\\\setlength\\{\\\\tabcolsep\\}\\{",
tabcolsep,
"pt\\}\\\n\\\\renewcommand\\{\\\\arraystretch\\}\\{",
arraystretch, "}\\\n")
output = gsub(header, header_replacement, output)
cat(output, file = file)
}
}
# descriptive function
getDescriptives = function(x) {
x = as.numeric(x)
m = mean(x, na.rm = TRUE)
sd = sd(x, na.rm = TRUE)
pm = mean(is.na(x))*100
n = length(x[!is.na(x)])
return(c(m, sd, pm, n))
}
# clean long text
longText = function(text) {
dtext = gsub("\n", "", text)
dtext = gsub("\\s+", " ", dtext)
return(dtext)
}
# add note to table
add_notes_table = function(tab,
comment = "",
arraystretch = 0.8,
tabcolsep = 10,
filename = "",
header = NULL,
header_replacement = NULL,
bottom = NULL,
bottom_replacement = NULL ,
closing = NULL,
closing_replacement = NULL) {
if (is.null(header)) {
header = "begin\\{table\\}\\[htp\\]\\\n\\\\centering\\\n"
}
if (is.null(header_replacement)) {
header_replacement = paste0("begin\\{table\\}\\[htp\\]\\\n",
"\\\\setlength\\{\\\\tabcolsep\\}\\{",
tabcolsep,
"pt\\}\\\n\\\\renewcommand\\{\\\\arraystretch\\}\\{",
arraystretch,
"\\}\\\n",
"\\\\begin\\{center\\}\\\n",
"\\\\scriptsize\\\n",
"\\\\begin\\{threeparttable\\}\\\n")
}
tab = gsub(header, header_replacement, tab)
if (is.null(bottom)) {
bottom = "end\\{tabular\\}\\n"
}
if (is.null(bottom_replacement)) {
bottom_replacement = paste0("end\\{tabular\\}\\\n\\\\begin{tablenotes}\\\n\\\\scriptsize\\\n\\\\item ",
comment,
"\\\n\\\\end{tablenotes}\\\n")
}
tab = gsub(bottom, bottom_replacement, tab)
if (is.null(closing)) {
closing = "end\\{table\\}\\n"
}
if (is.null(closing_replacement)) {
closing_replacement = paste0("end{threeparttable}\\\n",
"\\\\end\\{center\\}\\\n",
"\\\\end\\{table\\}\\\n"
)
}
tab = gsub(closing, closing_replacement, tab)
cat(tab, file = filename)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment