Skip to content

Instantly share code, notes, and snippets.

@rcdilorenzo
Last active July 14, 2018 19:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rcdilorenzo/bf7fc25b91cf6653ff29f4111dc47156 to your computer and use it in GitHub Desktop.
Save rcdilorenzo/bf7fc25b91cf6653ff29f4111dc47156 to your computer and use it in GitHub Desktop.
Collection of helpful EDA functions in R (originally created for M.S. in Data Science assignment work at Regis University)
hist.density <- function (data, xlab = "<x>", font.main = 1,
main = "Histogram of data") {
# Calculate histogram based on these values
data.hist = hist(data, plot = FALSE)
# Determine scaling factor
multiplier = data.hist$counts / data.hist$density
# Create density function of the area
data.density = density(data)
# Scale y-axis of density to histogram limits
data.density$y = data.density$y * multiplier[1]
# Plot histgram with labels and limits
plot(data.hist, xlab = xlab, main = main, font.main = font.main,
ylim = c(0, max(c(data.density$y, data.hist$counts))))
# Add density function as a line overlay
lines(data.density)
}
dist.summary <- function (dataframe, vdescriptions = names(dataframe),
rows = 3, columns = 3,
main = 'Distribution of Variables',
mar = c(1, 1, 2, 0), oma = c(1, 1, 3, 1)) {
# Setup layout and spacing
total = rows * columns
layout(mat = matrix(1:(total * 2), rows * 2, columns, byrow = F), height = c(rep(c(2, 3), total)))
par(mar = mar, oma = oma)
for (index in 1:ncol(dataframe)) {
column = dataframe[,names(dataframe)[index]]
# Check for discrete / continuous
if (is.factor(column)) {
# Default padding
par(mai = rep(0.3, 4))
plot.new()
title(main = vdescriptions[index])
# Padding (except top)
par(mai = c(0.3, 0.3, 0, 0.3))
# Display bar plot of frequencies
barplot(table(column))
} else {
# Padding (expanded for top and none for bottom)
par(mai = c(0, 0.3, 0.5, 0.3))
boxplot(column, cex = 0.8, horizontal = T, pch = '.',
main = vdescriptions[index], outline = F, axes = F)
# Padding (except top)
par(mai = c(0.3, 0.3, 0, 0.3))
# Display histogram and density function
hist.density(column, font.main = 1, main = '')
}
}
title(main = main, outer = T)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment