Skip to content

Instantly share code, notes, and snippets.

@jmclawson
Last active September 3, 2020 21:23
Show Gist options
  • Save jmclawson/79cd732fb9e6a4f1dd14f1caaac4ee2d to your computer and use it in GitHub Desktop.
Save jmclawson/79cd732fb9e6a4f1dd14f1caaac4ee2d to your computer and use it in GitHub Desktop.
library(stylo)
library(ggplot2)
library(dendextend)
# Load with this line: devtools::source_gist("79cd732fb9e6a4f1dd14f1caaac4ee2d")
# Use df <- stylo() to save frequency results
# Then use stylo2gg(df) to visualize principal components
# Use stylo2gg(df, viz="hc") to show hierarchical clusters without rerunning stylo
stylo2gg <- function(df,
viz = "pca",
num.features = NULL,
labeling = NULL,
classing = NULL,
scaling = FALSE,
linkage = "ward.D",
distance = "delta",
horiz = TRUE,
invert.x = FALSE,
invert.y = FALSE) {
if (is.null(num.features)) {
num.features <- length(df$features.actually.used)
}
if (viz == "PCR") {
viz <- "pca"
scaling <- TRUE
}
df <- df$table.with.all.freqs %>%
.[,df$features.actually.used[1:num.features]] %>%
as.data.frame()
# df_summary <- rbind(avg = colMeans(df),
# sd = apply(df, 2, sd))
df_means <- colMeans(df)
df_sd <- apply(df, 2, sd)
## create table of z scores
corpus_zscores <- list()
for (row_i in rownames(df)) {
thisrow <- (df[row_i, ] - df_means) / df_sd
corpus_zscores[[row_i]] <- thisrow
}
df_z <- data.frame(matrix(unlist(corpus_zscores),
nrow = length(corpus_zscores),
byrow = T))
rownames(df_z) <- names(corpus_zscores)
colnames(df_z) <- colnames(corpus_zscores[[1]])
if (viz == "PCV") {
viz <- "pca"
df_z <- df
}
if (viz == "pca" || viz == "PCA" || viz == "PCR") {
df_pca <- prcomp(df_z, scale. = scaling)
pc_variance <- summary(df_pca)$importance[2,1:2]
df_pca <- df_pca$x %>%
as.data.frame()
if (invert.x) {
df_pca$PC1 <- df_pca$PC1 * -1
}
if (invert.y) {
df_pca$PC2 <- df_pca$PC2 * -1
}
if (is.null(classing)) {
df_pca$class <- df_pca %>%
rownames() %>%
strsplit("_") %>%
sapply(`[`, 1)
} else {
df_pca$class <- classing
}
df_pca$title <- df_pca %>%
rownames() %>%
strsplit("_") %>%
sapply(`[`, 2)
df_pca$shorttitle <- df_pca$title %>%
gsub(pattern = "[a-z]",
replacement = "",
x = .)
num_shapes <- df_pca$class %>%
unique() %>%
length()
the_plot <- df_pca %>%
ggplot(aes(PC1,
PC2)) +
geom_hline(yintercept = 0, color = "gray") +
geom_vline(xintercept = 0, color = "gray")
if (is.null(labeling)) {
the_plot <- the_plot +
geom_point(aes(shape = class,
color = class)) +
scale_shape_manual(values = rep(c(1, 3:11), length.out = num_shapes))
} else {
the_plot <- the_plot +
geom_text(aes(label = labeling,
color = class), show.legend = FALSE)
}
the_plot <- the_plot +
theme_bw() +
theme(legend.title = element_blank()) +
labs(x = paste0("PC1 (",
round(pc_variance[1]*100,1),
"%)"),
y = paste0("PC2 (",
round(pc_variance[2]*100,1),
"%)"))
} else if (viz == "hc" || viz == "ca" || viz == "CA" || viz == "HC") {
if (!distance == "euclidean") {
df <- df_z
}
if (is.null(classing)){
the_class <- df %>%
rownames() %>%
strsplit("_") %>%
sapply(`[`, 1)
} else {
the_class <- classing
}
gg_color <- function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
the_colors <- gg_color(length(unique(the_class)))
if (!is.null(labeling)) {
rownames(df) <- labeling
}
rownames(df) <- paste0(" ", rownames(df))
the_dend <- df %>%
as.matrix()
if (distance == "argamon") {
the_dend <- the_dend %>%
dist.argamon()
} else if (distance == "eder") {
the_dend <- the_dend %>%
dist.eder()
} else if (distance == "cosine") {
the_dend <- the_dend %>%
dist.cosine()
} else if (distance == "simple") {
the_dend <- the_dend %>%
dist.simple()
} else if (distance == "delta") {# default to delta
the_dend <- the_dend %>%
dist.delta()
} else {
the_dend <- the_dend %>%
dist(method = distance)
}
the_dend <- the_dend %>%
as.dist() %>%
hclust(method = linkage) %>%
as.dendrogram() %>%
set("branches_lwd", 0.7) %>%
# set("branches_k_color", k = 3) %>%
set("labels_cex", 0.7) %>% set("hang_leaves", 0)
labels_colors(the_dend) <- the_class %>%
as.factor() %>%
.[order.dendrogram(the_dend)] %>%
the_colors[.]
the_plot <- the_dend %>% ggplot(horiz = horiz)
}
return(the_plot)
}
@jmclawson
Copy link
Author

Instead of this gist, please use the the stylo2gg package, which does more and does it better.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment