Created
May 22, 2017 00:31
-
-
Save kunigami/1823af3e289122e1f0e5253d82e15ec8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Pre-requisites | |
install.packages('stringr') | |
# Read the data | |
results <- read.csv( | |
'blog-examples/largest-subsequence-sets/words_subsequences.txt', | |
colClasses="character", | |
header=TRUE, | |
strip.white=TRUE | |
) | |
selected_cols = data.frame(subsequences = as.numeric(results$subsequences), length = str_trim(as.character(results$length))) | |
# Lengths sorted as numbers | |
# Extract the distinct values from the length column | |
all_lengths = levels(selected_cols$length) | |
# Sort the numeric string as numbers. | |
sorted_lengths <- as.character(sort(as.integer(all_lengths))) | |
# Re-assign the levels, sorted. This way the plot will have the values in the right order | |
selected_cols$length <- factor(selected_cols$length, levels = sorted_lengths) | |
fill <- "#4271AE" | |
line <- "#1F3552" | |
ggplot(selected_cols, aes(x = length, y = subsequences)) + | |
geom_boxplot(fill = fill, colour = line) + | |
scale_x_discrete("Word length") + | |
scale_y_continuous("Number of Subsequences") + | |
ggtitle("Boxplot of subsequences by word length") + | |
theme_bw() | |
# Scatter plot: add jitter because the x-values are discrete | |
ggplot(selected_cols, aes(x = length, y = subsequences)) + | |
geom_jitter() + | |
scale_x_discrete("Word length") + | |
scale_y_continuous("Number of Subsequences") + | |
ggtitle("Scatter plot of subsequences by word length") + | |
theme_bw() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment