Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
# Pre-requisites
install.packages('stringr')
# Read the data
results <- read.csv(
'blog-examples/largest-subsequence-sets/words_subsequences.txt',
colClasses="character",
header=TRUE,
strip.white=TRUE
)
selected_cols = data.frame(subsequences = as.numeric(results$subsequences), length = str_trim(as.character(results$length)))
# Lengths sorted as numbers
# Extract the distinct values from the length column
all_lengths = levels(selected_cols$length)
# Sort the numeric string as numbers.
sorted_lengths <- as.character(sort(as.integer(all_lengths)))
# Re-assign the levels, sorted. This way the plot will have the values in the right order
selected_cols$length <- factor(selected_cols$length, levels = sorted_lengths)
fill <- "#4271AE"
line <- "#1F3552"
ggplot(selected_cols, aes(x = length, y = subsequences)) +
geom_boxplot(fill = fill, colour = line) +
scale_x_discrete("Word length") +
scale_y_continuous("Number of Subsequences") +
ggtitle("Boxplot of subsequences by word length") +
theme_bw()
# Scatter plot: add jitter because the x-values are discrete
ggplot(selected_cols, aes(x = length, y = subsequences)) +
geom_jitter() +
scale_x_discrete("Word length") +
scale_y_continuous("Number of Subsequences") +
ggtitle("Scatter plot of subsequences by word length") +
theme_bw()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment