kunigami/analysis.r

## analysis.r
# Pre-requisites
install.packages('stringr')

# Read the data
results <- read.csv(
  'blog-examples/largest-subsequence-sets/words_subsequences.txt',
  colClasses="character",
  header=TRUE,
  strip.white=TRUE
)

selected_cols = data.frame(subsequences = as.numeric(results$subsequences), length = str_trim(as.character(results$length)))

# Lengths sorted as numbers

# Extract the distinct values from the length column
all_lengths = levels(selected_cols$length)
# Sort the numeric string as numbers.
sorted_lengths <- as.character(sort(as.integer(all_lengths)))
# Re-assign the levels, sorted. This way the plot will have the values in the right order
selected_cols$length <- factor(selected_cols$length, levels = sorted_lengths)

fill <- "#4271AE"
line <- "#1F3552"

ggplot(selected_cols, aes(x = length, y = subsequences)) +
    geom_boxplot(fill = fill, colour = line) +
    scale_x_discrete("Word length") +
    scale_y_continuous("Number of Subsequences") +
    ggtitle("Boxplot of subsequences by word length") +
    theme_bw()

# Scatter plot: add jitter because the x-values are discrete
ggplot(selected_cols, aes(x = length, y = subsequences)) +
  geom_jitter() +
  scale_x_discrete("Word length") +
  scale_y_continuous("Number of Subsequences") +
  ggtitle("Scatter plot of subsequences by word length") +
  theme_bw()
	# Pre-requisites
	install.packages('stringr')

	# Read the data
	results <- read.csv(
	'blog-examples/largest-subsequence-sets/words_subsequences.txt',
	colClasses="character",
	header=TRUE,
	strip.white=TRUE
	)

	selected_cols = data.frame(subsequences = as.numeric(results$subsequences), length = str_trim(as.character(results$length)))

	# Lengths sorted as numbers

	# Extract the distinct values from the length column
	all_lengths = levels(selected_cols$length)
	# Sort the numeric string as numbers.
	sorted_lengths <- as.character(sort(as.integer(all_lengths)))
	# Re-assign the levels, sorted. This way the plot will have the values in the right order
	selected_cols$length <- factor(selected_cols$length, levels = sorted_lengths)

	fill <- "#4271AE"
	line <- "#1F3552"

	ggplot(selected_cols, aes(x = length, y = subsequences)) +
	geom_boxplot(fill = fill, colour = line) +
	scale_x_discrete("Word length") +
	scale_y_continuous("Number of Subsequences") +
	ggtitle("Boxplot of subsequences by word length") +
	theme_bw()

	# Scatter plot: add jitter because the x-values are discrete
	ggplot(selected_cols, aes(x = length, y = subsequences)) +
	geom_jitter() +
	scale_x_discrete("Word length") +
	scale_y_continuous("Number of Subsequences") +
	ggtitle("Scatter plot of subsequences by word length") +
	theme_bw()