Skip to content

Instantly share code, notes, and snippets.

@ararslan
Created Mar 27, 2016
Embed
What would you like to do?
k-means clustering of code golf languages by size
# Data from https://data.stackexchange.com/codegolf/query/459681
library(stringr)
library(XML)
library(reshape2)
setwd("~/Projects/ppcgdata/")
answers <- read.csv("QueryResults.csv", header = TRUE, stringsAsFactors = FALSE)
# Remove answers to questions with weird scoring
answers <- answers[answers$ParentId != 2078, ]
# Summon the pony
lang_count <- lapply(answers$Body, function(x) {
# Remove struck out text
nostrike <- str_replace_all(x, "<s(trike)?>.*</s(trike)?>", "")
# Parse out the post header
doc <- htmlTreeParse(nostrike, useInternalNodes = TRUE)
header <- xpathApply(doc, "//h1|//h2|//h3|//strong", xmlValue)[[1]]
# Regular expressions
lang_regex <- "^([^-,\\(\\d:â]+)"
bytes_regex <- "(\\d+)\\s*\\w*[\\)]?$"
# If we can get the language and byte count, great, otherwise BOOOOOO
if (str_detect(header, lang_regex) && str_detect(header, bytes_regex)) {
data.frame(language = str_trim(str_match(header, lang_regex)[, 2], "both"),
bytecount = as.integer(str_match(header, bytes_regex)[, 2]))
} else {
NULL
}
})
lang_count <- do.call(rbind, lang_count)
lang_count <- lang_count[complete.cases(lang_count), ]
# Remove cases with ridiculous byte counts as these are nonrepresentative
# of their respective languages (170 is pretty arbitrary tbh)
lang_count <- lang_count[lang_count$bytecount < 170, ]
# Clean up some poorly formatted cases
lang_count$lang <- vapply(lang_count$language, function(x) {
low <- str_to_lower(x)
if (str_detect(low, "\\bpowershell\\b")) {
"powershell"
} else if (str_detect(low, "\\bapl\\b")) {
"apl"
} else if (str_detect(low, "\\bbash\\b") || low %in% c("sh", "shell")) {
"bash"
} else if (str_detect(low, "\\bbatch\\b")) {
"batch"
} else if (str_detect(low, "^d\\b")) {
"d"
} else if (str_detect(low, "^c\\+\\+")) {
"c++"
} else if (str_detect(low, "[+/]") && low != "pari/gp") {
str_trim(str_extract(low, "[^+/]+"), "both")
} else if (str_detect(low, "function") && low != "function") {
str_trim(str_extract(low, regex("^.*(?=function)", ignore_case = TRUE)))
} else if (str_detect(low, "\\bpython\\b")) {
"python"
} else if (str_detect(low, "\\bjava\\b")) {
"java"
} else if (low == "bf" || str_detect(low, "\\bbrainf")) {
"brainfuck"
} else if (low %in% c("js", "es", "ecmascript") ||
str_detect(low, "\\bjavascript\\b")) {
"javascript"
} else if (str_detect(low, "\\bsql") || low == "t") {
"sql"
} else if (low == "ti") {
"ti-basic"
} else if (low == "swi") {
"prolog"
} else if (low == "gnu sed") {
"sed"
} else if (low == "gs") {
"gs2"
} else if (low %in% c("matlab", "octave")) {
"matlab/octave"
} else {
low
}
}, "", USE.NAMES = FALSE)
# Remove languages with low usage (fewer than 10 occurrences)
groups <- table(lang_count$lang)
groups <- groups[groups > 10]
lang_count <- lang_count[lang_count$lang %in% names(groups) &
lang_count$lang != "", ]
# Obtain clusters from k-means
k <- 6
kmean <- kmeans(lang_count$bytecount, centers = k)
lang_count$cluster <- kmean$cluster
crosstab <- as.data.frame(table(lang_count$lang, lang_count$cluster))
colnames(crosstab)[1:2] <- c("lang", "cluster")
crosstab <- dcast(crosstab, lang ~ cluster, fill = 0L)
crosstab$cluster <- vapply(1:dim(crosstab)[1], function(i) {
which.max(crosstab[i, 2:(k+1)])
}, 0L, USE.NAMES = FALSE)
results <- crosstab[, c("lang", "cluster")]
results$lang <- as.character(results$lang)
centers <- data.frame(cluster = 1:k, center = kmean$centers[, 1])
results <- merge(results, centers, by = "cluster")
results <- results[order(results$center, results$lang), ]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment