-
-
Save ararslan/f9418ba838a788ea0341 to your computer and use it in GitHub Desktop.
k-means clustering of code golf languages by size
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Data from https://data.stackexchange.com/codegolf/query/459681 | |
library(stringr) | |
library(XML) | |
library(reshape2) | |
setwd("~/Projects/ppcgdata/") | |
answers <- read.csv("QueryResults.csv", header = TRUE, stringsAsFactors = FALSE) | |
# Remove answers to questions with weird scoring | |
answers <- answers[answers$ParentId != 2078, ] | |
# Summon the pony | |
lang_count <- lapply(answers$Body, function(x) { | |
# Remove struck out text | |
nostrike <- str_replace_all(x, "<s(trike)?>.*</s(trike)?>", "") | |
# Parse out the post header | |
doc <- htmlTreeParse(nostrike, useInternalNodes = TRUE) | |
header <- xpathApply(doc, "//h1|//h2|//h3|//strong", xmlValue)[[1]] | |
# Regular expressions | |
lang_regex <- "^([^-,\\(\\d:â]+)" | |
bytes_regex <- "(\\d+)\\s*\\w*[\\)]?$" | |
# If we can get the language and byte count, great, otherwise BOOOOOO | |
if (str_detect(header, lang_regex) && str_detect(header, bytes_regex)) { | |
data.frame(language = str_trim(str_match(header, lang_regex)[, 2], "both"), | |
bytecount = as.integer(str_match(header, bytes_regex)[, 2])) | |
} else { | |
NULL | |
} | |
}) | |
lang_count <- do.call(rbind, lang_count) | |
lang_count <- lang_count[complete.cases(lang_count), ] | |
# Remove cases with ridiculous byte counts as these are nonrepresentative | |
# of their respective languages (170 is pretty arbitrary tbh) | |
lang_count <- lang_count[lang_count$bytecount < 170, ] | |
# Clean up some poorly formatted cases | |
lang_count$lang <- vapply(lang_count$language, function(x) { | |
low <- str_to_lower(x) | |
if (str_detect(low, "\\bpowershell\\b")) { | |
"powershell" | |
} else if (str_detect(low, "\\bapl\\b")) { | |
"apl" | |
} else if (str_detect(low, "\\bbash\\b") || low %in% c("sh", "shell")) { | |
"bash" | |
} else if (str_detect(low, "\\bbatch\\b")) { | |
"batch" | |
} else if (str_detect(low, "^d\\b")) { | |
"d" | |
} else if (str_detect(low, "^c\\+\\+")) { | |
"c++" | |
} else if (str_detect(low, "[+/]") && low != "pari/gp") { | |
str_trim(str_extract(low, "[^+/]+"), "both") | |
} else if (str_detect(low, "function") && low != "function") { | |
str_trim(str_extract(low, regex("^.*(?=function)", ignore_case = TRUE))) | |
} else if (str_detect(low, "\\bpython\\b")) { | |
"python" | |
} else if (str_detect(low, "\\bjava\\b")) { | |
"java" | |
} else if (low == "bf" || str_detect(low, "\\bbrainf")) { | |
"brainfuck" | |
} else if (low %in% c("js", "es", "ecmascript") || | |
str_detect(low, "\\bjavascript\\b")) { | |
"javascript" | |
} else if (str_detect(low, "\\bsql") || low == "t") { | |
"sql" | |
} else if (low == "ti") { | |
"ti-basic" | |
} else if (low == "swi") { | |
"prolog" | |
} else if (low == "gnu sed") { | |
"sed" | |
} else if (low == "gs") { | |
"gs2" | |
} else if (low %in% c("matlab", "octave")) { | |
"matlab/octave" | |
} else { | |
low | |
} | |
}, "", USE.NAMES = FALSE) | |
# Remove languages with low usage (fewer than 10 occurrences) | |
groups <- table(lang_count$lang) | |
groups <- groups[groups > 10] | |
lang_count <- lang_count[lang_count$lang %in% names(groups) & | |
lang_count$lang != "", ] | |
# Obtain clusters from k-means | |
k <- 6 | |
kmean <- kmeans(lang_count$bytecount, centers = k) | |
lang_count$cluster <- kmean$cluster | |
crosstab <- as.data.frame(table(lang_count$lang, lang_count$cluster)) | |
colnames(crosstab)[1:2] <- c("lang", "cluster") | |
crosstab <- dcast(crosstab, lang ~ cluster, fill = 0L) | |
crosstab$cluster <- vapply(1:dim(crosstab)[1], function(i) { | |
which.max(crosstab[i, 2:(k+1)]) | |
}, 0L, USE.NAMES = FALSE) | |
results <- crosstab[, c("lang", "cluster")] | |
results$lang <- as.character(results$lang) | |
centers <- data.frame(cluster = 1:k, center = kmean$centers[, 1]) | |
results <- merge(results, centers, by = "cluster") | |
results <- results[order(results$center, results$lang), ] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment