ararslan/LanguageClustering.R Secret

## LanguageClustering.R
# Data from https://data.stackexchange.com/codegolf/query/459681

library(stringr)
library(XML)
library(reshape2)

setwd("~/Projects/ppcgdata/")

answers <- read.csv("QueryResults.csv", header = TRUE, stringsAsFactors = FALSE)

# Remove answers to questions with weird scoring
answers <- answers[answers$ParentId != 2078, ]

# Summon the pony
lang_count <- lapply(answers$Body, function(x) {
    # Remove struck out text
    nostrike <- str_replace_all(x, "<s(trike)?>.*</s(trike)?>", "")

    # Parse out the post header
    doc <- htmlTreeParse(nostrike, useInternalNodes = TRUE)
    header <- xpathApply(doc, "//h1|//h2|//h3|//strong", xmlValue)[[1]]

    # Regular expressions
    lang_regex <- "^([^-,\\(\\d:â]+)"
    bytes_regex <- "(\\d+)\\s*\\w*[\\)]?$"

    # If we can get the language and byte count, great, otherwise BOOOOOO
    if (str_detect(header, lang_regex) && str_detect(header, bytes_regex)) {
        data.frame(language = str_trim(str_match(header, lang_regex)[, 2], "both"),
                   bytecount = as.integer(str_match(header, bytes_regex)[, 2]))
    } else {
        NULL
    }
})

lang_count <- do.call(rbind, lang_count)
lang_count <- lang_count[complete.cases(lang_count), ]

# Remove cases with ridiculous byte counts as these are nonrepresentative
# of their respective languages (170 is pretty arbitrary tbh)
lang_count <- lang_count[lang_count$bytecount < 170, ]

# Clean up some poorly formatted cases
lang_count$lang <- vapply(lang_count$language, function(x) {
    low <- str_to_lower(x)
    if (str_detect(low, "\\bpowershell\\b")) {
        "powershell"
    } else if (str_detect(low, "\\bapl\\b")) {
        "apl"
    } else if (str_detect(low, "\\bbash\\b") || low %in% c("sh", "shell")) {
        "bash"
    } else if (str_detect(low, "\\bbatch\\b")) {
        "batch"
    } else if (str_detect(low, "^d\\b")) {
        "d"
    } else if (str_detect(low, "^c\\+\\+")) {
        "c++"
    } else if (str_detect(low, "[+/]") && low != "pari/gp") {
        str_trim(str_extract(low, "[^+/]+"), "both")
    } else if (str_detect(low, "function") && low != "function") {
        str_trim(str_extract(low, regex("^.*(?=function)", ignore_case = TRUE)))
    } else if (str_detect(low, "\\bpython\\b")) {
        "python"
    } else if (str_detect(low, "\\bjava\\b")) {
        "java"
    } else if (low == "bf" || str_detect(low, "\\bbrainf")) {
        "brainfuck"
    } else if (low %in% c("js", "es", "ecmascript") ||
               str_detect(low, "\\bjavascript\\b")) {
        "javascript"
    } else if (str_detect(low, "\\bsql") || low == "t") {
        "sql"
    } else if (low == "ti") {
        "ti-basic"
    } else if (low == "swi") {
        "prolog"
    } else if (low == "gnu sed") {
        "sed"
    } else if (low == "gs") {
        "gs2"
    } else if (low %in% c("matlab", "octave")) {
        "matlab/octave"
    } else {
        low
    }
}, "", USE.NAMES = FALSE)

# Remove languages with low usage (fewer than 10 occurrences)
groups <- table(lang_count$lang)
groups <- groups[groups > 10]
lang_count <- lang_count[lang_count$lang %in% names(groups) &
                         lang_count$lang != "", ]

# Obtain clusters from k-means
k <- 6
kmean <- kmeans(lang_count$bytecount, centers = k)
lang_count$cluster <- kmean$cluster

crosstab <- as.data.frame(table(lang_count$lang, lang_count$cluster))
colnames(crosstab)[1:2] <- c("lang", "cluster")
crosstab <- dcast(crosstab, lang ~ cluster, fill = 0L)

crosstab$cluster <- vapply(1:dim(crosstab)[1], function(i) {
    which.max(crosstab[i, 2:(k+1)])
}, 0L, USE.NAMES = FALSE)

results <- crosstab[, c("lang", "cluster")]
results$lang <- as.character(results$lang)

centers <- data.frame(cluster = 1:k, center = kmean$centers[, 1])
results <- merge(results, centers, by = "cluster")
results <- results[order(results$center, results$lang), ]
	# Data from https://data.stackexchange.com/codegolf/query/459681

	library(stringr)
	library(XML)
	library(reshape2)

	setwd("~/Projects/ppcgdata/")

	answers <- read.csv("QueryResults.csv", header = TRUE, stringsAsFactors = FALSE)

	# Remove answers to questions with weird scoring
	answers <- answers[answers$ParentId != 2078, ]

	# Summon the pony
	lang_count <- lapply(answers$Body, function(x) {
	# Remove struck out text
	nostrike <- str_replace_all(x, "<s(trike)?>.*</s(trike)?>", "")

	# Parse out the post header
	doc <- htmlTreeParse(nostrike, useInternalNodes = TRUE)
	header <- xpathApply(doc, "//h1\|//h2\|//h3\|//strong", xmlValue)[[1]]

	# Regular expressions
	lang_regex <- "^([^-,\\(\\d:â]+)"
	bytes_regex <- "(\\d+)\\s\\w[\\)]?$"

	# If we can get the language and byte count, great, otherwise BOOOOOO
	if (str_detect(header, lang_regex) && str_detect(header, bytes_regex)) {
	data.frame(language = str_trim(str_match(header, lang_regex)[, 2], "both"),
	bytecount = as.integer(str_match(header, bytes_regex)[, 2]))
	} else {
	NULL
	}
	})

	lang_count <- do.call(rbind, lang_count)
	lang_count <- lang_count[complete.cases(lang_count), ]

	# Remove cases with ridiculous byte counts as these are nonrepresentative
	# of their respective languages (170 is pretty arbitrary tbh)
	lang_count <- lang_count[lang_count$bytecount < 170, ]

	# Clean up some poorly formatted cases
	lang_count$lang <- vapply(lang_count$language, function(x) {
	low <- str_to_lower(x)
	if (str_detect(low, "\\bpowershell\\b")) {
	"powershell"
	} else if (str_detect(low, "\\bapl\\b")) {
	"apl"
	} else if (str_detect(low, "\\bbash\\b") \|\| low %in% c("sh", "shell")) {
	"bash"
	} else if (str_detect(low, "\\bbatch\\b")) {
	"batch"
	} else if (str_detect(low, "^d\\b")) {
	"d"
	} else if (str_detect(low, "^c\\+\\+")) {
	"c++"
	} else if (str_detect(low, "[+/]") && low != "pari/gp") {
	str_trim(str_extract(low, "[^+/]+"), "both")
	} else if (str_detect(low, "function") && low != "function") {
	str_trim(str_extract(low, regex("^.*(?=function)", ignore_case = TRUE)))
	} else if (str_detect(low, "\\bpython\\b")) {
	"python"
	} else if (str_detect(low, "\\bjava\\b")) {
	"java"
	} else if (low == "bf" \|\| str_detect(low, "\\bbrainf")) {
	"brainfuck"
	} else if (low %in% c("js", "es", "ecmascript") \|\|
	str_detect(low, "\\bjavascript\\b")) {
	"javascript"
	} else if (str_detect(low, "\\bsql") \|\| low == "t") {
	"sql"
	} else if (low == "ti") {
	"ti-basic"
	} else if (low == "swi") {
	"prolog"
	} else if (low == "gnu sed") {
	"sed"
	} else if (low == "gs") {
	"gs2"
	} else if (low %in% c("matlab", "octave")) {
	"matlab/octave"
	} else {
	low
	}
	}, "", USE.NAMES = FALSE)

	# Remove languages with low usage (fewer than 10 occurrences)
	groups <- table(lang_count$lang)
	groups <- groups[groups > 10]
	lang_count <- lang_count[lang_count$lang %in% names(groups) &
	lang_count$lang != "", ]

	# Obtain clusters from k-means
	k <- 6
	kmean <- kmeans(lang_count$bytecount, centers = k)
	lang_count$cluster <- kmean$cluster

	crosstab <- as.data.frame(table(lang_count$lang, lang_count$cluster))
	colnames(crosstab)[1:2] <- c("lang", "cluster")
	crosstab <- dcast(crosstab, lang ~ cluster, fill = 0L)

	crosstab$cluster <- vapply(1:dim(crosstab)[1], function(i) {
	which.max(crosstab[i, 2:(k+1)])
	}, 0L, USE.NAMES = FALSE)

	results <- crosstab[, c("lang", "cluster")]
	results$lang <- as.character(results$lang)

	centers <- data.frame(cluster = 1:k, center = kmean$centers[, 1])
	results <- merge(results, centers, by = "cluster")
	results <- results[order(results$center, results$lang), ]