geoffjentry/gist:8620150

## gistfile1.txt
load("code2013.rda") # 6028 tweets

filtered_tweets = strip_retweets(code2013) # 5006 tweets
statuses = sapply(filtered_tweets, function(x) x$getText())

# Read in the TIOBE data
tiobe = read.csv("tiobe.csv", stringsAsFactors=FALSE)
tiobe_langs = tolower(tiobe[, "lang"])

# Looking at the TIOBE listings and some of the tweet data, massage some of the entries
# here. This won't be perfect but will help a little bit
replace_statuses = function(statuses, was, is) {
    gsub(was, is, statuses, ignore.case=TRUE)
}

replacements = list(c("objective c", "objective-c"), c("visual basic", "visual-basic"),
    c("emacs lisp", "emacs-lisp"), c("object pascal", "delphi/object-pascal"),
    c("delphi", "delphi/object-pascal"), c("common lisp", "common-lisp"),
    c("elisp", "emacs-lisp"))

for (pair in replacements) {
    statuses = replace_statuses(statuses, pair[1], pair[2])
}

tiobe_langs[7] = "visual-basic"
tiobe_langs[11] = "visual-basic"
tiobe_langs[20] = "delphi/object-pascal"
tiobe_langs[46] = "emacs-lisp"
tiobe_langs[41] = "common-lisp"

tiobe$lang = tiobe_langs
# we've got two visual-basic entries
tiobe[7, "rating"] = tiobe[7, "rating"] + tiobe[11, "rating"]
tiobe = tiobe[-11, ]

# I want to convert this all to lowercase but there are 67 with weird encodings
bad_statuses = numeric()
lowercase_statuses = character()
for (i in seq_along(statuses)) {
    tl = try(tolower(statuses[[i]]), silent=TRUE)
    if (inherits(tl, "try-error")) {
        bad_statuses = c(bad_statuses, i)
    } else {
        lowercase_statuses = c(lowercase_statuses, tl)
    }
}

if (length(bad_statuses) > 0) {
    filtered_tweets = filtered_tweets[-bad_statuses]
}

statuses = lowercase_statuses

# tokenize each status. split on comma period or whitespace
status_tokens = strsplit(statuses, ",|\\.|\\s+")

matching_tokens = sapply(status_tokens, function(x) {
    x[which(x %in% tiobe_langs)]
})

# Now have the languages mentioned in #code2013 which are in TIOBE
code2013_langs = unlist(matching_tokens)
new_code2013_lang_table = as.data.frame(sort(table(code2013_langs), decreasing=TRUE))
colnames(new_code2013_lang_table) = "Count"
# Create a column describing the rough place of the code2013 langs
new_code2013_lang_table$code2013_tier = ordered(c(rep("1-5", 5), rep("6-10", 5), rep("11-15", 5),
    rep("16-25", 10), rep("26-40", 15)), levels=c("1-5", "6-10", "11-15", "16-25", "26-40"))
# Order by the TIOBE rankings
new_code2013_lang_table$code2013_langs = ordered(rownames(new_code2013_lang_table),
    levels=rev(tiobe[, "lang"]))
new_code2013_lang_table$code2013_rank = 1:nrow(new_code2013_lang_table)
new_code2013_lang_table$tiobe_rank = match(new_code2013_lang_table$code2013_langs, tiobe[, "lang"])
new_code2013_lang_table$orig_rank = code2013_lang_table[match(rownames(new_code2013_lang_table),
    rownames(code2013_lang_table)), "code2013_rank"]


library(ggplot2)

## Compare new vs old
png(file="update/new_vs_old.png", width=640, height=640)
ggplot(new_code2013_lang_table, aes(x=code2013_rank, y=orig_rank, color=code2013_tier)) +
    geom_text(aes(label=code2013_langs), size=3.5) +
    ylab("Updated #code2013 Rank") + xlab("Original #code2013 rank") +
    ggtitle("Updated vs New #code2013 Rankings")
dev.off()

## Compare new to tiobe
png(file="update/update_vs_tiobe.png", width=640, height=640)
ggplot(new_code2013_lang_table, aes(x=code2013_rank, y=tiobe_rank, color=code2013_tier)) +
    geom_text(aes(label=code2013_langs), size=3.5) +
    ylab("TIOBE Rank") + xlab("Updated #code2013 rank") +
    ggtitle("Updated #code2013 vs TIOBE rankings")
dev.off()


## Compare only new to tiobe - yes, there's a lot of ugly reused code and
## overwriting of variables. I'm lazy, hungry and want lunch.
code2013 = code2013[1:1404]
filtered_tweets = strip_retweets(code2013)
statuses = sapply(filtered_tweets, function(x) x$getText())

bad_statuses = numeric()
lowercase_statuses = character()
for (i in seq_along(statuses)) {
    tl = try(tolower(statuses[[i]]), silent=TRUE)
    if (inherits(tl, "try-error")) {
        bad_statuses = c(bad_statuses, i)
    } else {
        lowercase_statuses = c(lowercase_statuses, tl)
    }
}

if (length(bad_statuses) > 0) {
    filtered_tweets = filtered_tweets[-bad_statuses]
}

statuses = lowercase_statuses

for (pair in replacements) {
    statuses = replace_statuses(statuses, pair[1], pair[2])
}

# tokenize each status. split on comma period or whitespace
status_tokens = strsplit(statuses, ",|\\.|\\s+")

matching_tokens = sapply(status_tokens, function(x) {
    x[which(x %in% tiobe_langs)]
})

code2013_langs = unlist(matching_tokens)
code2013_lang_table = as.data.frame(sort(table(code2013_langs), decreasing=TRUE))
colnames(code2013_lang_table) = "Count"
# Create a column describing the rough place of the code2013 langs
code2013_lang_table$code2013_tier = ordered(c(rep("1-5", 5), rep("6-10", 5), rep("11-15", 5),
    rep("16-25", 10), rep("26-35", 10)), levels=c("1-5", "6-10", "11-15", "16-25", "26-35"))
# Order by the TIOBE rankings
code2013_lang_table$code2013_langs = ordered(rownames(code2013_lang_table),
    levels=rev(tiobe[, "lang"]))
code2013_lang_table$code2013_rank = 1:nrow(code2013_lang_table)
code2013_lang_table$tiobe_rank = match(code2013_lang_table$code2013_langs, tiobe[, "lang"])

png(file="updated_code2013_tiobe_scatter.png", width=640, height=640)
ggplot(code2013_lang_table, aes(x=code2013_rank, y=tiobe_rank, color=code2013_tier)) +
    geom_text(aes(label=code2013_langs), size=3.5) +
    ylab("TIOBE Rank") + xlab("Updated #code2013 Rank") +
    ggtitle("Updated #code2013 vs TIOBE rankings")
dev.off()
	load("code2013.rda") # 6028 tweets

	filtered_tweets = strip_retweets(code2013) # 5006 tweets
	statuses = sapply(filtered_tweets, function(x) x$getText())

	# Read in the TIOBE data
	tiobe = read.csv("tiobe.csv", stringsAsFactors=FALSE)
	tiobe_langs = tolower(tiobe[, "lang"])

	# Looking at the TIOBE listings and some of the tweet data, massage some of the entries
	# here. This won't be perfect but will help a little bit
	replace_statuses = function(statuses, was, is) {
	gsub(was, is, statuses, ignore.case=TRUE)
	}

	replacements = list(c("objective c", "objective-c"), c("visual basic", "visual-basic"),
	c("emacs lisp", "emacs-lisp"), c("object pascal", "delphi/object-pascal"),
	c("delphi", "delphi/object-pascal"), c("common lisp", "common-lisp"),
	c("elisp", "emacs-lisp"))

	for (pair in replacements) {
	statuses = replace_statuses(statuses, pair[1], pair[2])
	}

	tiobe_langs[7] = "visual-basic"
	tiobe_langs[11] = "visual-basic"
	tiobe_langs[20] = "delphi/object-pascal"
	tiobe_langs[46] = "emacs-lisp"
	tiobe_langs[41] = "common-lisp"

	tiobe$lang = tiobe_langs
	# we've got two visual-basic entries
	tiobe[7, "rating"] = tiobe[7, "rating"] + tiobe[11, "rating"]
	tiobe = tiobe[-11, ]

	# I want to convert this all to lowercase but there are 67 with weird encodings
	bad_statuses = numeric()
	lowercase_statuses = character()
	for (i in seq_along(statuses)) {
	tl = try(tolower(statuses[[i]]), silent=TRUE)
	if (inherits(tl, "try-error")) {
	bad_statuses = c(bad_statuses, i)
	} else {
	lowercase_statuses = c(lowercase_statuses, tl)
	}
	}

	if (length(bad_statuses) > 0) {
	filtered_tweets = filtered_tweets[-bad_statuses]
	}

	statuses = lowercase_statuses

	# tokenize each status. split on comma period or whitespace
	status_tokens = strsplit(statuses, ",\|\\.\|\\s+")

	matching_tokens = sapply(status_tokens, function(x) {
	x[which(x %in% tiobe_langs)]
	})

	# Now have the languages mentioned in #code2013 which are in TIOBE
	code2013_langs = unlist(matching_tokens)
	new_code2013_lang_table = as.data.frame(sort(table(code2013_langs), decreasing=TRUE))
	colnames(new_code2013_lang_table) = "Count"
	# Create a column describing the rough place of the code2013 langs
	new_code2013_lang_table$code2013_tier = ordered(c(rep("1-5", 5), rep("6-10", 5), rep("11-15", 5),
	rep("16-25", 10), rep("26-40", 15)), levels=c("1-5", "6-10", "11-15", "16-25", "26-40"))
	# Order by the TIOBE rankings
	new_code2013_lang_table$code2013_langs = ordered(rownames(new_code2013_lang_table),
	levels=rev(tiobe[, "lang"]))
	new_code2013_lang_table$code2013_rank = 1:nrow(new_code2013_lang_table)
	new_code2013_lang_table$tiobe_rank = match(new_code2013_lang_table$code2013_langs, tiobe[, "lang"])
	new_code2013_lang_table$orig_rank = code2013_lang_table[match(rownames(new_code2013_lang_table),
	rownames(code2013_lang_table)), "code2013_rank"]


	library(ggplot2)

	## Compare new vs old
	png(file="update/new_vs_old.png", width=640, height=640)
	ggplot(new_code2013_lang_table, aes(x=code2013_rank, y=orig_rank, color=code2013_tier)) +
	geom_text(aes(label=code2013_langs), size=3.5) +
	ylab("Updated #code2013 Rank") + xlab("Original #code2013 rank") +
	ggtitle("Updated vs New #code2013 Rankings")
	dev.off()

	## Compare new to tiobe
	png(file="update/update_vs_tiobe.png", width=640, height=640)
	ggplot(new_code2013_lang_table, aes(x=code2013_rank, y=tiobe_rank, color=code2013_tier)) +
	geom_text(aes(label=code2013_langs), size=3.5) +
	ylab("TIOBE Rank") + xlab("Updated #code2013 rank") +
	ggtitle("Updated #code2013 vs TIOBE rankings")
	dev.off()


	## Compare only new to tiobe - yes, there's a lot of ugly reused code and
	## overwriting of variables. I'm lazy, hungry and want lunch.
	code2013 = code2013[1:1404]
	filtered_tweets = strip_retweets(code2013)
	statuses = sapply(filtered_tweets, function(x) x$getText())

	bad_statuses = numeric()
	lowercase_statuses = character()
	for (i in seq_along(statuses)) {
	tl = try(tolower(statuses[[i]]), silent=TRUE)
	if (inherits(tl, "try-error")) {
	bad_statuses = c(bad_statuses, i)
	} else {
	lowercase_statuses = c(lowercase_statuses, tl)
	}
	}

	if (length(bad_statuses) > 0) {
	filtered_tweets = filtered_tweets[-bad_statuses]
	}

	statuses = lowercase_statuses

	for (pair in replacements) {
	statuses = replace_statuses(statuses, pair[1], pair[2])
	}

	# tokenize each status. split on comma period or whitespace
	status_tokens = strsplit(statuses, ",\|\\.\|\\s+")

	matching_tokens = sapply(status_tokens, function(x) {
	x[which(x %in% tiobe_langs)]
	})

	code2013_langs = unlist(matching_tokens)
	code2013_lang_table = as.data.frame(sort(table(code2013_langs), decreasing=TRUE))
	colnames(code2013_lang_table) = "Count"
	# Create a column describing the rough place of the code2013 langs
	code2013_lang_table$code2013_tier = ordered(c(rep("1-5", 5), rep("6-10", 5), rep("11-15", 5),
	rep("16-25", 10), rep("26-35", 10)), levels=c("1-5", "6-10", "11-15", "16-25", "26-35"))
	# Order by the TIOBE rankings
	code2013_lang_table$code2013_langs = ordered(rownames(code2013_lang_table),
	levels=rev(tiobe[, "lang"]))
	code2013_lang_table$code2013_rank = 1:nrow(code2013_lang_table)
	code2013_lang_table$tiobe_rank = match(code2013_lang_table$code2013_langs, tiobe[, "lang"])

	png(file="updated_code2013_tiobe_scatter.png", width=640, height=640)
	ggplot(code2013_lang_table, aes(x=code2013_rank, y=tiobe_rank, color=code2013_tier)) +
	geom_text(aes(label=code2013_langs), size=3.5) +
	ylab("TIOBE Rank") + xlab("Updated #code2013 Rank") +
	ggtitle("Updated #code2013 vs TIOBE rankings")
	dev.off()