Skip to content

Instantly share code, notes, and snippets.

Created January 25, 2014 17:33
Show Gist options
  • Save geoffjentry/8620150 to your computer and use it in GitHub Desktop.
Save geoffjentry/8620150 to your computer and use it in GitHub Desktop.
load("code2013.rda") # 6028 tweets
filtered_tweets = strip_retweets(code2013) # 5006 tweets
statuses = sapply(filtered_tweets, function(x) x$getText())
# Read in the TIOBE data
tiobe = read.csv("tiobe.csv", stringsAsFactors=FALSE)
tiobe_langs = tolower(tiobe[, "lang"])
# Looking at the TIOBE listings and some of the tweet data, massage some of the entries
# here. This won't be perfect but will help a little bit
replace_statuses = function(statuses, was, is) {
gsub(was, is, statuses,
replacements = list(c("objective c", "objective-c"), c("visual basic", "visual-basic"),
c("emacs lisp", "emacs-lisp"), c("object pascal", "delphi/object-pascal"),
c("delphi", "delphi/object-pascal"), c("common lisp", "common-lisp"),
c("elisp", "emacs-lisp"))
for (pair in replacements) {
statuses = replace_statuses(statuses, pair[1], pair[2])
tiobe_langs[7] = "visual-basic"
tiobe_langs[11] = "visual-basic"
tiobe_langs[20] = "delphi/object-pascal"
tiobe_langs[46] = "emacs-lisp"
tiobe_langs[41] = "common-lisp"
tiobe$lang = tiobe_langs
# we've got two visual-basic entries
tiobe[7, "rating"] = tiobe[7, "rating"] + tiobe[11, "rating"]
tiobe = tiobe[-11, ]
# I want to convert this all to lowercase but there are 67 with weird encodings
bad_statuses = numeric()
lowercase_statuses = character()
for (i in seq_along(statuses)) {
tl = try(tolower(statuses[[i]]), silent=TRUE)
if (inherits(tl, "try-error")) {
bad_statuses = c(bad_statuses, i)
} else {
lowercase_statuses = c(lowercase_statuses, tl)
if (length(bad_statuses) > 0) {
filtered_tweets = filtered_tweets[-bad_statuses]
statuses = lowercase_statuses
# tokenize each status. split on comma period or whitespace
status_tokens = strsplit(statuses, ",|\\.|\\s+")
matching_tokens = sapply(status_tokens, function(x) {
x[which(x %in% tiobe_langs)]
# Now have the languages mentioned in #code2013 which are in TIOBE
code2013_langs = unlist(matching_tokens)
new_code2013_lang_table =, decreasing=TRUE))
colnames(new_code2013_lang_table) = "Count"
# Create a column describing the rough place of the code2013 langs
new_code2013_lang_table$code2013_tier = ordered(c(rep("1-5", 5), rep("6-10", 5), rep("11-15", 5),
rep("16-25", 10), rep("26-40", 15)), levels=c("1-5", "6-10", "11-15", "16-25", "26-40"))
# Order by the TIOBE rankings
new_code2013_lang_table$code2013_langs = ordered(rownames(new_code2013_lang_table),
levels=rev(tiobe[, "lang"]))
new_code2013_lang_table$code2013_rank = 1:nrow(new_code2013_lang_table)
new_code2013_lang_table$tiobe_rank = match(new_code2013_lang_table$code2013_langs, tiobe[, "lang"])
new_code2013_lang_table$orig_rank = code2013_lang_table[match(rownames(new_code2013_lang_table),
rownames(code2013_lang_table)), "code2013_rank"]
## Compare new vs old
png(file="update/new_vs_old.png", width=640, height=640)
ggplot(new_code2013_lang_table, aes(x=code2013_rank, y=orig_rank, color=code2013_tier)) +
geom_text(aes(label=code2013_langs), size=3.5) +
ylab("Updated #code2013 Rank") + xlab("Original #code2013 rank") +
ggtitle("Updated vs New #code2013 Rankings")
## Compare new to tiobe
png(file="update/update_vs_tiobe.png", width=640, height=640)
ggplot(new_code2013_lang_table, aes(x=code2013_rank, y=tiobe_rank, color=code2013_tier)) +
geom_text(aes(label=code2013_langs), size=3.5) +
ylab("TIOBE Rank") + xlab("Updated #code2013 rank") +
ggtitle("Updated #code2013 vs TIOBE rankings")
## Compare only new to tiobe - yes, there's a lot of ugly reused code and
## overwriting of variables. I'm lazy, hungry and want lunch.
code2013 = code2013[1:1404]
filtered_tweets = strip_retweets(code2013)
statuses = sapply(filtered_tweets, function(x) x$getText())
bad_statuses = numeric()
lowercase_statuses = character()
for (i in seq_along(statuses)) {
tl = try(tolower(statuses[[i]]), silent=TRUE)
if (inherits(tl, "try-error")) {
bad_statuses = c(bad_statuses, i)
} else {
lowercase_statuses = c(lowercase_statuses, tl)
if (length(bad_statuses) > 0) {
filtered_tweets = filtered_tweets[-bad_statuses]
statuses = lowercase_statuses
for (pair in replacements) {
statuses = replace_statuses(statuses, pair[1], pair[2])
# tokenize each status. split on comma period or whitespace
status_tokens = strsplit(statuses, ",|\\.|\\s+")
matching_tokens = sapply(status_tokens, function(x) {
x[which(x %in% tiobe_langs)]
code2013_langs = unlist(matching_tokens)
code2013_lang_table =, decreasing=TRUE))
colnames(code2013_lang_table) = "Count"
# Create a column describing the rough place of the code2013 langs
code2013_lang_table$code2013_tier = ordered(c(rep("1-5", 5), rep("6-10", 5), rep("11-15", 5),
rep("16-25", 10), rep("26-35", 10)), levels=c("1-5", "6-10", "11-15", "16-25", "26-35"))
# Order by the TIOBE rankings
code2013_lang_table$code2013_langs = ordered(rownames(code2013_lang_table),
levels=rev(tiobe[, "lang"]))
code2013_lang_table$code2013_rank = 1:nrow(code2013_lang_table)
code2013_lang_table$tiobe_rank = match(code2013_lang_table$code2013_langs, tiobe[, "lang"])
png(file="updated_code2013_tiobe_scatter.png", width=640, height=640)
ggplot(code2013_lang_table, aes(x=code2013_rank, y=tiobe_rank, color=code2013_tier)) +
geom_text(aes(label=code2013_langs), size=3.5) +
ylab("TIOBE Rank") + xlab("Updated #code2013 Rank") +
ggtitle("Updated #code2013 vs TIOBE rankings")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment