Jeff Gentry geoffjentry

## gist:9178556
from_db = load_tweets_db()

## gist:9178541
tweets = searchTwitter("#rstats", n=500)

store_tweets_db(tweets)

## gist:9178523
register_db_backend(dbi_connection)

# or create a sqlite connection

register_sqlite_backend("/path/to/sqlite/file")

# or create a mysql connection

register_mysql_backend("my_database", "hostname", "username", "password")

## gist:8620150
load("code2013.rda") # 6028 tweets

filtered_tweets = strip_retweets(code2013) # 5006 tweets
statuses = sapply(filtered_tweets, function(x) x$getText())

# Read in the TIOBE data
tiobe = read.csv("tiobe.csv", stringsAsFactors=FALSE)
tiobe_langs = tolower(tiobe[, "lang"])

# Looking at the TIOBE listings and some of the tweet data, massage some of the entries

## gist:8230289
code2013_lang_table$code2013_rank = 1:nrow(code2013_lang_table)
code2013_lang_table$tiobe_rank = match(code2013_lang_table$code2013_langs, tiobe[, "lang"])

# Make a scatterplot of the ranking differences
png(file="code2013_tiobe_scatter.png", width=640, height=640)
ggplot(code2013_lang_table, aes(x=code2013_rank, y=tiobe_rank, color=code2013_tier)) +
    geom_text(aes(label=code2013_langs), size=3) +
    ylab("TIOBE Rank") + xlab("#code2013 rank") +
    ggtitle("#code2013 vs TIOBE rankings")
dev.off()

## gist:8226975
library(ggplot2)
png(file="code2013_tiobe.png", width=640, height=640)
ggplot(code2013_lang_table, aes(x=code2013_langs, y=Count, fill=code2013_tier)) +
    geom_bar(stat="identity") +
    xlab("Language") + ylab("Count") +
    ggtitle("#code2013 Languages Sorted By TIOBE Rankings") +
    coord_flip()
dev.off()

## gist:8226512
# tokenize each status. split on comma period or whitespace
status_tokens = strsplit(statuses, ",|\\.|\\s+")

matching_tokens = sapply(status_tokens, function(x) {
    x[which(x %in% tiobe_langs)]
})

# Now have the languages mentioned in #code2013 which are in TIOBE
code2013_langs = unlist(matching_tokens)
code2013_lang_table = as.data.frame(sort(table(code2013_langs), decreasing=TRUE))

## gist:8226425
# I want to convert this all to lowercase but there are 67 with weird encodings
bad_statuses = numeric()
lowercase_statuses = character()
for (i in seq_along(statuses)) {
    tl = try(tolower(statuses[[i]]), silent=TRUE)
    if (inherits(tl, "try-error")) {
        bad_statuses = c(bad_statuses, i)
    } else {
        lowercase_statuses = c(lowercase_statuses, tl)
    }

## gist:8226310
# Read in the TIOBE data
tiobe = read.csv("tiobe.csv", stringsAsFactors=FALSE)
tiobe_langs = tolower(tiobe[, "lang"])

# Looking at the TIOBE listings and some of the tweet data, massage some of the entries
# here. This won't be perfect but will help a little bit
replace_statuses = function(statuses, was, is) {
    gsub(was, is, statuses, ignore.case=TRUE)
}

## gist:8226180
load("code2013.rda")

# Find/remove the tweets flagged as retweets
is_retweets = which(sapply(code2013, function(x) x$getIsRetweet()))

if (length(is_retweets) > 0) {
    filtered_tweets = code2013[-is_retweets]
} else {
    filtered_tweets = code2013
}
	tweets = searchTwitter("#rstats", n=500)

	store_tweets_db(tweets)
	register_db_backend(dbi_connection)

	# or create a sqlite connection

	register_sqlite_backend("/path/to/sqlite/file")

	# or create a mysql connection

	register_mysql_backend("my_database", "hostname", "username", "password")
	load("code2013.rda") # 6028 tweets

	filtered_tweets = strip_retweets(code2013) # 5006 tweets
	statuses = sapply(filtered_tweets, function(x) x$getText())

	# Read in the TIOBE data
	tiobe = read.csv("tiobe.csv", stringsAsFactors=FALSE)
	tiobe_langs = tolower(tiobe[, "lang"])

	# Looking at the TIOBE listings and some of the tweet data, massage some of the entries
	code2013_lang_table$code2013_rank = 1:nrow(code2013_lang_table)
	code2013_lang_table$tiobe_rank = match(code2013_lang_table$code2013_langs, tiobe[, "lang"])

	# Make a scatterplot of the ranking differences
	png(file="code2013_tiobe_scatter.png", width=640, height=640)
	ggplot(code2013_lang_table, aes(x=code2013_rank, y=tiobe_rank, color=code2013_tier)) +
	geom_text(aes(label=code2013_langs), size=3) +
	ylab("TIOBE Rank") + xlab("#code2013 rank") +
	ggtitle("#code2013 vs TIOBE rankings")
	dev.off()
	library(ggplot2)
	png(file="code2013_tiobe.png", width=640, height=640)
	ggplot(code2013_lang_table, aes(x=code2013_langs, y=Count, fill=code2013_tier)) +
	geom_bar(stat="identity") +
	xlab("Language") + ylab("Count") +
	ggtitle("#code2013 Languages Sorted By TIOBE Rankings") +
	coord_flip()
	dev.off()
	# tokenize each status. split on comma period or whitespace
	status_tokens = strsplit(statuses, ",\|\\.\|\\s+")

	matching_tokens = sapply(status_tokens, function(x) {
	x[which(x %in% tiobe_langs)]
	})

	# Now have the languages mentioned in #code2013 which are in TIOBE
	code2013_langs = unlist(matching_tokens)
	code2013_lang_table = as.data.frame(sort(table(code2013_langs), decreasing=TRUE))
	# I want to convert this all to lowercase but there are 67 with weird encodings
	bad_statuses = numeric()
	lowercase_statuses = character()
	for (i in seq_along(statuses)) {
	tl = try(tolower(statuses[[i]]), silent=TRUE)
	if (inherits(tl, "try-error")) {
	bad_statuses = c(bad_statuses, i)
	} else {
	lowercase_statuses = c(lowercase_statuses, tl)
	}
	load("code2013.rda")

	# Find/remove the tweets flagged as retweets
	is_retweets = which(sapply(code2013, function(x) x$getIsRetweet()))

	if (length(is_retweets) > 0) {
	filtered_tweets = code2013[-is_retweets]
	} else {
	filtered_tweets = code2013
	}