jm3/jm3-redis-stats.R

## jm3-redis-stats.R
# aggregate stats from redis from our tweet research

# util: pretty-print bignums w/commas for readability
pp <- function(x){
  format( x, big.mark=",", scientific=FALSE)
}

# open up access to redis data
library(rredis)
redisConnect()

# core sets + zsets
sets  <- c('tweets:hashtags', 'tweets:links', 'tweets:mentions',
           'user:is_public')
zsets <- c('words', 'user:followers', 'user:num_tweets')

# 8 language sets + 120 country sets
langs  <- c("DE", "EN", "ES", "FA", "FR", "NL", "PT", "RU")
countries  <- c(
  "AE", "AF", "AG", "AM", "AO", "AQ", "AR", "AT", "AU", "AZ", "BA",
  "BB", "BD", "BE", "BH", "BN", "BR", "BS", "BW", "BY", "CA", "CH",
  "CL", "CN", "CO", "CR", "CU", "CY", "DE", "DK", "DO", "DZ", "EC",
  "EE", "EG", "ES", "ET", "FI", "FJ", "FK", "FR", "GB", "GE", "GH",
  "GI", "GL", "GR", "GT", "GU", "HK", "HN", "HR", "HU", "ID", "IE",
  "IL", "IN", "IR", "IT", "JM", "JO", "JP", "KE", "KH", "KP", "KR",
  "KW", "LB", "LK", "LT", "LU", "LV", "MA", "MC", "MK", "MT", "MU",
  "MW", "MX", "MY", "NG", "NI", "NL", "NO", "NP", "NZ", "OM", "PA",
  "PE", "PH", "PK", "PL", "PT", "PY", "QA", "RO", "RS", "RU", "RW",
  "SA", "SE", "SG", "SI", "SN", "SV", "TH", "TR", "TT", "TW", "TZ",
  "UA", "UG", "US", "UY", "VA", "VE", "VI", "VN", "XK", "ZA", "ZW")

# walk the list of key names and pretty-print stats for each set
for (i in 1:length(sets)) {
  print( paste(sets[i], ":", pp( redisSCard(sets[i]))))
}

# ...and zset
for (i in 1:length(zsets)) {
  print( paste(zsets[i], ":", pp( redisZCard(zsets[i]))))
}

# emit basic cardinality for all languages...
lang_stats <- c(1:length(langs))
for (i in 1:length(langs)) {
  key <- paste("user:lang:",langs[i],sep="")
  card <- redisSCard(key)
  lang_stats[i] <- card
  print( paste(key, pp(card)))
}
lang_stats <- data.frame(langs,lang_stats)
names(lang_stats) <- c("tweet language","occurrences")

# ...and countries
country_stats <- c(1:length(countries))
for (i in 1:length(countries)) {
  key <- paste("user:country:",countries[i],sep="")
  card <- redisSCard(key)
  country_stats[i] <- card
  print( paste(key, pp( card)))
}
country_stats <- data.frame(countries,country_stats)
names(country_stats) <- c("tweet country", "occurrences")

# clean up the workspace
rm(i,card,key)

# after the run, stats accrue in 2 data.frames: lang_stats + country_stats

# "tweets:hashtags : 458,640
# "tweets:links : 270,319
# "tweets:mentions : 1,086,466
# "user:is_public : 1,812,923
#
# "words : 503,999
# "user:followers : 1,711,305
# "user:num_tweets : 1,207,538
#
# "user:lang:DE 9,369
# "user:lang:EN 1,622,940
# "user:lang:ES 62,800
# "user:lang:FA 932
# "user:lang:FR 166,233
# "user:lang:NL 3,361
# "user:lang:PT 5,109
# "user:lang:RU 124,741
	# aggregate stats from redis from our tweet research

	# util: pretty-print bignums w/commas for readability
	pp <- function(x){
	format( x, big.mark=",", scientific=FALSE)
	}

	# open up access to redis data
	library(rredis)
	redisConnect()

	# core sets + zsets
	sets <- c('tweets:hashtags', 'tweets:links', 'tweets:mentions',
	'user:is_public')
	zsets <- c('words', 'user:followers', 'user:num_tweets')

	# 8 language sets + 120 country sets
	langs <- c("DE", "EN", "ES", "FA", "FR", "NL", "PT", "RU")
	countries <- c(
	"AE", "AF", "AG", "AM", "AO", "AQ", "AR", "AT", "AU", "AZ", "BA",
	"BB", "BD", "BE", "BH", "BN", "BR", "BS", "BW", "BY", "CA", "CH",
	"CL", "CN", "CO", "CR", "CU", "CY", "DE", "DK", "DO", "DZ", "EC",
	"EE", "EG", "ES", "ET", "FI", "FJ", "FK", "FR", "GB", "GE", "GH",
	"GI", "GL", "GR", "GT", "GU", "HK", "HN", "HR", "HU", "ID", "IE",
	"IL", "IN", "IR", "IT", "JM", "JO", "JP", "KE", "KH", "KP", "KR",
	"KW", "LB", "LK", "LT", "LU", "LV", "MA", "MC", "MK", "MT", "MU",
	"MW", "MX", "MY", "NG", "NI", "NL", "NO", "NP", "NZ", "OM", "PA",
	"PE", "PH", "PK", "PL", "PT", "PY", "QA", "RO", "RS", "RU", "RW",
	"SA", "SE", "SG", "SI", "SN", "SV", "TH", "TR", "TT", "TW", "TZ",
	"UA", "UG", "US", "UY", "VA", "VE", "VI", "VN", "XK", "ZA", "ZW")

	# walk the list of key names and pretty-print stats for each set
	for (i in 1:length(sets)) {
	print( paste(sets[i], ":", pp( redisSCard(sets[i]))))
	}

	# ...and zset
	for (i in 1:length(zsets)) {
	print( paste(zsets[i], ":", pp( redisZCard(zsets[i]))))
	}

	# emit basic cardinality for all languages...
	lang_stats <- c(1:length(langs))
	for (i in 1:length(langs)) {
	key <- paste("user:lang:",langs[i],sep="")
	card <- redisSCard(key)
	lang_stats[i] <- card
	print( paste(key, pp(card)))
	}
	lang_stats <- data.frame(langs,lang_stats)
	names(lang_stats) <- c("tweet language","occurrences")

	# ...and countries
	country_stats <- c(1:length(countries))
	for (i in 1:length(countries)) {
	key <- paste("user:country:",countries[i],sep="")
	card <- redisSCard(key)
	country_stats[i] <- card
	print( paste(key, pp( card)))
	}
	country_stats <- data.frame(countries,country_stats)
	names(country_stats) <- c("tweet country", "occurrences")

	# clean up the workspace
	rm(i,card,key)

	# after the run, stats accrue in 2 data.frames: lang_stats + country_stats

	# "tweets:hashtags : 458,640
	# "tweets:links : 270,319
	# "tweets:mentions : 1,086,466
	# "user:is_public : 1,812,923
	#
	# "words : 503,999
	# "user:followers : 1,711,305
	# "user:num_tweets : 1,207,538
	#
	# "user:lang:DE 9,369
	# "user:lang:EN 1,622,940
	# "user:lang:ES 62,800
	# "user:lang:FA 932
	# "user:lang:FR 166,233
	# "user:lang:NL 3,361
	# "user:lang:PT 5,109
	# "user:lang:RU 124,741