Skip to content

Instantly share code, notes, and snippets.

@jm3
Created May 12, 2012 22:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jm3/2669385 to your computer and use it in GitHub Desktop.
Save jm3/2669385 to your computer and use it in GitHub Desktop.
jm3-redis-stats
# aggregate stats from redis from our tweet research
# util: pretty-print bignums w/commas for readability
pp <- function(x){
format( x, big.mark=",", scientific=FALSE)
}
# open up access to redis data
library(rredis)
redisConnect()
# core sets + zsets
sets <- c('tweets:hashtags', 'tweets:links', 'tweets:mentions',
'user:is_public')
zsets <- c('words', 'user:followers', 'user:num_tweets')
# 8 language sets + 120 country sets
langs <- c("DE", "EN", "ES", "FA", "FR", "NL", "PT", "RU")
countries <- c(
"AE", "AF", "AG", "AM", "AO", "AQ", "AR", "AT", "AU", "AZ", "BA",
"BB", "BD", "BE", "BH", "BN", "BR", "BS", "BW", "BY", "CA", "CH",
"CL", "CN", "CO", "CR", "CU", "CY", "DE", "DK", "DO", "DZ", "EC",
"EE", "EG", "ES", "ET", "FI", "FJ", "FK", "FR", "GB", "GE", "GH",
"GI", "GL", "GR", "GT", "GU", "HK", "HN", "HR", "HU", "ID", "IE",
"IL", "IN", "IR", "IT", "JM", "JO", "JP", "KE", "KH", "KP", "KR",
"KW", "LB", "LK", "LT", "LU", "LV", "MA", "MC", "MK", "MT", "MU",
"MW", "MX", "MY", "NG", "NI", "NL", "NO", "NP", "NZ", "OM", "PA",
"PE", "PH", "PK", "PL", "PT", "PY", "QA", "RO", "RS", "RU", "RW",
"SA", "SE", "SG", "SI", "SN", "SV", "TH", "TR", "TT", "TW", "TZ",
"UA", "UG", "US", "UY", "VA", "VE", "VI", "VN", "XK", "ZA", "ZW")
# walk the list of key names and pretty-print stats for each set
for (i in 1:length(sets)) {
print( paste(sets[i], ":", pp( redisSCard(sets[i]))))
}
# ...and zset
for (i in 1:length(zsets)) {
print( paste(zsets[i], ":", pp( redisZCard(zsets[i]))))
}
# emit basic cardinality for all languages...
lang_stats <- c(1:length(langs))
for (i in 1:length(langs)) {
key <- paste("user:lang:",langs[i],sep="")
card <- redisSCard(key)
lang_stats[i] <- card
print( paste(key, pp(card)))
}
lang_stats <- data.frame(langs,lang_stats)
names(lang_stats) <- c("tweet language","occurrences")
# ...and countries
country_stats <- c(1:length(countries))
for (i in 1:length(countries)) {
key <- paste("user:country:",countries[i],sep="")
card <- redisSCard(key)
country_stats[i] <- card
print( paste(key, pp( card)))
}
country_stats <- data.frame(countries,country_stats)
names(country_stats) <- c("tweet country", "occurrences")
# clean up the workspace
rm(i,card,key)
# after the run, stats accrue in 2 data.frames: lang_stats + country_stats
# "tweets:hashtags : 458,640
# "tweets:links : 270,319
# "tweets:mentions : 1,086,466
# "user:is_public : 1,812,923
#
# "words : 503,999
# "user:followers : 1,711,305
# "user:num_tweets : 1,207,538
#
# "user:lang:DE 9,369
# "user:lang:EN 1,622,940
# "user:lang:ES 62,800
# "user:lang:FA 932
# "user:lang:FR 166,233
# "user:lang:NL 3,361
# "user:lang:PT 5,109
# "user:lang:RU 124,741
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment