Skip to content

Instantly share code, notes, and snippets.

@drewconway
drewconway / get_scores.R
Created February 9, 2011 04:22
Function returns quater scores from Wikipedia Super Bown pages
# Function returns quater scores from Wikipedia Super Bown pages
get.scores<-function(numeral) {
# Base URL for Wikipedia
wp.url<-getURL(paste("http://en.wikipedia.org/wiki/Super_Bowl_",numeral,sep=""))
wp.data<-htmlTreeParse(wp.url, useInternalNodes=TRUE)
score.html<-getNodeSet(wp.data,"//table[@style='background-color:transparent;']")
score.table<-readHTMLTable(score.html[[1]])
score.table<-transform(score.table, SB=numeral)
return(score.table)
}
@drewconway
drewconway / to.RomanNumeral.R
Created February 8, 2011 04:35
# A function that converts a given integer into its Roman Numeral equivalent
# A function that converts a given integer into its Roman Numeral equivalent
to.RomanNumeral<-function(x) {
if(0 < x & x < 5000) {
x<-as.integer(x)
digits<-c(1000,900,500,400,100,90,50,40,10,9,5,4,1)
numerals<-c("M","CM","D","CD","C","XC","L","XL","X","IX","V","IV","I")
digits.numerals<-as.data.frame(cbind(digits,numerals), stringsAsFactors=FALSE)
numeral<-""
for(i in 1:nrow(digits.numerals)) {
while(x >= as.numeric(digits.numerals[i,1])) {
@drewconway
drewconway / twitter_word_cloud.R
Created January 31, 2011 00:16
R function to create a comparative word cloud of two twitter hashtags, as introduced here http://www.drewconway.com/zia/?p=2624
# File-Name: twitter_word_cloud.R
# Date: 2011-01-30
# Author: Drew Conway
# Email: drew.conway@nyu.edu
# Purpose: Create a comparative word cloud of two twitter hashtags
# Data Used:
# Packages Used: twitteR, tm, ggplot2
# Output File: Hashtag word cloud
# Data Output:
# Machine: Drew Conway's MacBook Pro
@drewconway
drewconway / google_counts.R
Created January 22, 2011 22:26
Function takes a string as parameter and returns the approximate number of Google search results containing that string
require(RCurl)
require(XML)
google.counts<-function(s){
search.url<-paste("http://www.google.com/search?q=",gsub(" ","+",s),sep="")
search.html<-getURL(search.url)
parse.search<-htmlTreeParse(search.html,useInternalNodes = TRUE)
search.nodes<-getNodeSet(parse.search,"//div[@id='resultStats']")
search.value<-strsplit(xmlValue(search.nodes[[1]])," ",fixed=TRUE)[[1]][2]
return(as.numeric(gsub(",","",search.value,fixed=TRUE)))
@drewconway
drewconway / twitter_network.py
Created January 17, 2011 18:31
A function, which given a list of Twitter users, creates NetworkX object of relationships.
def twitter_network(users, api, user_type="search", alt_type="friend"):
"""
Given a list of Twitter users, create NetworkX object of relationships.
args: users List of Twitter users as strings
user_types Type string for entries in 'users'
"""
twitter_network=nx.DiGraph()
# Iteratively create network with appropriate type data
users=list(users)
for u in users:
@drewconway
drewconway / get.stack.R
Created December 9, 2010 22:21
Returns the number of tags for a given token on StackOverflow.com
# Get StackOverflow data
get.stack<-function(tok) {
# Must check for XML install, thanks onertipaday!
if (!require(XML)) install.packages('XML')
library(XML)
# Enter a SO tag as character string, and number of tags are returned
tok<-gsub("(/| )","-",tok)
tok<-gsub("#","%23",tok,fixed=TRUE)
base.stack<-"http://stackoverflow.com/questions/tagged/"
stack.tree<-htmlTreeParse(paste(base.stack,tok,sep=""),useInternalNodes=TRUE)
#### The following code produces a crash of ggplot2 ####
# Load data
hashtag<-"rstats"
infochimps<-read.csv(paste(hashtag,"_infochimps.csv",sep=""))
# Produce plot
png(paste(hashtag,"_infochimps_metric.png",sep=""),height=800,width=800,res=100)
ic.plot<-ggplot(infochimps,aes(x=log(followers_count/friends_count),y=trstrank))+geom_text(aes(label=screen_name,color=tweet.hash,size=replies_out/replies_in))
ic.plot<-ic.plot+xlab(expression(log[frac(Followers,Friends)]))+ylab("Infochimps.org trstrank")+opts(title=paste("Key Actor Analysis for",hashtag," with Infochimps.org Data",sep=""))
### Perform analysis and make pretty pictures ###
png("surv_plot.png",height=1000,width=1000,res=100)
par(mfrow=c(2,2))
survplot(survfit(Surv(time,censor)~type,data=subset(surv_data,type=="lonely")),what="survival",conf="bands",xlab="Minutes")
title("Survival function for seeing a lonely guy")
survplot(survfit(Surv(time,censor)~type,data=subset(surv_data,type=="penis")),what="survival",conf="bands",xlab="Minutes",main="Survival function for seeing a penis")
title("Survival function for seeing a penis")
survplot(survfit(Surv(time,censor)~type,data=subset(surv_data,type=="drunk")),what="survival",conf="bands",xlab="Minutes",main="Survival function for seeing two or more drunk people")
title("Survival function for seeing two or more drunk people")
### Generate our data ###
# Time to seeing a penis
penis<-round(rchisq(30,2))
p_cens<-rep(1,30)
p_type<-rep("penis",30)
p_bind<-cbind(penis,p_cens,p_type)
# Time to seeing a lonely dude
lonely<-round(rchisq(40,1))
library(survival)
library(ggplot2)
library(Design)
### Test data generation ###
# Chi-square test
count<-1000
cs1<-rchisq(count,1)