Skip to content

Instantly share code, notes, and snippets.

@daviosa
Forked from rpietro/data_management.r
Created August 14, 2013 12:05
Show Gist options
  • Save daviosa/6230465 to your computer and use it in GitHub Desktop.
Save daviosa/6230465 to your computer and use it in GitHub Desktop.
# reading data into R: http://goo.gl/bAUmj
library("ISwR")
#factors
pain <- c(0,3,2,2,1)
fpain <- factor(pain,levels=0:3)
levels(fpain) <- c("none","mild","medium","severe")
fpain
# lists
intake.pre <- c(5260,5470,5640,6180,6390,6515,6805,7515,7515,8230,8770)
intake.post <- c(3910,4220,3885,5160,5645,4680,5265,5975,6790,6900,7335)
mylist <- list(before=intake.pre,after=intake.post)
mylist
mylist$before
# data frames
d <- data.frame(intake.pre,intake.post)
d
d$intake.pre
# indexing
intake.pre[5]
intake.pre[c(3,5,7)]
v <- c(3,5,7)
intake.pre[v]
intake.pre[1:5]
intake.pre[-c(3,5,7)]
intake.post[intake.pre > 7000]
intake.post[intake.pre > 7000 & intake.pre <= 8000]
intake.pre > 7000 & intake.pre <= 8000
# indexing and dataframes
d <- data.frame(intake.pre,intake.post)
d[5,1]
d[5,]
d[d$intake.pre>7000,]
sel <- d$intake.pre>7000
sel
d[sel,]
d[1:2,]
head(d)
#grouped data and dataframes
energy
exp.lean <- energy$expend[energy$stature=="lean"]
exp.obese <- energy$expend[energy$stature=="obese"]
l <- split(energy$expend, energy$stature)
l
# recoding
age <- subset(juul, age >= 10 & age <= 16)$age
range(age)
agegr <- cut(age, seq(10,16,2), right=F, include.lowest=T)
agegr
length(age)
table(agegr)
q <- quantile(age, c(0, .25, .50, .75, 1))
q
ageQ <- cut(age, q, include.lowest=T)
table(ageQ)
levels(ageQ) <- c("1st", "2nd", "3rd", "4th")
levels(agegr) <- c("10-11", "12-13", "14-15")
table(ageQ)
table(agegr)
# manipulating factor levels
pain <- c(0,3,2,2,1)
fpain <- factor(pain,levels=0:3,
labels=c("none","mild","medium","severe"))
text.pain <- c("none","severe", "medium", "medium", "mild")
factor(text.pain)
ftpain <- factor(text.pain)
ftpain
ftpain2 <- factor(ftpain, levels=c("none", "mild", "medium", "severe"))
ftpain2
# dates
stroke <- read.csv2(system.file("rawdata","stroke.csv", package="ISwR"), na.strings=".")
names(stroke)
names(stroke) <- tolower(names(stroke))
names(stroke)
head(stroke)
stroke <- transform(stroke,
died = as.Date(died, format="%d.%m.%Y"),
dstr = as.Date(dstr, format="%d.%m.%Y"))
summary(stroke$died)
summary(stroke$dstr)
summary(stroke$died - stroke$dstr)
head(stroke$died - stroke$dstr)
stroke <- transform(stroke,
end = pmin(died, as.Date("1996-1-1"), na.rm = T),
dead = !is.na(died) & died < as.Date("1996-1-1"))
head(stroke)
stroke <- transform(stroke,
obstime = as.numeric(end - dstr, units="days")/365.25)
head(stroke)
# data merge and data frame restructuring
juulgrl <- subset(juul, sex==2, select=-c(testvol,sex))
head(juulgrl)
juulboy <- subset(juul, sex==1, select=-c(menarche,sex))
head(jullboy)
juulgrl$sex <- factor("F")
juulgrl$testvol <- NA
juulboy$sex <- factor("M")
juulboy$menarche <- NA
juulall <- rbind(juulboy, juulgrl)
names(juulall)
levels(juulall$sex)
#sqldf
# Load the package
# install.packages("sqldf")
# install.packages("PASWR")
library(sqldf)
# Use the titanic data set
data(titanic3, package="PASWR")
colnames(titanic3)
head(titanic3)
sqldf('select age, count(*) from titanic3 where age is not null group by age')
library(ggplot2)
DF=sqldf('select age from titanic3 where age != "NA"')
qplot(DF$age,data=DF, geom="histogram")
DF=sqldf('select count(*) total from titanic3 where age=29 group by survived')
DF2=t(DF)
colnames(DF2)=c('Died','Survived')
head(DF2)
# character data
x = 7
y = 10
cat('x should be greater than y, but x=',x,'and y=',y,'\n')
cat('Long strings can','be displayed over',
'several lines using','the fill= argument',
fill=40)
paste('one',2,'three',4,'five')
paste(c('one','two','three','four'))
paste(c('one','two','three','four'),collapse=' ')
paste('X',1:5,sep='')
paste(c('X','Y'),1:5,sep='')
paste(c('X','Y'),1:5,sep='_',collapse='|')
paste(c('X','Y'),1:5,'^',c('a','b'),sep='_',collapse='|')
paste(c('X','Y'),1:5,'^',c('a','b'),sep='_')
head(state.name)
substring(state.name,2,6)
mystring = 'dog cat duck'
substring(mystring,c(1,5,9),c(3,7,12))
state = 'Mississippi'
ll = nchar(state)
ll
ltrs = substring(state,1:ll,1:ll)
ltrs
which(ltrs == 's')
# loops and repeats
for (i in 1:5) print(i^2)
j<-k<-0
for (i in 1:5) {
j<- j+1
k<-k+i*j
print(i+j+k)
}
fac2<-function(x) { f <- 1
t <- x
while(t>1) {
f <- f*t
t <- t-1
}
return(f)
}
sapply(0:5,fac2)
fac3 <- function(x) {
f <- 1
t <- x
repeat {
if (t<2) break
f <- f*t
t <- t-1 }
return(f) }
sapply(0:5,fac3)
cumprod(1:5)
fac4<-function(x) max(cumprod(1:x))
sapply(0:5,fac4)
fibonacci<-function(n) {
a<-1
b<-0
while(n>0)
{swap<-a
a<-a+b
b<-swap
n<-n-1 }
b}
sapply(1:10,fibonacci)
# avoiding loops
for (i in 1:length(y)) { if(y[i] < 0) y[i] <- 0 }
y [y < 0] <- 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment