Skip to content

Instantly share code, notes, and snippets.

@justgrimes
Created December 10, 2014 16:24
Show Gist options
  • Save justgrimes/07079a03c66f2ad054d3 to your computer and use it in GitHub Desktop.
Save justgrimes/07079a03c66f2ad054d3 to your computer and use it in GitHub Desktop.
Introduction to R - SDC Annual Meeting 2014
#SDC Annual Meeting 2014 Introduction to R, R Studio, and Statistical Computing
# Load downloaded data from PLS 2012 survey, CSV format (comma seperated value)
pupld12a_csv <- read.csv("pupld12a_csv/pupld12a.csv", header = TRUE, sep=",", quote = "\"")
puout12a_csv <- read.csv("pupld12a_csv/puout12a.csv", header = TRUE, sep=",", quote = "\"")
pusum12a_csv <- read.csv("pupld12a_csv/pusum12a.csv", header = TRUE, sep=",", quote = "\"")
# Load downloaded data from PLS 2012 survey, Excel format (xls)
#install.packages("gdata")
library(gdata) #library to load xls data; if loading xlsx files use XLConnect library
pupld12a_xls <- read.xls("pupld12a_xls/pupld12a.xls", sheet = 1, header = TRUE)
puout12a_xls <- read.xls("pupld12a_xls/puout12a.xls", sheet = 1, header = TRUE)
pusum12a_xls <- read.xls("pupld12a_xls/pusum12a.xls", sheet = 1, header = TRUE)
# Load downloaded data from PLS 2012 survey, SAS format (sas7bdat)
#install.packages("sas7bdat")
library(sas7bdat)
pupld12a_sas <- read.sas7bdat("pupld12a_sas/pupld12a.sas7bdat")
puout12a_sas <- read.sas7bdat("pupld12a_sas/puout12a.sas7bdat")
pusum12a_sas <- read.sas7bdat("pupld12a_sas/pusum12a.sas7bdat")
# Load PLS 2012 directly from the web
temp <- tempfile()
download.file("http://www.imls.gov/assets/1/AssetManager/pupld12a_csv.zip", temp, mode="wb")
unzip(temp, "Pupld12a.csv")
unzip(temp, "Puout12a.csv")
unzip(temp, "Pusum12a.csv")
pupld12a <- read.csv("Pupld12a.csv", header = TRUE, sep=",", quote = "\"", na.strings=c(".","-1","-3"))
puout12a <- read.csv("Puout12a.csv", header = TRUE, sep=",", quote = "\"", na.strings=c(".","-1","-3"))
pusum12a <- read.csv("Pusum12a.csv", header = TRUE, sep=",", quote = "\"", na.strings=c(".","-1","-3"))
#Keep columns (i.e. drop imputed columns)
keepme <- c("FSCSKEY","STABR","LIBID","LIBNAME","ADDRESS","CITY","ZIP","ZIP4","ADDRES_M","CITY_M","ZIP_M","ZIP4_M",
"CNTY","PHONE","C_RELATN","C_LEGBAS","C_ADMIN","C_FSCS","GEOCODE","LSABOUND","STARTDAT","ENDDATE",
"POPU_LSA","POPU_UND","CENTLIB","BRANLIB","BKMOB","MASTER","LIBRARIA","OTHPAID","TOTSTAFF","LOCGVT"
,"STGVT","FEDGVT","OTHINCM","TOTINCM","SALARIES","BENEFIT","STAFFEXP","PRMATEXP",
"ELMATEXP","OTHMATEX","TOTEXPCO","OTHOPEXP","TOTOPEXP","LCAP_REV","SCAP_REV","OCAP_REV","CAP_REV",
"CAPITAL","BKVOL","EBOOK","AUDIO_PH","AUDIO_DL","VIDEO_PH","VIDEO_DL","DB_LO_OT","DB_ST","DATABASE",
"SUBSCRIP","HRS_OPEN","VISITS","REFERENC","REGBOR","TOTCIR","KIDCIRCL","LOANTO","LOANFM",
"TOTPRO","KIDPRO","YAPRO","TOTATTEN","KIDATTEN","YAATTEN","GPTERMS","PITUSR","YR_SUB","OBEREG",
"RSTATUS","STATSTRU","STATNAME","STATADDR","LONGITUD","LATITUDE","FIPSST","FIPSCO","FIPSPLAC",
"CNTYPOP","LOCALE","CENTRACT","CENBLOCK","CDCODE","CBSA","MICROF","GAL","GALMS","POSTMS")
mydf <- subset(pupld12a, select = keepme)
#pupld12a <- pupld12a[keepme]
#remove terroritories
mydf <- subset(mydf, STABR!="MP")
mydf <- subset(mydf, STABR!="VI")
mydf <- subset(mydf, STABR!="GU")
mydf <- subset(mydf, STABR!="MH")
mydf <- subset(mydf, STABR!="PW")
mydf <- subset(mydf, STABR!="PR")
mydf <- subset(mydf, STABR!="AS")
#remove non fscs data
mydf <- subset(mydf, C_FSCS!="N")
#remove closed or temporarily closed libraries
mydf <- subset(mydf, STATSTRU!=3)
mydf <- subset(mydf, STATSTRU!=23)
# You can also select data from clipboard
#read.table("clipboard")
#write/export data as csv
write.csv(pusum12a, file="export.csv")
#view data
mydf
head(mydf,5)
tail(mydf,5)
#count number of rows/columns
nrow(mydf) #count row
ncol(mydf) #count cols
dim(mydf) #get dimensions
#get column names
colnames(mydf)
#select an element of a data frame
mydf[1,1]
#select a row of a data frame
mydf[10,]
#select multiple rows of a data frame
mydf[c(1,3),]
mydf[1:7,]
#select a column
mydf[,1]
mydf$STABR
#count
length(mydf$VISIT)
#find missing
which(is.na(mydf$VISIT))
#count missing
length(which(is.na(mydf$VISIT)))
#sum
sum(mydf$VISIT)
sum(mydf$VISIT,na.rm=TRUE)
#basic statistics
mean(mydf$VISIT,na.rm=TRUE)
median(mydf$VISIT,na.rm=TRUE)
var(mydf$VISIT,na.rm=TRUE)
#summarize data
summary(mydf)
#Sort asc
mydf2 <- mydf[order(mydf$VISITS),]
#Sort desc
mydf2 <- mydf[order(-mydf$VISITS),]
#Create a new variable
mydf$VISITSPERCAP <- mydf$VISITS/mydf$POPU_UND
mydf$CIRCPERCAP <- mydf$TOTCIR/mydf$POPU_UND
sum(mydf$VISITS,na.rm=TRUE)/sum(mydf$POPU_UND,na.rm=TRUE)
sum(as.numeric(mydf$TOTCIR,na.rm=TRUE))/sum(mydf$POPU_UND,na.rm=TRUE)
#Round a value
round(sum(mydf$VISITS,na.rm=TRUE)/sum(mydf$POPU_UND,na.rm=TRUE),2)
#Recode a value
mydf$LOCALE4[mydf$LOCALE == 11 | mydf$LOCALE == 12 | mydf$LOCALE == 13] <- 1
mydf$LOCALE4[mydf$LOCALE == 21 | mydf$LOCALE == 22 | mydf$LOCALE == 23] <- 2
mydf$LOCALE4[mydf$LOCALE == 31 | mydf$LOCALE == 32 | mydf$LOCALE == 33] <- 3
mydf$LOCALE4[mydf$LOCALE == 41 | mydf$LOCALE == 42 | mydf$LOCALE == 43] <- 4
mydf$LOCALE4[mydf$LOCALE4==1] <- "City"
mydf$LOCALE4[mydf$LOCALE4==2] <- "Suburb"
mydf$LOCALE4[mydf$LOCALE4==3] <- "Town"
mydf$LOCALE4[mydf$LOCALE4==4] <- "Rural"
mydf$LOCALE4 <- as.factor(mydf$LOCALE4)
#Rename a column
names(mydf)[names(mydf)=="LOCALE"] <- "LOCALE12"
# List types of columns in data frame
str(mydf)
#Summarize data
ptab <- table(mydf$STABR, mydf$LOCALE4)
#Save table as data frame
df <- as.data.frame(ptab)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment