Created
January 29, 2014 17:19
-
-
Save cddesja/8692613 to your computer and use it in GitHub Desktop.
IRT 30 January 2014 R code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ---------------------- # | |
# IRT workshop 30/1/2014 # | |
# ---------------------- # | |
# The pound sign is used to indicate a comment. | |
# You should use comments a lot! Especially if you don't use | |
# R very often as it will help you remember what you did last time | |
# and why. | |
# You should also use scripts too! This will save you a ton of time | |
# next time and equally as important they will ensure reproducibility. | |
# Install some important IRT libraries | |
# install.packages("ltm") | |
# install.packages("irtoys") | |
# install.packages("lme4") | |
# install.packages("plyr") | |
# install.packages("ggplot2") | |
# Load these libraries | |
library(ltm) | |
library(irtoys) | |
library(lme4) | |
library(reshape) | |
library(ggplot2) | |
library(MASS) | |
# List the functions in a particular package | |
lsf.str("package:ltm") | |
# To find out information about a function: | |
?ltm | |
# To search for a keyword | |
??item | |
# What data sets are available? | |
data() | |
# Which data sets belong to ltm, irtoys, and lme4? | |
data(package=c("lme4","ltm","irtoys")) | |
# ----- # | |
# Data # | |
# ----- # | |
# Examine state.x77 data sets | |
data(state.x77) | |
# Find out information about the data set | |
?state.x77 | |
# Show the first few cases | |
head(state.x77) | |
# and the last few | |
tail(state.x77) | |
# If you want to see a specific observation, say #20, and you want to see responses from variables 2 through 3 | |
state.x77[20,2:3] | |
# Or if you want to see variables 1, 3, and 6 for observations 19 through 21. | |
state.x77[19:21, c(1,3,6)] | |
# QUESTION: How would you show observation 14 and 26 for variables 3, 7 | |
# Can also call them by name | |
state.x77["Alabama",] | |
# To write the data to a text file | |
write.table(state.x77,file = "(statex77.txt") | |
# To read it back in | |
read.table(file="(statex77.txt") | |
# But we need to save it as an object | |
state <- read.table(file="(statex77.txt") | |
# Similar functions exist to read data directly from SPSS, CSV files, etc # | |
# Check out the foreign package # | |
# To summarize the data there are various commands | |
summary(state.x77) | |
# To find out the type of variables they are | |
str(state.x77) | |
# Says this is a list | |
is.list(state.x77) | |
# Convert to a data.frame | |
state.df <- as.data.frame(state.x77) | |
str(state.df) | |
# Examine one variable as a table | |
table(state.df$Life) | |
table(state.x77$Life) | |
# Data.frame is also a list | |
is.list(state.df) | |
# To call a specific variable from a data.frame you need to use the $ command | |
state.df$Population | |
# QUESTION: How would you list the contains of the Income variable? | |
# Say that Alabama should have had a Population score of 3700 instead of 3615 | |
state.df[1,] | |
state.df[1,1] <- 3700 | |
state.df[1,] | |
# QUESTION: What if we found out that California should have a Illiteracy of 3.6. How can we change this? | |
# Conditional summaries | |
table(state.df$Population,state.df$Income) | |
# Not terribly useful. More useful for factors | |
# How many different HS Graduation are there? | |
table(state.df$HS) | |
# Which state has the lowest HS graduation rate? | |
which.min(state.df$HS) | |
# Which state? | |
state.df[which.min(state.df$HS),] | |
# Notice we can nest functions | |
# QUESTION: Which state has the highest murder rate? | |
# Let's examine Income a littler more | |
summary(state.df$Income) | |
# Let's create a factor for income, that seperates low and high income depending on whether they fall below or above the median | |
state.df$Income.f <- ifelse(state.df$Income > median(state.df$Income), "High","Low") | |
state.df$Income.f <- as.factor(state.df$Income.f) | |
str(state.df) | |
# QUESTION: How would we divide the states into high and low life expectancy based on the mean? | |
# How many levels of factor are there | |
nlevels(state.df$Income.f) | |
# Create lowest, low, high, higher income based on quantile splits | |
Quants <- quantile(state.df$Income) | |
state.df$Income.f2 <- ifelse(state.df$Income < Quants[2],"Lowest", | |
ifelse(state.df$Income > Quants[2] & state.df$Income < Quants[3], "Low", | |
ifelse(state.df$Income > Quants[3] & state.df$Income < Quants[4], "High","Highest"))) | |
# Remove variables created | |
state.df$Income.f <- NULL | |
state.df$Income.f2 <- NULL | |
# ---------- # | |
# Regression # | |
# ---------- # | |
# Scatterplot matrix | |
pairs(state.df) | |
# Correlation matrix | |
cor(state.df) | |
# Try to predict murder rate from Income | |
m0 <- lm(Murder ~ Income, data = state.df) | |
# Summary information | |
summary(m0) | |
anova(m0) | |
# This also contain information | |
str(summary(m0)) | |
str(anova(m0)) | |
# So if you just want the F-value for Gender | |
f <- anova(m0)$F[1] | |
# And the t-test from Gender | |
t <- summary(m0)$coef[6] | |
# Aren't these two related? | |
round(t^2) == round(f) | |
# You can get a lot of diagnostic information from plotting model | |
plot(m0) | |
# Box-Cox transformation | |
boxcox(m0) | |
# Plot Murder vs. Income | |
qplot(y =state.df$Murder, x = state.df$Income) + geom_smooth() # Looks like there is one outlier perhaps? Alaska | |
# Let's drop Alaska | |
m0a <- lm(Murder ~ Income, data = state.df[-2,]) | |
plot(m0a) | |
# You can extract residuals and predicted values | |
residuals(m0) | |
# To specify the type of residuals see ?residuals | |
# Get predicted values | |
predict(m0) | |
# To get the r squared | |
summary(m0)$r.squared | |
# Another option would be to create that factor of Income again | |
state.df$Income.f2 <- ifelse(state.df$Income < Quants[2],"Lowest", | |
ifelse(state.df$Income > Quants[2] & state.df$Income < Quants[3], "Low", | |
ifelse(state.df$Income > Quants[3] & state.df$Income < Quants[4], "High","Highest"))) | |
# Multiple regression | |
m1 <- lm(Murder ~ Income.f2 + Illiteracy, data = state.df) | |
summary(m1)$r.squared | |
# QUESTION: What kind of model is m1? | |
# Examine our ANCOVA | |
anova(m1) | |
summary(m1) | |
# Strange, none of the differences are significant. And our baseline group is high, let's make it highest | |
state.df$Income.f3 <- factor(state.df$Income.f2, levels = c("Highest","High","Low","Lowest")) | |
# Wait if we want to throw all everything in there, except id! | |
m2 <- lm(Murder ~ . - Income -Income.f3, data = state.df) | |
summary(m2) | |
# Want to do stepwise | |
stepAIC(m2) | |
# ----- # | |
# Plots # | |
# ----- # | |
# Can attach data frame | |
attach(state.df) | |
# Plot Murder vs. Life Expectancy | |
(g0 <- qplot(x = `Life Exp`, y = Murder) + stat_smooth(size = 2)) | |
# Notice the cumbersome `Life Exp` ... don't use spaces in your coding | |
# Let's add in the Income Factor into this | |
(g1 <- g0 + geom_point(aes(color = Income.f3))) | |
# Throw in gender | |
(g2 <- g1 + stat_summary(aes(group=Income.f3, color =Income.f3 ), fun.y=mean, geom="line", size=1)) | |
# Change the background colors | |
(g3 <- g2 + theme_bw()) | |
# Remove the legends and change the x-label | |
(g4 <- g3 + theme(legend.position="none") + xlab("Life Expectancy")) | |
# QUESTION: How would you change the y-label to say Murder Rate? | |
# Create histogram | |
qplot(x = `Life Exp`) | |
# Create a box plot | |
qplot(y = Murder, x = Income.f3, geom = "boxplot") | |
# Explore in R with another data set or explore with this one further! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment