Created
November 15, 2012 03:56
-
-
Save michelleboisson/4076558 to your computer and use it in GitHub Desktop.
Data Without Borders - Assignment 8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Describing Distributions | |
snf = read.csv("http://jakeporway.com/teaching/data/snf_4.csv", head=T, as.is=T) | |
#Make a “height” column for your data that is the total number of inches tall each person is. | |
heights = apply(snf,1, function(x) { as.numeric(x['feet'])*12 + as.numeric(x['inches']) }) | |
snf$fullheight = heights | |
mean(heights) | |
#[1] 68.57884 | |
median(heights) | |
#[1] 69 | |
head(rev(sort(table(heights)))) | |
heights | |
#68 69 70 67 66 72 | |
#7976 7522 6951 6761 5819 4885 | |
summary(heights) | |
#Min. 1st Qu. Median Mean 3rd Qu. Max. | |
#36.00 67.00 69.00 68.58 71.00 95.00 | |
sd(heights) | |
#[1] 3.151631 | |
summary(snf$weight) | |
#Min. 1st Qu. Median Mean 3rd Qu. Max. | |
#1.0 150.0 165.0 169.3 180.0 999.0 | |
summary(snf$period_obs) | |
#Min. 1st Qu. Median Mean 3rd Qu. Max. | |
#0.000 1.000 1.000 2.369 2.000 856.000 | |
summary(snf$period_stop) | |
#Min. 1st Qu. Median Mean 3rd Qu. Max. | |
#0.00 3.00 5.00 5.59 5.00 999.00 | |
# there seems to be lots of skewing with the period_obs and the period_stop. There maxes fall way out. | |
plot(snf$period_stop) | |
#Slippery Slopes | |
#1. Create a subset of the data where period_obs and period_stop are less than 40. | |
subset = snf[,snf$period_obs < 40 && snf$period_stop < 40] | |
#2. Create a jittered() scatterplot of the data. What do you see? | |
plot(jitter(subset$period_stop), jitter(subset$period_obs)) | |
# there's a cluttering of points towards the lower left corner, under 10mins | |
#but also a group near the 30mins of observation and a 10mins stop. | |
#Build a linear model predicting the period_stop variable from period_obs. What is | |
#the slope of your model? Based on your intuition, would you say this is a good | |
#model? | |
#well, plotting points along the mean doesn't seem really make a straight line | |
for (i in unique(snf$period_obs)) { | |
points(i, mean(snf$period_obs[snf$period_stop == i]), col=2, pch=18) | |
} | |
#let's draw the linear model | |
linear.model <- lm(snf$period_obs ~ snf$period_stop) | |
plot(jitter(subset$period_stop), jitter(subset$period_obs)) | |
abline(linear.model) | |
summary(linear.model) | |
#The slope is 0.008806 | |
#Call: | |
# lm(formula = snf$period_obs ~ snf$period_stop) | |
#Residuals: | |
# Min 1Q Median 3Q Max | |
#-10.12 -1.36 -1.34 -0.34 853.64 | |
#Coefficients: | |
# Estimate Std. Error t value Pr(>|t|) | |
#(Intercept) 2.319811 0.024328 95.355 < 2e-16 *** | |
# snf$period_stop 0.008806 0.001687 5.218 1.81e-07 *** | |
--- | |
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 | |
#Residual standard error: 5.405 on 58087 degrees of freedom | |
#Multiple R-squared: 0.0004686, Adjusted R-squared: 0.0004514 | |
#F-statistic: 27.23 on 1 and 58087 DF, p-value: 1.811e-07 | |
#Using your model, predict how long you expect someone to be stopped if they’re | |
#observed for 5 minutes. | |
slope = linear.model$coefficients[[2]] | |
intercept = linear.model$coefficients[[1]] | |
person.obs = 5 | |
predicted.stop = slope*person.obs + intercept | |
predicted.stop | |
[1] 2.36384 | |
#Using your model, predict how long you expect someone to be stopped if they’re | |
#observed for 60 minutes. Even though we built the model only on data for | |
#periods < 40, we do have some data for when people were observed for 60 | |
#minutes. Compute the mean for those period_stops where period_obs = 60. | |
newsub = snf[snf$period_obs ==60, ] | |
plot(newsub$period_stop) #out of curiosity | |
mean(newsub$period_stop) | |
#[1] 7.631579 | |
slope = linear.model$coefficients[[2]] | |
intercept = linear.model$coefficients[[1]] | |
person.obs = 60 | |
slope*person.obs + intercept | |
[1] 60 | |
# I'm getting 60mins... I think I did something wrong... | |
#Create a scatterplot of the height and weight variables. Jitter() or use | |
#transparency() so we can see where the bulk of the data lies. | |
# I don't seem to have transparency() | |
plot(jitter(snf$weight), jitter(snf$fullheight)) | |
#Trim your data to exclude extreme height or weight values. Write down what | |
#threshold you used. | |
weight.threashold = 450 | |
clean.snf = snf[snf$weight < weight.threashold,] | |
plot(jitter(clean.snf$weight), jitter(clean.snf$fullheight)) | |
lm = lm(clean.snf$weight ~ clean.snf$fullheight) | |
abline(lm, col='red') | |
#hmm, I'm think I'm messing up here. The slope is 4.282307 and the line | |
#doesn't run through my plot... | |
slope = lm$coefficients[[2]] | |
intercept = lm$coefficients[[1]] | |
slope | |
#[1] 4.282307 | |
#How much do you expect someone who’s 6’ 0” to weight? | |
person = 6 *12 | |
predicted.weight <- slope*person + intercept | |
#183.4005, ok this does actually make sense, so maybe I didn't mess up. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment