lecy/Regression_in_R.Rmd

## Regression_in_R.Rmd
---
title: "Regression in R"
output: html_notebook: default
---


# create some fake data

```{r}

x1 <- 1:100

x2 <- -0.1*x1 + rnorm(100)

x3 <- 0.05*x2 + rnorm(100)

y <- 2*x1 + 10*rnorm(100) + 10*x2

dat <- data.frame( y, x1, x2, x3 )

head( dat )

```


# descriptive statistics

```{r}

library( pastecs )

print( t( stat.desc( dat ) ), digits=3 )

# To copy and paste into Excel:
#
# descriptives <- t( stat.desc(dat) )
#
# write.table( descriptives, "clipboard", sep="\t", row.names=TRUE )

```


```{r}

# To create nicely formatted tables for markdown documents use the kable() function

library( knitr )

kable( t( stat.desc( dat )[ c(1,4,5,8,9,13), ] ), format="markdown", digits=3 )

```


|   | nbr.val|     min|     max| median|   mean| std.dev|
|:--|-------:|-------:|-------:|------:|------:|-------:|
|y  |     100| -11.210| 118.497| 50.591| 52.395|  31.342|
|x1 |     100|   1.000| 100.000| 50.500| 50.500|  29.011|
|x2 |     100| -10.851|   1.071| -5.097| -4.964|   3.135|
|x3 |     100|  -2.548|   1.857| -0.409| -0.380|   1.060|


# pretty pairs plot

Convenient visual descriptives:

```{r}

pairs( dat )
```

This one is better:

```{r}

panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- abs(cor(x, y))
    txt <- format(c(r, 0.123456789), digits=digits)[1]
    txt <- paste(prefix, txt, sep="")
    if(missing(cex.cor)) cex <- 0.8/strwidth(txt)

    test <- cor.test(x,y)
    # borrowed from printCoefmat
    Signif <- symnum(test$p.value, corr = FALSE, na = FALSE,
                  cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
                  symbols = c("***", "**", "*", ".", " "))

    text(0.5, 0.5, txt, cex = 1.5 )
    text(.7, .8, Signif, cex=cex, col=2)
}


pairs( dat, lower.panel=panel.smooth, upper.panel=panel.cor)

```


# create some fake regression data

```{r}

x <- 1:100
y <- 2*x + rnorm(100,0,20)

plot( x, y )

dum <- sample( c("NJ","NY","MA","PA"), 100, replace=T )


```


# basic regression syntax

```{r}
lm( y ~ x )

m.01 <- lm( y ~ x )

summary( m.01 )
```


# nice visual diagnostics

```{r}
par( mfrow=c(2,2) )
plot( m.01 )
```


# useful model fit functions

```{r}


coefficients( m.01 ) # model coefficients
confint( m.01, level=0.95) # CIs for model parameters

# not run because of long output

# anova( m.01 ) # anova table
# fitted( m.01 ) # predicted values
# residuals( m.01 ) # residuals
# influence( m.01 ) # regression diagnostics


```


```{r, warning=F }

library( coefplot )

m.02 <-  lm( y ~ x1 + I(x1^2) + x2 + x3 )

coefplot(m.02)

```


# pretty output

```{r, warning=FALSE}

# install.packages( "memisc" )

library( memisc )

m.02 <- lm( y ~ x + I(x^2) )  # quadratic term
m.03 <- lm( y ~ x - 1 )       # no intercept term


pretty.table <- mtable("Model 1"=m.01,"Model 2"=m.02,"Model 3"=m.03,
                  summary.stats=c("R-squared","F","p","N"))

pretty.table

```


# specification

```{r}

summary( lm( y ~ x1 + x2 + x3 ) )

# add different functional forms

# square x1

summary( lm( y ~ x1 + x1^2 + x2 + x3 ) )  # not right

summary( lm( y ~ x1 + I(x1^2) + x2 + x3 ) )  # like this

summary( lm( y ~ log(x1) + x2 + x3 ) )  # log of x1 in formula works fine


```

```{r}
# interactions

summary( lm( y ~ x1 + x2 ) )

summary( lm( y ~ x1 + x2 + I(x1*x2) ) )

summary( lm( y ~ x1*x2 ) ) # shortcut
```

```{r}
# dummy variables

summary( lm( y ~ x1 + x2 + x3 + dum ) ) # drop one level

summary( lm( y ~ x1 + x2 + x3 + dum - 1) ) # keep all, drop intercept
```


# standardized regression coefficients (beta)


```{r}
# install.packages( "lm.beta" )

library( lm.beta )

m.01.beta <- lm.beta( m.01 )

summary( m.01.beta )

# coef( m.01.beta )


```

```{r}
# note the standard error is not standardized - describes regular coefficients not standardized

summary( m.01 )
```


# or just use the formula:
```{r}
lm.beta <- function( my.mod )
{
    b <- summary(my.mod)$coef[-1, 1]
    sx <- sd( my.mod$model[,-1] )
    sy <- sd( my.mod$model[,1] )
    beta <- b * sx/sy
    return(beta)
}


coefficients( m.01 )

lm.beta( m.01 )
```


# robust standard errors


```{r, warning=FALSE}

# install.packages( "sandwhich" )
# install.packages( "lmtest" )

library(sandwich)
library(lmtest)

m.01 <- lm( y ~ x )


# REGULAR STANDARD ERRORS
summary( m.01 ) # not-robust


```

```{r}
# ROBUST STANDARD ERRORS

# reproduce the Stata default
coeftest( m.01, vcov=vcovHC(m.01,"HC1") )      # robust; HC1 (Stata default)
```


```{r}
# ROBUST STANDARD ERRORS

# check that "sandwich" returns HC0
coeftest(m.01, vcov = sandwich)                # robust; sandwich
coeftest(m.01, vcov = vcovHC(m.01, "HC0"))     # robust; HC0
```


```{r}
# ROBUST STANDARD ERRORS

# check that the default robust var-cov matrix is HC3
coeftest(m.01, vcov = vcovHC(m.01))            # robust; HC3
coeftest(m.01, vcov = vcovHC(m.01, "HC3"))     # robust; HC3 (default)
```
	---
	title: "Regression in R"
	output: html_notebook: default
	---




	# create some fake data

	```{r}

	x1 <- 1:100

	x2 <- -0.1*x1 + rnorm(100)

	x3 <- 0.05*x2 + rnorm(100)

	y <- 2x1 + 10rnorm(100) + 10*x2

	dat <- data.frame( y, x1, x2, x3 )

	head( dat )

	```


	# descriptive statistics

	```{r}

	library( pastecs )

	print( t( stat.desc( dat ) ), digits=3 )

	# To copy and paste into Excel:
	#
	# descriptives <- t( stat.desc(dat) )
	#
	# write.table( descriptives, "clipboard", sep="\t", row.names=TRUE )

	```


	```{r}

	# To create nicely formatted tables for markdown documents use the kable() function

	library( knitr )

	kable( t( stat.desc( dat )[ c(1,4,5,8,9,13), ] ), format="markdown", digits=3 )

	```



	\| \| nbr.val\| min\| max\| median\| mean\| std.dev\|
	\|:--\|-------:\|-------:\|-------:\|------:\|------:\|-------:\|
	\|y \| 100\| -11.210\| 118.497\| 50.591\| 52.395\| 31.342\|
	\|x1 \| 100\| 1.000\| 100.000\| 50.500\| 50.500\| 29.011\|
	\|x2 \| 100\| -10.851\| 1.071\| -5.097\| -4.964\| 3.135\|
	\|x3 \| 100\| -2.548\| 1.857\| -0.409\| -0.380\| 1.060\|













	# pretty pairs plot

	Convenient visual descriptives:

	```{r}

	pairs( dat )
	```

	This one is better:

	```{r}

	panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
	{
	usr <- par("usr"); on.exit(par(usr))
	par(usr = c(0, 1, 0, 1))
	r <- abs(cor(x, y))
	txt <- format(c(r, 0.123456789), digits=digits)[1]
	txt <- paste(prefix, txt, sep="")
	if(missing(cex.cor)) cex <- 0.8/strwidth(txt)

	test <- cor.test(x,y)
	# borrowed from printCoefmat
	Signif <- symnum(test$p.value, corr = FALSE, na = FALSE,
	cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
	symbols = c("*", "", "*", ".", " "))

	text(0.5, 0.5, txt, cex = 1.5 )
	text(.7, .8, Signif, cex=cex, col=2)
	}


	pairs( dat, lower.panel=panel.smooth, upper.panel=panel.cor)

	```






	# create some fake regression data

	```{r}

	x <- 1:100
	y <- 2*x + rnorm(100,0,20)

	plot( x, y )

	dum <- sample( c("NJ","NY","MA","PA"), 100, replace=T )


	```






	# basic regression syntax

	```{r}
	lm( y ~ x )

	m.01 <- lm( y ~ x )

	summary( m.01 )
	```




	# nice visual diagnostics

	```{r}
	par( mfrow=c(2,2) )
	plot( m.01 )
	```




	# useful model fit functions

	```{r}


	coefficients( m.01 ) # model coefficients
	confint( m.01, level=0.95) # CIs for model parameters

	# not run because of long output

	# anova( m.01 ) # anova table
	# fitted( m.01 ) # predicted values
	# residuals( m.01 ) # residuals
	# influence( m.01 ) # regression diagnostics


	```



	```{r, warning=F }

	library( coefplot )

	m.02 <- lm( y ~ x1 + I(x1^2) + x2 + x3 )

	coefplot(m.02)

	```




	# pretty output

	```{r, warning=FALSE}

	# install.packages( "memisc" )

	library( memisc )

	m.02 <- lm( y ~ x + I(x^2) ) # quadratic term
	m.03 <- lm( y ~ x - 1 ) # no intercept term


	pretty.table <- mtable("Model 1"=m.01,"Model 2"=m.02,"Model 3"=m.03,
	summary.stats=c("R-squared","F","p","N"))

	pretty.table

	```














	# specification

	```{r}

	summary( lm( y ~ x1 + x2 + x3 ) )

	# add different functional forms

	# square x1

	summary( lm( y ~ x1 + x1^2 + x2 + x3 ) ) # not right

	summary( lm( y ~ x1 + I(x1^2) + x2 + x3 ) ) # like this

	summary( lm( y ~ log(x1) + x2 + x3 ) ) # log of x1 in formula works fine


	```

	```{r}
	# interactions

	summary( lm( y ~ x1 + x2 ) )

	summary( lm( y ~ x1 + x2 + I(x1*x2) ) )

	summary( lm( y ~ x1*x2 ) ) # shortcut
	```

	```{r}
	# dummy variables

	summary( lm( y ~ x1 + x2 + x3 + dum ) ) # drop one level

	summary( lm( y ~ x1 + x2 + x3 + dum - 1) ) # keep all, drop intercept
	```












	# standardized regression coefficients (beta)



	```{r}
	# install.packages( "lm.beta" )

	library( lm.beta )

	m.01.beta <- lm.beta( m.01 )

	summary( m.01.beta )

	# coef( m.01.beta )


	```

	```{r}
	# note the standard error is not standardized - describes regular coefficients not standardized

	summary( m.01 )
	```





	# or just use the formula:
	```{r}
	lm.beta <- function( my.mod )
	{
	b <- summary(my.mod)$coef[-1, 1]
	sx <- sd( my.mod$model[,-1] )
	sy <- sd( my.mod$model[,1] )
	beta <- b * sx/sy
	return(beta)
	}


	coefficients( m.01 )

	lm.beta( m.01 )
	```












	# robust standard errors


	```{r, warning=FALSE}

	# install.packages( "sandwhich" )
	# install.packages( "lmtest" )

	library(sandwich)
	library(lmtest)

	m.01 <- lm( y ~ x )


	# REGULAR STANDARD ERRORS
	summary( m.01 ) # not-robust





	```

	```{r}
	# ROBUST STANDARD ERRORS

	# reproduce the Stata default
	coeftest( m.01, vcov=vcovHC(m.01,"HC1") ) # robust; HC1 (Stata default)
	```



	```{r}
	# ROBUST STANDARD ERRORS

	# check that "sandwich" returns HC0
	coeftest(m.01, vcov = sandwich) # robust; sandwich
	coeftest(m.01, vcov = vcovHC(m.01, "HC0")) # robust; HC0
	```


	```{r}
	# ROBUST STANDARD ERRORS

	# check that the default robust var-cov matrix is HC3
	coeftest(m.01, vcov = vcovHC(m.01)) # robust; HC3
	coeftest(m.01, vcov = vcovHC(m.01, "HC3")) # robust; HC3 (default)
	```