Skip to content

Instantly share code, notes, and snippets.

@apoorvalal
Created June 30, 2017 17:27
Show Gist options
  • Save apoorvalal/42b19136143e78152358cfcb1aff5981 to your computer and use it in GitHub Desktop.
Save apoorvalal/42b19136143e78152358cfcb1aff5981 to your computer and use it in GitHub Desktop.
lasso
# command_for_r
# $r_exec --no-save --no-restore --verbose
# N:/.../script.R
# inp_file = 'input.dta'
# inp_dir - 'D:/input/'
# working = 'D:/working/'
# dep_var = 'tot_grouped_cost'
# rhs_vars = c('age ')
# cmd: 2>&1
setwd(code)
args(commandArgs(TRUE))
if(lenght(args)==0){
print('no arguments supplied. Using default arguments')
q()
}else{
for(i in 1:length(args)){
print(args[i])
eval(parse(text=args[[i]]))
}
input_data = paste0(inp_dir,'/',inp_file,'.dta')
fvars = FALSE
}
# source('1_lasso.R',print.eval=TRUE)
pkg = c('haven','readr','dplyr','ggplot2','glmnet','stargazer')
lapply(pkg,require,character.only=TRUE)
raw = haven::read_dta(input_data)
setwd(working)
## stitches together formula
if (fvars == TRUE) {
lapply(factors,as.factor)
raw$depvar = raw$tot_grouped_cost
fml <- as.formula(paste('depvar~',
paste((rhs_vars),collapse='+'),'+',
paste('factor(',factors,')',collapse='+',sep = ''),
sep=''))
} else {
raw$depvar = raw$tot_grouped_cost
fml <- as.formula(paste('depvar~',
paste((rhs_vars),collapse='+')))
}
lm1 = lm(fml,data=raw)
stargazer(lm1,type='text')
fv_lm <- lm1$fitted.values
l <- length(lm1$coefficients)
coef_lm <- lm1$coefficients
rmse_lm <- mean((raw$depvar-fv_lm)^2)
########################################
x = model.matrix(fml,raw)
y = as.matrix(raw['depvar'])
set.seed(1)
train = sample(1:nrow(x),nrow(x)/2)
test = (-train)
y.test=y[test]
########################################
# GLMNET fit
grid = 10^seq(10,-2,length=100)
fit = glmnet(x[train,],y[train],alpha=1,lambda=grid)
pdf(file="99_ls_l1vcof.pdf")
plot(fit)
dev.off()
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train,],alpha=1) # choose tuning parameter by cross-validation
pdf(file="99_ls_lbvMSE.pdf")
plot(cv.out)
dev.off()
bestlam=cv.out$lambda.min # find lambda that minimizes cross-validation error
out = glmnet(x,y,alpha=1,lambda=grid) # fit final model with above value of lambda
coef_lasso = predict(out,type='coefficients',s=bestlam)
fv_lasso = as.numeric(predict(out,s=bestlam,type='response',newx=x))
## Clean up coefficient vectors for export
temp = as.matrix(coef_lasso)
colnames(temp) = 'coef'
names = rownames(test)
temp2 = as.data.frame(cbind(names,as.numeric(temp)),rownames=NULL)
temp2 %>%
rename(coef=V2) %>%
filter(coef!=0) ->
nonzero_lasso_coeffs
temp = as.matrix(coef_lm)
colnames(temp) = 'coef'
names = rownames(test)
temp2 = as.data.frame(cbind(names,as.numeric(temp)),rownames=NULL)
temp2 %>%
rename(coef=V2) %>%
filter(coef!=0) ->
nonzero_lm_coeffs
######################################
costs <- as.data.frame(cbind(y,fv_lm,fv_lasso))
costs = costs[costs$depvar < quantile(costs$depvar,0.995),] # winsorized
df.m = reshape2::melt(costs)
p1 = ggplot(df.m,)
ggplot(data = df.m,mapping = aes(x = value, colour = variable)) +
geom_density()
(cor(costs))^2
length(coef_lm)
colSums(nonzero_lasso_coeffs['coef']!=0)
lasso_selection = as.character(nonzero_lasso_coeffs$names)
all_coeffs = merge(nonzero_lm_coeffs,nonzero_lasso_coeffs,by='names',all.x=TRUE)
all_coeffs
write_csv(all_coeffs,paste0(working,'coeffs.csv'))
dropped_covars = as.character(all_coeffs[is.na(all_coefs$coef.y),]$names)
target = file(paste0(working,'droplist.txt'))
writeLines(dropped_covars,target,sep='\n')
close(target)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment