johnjosephhorton/predict_wages.R

## predict_wages.R
#!/usr/bin/Rscript

# Can title predicto log wages?
# Author: John Horton <http://john-joseph-horton.com>

library(JJHmisc) # available on Github
library(oDeskTools) #oDesk-specific package
library(ggplot2)
library(scales)
library(RTextTools)
library(caret)

# Get a dataset of 20K titles & earned wages
REFRESH <- TRUE
if (REFRESH) {
    connectDB(get_credentials())
    DT <- na.omit(subset(dbGetQuery(
        con, "select * from titles_and_wages"), total_charge > 0))
    DT <- within(DT,{
        wage = total_charge/hours
        lwage = log(total_charge/hours)})
    df <- subset(DT, wage > 0.50 & wage < 100)
    saveRDS(df, file = "../../data/titles_and_wages.rds")
} else {
    df <- readRDS("../../data/titles_and_wages.rds")
}

# use the RTextTools package to create the document
# term matrix for opening titles
doc_matrix <- create_matrix(df$title,
                            language = "english",
                            removeNumbers = TRUE,
                            stemWords = TRUE,
                            removeSparseTerms = 0.999)

# inspect terms & create a matrix of indicators
str(doc_matrix)
x <- as.matrix(doc_matrix)

# create a training and testing dataset
set.seed(20130615)

inTrain <- createDataPartition(y = df$lwage, list = FALSE)
x.train <- x[inTrain, ]
x.test <- x[-inTrain, ]

# normalize the outcome variable
y <- scale(df$lwage)

# create training & test outcome variables
y.train <- y[ inTrain]
y.test <- y[ -inTrain]

# see mean counts by document
mean(rowSums(x.train))

############
# Fit models
############

# use the lasso to fit the model & use cross-validation to find suitable
# regularization parameter

fit  <- glmnet(x = x.train, y = y.train, family = "gaussian")
plot(fit)

cv.fit  <- cv.glmnet(x = x.train, y = y.train, family = "gaussian")
plot(cv.fit)

######################################################
# Extract Coefficients from best cross-validated model
######################################################

Coefficients <- coef(fit, s = cv.fit$lambda.min)
Active.Index <- which(Coefficients != 0)
Active.Coefficients <- Coefficients[Active.Index]

# Visualize predictor effects
df.predictor <- data.frame(word = rownames(Coefficients),
                           coef = as.vector(Coefficients),
                           n = c(1,colSums(x.train))
)

# order from largest to smallest
df.predictor$word <- with(df.predictor, reorder(word, coef, mean))

g <- qplot(coef, y = word, size = log(n), data = subset(df.predictor,
                          n > 80 &
                          coef != 0)) +
    xlab("Coefficient on word indicator in log wage model") +
    ylab("De-stemmed word from opening title") +
    theme_bw() +
    geom_vline(aes(xintercept = 0), colour = "red", linetype = "dashed") +
    theme(axis.text.y = element_text(size = 12))

print(g)

writeImage(g, "title_log_wage_prediction", height = 12, width = 6)

# need to prepend a column of 1s to the model matrix, as the
# coefficient vector includes an intercept
y.train.hat <- as.vector(cbind(1, x.train)  %*% Coefficients)

# how does it do?
qplot(y.train, y.train.hat)

######################################
# Make predictions on the testing data
######################################
y.test.hat <- as.vector(cbind(1, x.test)  %*% Coefficients)
resid <- y.test - y.test.hat

# Comput R-squared
with(df, 1 - sum(resid**2)/sum(y.test**2))
	#!/usr/bin/Rscript

	# Can title predicto log wages?
	# Author: John Horton <http://john-joseph-horton.com>

	library(JJHmisc) # available on Github
	library(oDeskTools) #oDesk-specific package
	library(ggplot2)
	library(scales)
	library(RTextTools)
	library(caret)

	# Get a dataset of 20K titles & earned wages
	REFRESH <- TRUE
	if (REFRESH) {
	connectDB(get_credentials())
	DT <- na.omit(subset(dbGetQuery(
	con, "select * from titles_and_wages"), total_charge > 0))
	DT <- within(DT,{
	wage = total_charge/hours
	lwage = log(total_charge/hours)})
	df <- subset(DT, wage > 0.50 & wage < 100)
	saveRDS(df, file = "../../data/titles_and_wages.rds")
	} else {
	df <- readRDS("../../data/titles_and_wages.rds")
	}

	# use the RTextTools package to create the document
	# term matrix for opening titles
	doc_matrix <- create_matrix(df$title,
	language = "english",
	removeNumbers = TRUE,
	stemWords = TRUE,
	removeSparseTerms = 0.999)

	# inspect terms & create a matrix of indicators
	str(doc_matrix)
	x <- as.matrix(doc_matrix)

	# create a training and testing dataset
	set.seed(20130615)

	inTrain <- createDataPartition(y = df$lwage, list = FALSE)
	x.train <- x[inTrain, ]
	x.test <- x[-inTrain, ]

	# normalize the outcome variable
	y <- scale(df$lwage)

	# create training & test outcome variables
	y.train <- y[ inTrain]
	y.test <- y[ -inTrain]

	# see mean counts by document
	mean(rowSums(x.train))

	############
	# Fit models
	############

	# use the lasso to fit the model & use cross-validation to find suitable
	# regularization parameter

	fit <- glmnet(x = x.train, y = y.train, family = "gaussian")
	plot(fit)

	cv.fit <- cv.glmnet(x = x.train, y = y.train, family = "gaussian")
	plot(cv.fit)

	######################################################
	# Extract Coefficients from best cross-validated model
	######################################################

	Coefficients <- coef(fit, s = cv.fit$lambda.min)
	Active.Index <- which(Coefficients != 0)
	Active.Coefficients <- Coefficients[Active.Index]

	# Visualize predictor effects
	df.predictor <- data.frame(word = rownames(Coefficients),
	coef = as.vector(Coefficients),
	n = c(1,colSums(x.train))
	)

	# order from largest to smallest
	df.predictor$word <- with(df.predictor, reorder(word, coef, mean))

	g <- qplot(coef, y = word, size = log(n), data = subset(df.predictor,
	n > 80 &
	coef != 0)) +
	xlab("Coefficient on word indicator in log wage model") +
	ylab("De-stemmed word from opening title") +
	theme_bw() +
	geom_vline(aes(xintercept = 0), colour = "red", linetype = "dashed") +
	theme(axis.text.y = element_text(size = 12))

	print(g)

	writeImage(g, "title_log_wage_prediction", height = 12, width = 6)

	# need to prepend a column of 1s to the model matrix, as the
	# coefficient vector includes an intercept
	y.train.hat <- as.vector(cbind(1, x.train) %*% Coefficients)

	# how does it do?
	qplot(y.train, y.train.hat)

	######################################
	# Make predictions on the testing data
	######################################
	y.test.hat <- as.vector(cbind(1, x.test) %*% Coefficients)
	resid <- y.test - y.test.hat

	# Comput R-squared
	with(df, 1 - sum(resid2)/sum(y.test2))