This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/Rscript | |
# Can title predicto log wages? | |
# Author: John Horton <http://john-joseph-horton.com> | |
library(JJHmisc) # available on Github | |
library(oDeskTools) #oDesk-specific package | |
library(ggplot2) | |
library(scales) | |
library(RTextTools) | |
library(caret) | |
# Get a dataset of 20K titles & earned wages | |
REFRESH <- TRUE | |
if (REFRESH) { | |
connectDB(get_credentials()) | |
DT <- na.omit(subset(dbGetQuery( | |
con, "select * from titles_and_wages"), total_charge > 0)) | |
DT <- within(DT,{ | |
wage = total_charge/hours | |
lwage = log(total_charge/hours)}) | |
df <- subset(DT, wage > 0.50 & wage < 100) | |
saveRDS(df, file = "../../data/titles_and_wages.rds") | |
} else { | |
df <- readRDS("../../data/titles_and_wages.rds") | |
} | |
# use the RTextTools package to create the document | |
# term matrix for opening titles | |
doc_matrix <- create_matrix(df$title, | |
language = "english", | |
removeNumbers = TRUE, | |
stemWords = TRUE, | |
removeSparseTerms = 0.999) | |
# inspect terms & create a matrix of indicators | |
str(doc_matrix) | |
x <- as.matrix(doc_matrix) | |
# create a training and testing dataset | |
set.seed(20130615) | |
inTrain <- createDataPartition(y = df$lwage, list = FALSE) | |
x.train <- x[inTrain, ] | |
x.test <- x[-inTrain, ] | |
# normalize the outcome variable | |
y <- scale(df$lwage) | |
# create training & test outcome variables | |
y.train <- y[ inTrain] | |
y.test <- y[ -inTrain] | |
# see mean counts by document | |
mean(rowSums(x.train)) | |
############ | |
# Fit models | |
############ | |
# use the lasso to fit the model & use cross-validation to find suitable | |
# regularization parameter | |
fit <- glmnet(x = x.train, y = y.train, family = "gaussian") | |
plot(fit) | |
cv.fit <- cv.glmnet(x = x.train, y = y.train, family = "gaussian") | |
plot(cv.fit) | |
###################################################### | |
# Extract Coefficients from best cross-validated model | |
###################################################### | |
Coefficients <- coef(fit, s = cv.fit$lambda.min) | |
Active.Index <- which(Coefficients != 0) | |
Active.Coefficients <- Coefficients[Active.Index] | |
# Visualize predictor effects | |
df.predictor <- data.frame(word = rownames(Coefficients), | |
coef = as.vector(Coefficients), | |
n = c(1,colSums(x.train)) | |
) | |
# order from largest to smallest | |
df.predictor$word <- with(df.predictor, reorder(word, coef, mean)) | |
g <- qplot(coef, y = word, size = log(n), data = subset(df.predictor, | |
n > 80 & | |
coef != 0)) + | |
xlab("Coefficient on word indicator in log wage model") + | |
ylab("De-stemmed word from opening title") + | |
theme_bw() + | |
geom_vline(aes(xintercept = 0), colour = "red", linetype = "dashed") + | |
theme(axis.text.y = element_text(size = 12)) | |
print(g) | |
writeImage(g, "title_log_wage_prediction", height = 12, width = 6) | |
# need to prepend a column of 1s to the model matrix, as the | |
# coefficient vector includes an intercept | |
y.train.hat <- as.vector(cbind(1, x.train) %*% Coefficients) | |
# how does it do? | |
qplot(y.train, y.train.hat) | |
###################################### | |
# Make predictions on the testing data | |
###################################### | |
y.test.hat <- as.vector(cbind(1, x.test) %*% Coefficients) | |
resid <- y.test - y.test.hat | |
# Comput R-squared | |
with(df, 1 - sum(resid**2)/sum(y.test**2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment