Skip to content

Instantly share code, notes, and snippets.

@johnjosephhorton
Created November 30, 2013 15:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save johnjosephhorton/7720452 to your computer and use it in GitHub Desktop.
Save johnjosephhorton/7720452 to your computer and use it in GitHub Desktop.
#!/usr/bin/Rscript
# Can title predicto log wages?
# Author: John Horton <http://john-joseph-horton.com>
library(JJHmisc) # available on Github
library(oDeskTools) #oDesk-specific package
library(ggplot2)
library(scales)
library(RTextTools)
library(caret)
# Get a dataset of 20K titles & earned wages
REFRESH <- TRUE
if (REFRESH) {
connectDB(get_credentials())
DT <- na.omit(subset(dbGetQuery(
con, "select * from titles_and_wages"), total_charge > 0))
DT <- within(DT,{
wage = total_charge/hours
lwage = log(total_charge/hours)})
df <- subset(DT, wage > 0.50 & wage < 100)
saveRDS(df, file = "../../data/titles_and_wages.rds")
} else {
df <- readRDS("../../data/titles_and_wages.rds")
}
# use the RTextTools package to create the document
# term matrix for opening titles
doc_matrix <- create_matrix(df$title,
language = "english",
removeNumbers = TRUE,
stemWords = TRUE,
removeSparseTerms = 0.999)
# inspect terms & create a matrix of indicators
str(doc_matrix)
x <- as.matrix(doc_matrix)
# create a training and testing dataset
set.seed(20130615)
inTrain <- createDataPartition(y = df$lwage, list = FALSE)
x.train <- x[inTrain, ]
x.test <- x[-inTrain, ]
# normalize the outcome variable
y <- scale(df$lwage)
# create training & test outcome variables
y.train <- y[ inTrain]
y.test <- y[ -inTrain]
# see mean counts by document
mean(rowSums(x.train))
############
# Fit models
############
# use the lasso to fit the model & use cross-validation to find suitable
# regularization parameter
fit <- glmnet(x = x.train, y = y.train, family = "gaussian")
plot(fit)
cv.fit <- cv.glmnet(x = x.train, y = y.train, family = "gaussian")
plot(cv.fit)
######################################################
# Extract Coefficients from best cross-validated model
######################################################
Coefficients <- coef(fit, s = cv.fit$lambda.min)
Active.Index <- which(Coefficients != 0)
Active.Coefficients <- Coefficients[Active.Index]
# Visualize predictor effects
df.predictor <- data.frame(word = rownames(Coefficients),
coef = as.vector(Coefficients),
n = c(1,colSums(x.train))
)
# order from largest to smallest
df.predictor$word <- with(df.predictor, reorder(word, coef, mean))
g <- qplot(coef, y = word, size = log(n), data = subset(df.predictor,
n > 80 &
coef != 0)) +
xlab("Coefficient on word indicator in log wage model") +
ylab("De-stemmed word from opening title") +
theme_bw() +
geom_vline(aes(xintercept = 0), colour = "red", linetype = "dashed") +
theme(axis.text.y = element_text(size = 12))
print(g)
writeImage(g, "title_log_wage_prediction", height = 12, width = 6)
# need to prepend a column of 1s to the model matrix, as the
# coefficient vector includes an intercept
y.train.hat <- as.vector(cbind(1, x.train) %*% Coefficients)
# how does it do?
qplot(y.train, y.train.hat)
######################################
# Make predictions on the testing data
######################################
y.test.hat <- as.vector(cbind(1, x.test) %*% Coefficients)
resid <- y.test - y.test.hat
# Comput R-squared
with(df, 1 - sum(resid**2)/sum(y.test**2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment