public
Last active

R code to operate MALLET entirely from within R. Set variables, send commands to Windows' command console and get MALLET's result back into R for further analysis.

  • Download Gist
R2MALLET.r
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
# Set working directory
dir <- "C:\\" # adjust to suit
setwd(dir)
 
# configure variables and filenames for MALLET
## here using MALLET's built-in example data and
## variables from http://programminghistorian.org/lessons/topic-modeling-and-mallet
 
# folder containing txt files for MALLET to work on
importdir <- "C:\\mallet-2.0.7\\sample-data\\web\\en"
# name of file for MALLET to train model on
output <- "tutorial.mallet"
# set number of topics for MALLET to use
ntopics <- 20
# set optimisation interval for MALLET to use
optint <- 20
# set file names for output of model, extensions must be as shown
outputstate <- "topic-state.gz"
outputtopickeys <- "tutorial_keys.txt"
outputdoctopics <- "tutorial_composition.txt"
# combine variables into strings ready for windows command line
cd <- "cd C:\\mallet-2.0.7" # location of the bin directory
import <- paste("bin\\mallet import-dir --input", importdir, "--output", output, "--keep-sequence --remove-stopwords", sep = " ")
train <- paste("bin\\mallet train-topics --input", output, "--num-topics", ntopics, "--optimize-interval", optint, "--output-state", outputstate, "--output-topic-keys", outputtopickeys, "--output-doc-topics", outputdoctopics, sep = " ")
 
# setup system enviroment for R
MALLET_HOME <- "c:/mallet-2.0.7" # location of the bin directory
Sys.setenv("MALLET_HOME" = MALLET_HOME)
Sys.setenv(PATH = "c:/Program Files (x86)/Java/jre7/bin")
 
# send commands to the Windows command prompt
# watch results scroll by in R console...
shell(shQuote(paste(cd, import, train, sep = " && ")),
invisible = FALSE)
# inspect results
setwd(MALLET_HOME)
# outputstateresult <-
outputtopickeysresult <- read.table(outputtopickeys, header=F, sep="\t")
outputdoctopicsresult <-read.table(outputdoctopics, header=F, sep="\t")
 
# manipulate outputdoctopicsresult to be more useful
dat <- outputdoctopicsresult
l_dat <- reshape(dat, idvar=1:2, varying=list(topics=colnames(dat[,seq(3, ncol(dat)-1, 2)]),
props=colnames(dat[,seq(4, ncol(dat), 2)])),
direction="long")
library(reshape2)
w_dat <- dcast(l_dat, V2 ~ V3)
rm(l_dat) # because this is very big but not longer needed
 
# write reshaped table to CSV file for closer inspection
write.csv(w_dat, "topic_model_table.csv")
# find the location of that CSV file
# should pop open a window of the folder
# where the CSV is
shell.exec(getwd())

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.