Skip to content

Instantly share code, notes, and snippets.

@inkhorn
Last active December 27, 2015 03:18
Show Gist options
  • Save inkhorn/7257840 to your computer and use it in GitHub Desktop.
Save inkhorn/7257840 to your computer and use it in GitHub Desktop.
Enron Corpus Processing
library(stringr)
library(plyr)
library(tm)
library(tm.plugin.mail)
library(SnowballC)
library(topicmodels)
# At this point, the python script should have been run,
# creating about 126 thousand txt files. I was very much afraid
# to import that many txt files into the tm package in R (my computer only
# runs on 8GB of RAM), so I decided to mark 60k of them for a sample, and move the
# rest of them into a separate directory
email_txts = list.files('data/')
email_txts_sample = sample(email_txts, 60000)
email_rename = data.frame(orig=email_txts_sample, new=sub(".txt",".rxr", email_txts_sample))
file.rename(str_c('data/',email_rename$orig), str_c('data/',email_rename$new))
# At this point, all of the non-sampled emails (labelled .txt, not .rxr)
# need to go into a different directory. I created a directory that I called
# nonsampled/ and moved the files there via the terminal command "mv *.txt nonsampled/".
# It's very important that you don't try to do this via a file explorer, windows or linux,
# as the act of trying to display that many file icons is apparently very difficult for a regular machine :$
enron = Corpus(DirSource("/home/inkhorn/enron/data"))
extendedstopwords=c("a","about","above","across","after","MIME Version","forwarded","again","against","all","almost","alone","along","already","also","although","always","am","among","an","and","another","any","anybody","anyone","anything","anywhere","are","area","areas","aren't","around","as","ask","asked","asking","asks","at","away","b","back","backed","backing","backs","be","became","because","become","becomes","been","before","began","behind","being","beings","below","best","better","between","big","both","but","by","c","came","can","cannot","can't","case","cases","certain","certainly","clear","clearly","come","could","couldn't","d","did","didn't","differ","different","differently","do","does","doesn't","doing","done","don't","down","downed","downing","downs","during","e","each","early","either","end","ended","ending","ends","enough","even","evenly","ever","every","everybody","everyone","everything","everywhere","f","face","faces","fact","facts","far","felt","few","find","finds","first","for","four","from","full","fully","further","furthered","furthering","furthers","g","gave","general","generally","get","gets","give","given","gives","go","going","good","goods","got","great","greater","greatest","group","grouped","grouping","groups","h","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","her","here","here's","hers","herself","he's","high","higher","highest","him","himself","his","how","however","how's","i","i'd","if","i'll","i'm","important","in","interest","interested","interesting","interests","into","is","isn't","it","its","it's","itself","i've","j","just","k","keep","keeps","kind","knew","know","known","knows","l","large","largely","last","later","latest","least","less","let","lets","let's","like","likely","long","longer","longest","m","made","make","making","man","many","may","me","member","members","men","might","more","most","mostly","mr","mrs","much","must","mustn't","my","myself","n","necessary","need","needed","needing","needs","never","new","newer","newest","next","no","nobody","non","noone","nor","not","nothing","now","nowhere","number","numbers","o","of","off","often","old","older","oldest","on","once","one","only","open","opened","opening","opens","or","order","ordered","ordering","orders","other","others","ought","our","ours","ourselves","out","over","own","p","part","parted","parting","parts","per","perhaps","place","places","point","pointed","pointing","points","possible","present","presented","presenting","presents","problem","problems","put","puts","q","quite","r","rather","really","right","room","rooms","s","said","same","saw","say","says","second","seconds","see","seem","seemed","seeming","seems","sees","several","shall","shan't","she","she'd","she'll","she's","should","shouldn't","show","showed","showing","shows","side","sides","since","small","smaller","smallest","so","some","somebody","someone","something","somewhere","state","states","still","such","sure","t","take","taken","than","that","that's","the","their","theirs","them","themselves","then","there","therefore","there's","these","they","they'd","they'll","they're","they've","thing","things","think","thinks","this","those","though","thought","thoughts","three","through","thus","to","today","together","too","took","toward","turn","turned","turning","turns","two","u","under","until","up","upon","us","use","used","uses","v","very","w","want","wanted","wanting","wants","was","wasn't","way","ways","we","we'd","well","we'll","wells","went","were","we're","weren't","we've","what","what's","when","when's","where","where's","whether","which","while","who","whole","whom","who's","whose","why","why's","will","with","within","without","won't","work","worked","working","works","would","wouldn't","x","y","year","years","yes","yet","you","you'd","you'll","young","younger","youngest","your","you're","yours","yourself","yourselves","you've","z")
dtm.control = list(
tolower = T,
removePunctuation = T,
removeNumbers = T,
stopwords = c(stopwords("english"),extendedstopwords),
stemming = T,
wordLengths = c(3,Inf),
weighting = weightTf)
dtm = DocumentTermMatrix(enron, control=dtm.control)
dtm = removeSparseTerms(dtm,0.999)
dtm = dtm[rowSums(as.matrix(dtm))>0,]
k = 4
# Beware: this step takes a lot of patience! My computer was chugging along for probably 10 or so minutes before it completed the LDA here.
lda.model = LDA(dtm, k)
# This enables you to examine the words that make up each topic that was calculated. Bear in mind that I've chosen to stem all words possible in this corpus, so some of the words output will look a little weird.
terms(lda.model,20)
# Here I construct a dataframe that scores each document according to how closely its content
# matches up with each topic. The closer the score is to 0, the more likely its content matches
# up with a particular topic.
emails.topics = posterior(lda.model, dtm)$topics
df.emails.topics = as.data.frame(emails.topics)
df.emails.topics = cbind(email=as.character(rownames(df.emails.topics)),
df.emails.topics, stringsAsFactors=F)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment