Skip to content

Instantly share code, notes, and snippets.

@josep2
Created April 7, 2014 04:43
Show Gist options
  • Save josep2/10014932 to your computer and use it in GitHub Desktop.
Save josep2/10014932 to your computer and use it in GitHub Desktop.
##Load Necessary Packages
library(maps)
require(RCurl)
library(RgoogleMaps)
library(stringr)
library(plotly)
library(tm)
library(rJava)
library(Snowball)
## Get Google Spreadsheet data into R
Linkedin<-getURL('https://docs.google.com/spreadsheet/pub?key=0Avgy7vmlxX7VdGNiZUx3U3d0ODMxRFZ4NEFrSHRsSnc&single=true&gid=0&output=csv')
Linkedin<-read.csv(textConnection(Linkedin))
##Okay figure out where the places where there are 10 or more jobs in the corpus
TopSpots<-as.data.frame(table(Linkedin$Location))
TopSpots<-TopSpots[which(TopSpots$Freq > 9), ]
##Sort it descending
TopSpots<-TopSpots[order(-TopSpots$Freq), ]
## Now we need to get the words "Area" and "Greater" out of these strings
NewNames<-str_replace_all(string=TopSpots$Var1,
pattern="Area",
repl="")
Temp<-as.data.frame(NewNames)
FinalNames<-str_replace_all(string=Temp$NewNames,
pattern="Greater",
repl="")
FinalNames<-as.data.frame(FinalNames)
TopSpots$City<-FinalNames$FinalNames
##Now that we have cities lets build a function that will give us longitude and lattitude
TopSpots1 = cbind.data.frame(TopSpots,
lat=NA,
lon=NA)
TopSpots1 <-with(TopSpots1,
data.frame(City,
t(sapply(TopSpots1$City,
getGeoCode))))
TopSpots<-merge(TopSpots,
TopSpots1,
by.x=3)
##Great lets use plotly to make a cool map vis
p <- plotly(username="yourname",
key="yourkey")
trace1 <- list(x=map(regions="usa")$x,
y=map(regions="usa")$y)
#Create the plotable city data
trace2 <- list(x= TopSpots$lon,
y=TopSpots$lat,
text=TopSpots$City,
type="scatter",
mode="markers",
marker=list("size"=TopSpots$Freq,
"opacity"=0.5))
p$plotly(trace1,
trace2)
## Alright let's take a look as Industries with 5 more more:
Top_Industry<-as.data.frame(table(Linkedin$Industry))
Top_Industry<-Top_Industry[which(Top_Industry$Freq > 2), ]
##Sort it descending
Top_Industry<-Top_Industry[order(-Top_Industry$Freq),]
colnames(Top_Industry)<-c("Industry","Frequency")
##Put this in a table and emebed
##Manage Description Data
Descriptions<-as.data.frame(Linkedin$Description)
-------------------------------------------------------------------
##Natural Language Processing Code Used from https://github.com/benmarwick/AAA2011-Tweets/blob/master/AAA2011.R
## Check it out if you want a better understanding of how this works
##First things first, lets get a corpus and do a ton of work for NLP
LinkedIn_Corpus<-Corpus(VectorSource(Descriptions))
LinkedIn_Corpus <- tm_map(LinkedIn_Corpus,
tolower) # convert all text to lower case
LinkedIn_Corpus <- tm_map(LinkedIn_Corpus,
removePunctuation) #gets rid of puncuation
LinkedIn_Corpus <- tm_map(LinkedIn_Corpus,
removeNumbers)
LinkedIn_Corpus <- tm_map(LinkedIn_Corpus,
removeWords,
stopwords("english"))
LinkedIn_Corpus <- tm_map(LinkedIn_Corpus,
stemDocument,
language = "english") # converts terms to tokens
LinkedIn_Corpus.tdm <- TermDocumentMatrix(LinkedIn_Corpus,
control = list(minWordLength = 3)) # create a term document matrix, keepiing only tokens longer than three characters, since shorter tokens are very hard to interpret
findFreqTerms(LinkedIn_Corpus.tdm,
lowfreq=30) # have a look at common words, in this case, those that appear at least 30 times, good to get high freq words and add to stopword list and re-make the dtm, in this case add aaa, panel, session
## Cool now lets get to getting topics out of this data
LinkedIn_Corpus.tdm.sp <- removeSparseTerms(LinkedIn_Corpus.tdm,
sparse=0.989)##Get sparse terms out
LinkedIn_Corpus.tdm.sp.df <- as.data.frame(inspect(LinkedIn_Corpus.tdm.sp ))
##Gotta invert this to prepare it for analysis
LinkedIn_Corpus.tdm.sp.df.sc.t <- t(scale(LinkedIn_Corpus.tdm.sp.df))
require(slam)
LinkedIn_Corpus.tdm.sp.t <- t(LinkedIn_Corpus.tdm.sp)
summary(col_sums(LinkedIn_Corpus.tdm.sp.t))
term_tfidf <- tapply(LinkedIn_Corpus.tdm.sp.t$v/row_sums(LinkedIn_Corpus.tdm.sp.t)[LinkedIn_Corpus.tdm.sp.t$i], LinkedIn_Corpus.tdm.sp.t$j,mean) * log2(nDocs(LinkedIn_Corpus.tdm.sp.t)/col_sums(LinkedIn_Corpus.tdm.sp.t>0)) # calculate tf-idf values
LinkedIn_Corpus.tdm.sp.t.tdif <- LinkedIn_Corpus.tdm.sp.t[,term_tfidf>=1.0] # keep only those terms that are slightly less frequent that the median
LinkedIn_Corpus.tdm.sp.t.tdif <- LinkedIn_Corpus.tdm.sp.t[row_sums(LinkedIn_Corpus.tdm.sp.t) > 0, ]
summary(col_sums(LinkedIn_Corpus.tdm.sp.t.tdif)) # have a look
require(topicmodels)
best.model <- lapply(seq(2, 50, by = 1),
function(d){LDA(LinkedIn_Corpus.tdm.sp.t.tdif, d)}) # this will make a topic model for every number of topics between 2 and 50... it will take some time!
best.model.logLik <- as.data.frame(as.matrix(lapply(best.model, logLik)))
require(ggplot2)
#plot it
best.model.logLik.df <- data.frame(topics=c(2:50), LL = as.numeric(as.matrix(best.model.logLik)))
best.model.logLik.df.sort <- best.model.logLik.df[order(-best.model.logLik.df$LL), ] # sort to find out which number of topics has the highest loglik, in this case 23 topics.
best.model.logLik.df.sort # have a look to see what's at the top of the list, the one with the highest score
ntop <- best.model.logLik.df.sort[1,]$topics
lda <- LDA(LinkedIn_Corpus.tdm.sp.t.tdif, ntop) # generate a LDA model the optimum number of topics
get_terms(lda, 5) # get keywords for each topic, just for a quick look
get_topics(lda, 5) # gets topic numbers per document
lda_topics<-get_topics(lda, 5)
beta <- lda@beta # create object containing parameters of the word distribution for each topic
gamma <- lda@gamma # create object containing posterior topic distribution for each document
terms <- lda@terms # create object containing terms (words) that can be used to line up with beta and gamma
colnames(beta) <- terms # puts the terms (or words) as the column names for the topic weights.
id <- t(apply(beta, 1, order)) # order the beta values
beta_ranked <- lapply(1:nrow(id),function(i)beta[i,id[i,]])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment