Created
April 7, 2014 04:43
-
-
Save josep2/10014932 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##Load Necessary Packages | |
library(maps) | |
require(RCurl) | |
library(RgoogleMaps) | |
library(stringr) | |
library(plotly) | |
library(tm) | |
library(rJava) | |
library(Snowball) | |
## Get Google Spreadsheet data into R | |
Linkedin<-getURL('https://docs.google.com/spreadsheet/pub?key=0Avgy7vmlxX7VdGNiZUx3U3d0ODMxRFZ4NEFrSHRsSnc&single=true&gid=0&output=csv') | |
Linkedin<-read.csv(textConnection(Linkedin)) | |
##Okay figure out where the places where there are 10 or more jobs in the corpus | |
TopSpots<-as.data.frame(table(Linkedin$Location)) | |
TopSpots<-TopSpots[which(TopSpots$Freq > 9), ] | |
##Sort it descending | |
TopSpots<-TopSpots[order(-TopSpots$Freq), ] | |
## Now we need to get the words "Area" and "Greater" out of these strings | |
NewNames<-str_replace_all(string=TopSpots$Var1, | |
pattern="Area", | |
repl="") | |
Temp<-as.data.frame(NewNames) | |
FinalNames<-str_replace_all(string=Temp$NewNames, | |
pattern="Greater", | |
repl="") | |
FinalNames<-as.data.frame(FinalNames) | |
TopSpots$City<-FinalNames$FinalNames | |
##Now that we have cities lets build a function that will give us longitude and lattitude | |
TopSpots1 = cbind.data.frame(TopSpots, | |
lat=NA, | |
lon=NA) | |
TopSpots1 <-with(TopSpots1, | |
data.frame(City, | |
t(sapply(TopSpots1$City, | |
getGeoCode)))) | |
TopSpots<-merge(TopSpots, | |
TopSpots1, | |
by.x=3) | |
##Great lets use plotly to make a cool map vis | |
p <- plotly(username="yourname", | |
key="yourkey") | |
trace1 <- list(x=map(regions="usa")$x, | |
y=map(regions="usa")$y) | |
#Create the plotable city data | |
trace2 <- list(x= TopSpots$lon, | |
y=TopSpots$lat, | |
text=TopSpots$City, | |
type="scatter", | |
mode="markers", | |
marker=list("size"=TopSpots$Freq, | |
"opacity"=0.5)) | |
p$plotly(trace1, | |
trace2) | |
## Alright let's take a look as Industries with 5 more more: | |
Top_Industry<-as.data.frame(table(Linkedin$Industry)) | |
Top_Industry<-Top_Industry[which(Top_Industry$Freq > 2), ] | |
##Sort it descending | |
Top_Industry<-Top_Industry[order(-Top_Industry$Freq),] | |
colnames(Top_Industry)<-c("Industry","Frequency") | |
##Put this in a table and emebed | |
##Manage Description Data | |
Descriptions<-as.data.frame(Linkedin$Description) | |
------------------------------------------------------------------- | |
##Natural Language Processing Code Used from https://github.com/benmarwick/AAA2011-Tweets/blob/master/AAA2011.R | |
## Check it out if you want a better understanding of how this works | |
##First things first, lets get a corpus and do a ton of work for NLP | |
LinkedIn_Corpus<-Corpus(VectorSource(Descriptions)) | |
LinkedIn_Corpus <- tm_map(LinkedIn_Corpus, | |
tolower) # convert all text to lower case | |
LinkedIn_Corpus <- tm_map(LinkedIn_Corpus, | |
removePunctuation) #gets rid of puncuation | |
LinkedIn_Corpus <- tm_map(LinkedIn_Corpus, | |
removeNumbers) | |
LinkedIn_Corpus <- tm_map(LinkedIn_Corpus, | |
removeWords, | |
stopwords("english")) | |
LinkedIn_Corpus <- tm_map(LinkedIn_Corpus, | |
stemDocument, | |
language = "english") # converts terms to tokens | |
LinkedIn_Corpus.tdm <- TermDocumentMatrix(LinkedIn_Corpus, | |
control = list(minWordLength = 3)) # create a term document matrix, keepiing only tokens longer than three characters, since shorter tokens are very hard to interpret | |
findFreqTerms(LinkedIn_Corpus.tdm, | |
lowfreq=30) # have a look at common words, in this case, those that appear at least 30 times, good to get high freq words and add to stopword list and re-make the dtm, in this case add aaa, panel, session | |
## Cool now lets get to getting topics out of this data | |
LinkedIn_Corpus.tdm.sp <- removeSparseTerms(LinkedIn_Corpus.tdm, | |
sparse=0.989)##Get sparse terms out | |
LinkedIn_Corpus.tdm.sp.df <- as.data.frame(inspect(LinkedIn_Corpus.tdm.sp )) | |
##Gotta invert this to prepare it for analysis | |
LinkedIn_Corpus.tdm.sp.df.sc.t <- t(scale(LinkedIn_Corpus.tdm.sp.df)) | |
require(slam) | |
LinkedIn_Corpus.tdm.sp.t <- t(LinkedIn_Corpus.tdm.sp) | |
summary(col_sums(LinkedIn_Corpus.tdm.sp.t)) | |
term_tfidf <- tapply(LinkedIn_Corpus.tdm.sp.t$v/row_sums(LinkedIn_Corpus.tdm.sp.t)[LinkedIn_Corpus.tdm.sp.t$i], LinkedIn_Corpus.tdm.sp.t$j,mean) * log2(nDocs(LinkedIn_Corpus.tdm.sp.t)/col_sums(LinkedIn_Corpus.tdm.sp.t>0)) # calculate tf-idf values | |
LinkedIn_Corpus.tdm.sp.t.tdif <- LinkedIn_Corpus.tdm.sp.t[,term_tfidf>=1.0] # keep only those terms that are slightly less frequent that the median | |
LinkedIn_Corpus.tdm.sp.t.tdif <- LinkedIn_Corpus.tdm.sp.t[row_sums(LinkedIn_Corpus.tdm.sp.t) > 0, ] | |
summary(col_sums(LinkedIn_Corpus.tdm.sp.t.tdif)) # have a look | |
require(topicmodels) | |
best.model <- lapply(seq(2, 50, by = 1), | |
function(d){LDA(LinkedIn_Corpus.tdm.sp.t.tdif, d)}) # this will make a topic model for every number of topics between 2 and 50... it will take some time! | |
best.model.logLik <- as.data.frame(as.matrix(lapply(best.model, logLik))) | |
require(ggplot2) | |
#plot it | |
best.model.logLik.df <- data.frame(topics=c(2:50), LL = as.numeric(as.matrix(best.model.logLik))) | |
best.model.logLik.df.sort <- best.model.logLik.df[order(-best.model.logLik.df$LL), ] # sort to find out which number of topics has the highest loglik, in this case 23 topics. | |
best.model.logLik.df.sort # have a look to see what's at the top of the list, the one with the highest score | |
ntop <- best.model.logLik.df.sort[1,]$topics | |
lda <- LDA(LinkedIn_Corpus.tdm.sp.t.tdif, ntop) # generate a LDA model the optimum number of topics | |
get_terms(lda, 5) # get keywords for each topic, just for a quick look | |
get_topics(lda, 5) # gets topic numbers per document | |
lda_topics<-get_topics(lda, 5) | |
beta <- lda@beta # create object containing parameters of the word distribution for each topic | |
gamma <- lda@gamma # create object containing posterior topic distribution for each document | |
terms <- lda@terms # create object containing terms (words) that can be used to line up with beta and gamma | |
colnames(beta) <- terms # puts the terms (or words) as the column names for the topic weights. | |
id <- t(apply(beta, 1, order)) # order the beta values | |
beta_ranked <- lapply(1:nrow(id),function(i)beta[i,id[i,]]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment