josep2/LinkedIn_Recommends.r

## LinkedIn_Recommends.r
##Load Necessary Packages

library(maps)
require(RCurl)
library(RgoogleMaps)
library(stringr)
library(plotly)
library(tm)
library(rJava)
library(Snowball)

## Get Google Spreadsheet data into R
Linkedin<-getURL('https://docs.google.com/spreadsheet/pub?key=0Avgy7vmlxX7VdGNiZUx3U3d0ODMxRFZ4NEFrSHRsSnc&single=true&gid=0&output=csv')

Linkedin<-read.csv(textConnection(Linkedin))

##Okay figure out where the places where there are 10 or more jobs in the corpus

TopSpots<-as.data.frame(table(Linkedin$Location))


TopSpots<-TopSpots[which(TopSpots$Freq > 9), ]

##Sort it descending

TopSpots<-TopSpots[order(-TopSpots$Freq), ]

## Now we need to get the words "Area" and "Greater" out of these strings

NewNames<-str_replace_all(string=TopSpots$Var1,
                          pattern="Area",
                          repl="")

Temp<-as.data.frame(NewNames)

FinalNames<-str_replace_all(string=Temp$NewNames,
                            pattern="Greater",
                            repl="")

FinalNames<-as.data.frame(FinalNames)

TopSpots$City<-FinalNames$FinalNames

##Now that we have cities lets build a function that will give us longitude and lattitude

TopSpots1 = cbind.data.frame(TopSpots,
                             lat=NA,
                             lon=NA)

TopSpots1 <-with(TopSpots1,
                 data.frame(City,
                            t(sapply(TopSpots1$City,
                                     getGeoCode))))

TopSpots<-merge(TopSpots,
                TopSpots1,
                by.x=3)

##Great lets use plotly to make a cool map vis

p <- plotly(username="yourname",
            key="yourkey")


trace1 <- list(x=map(regions="usa")$x,
               y=map(regions="usa")$y)

#Create the plotable city data
trace2 <- list(x= TopSpots$lon,
               y=TopSpots$lat,
               text=TopSpots$City,
               type="scatter",
               mode="markers",
               marker=list("size"=TopSpots$Freq,
                           "opacity"=0.5))

p$plotly(trace1,
         trace2)

## Alright let's take a look as Industries with 5 more more:

Top_Industry<-as.data.frame(table(Linkedin$Industry))


Top_Industry<-Top_Industry[which(Top_Industry$Freq > 2), ]

##Sort it descending

Top_Industry<-Top_Industry[order(-Top_Industry$Freq),]
colnames(Top_Industry)<-c("Industry","Frequency")

##Put this in a table and emebed


##Manage Description Data

Descriptions<-as.data.frame(Linkedin$Description)

-------------------------------------------------------------------

##Natural Language Processing Code Used from https://github.com/benmarwick/AAA2011-Tweets/blob/master/AAA2011.R
## Check it out if you want a better understanding of how this works

##First things first, lets get a corpus and do a ton of work for NLP

LinkedIn_Corpus<-Corpus(VectorSource(Descriptions))
LinkedIn_Corpus <- tm_map(LinkedIn_Corpus,
                          tolower) # convert all text to lower case
LinkedIn_Corpus <- tm_map(LinkedIn_Corpus,
                          removePunctuation) #gets rid of puncuation
LinkedIn_Corpus <- tm_map(LinkedIn_Corpus,
                          removeNumbers)
LinkedIn_Corpus <- tm_map(LinkedIn_Corpus,
                          removeWords,
                          stopwords("english"))

LinkedIn_Corpus <- tm_map(LinkedIn_Corpus,
                          stemDocument,
                          language = "english") # converts terms to tokens

LinkedIn_Corpus.tdm <- TermDocumentMatrix(LinkedIn_Corpus,
                                          control = list(minWordLength = 3)) # create a term document matrix, keepiing only tokens longer than three characters, since shorter tokens are very hard to interpret
findFreqTerms(LinkedIn_Corpus.tdm,
              lowfreq=30) # have a look at common words, in this case, those that appear at least 30 times, good to get high freq words and add to stopword list and re-make the dtm, in this case add aaa, panel, session

## Cool now lets get to getting topics out of this data

LinkedIn_Corpus.tdm.sp <- removeSparseTerms(LinkedIn_Corpus.tdm,
                                            sparse=0.989)##Get sparse terms out

LinkedIn_Corpus.tdm.sp.df <- as.data.frame(inspect(LinkedIn_Corpus.tdm.sp ))

##Gotta invert this to prepare it for analysis
LinkedIn_Corpus.tdm.sp.df.sc.t <- t(scale(LinkedIn_Corpus.tdm.sp.df))

require(slam)

LinkedIn_Corpus.tdm.sp.t <- t(LinkedIn_Corpus.tdm.sp)
summary(col_sums(LinkedIn_Corpus.tdm.sp.t))
term_tfidf <- tapply(LinkedIn_Corpus.tdm.sp.t$v/row_sums(LinkedIn_Corpus.tdm.sp.t)[LinkedIn_Corpus.tdm.sp.t$i], LinkedIn_Corpus.tdm.sp.t$j,mean) * log2(nDocs(LinkedIn_Corpus.tdm.sp.t)/col_sums(LinkedIn_Corpus.tdm.sp.t>0)) # calculate tf-idf values

LinkedIn_Corpus.tdm.sp.t.tdif <- LinkedIn_Corpus.tdm.sp.t[,term_tfidf>=1.0] # keep only those terms that are slightly less frequent that the median
LinkedIn_Corpus.tdm.sp.t.tdif <- LinkedIn_Corpus.tdm.sp.t[row_sums(LinkedIn_Corpus.tdm.sp.t) > 0, ]
summary(col_sums(LinkedIn_Corpus.tdm.sp.t.tdif)) # have a look

require(topicmodels)

best.model <- lapply(seq(2, 50, by = 1),
                     function(d){LDA(LinkedIn_Corpus.tdm.sp.t.tdif, d)}) # this will make a topic model for every number of topics between 2 and 50... it will take some time!
best.model.logLik <- as.data.frame(as.matrix(lapply(best.model, logLik)))

require(ggplot2)

#plot it
best.model.logLik.df <- data.frame(topics=c(2:50), LL = as.numeric(as.matrix(best.model.logLik)))
best.model.logLik.df.sort <- best.model.logLik.df[order(-best.model.logLik.df$LL), ] # sort to find out which number of topics has the highest loglik, in this case 23 topics.
best.model.logLik.df.sort # have a look to see what's at the top of the list, the one with the highest score
ntop <- best.model.logLik.df.sort[1,]$topics

lda <- LDA(LinkedIn_Corpus.tdm.sp.t.tdif, ntop) # generate a LDA model the optimum number of topics
get_terms(lda, 5) # get keywords for each topic, just for a quick look
get_topics(lda, 5) # gets topic numbers per document
lda_topics<-get_topics(lda, 5)
beta <- lda@beta # create object containing parameters of the word distribution for each topic
gamma <- lda@gamma # create object containing posterior topic distribution for each document
terms <- lda@terms # create object containing terms (words) that can be used to line up with beta and gamma
colnames(beta) <- terms # puts the terms (or words) as the column names for the topic weights.
id <- t(apply(beta, 1, order)) # order the beta values
beta_ranked <- lapply(1:nrow(id),function(i)beta[i,id[i,]])
	##Load Necessary Packages

	library(maps)
	require(RCurl)
	library(RgoogleMaps)
	library(stringr)
	library(plotly)
	library(tm)
	library(rJava)
	library(Snowball)

	## Get Google Spreadsheet data into R
	Linkedin<-getURL('https://docs.google.com/spreadsheet/pub?key=0Avgy7vmlxX7VdGNiZUx3U3d0ODMxRFZ4NEFrSHRsSnc&single=true&gid=0&output=csv')

	Linkedin<-read.csv(textConnection(Linkedin))

	##Okay figure out where the places where there are 10 or more jobs in the corpus

	TopSpots<-as.data.frame(table(Linkedin$Location))


	TopSpots<-TopSpots[which(TopSpots$Freq > 9), ]

	##Sort it descending

	TopSpots<-TopSpots[order(-TopSpots$Freq), ]

	## Now we need to get the words "Area" and "Greater" out of these strings

	NewNames<-str_replace_all(string=TopSpots$Var1,
	pattern="Area",
	repl="")

	Temp<-as.data.frame(NewNames)

	FinalNames<-str_replace_all(string=Temp$NewNames,
	pattern="Greater",
	repl="")

	FinalNames<-as.data.frame(FinalNames)

	TopSpots$City<-FinalNames$FinalNames

	##Now that we have cities lets build a function that will give us longitude and lattitude

	TopSpots1 = cbind.data.frame(TopSpots,
	lat=NA,
	lon=NA)

	TopSpots1 <-with(TopSpots1,
	data.frame(City,
	t(sapply(TopSpots1$City,
	getGeoCode))))

	TopSpots<-merge(TopSpots,
	TopSpots1,
	by.x=3)

	##Great lets use plotly to make a cool map vis

	p <- plotly(username="yourname",
	key="yourkey")


	trace1 <- list(x=map(regions="usa")$x,
	y=map(regions="usa")$y)

	#Create the plotable city data
	trace2 <- list(x= TopSpots$lon,
	y=TopSpots$lat,
	text=TopSpots$City,
	type="scatter",
	mode="markers",
	marker=list("size"=TopSpots$Freq,
	"opacity"=0.5))

	p$plotly(trace1,
	trace2)

	## Alright let's take a look as Industries with 5 more more:

	Top_Industry<-as.data.frame(table(Linkedin$Industry))


	Top_Industry<-Top_Industry[which(Top_Industry$Freq > 2), ]

	##Sort it descending

	Top_Industry<-Top_Industry[order(-Top_Industry$Freq),]
	colnames(Top_Industry)<-c("Industry","Frequency")

	##Put this in a table and emebed


	##Manage Description Data

	Descriptions<-as.data.frame(Linkedin$Description)

	-------------------------------------------------------------------

	##Natural Language Processing Code Used from https://github.com/benmarwick/AAA2011-Tweets/blob/master/AAA2011.R
	## Check it out if you want a better understanding of how this works

	##First things first, lets get a corpus and do a ton of work for NLP

	LinkedIn_Corpus<-Corpus(VectorSource(Descriptions))
	LinkedIn_Corpus <- tm_map(LinkedIn_Corpus,
	tolower) # convert all text to lower case
	LinkedIn_Corpus <- tm_map(LinkedIn_Corpus,
	removePunctuation) #gets rid of puncuation
	LinkedIn_Corpus <- tm_map(LinkedIn_Corpus,
	removeNumbers)
	LinkedIn_Corpus <- tm_map(LinkedIn_Corpus,
	removeWords,
	stopwords("english"))

	LinkedIn_Corpus <- tm_map(LinkedIn_Corpus,
	stemDocument,
	language = "english") # converts terms to tokens

	LinkedIn_Corpus.tdm <- TermDocumentMatrix(LinkedIn_Corpus,
	control = list(minWordLength = 3)) # create a term document matrix, keepiing only tokens longer than three characters, since shorter tokens are very hard to interpret
	findFreqTerms(LinkedIn_Corpus.tdm,
	lowfreq=30) # have a look at common words, in this case, those that appear at least 30 times, good to get high freq words and add to stopword list and re-make the dtm, in this case add aaa, panel, session

	## Cool now lets get to getting topics out of this data

	LinkedIn_Corpus.tdm.sp <- removeSparseTerms(LinkedIn_Corpus.tdm,
	sparse=0.989)##Get sparse terms out

	LinkedIn_Corpus.tdm.sp.df <- as.data.frame(inspect(LinkedIn_Corpus.tdm.sp ))

	##Gotta invert this to prepare it for analysis
	LinkedIn_Corpus.tdm.sp.df.sc.t <- t(scale(LinkedIn_Corpus.tdm.sp.df))

	require(slam)

	LinkedIn_Corpus.tdm.sp.t <- t(LinkedIn_Corpus.tdm.sp)
	summary(col_sums(LinkedIn_Corpus.tdm.sp.t))
	term_tfidf <- tapply(LinkedIn_Corpus.tdm.sp.t$v/row_sums(LinkedIn_Corpus.tdm.sp.t)[LinkedIn_Corpus.tdm.sp.t$i], LinkedIn_Corpus.tdm.sp.t$j,mean) * log2(nDocs(LinkedIn_Corpus.tdm.sp.t)/col_sums(LinkedIn_Corpus.tdm.sp.t>0)) # calculate tf-idf values

	LinkedIn_Corpus.tdm.sp.t.tdif <- LinkedIn_Corpus.tdm.sp.t[,term_tfidf>=1.0] # keep only those terms that are slightly less frequent that the median
	LinkedIn_Corpus.tdm.sp.t.tdif <- LinkedIn_Corpus.tdm.sp.t[row_sums(LinkedIn_Corpus.tdm.sp.t) > 0, ]
	summary(col_sums(LinkedIn_Corpus.tdm.sp.t.tdif)) # have a look

	require(topicmodels)

	best.model <- lapply(seq(2, 50, by = 1),
	function(d){LDA(LinkedIn_Corpus.tdm.sp.t.tdif, d)}) # this will make a topic model for every number of topics between 2 and 50... it will take some time!
	best.model.logLik <- as.data.frame(as.matrix(lapply(best.model, logLik)))

	require(ggplot2)

	#plot it
	best.model.logLik.df <- data.frame(topics=c(2:50), LL = as.numeric(as.matrix(best.model.logLik)))
	best.model.logLik.df.sort <- best.model.logLik.df[order(-best.model.logLik.df$LL), ] # sort to find out which number of topics has the highest loglik, in this case 23 topics.
	best.model.logLik.df.sort # have a look to see what's at the top of the list, the one with the highest score
	ntop <- best.model.logLik.df.sort[1,]$topics

	lda <- LDA(LinkedIn_Corpus.tdm.sp.t.tdif, ntop) # generate a LDA model the optimum number of topics
	get_terms(lda, 5) # get keywords for each topic, just for a quick look
	get_topics(lda, 5) # gets topic numbers per document
	lda_topics<-get_topics(lda, 5)
	beta <- lda@beta # create object containing parameters of the word distribution for each topic
	gamma <- lda@gamma # create object containing posterior topic distribution for each document
	terms <- lda@terms # create object containing terms (words) that can be used to line up with beta and gamma
	colnames(beta) <- terms # puts the terms (or words) as the column names for the topic weights.
	id <- t(apply(beta, 1, order)) # order the beta values
	beta_ranked <- lapply(1:nrow(id),function(i)beta[i,id[i,]])