primaryobjects/1-word2vec.R

## 1-word2vec.R
###
### Example of using word2vec.
### Kory Becker, August 16, 2017
###

library(devtools)
library(httr)
library(tm)

set_config(
  use_proxy(url="proxy.bloomberg.com", port=80)
)

set_config( config( ssl_verifypeer = 0L ) )

# Setup RTools path (optional).
#Sys.setenv(PATH = paste("C:/Rtools/bin", Sys.getenv("PATH"), sep=";"))
#Sys.setenv(BINPREF = "C:/Rtools/mingw_$(WIN)/bin/")

# See tutorial: https://github.com/bmschmidt/wordVectors/blob/master/vignettes/introduction.Rmd
install_github("bmschmidt/wordVectors")
library(wordVectors)

#
# Helper function to train a word2vec model from file.txt or load an existing one from file.bin.
#
word2vec <- function(fileName) {
  if (grepl('.txt', fileName, fixed=T)) {
    # Convert test.txt to test.bin.
    binaryFileName <- gsub('.txt', '.bin', fileName, fixed=T)
  }
  else {
    binaryFileName <- paste0(fileName, '.bin')
  }

  # Train word2vec model.
  if (!file.exists(binaryFileName)) {
    # Lowercase and setup ngrams.
    prepFileName <- 'temp.prep'
    prep_word2vec(origin=fileName, destination=prepFileName, lowercase=T, bundle_ngrams=2)

    # Train word2vec model.
    model <- train_word2vec(prepFileName, binaryFileName, vectors=200, threads=4, window=12, iter=5, negative_samples=0)

    # Cleanup.
    unlink(prepFileName)
  } else {
    model <- read.vectors(binaryFileName)
  }

  model
}

# Download and unzip the document corpus. Source: http://mattmahoney.net/dc/
if (!file.exists('text8') && !file.exists('text8.zip')) {
  temp <- tempfile()
  download.file('http://mattmahoney.net/dc/text8.zip', temp)
  unzip(temp)
  unlink(temp)
}

###
### Example 1: Simple text file.
###

# Read text file.
doc <- readChar('article.txt', file.info('article.txt')$size)

# Remove stop-words.
stopwords_regex <- paste(stopwords('en'), collapse = '\\b|\\b')
stopwords_regex <- paste0('\\b', stopwords_regex, '\\b')
doc <- stringr::str_replace_all(doc, stopwords_regex, '')

# Write text file with stop-words removed.
cat(doc, file="article2.txt",sep="\n",append=TRUE)

# Train word2vec model and explore.
model <- word2vec('article2.txt')
model %>% closest_to("president")

# Cleanup.
unlink('article2.txt')

###
### Example 2: Large document.
###

# Train word2vec model and explore.
model <- word2vec('text8')
model %>% closest_to("communism")

# Plot similar terms to 'computer' and 'internet'.
computers <- model[[c("computer","internet"),average=F]]

# model[1:3000,] here restricts to the 3000 most common words in the set.
computer_and_internet <- model[1:3000,] %>% cosineSimilarity(computers)

# Filter to the top 20 terms.
computer_and_internet <- computer_and_internet[
  rank(-computer_and_internet[,1])<20 |
    rank(-computer_and_internet[,2])<20,
  ]

plot(computer_and_internet,type='n')
text(computer_and_internet,labels=rownames(computer_and_internet))

## article.txt
Now the CEO of America's largest employer is taking President Donald Trump to task for his response to the violence in Charlottesville this past weekend.
On Monday, Walmart (WMT) CEO Doug McMillon said in a note to employees that the president "missed a critical opportunity" in his initial reaction to the "tragic events."
"As we watched the events and the response from President Trump over the weekend, we too felt that he missed a critical opportunity to help bring our country together by unequivocally rejecting the appalling actions of white supremacists," McMillon said.
He added that Trump's remarks on Monday, in which he denounced white supremacists by name, "were a step in the right direction."
"We need that clarity and consistency in the future," McMillon said.
McMillon sits on Trump's economic advisory council, formally known as the Strategic and Policy Forum. Walmart said he will not resign.
"I will continue to strongly advocate on behalf of our associates and customers, and urge our elected officials to do their part to promote a more just, tolerant and diverse society," McMillon said in his memo.
Trump was asked about McMillon's statement when he took questions from reporters at Trump Tower on Tuesday afternoon, but the president didn't say much.
Related: CEOs under fire to dump Trump
"The head of Walmart -- who I know, who's a very nice guy -- was making a political statement," Trump said.
The number of top executives who have come out against the president continues to grow.
The CEOs of Under Armour (UA), Intel (INTC, Tech30) and Merck (MRK) all quit Trump's manufacturing council on Monday to protest his failure to immediately denounce white supremacists. The president of the Alliance for American Manufacturing announced his resignation on Tuesday.
Other members of the Strategic and Policy Forum have also weighed in on what happened in Charlottesville. But none have resigned.
Stephen Schwarzman, the CEO Blackstone who chairs the initiative, said, "Bigotry, hatred and extremism are an affront to core American values and have no place in this country."
And PepsiCo (PEP) CEO Indra Nooyi tweeted Sunday that she was "heartbroken by the violence in #Charlottesville."
"Hate and intolerance are a betrayal of what we stand for as Americans," she said.

## output.txt
              word similarity to "communism"
1        communism                 1.0000000
2        socialism                 0.8233832
3          marxism                 0.7711451
4        communist                 0.7571373
5       capitalism                 0.7531698
6          marxist                 0.7440766
7  totalitarianism                 0.7290054
8       capitalist                 0.7189616
9         leninism                 0.7129190
10       socialist                 0.7125336

## terms.png

      
    Raw
  

              terms.png
	###
	### Example of using word2vec.
	### Kory Becker, August 16, 2017
	###

	library(devtools)
	library(httr)
	library(tm)

	set_config(
	use_proxy(url="proxy.bloomberg.com", port=80)
	)

	set_config( config( ssl_verifypeer = 0L ) )

	# Setup RTools path (optional).
	#Sys.setenv(PATH = paste("C:/Rtools/bin", Sys.getenv("PATH"), sep=";"))
	#Sys.setenv(BINPREF = "C:/Rtools/mingw_$(WIN)/bin/")

	# See tutorial: https://github.com/bmschmidt/wordVectors/blob/master/vignettes/introduction.Rmd
	install_github("bmschmidt/wordVectors")
	library(wordVectors)

	#
	# Helper function to train a word2vec model from file.txt or load an existing one from file.bin.
	#
	word2vec <- function(fileName) {
	if (grepl('.txt', fileName, fixed=T)) {
	# Convert test.txt to test.bin.
	binaryFileName <- gsub('.txt', '.bin', fileName, fixed=T)
	}
	else {
	binaryFileName <- paste0(fileName, '.bin')
	}

	# Train word2vec model.
	if (!file.exists(binaryFileName)) {
	# Lowercase and setup ngrams.
	prepFileName <- 'temp.prep'
	prep_word2vec(origin=fileName, destination=prepFileName, lowercase=T, bundle_ngrams=2)

	# Train word2vec model.
	model <- train_word2vec(prepFileName, binaryFileName, vectors=200, threads=4, window=12, iter=5, negative_samples=0)

	# Cleanup.
	unlink(prepFileName)
	} else {
	model <- read.vectors(binaryFileName)
	}

	model
	}

	# Download and unzip the document corpus. Source: http://mattmahoney.net/dc/
	if (!file.exists('text8') && !file.exists('text8.zip')) {
	temp <- tempfile()
	download.file('http://mattmahoney.net/dc/text8.zip', temp)
	unzip(temp)
	unlink(temp)
	}

	###
	### Example 1: Simple text file.
	###

	# Read text file.
	doc <- readChar('article.txt', file.info('article.txt')$size)

	# Remove stop-words.
	stopwords_regex <- paste(stopwords('en'), collapse = '\\b\|\\b')
	stopwords_regex <- paste0('\\b', stopwords_regex, '\\b')
	doc <- stringr::str_replace_all(doc, stopwords_regex, '')

	# Write text file with stop-words removed.
	cat(doc, file="article2.txt",sep="\n",append=TRUE)

	# Train word2vec model and explore.
	model <- word2vec('article2.txt')
	model %>% closest_to("president")

	# Cleanup.
	unlink('article2.txt')

	###
	### Example 2: Large document.
	###

	# Train word2vec model and explore.
	model <- word2vec('text8')
	model %>% closest_to("communism")

	# Plot similar terms to 'computer' and 'internet'.
	computers <- model[[c("computer","internet"),average=F]]

	# model[1:3000,] here restricts to the 3000 most common words in the set.
	computer_and_internet <- model[1:3000,] %>% cosineSimilarity(computers)

	# Filter to the top 20 terms.
	computer_and_internet <- computer_and_internet[
	rank(-computer_and_internet[,1])<20 \|
	rank(-computer_and_internet[,2])<20,
	]

	plot(computer_and_internet,type='n')
	text(computer_and_internet,labels=rownames(computer_and_internet))
	Now the CEO of America's largest employer is taking President Donald Trump to task for his response to the violence in Charlottesville this past weekend.
	On Monday, Walmart (WMT) CEO Doug McMillon said in a note to employees that the president "missed a critical opportunity" in his initial reaction to the "tragic events."
	"As we watched the events and the response from President Trump over the weekend, we too felt that he missed a critical opportunity to help bring our country together by unequivocally rejecting the appalling actions of white supremacists," McMillon said.
	He added that Trump's remarks on Monday, in which he denounced white supremacists by name, "were a step in the right direction."
	"We need that clarity and consistency in the future," McMillon said.
	McMillon sits on Trump's economic advisory council, formally known as the Strategic and Policy Forum. Walmart said he will not resign.
	"I will continue to strongly advocate on behalf of our associates and customers, and urge our elected officials to do their part to promote a more just, tolerant and diverse society," McMillon said in his memo.
	Trump was asked about McMillon's statement when he took questions from reporters at Trump Tower on Tuesday afternoon, but the president didn't say much.
	Related: CEOs under fire to dump Trump
	"The head of Walmart -- who I know, who's a very nice guy -- was making a political statement," Trump said.
	The number of top executives who have come out against the president continues to grow.
	The CEOs of Under Armour (UA), Intel (INTC, Tech30) and Merck (MRK) all quit Trump's manufacturing council on Monday to protest his failure to immediately denounce white supremacists. The president of the Alliance for American Manufacturing announced his resignation on Tuesday.
	Other members of the Strategic and Policy Forum have also weighed in on what happened in Charlottesville. But none have resigned.
	Stephen Schwarzman, the CEO Blackstone who chairs the initiative, said, "Bigotry, hatred and extremism are an affront to core American values and have no place in this country."
	And PepsiCo (PEP) CEO Indra Nooyi tweeted Sunday that she was "heartbroken by the violence in #Charlottesville."
	"Hate and intolerance are a betrayal of what we stand for as Americans," she said.
	word similarity to "communism"
	1 communism 1.0000000
	2 socialism 0.8233832
	3 marxism 0.7711451
	4 communist 0.7571373
	5 capitalism 0.7531698
	6 marxist 0.7440766
	7 totalitarianism 0.7290054
	8 capitalist 0.7189616
	9 leninism 0.7129190
	10 socialist 0.7125336