zimana/tf_idf_example.R

## tf_idf_example.R
# import tidytext for the TF-IDF function.
library(tidytext)
#import for count function
library(dplyr)
# import readtext for reading text.
library(readtext)
#
# First read the text source into the program. The read functions in R can be used. In this case, a text file containing your content can be read using paste0 via readtext function.
#
web_content <- readtext(paste0("TEXT FILE GOES HERE"))
web_content
#
# A column called text contains the words we want to break into a corpus
#
web_content$text
#
# Next, split the words in the $text column to create the corpus, the "bag of words".  To split columns and create a flat table of tokens, use the unnest_tokens() function.
# Then use the count() function to audit the unique words. The sort sets up a rank from largest count to the smallest value.
#
content_words <- web_content |>
  unnest_tokens(word, text, token = "words") |>
  count(word, sort=TRUE)
#
#  In essence, if your class of object is not a dataframe (In this example, the id does not appear in the return object) create a new corpus as a dataframe to combine the doc ids, text, and n. For our function, the doc_id should reflect one-row-per-term-per-document to match the bind.
new_corpus <- data.frame(web_content$doc_id,content_words$word,content_words$n)

# Give the new corpus column names for convenience and simplifying the td_idf formula
colnames(new_corpus) <- c("doc_id","text","number")

#
# With the words separated, the TF-IDF can be applied via bind_tf_idf. The text document to be examined should be a tidy dataset with one-row-per-term-per-document
#
content_sentiment <- new_corpus |>
  bind_tf_idf("doc_id","text","number")
#
# When you run the object, you will end up with a table with number of words, and score values for tf, idf, and the tf-idf.
content_sentiment
#
	# import tidytext for the TF-IDF function.
	library(tidytext)
	#import for count function
	library(dplyr)
	# import readtext for reading text.
	library(readtext)
	#
	# First read the text source into the program. The read functions in R can be used. In this case, a text file containing your content can be read using paste0 via readtext function.
	#
	web_content <- readtext(paste0("TEXT FILE GOES HERE"))
	web_content
	#
	# A column called text contains the words we want to break into a corpus
	#
	web_content$text
	#
	# Next, split the words in the $text column to create the corpus, the "bag of words". To split columns and create a flat table of tokens, use the unnest_tokens() function.
	# Then use the count() function to audit the unique words. The sort sets up a rank from largest count to the smallest value.
	#
	content_words <- web_content \|>
	unnest_tokens(word, text, token = "words") \|>
	count(word, sort=TRUE)
	#
	# In essence, if your class of object is not a dataframe (In this example, the id does not appear in the return object) create a new corpus as a dataframe to combine the doc ids, text, and n. For our function, the doc_id should reflect one-row-per-term-per-document to match the bind.
	new_corpus <- data.frame(web_content$doc_id,content_words$word,content_words$n)

	# Give the new corpus column names for convenience and simplifying the td_idf formula
	colnames(new_corpus) <- c("doc_id","text","number")

	#
	# With the words separated, the TF-IDF can be applied via bind_tf_idf. The text document to be examined should be a tidy dataset with one-row-per-term-per-document
	#
	content_sentiment <- new_corpus \|>
	bind_tf_idf("doc_id","text","number")
	#
	# When you run the object, you will end up with a table with number of words, and score values for tf, idf, and the tf-idf.
	content_sentiment
	#