yvlau92/tf-idf.R

## tf-idf.R
# tf-idf implementation in R

#### TF-IDF for Products ###
# Calculate frequency of words for each skincare product
product_words <- data %>%
  select(Product,ReviewContent)%>%
  tidytext::unnest_tokens(word, ReviewContent) %>%
  count(Product, word, sort = TRUE) %>%
  ungroup()

# Calculate the amount of times a word appears in a document
total_words <- product_words %>%
  group_by(Product)%>%
  summarize(total_words = sum(n))

#join total and n
product_words <- left_join(product_words, total_words)

# compute df, idf
product_words <- product_words %>%
  tidytext::bind_tf_idf(word, Product, n)

# arrange dataset by tf_idf
product_words %>%
  select(-total_words) %>%
  arrange(desc(tf_idf))

### TF-IDF for "Query" user input ###
# Function to compute tf_idf
query_tf_idf <- function(tags_list, words_data){
  n <- length(tags_list)
  word <- data.frame(word = tags_list)
  tf <- rep(1/n,n)
  dt <- cbind(word,tf)
  selected_bag <- unique(select(words_data,word,idf))%>%filter(word %in% dt$word)
  dt <- left_join(dt,selected_bag)
  dt$tf_idf <- dt$tf * dt$idf
  dt <- dt[order(word),]
  return(dt)
}
	# tf-idf implementation in R

	#### TF-IDF for Products ###
	# Calculate frequency of words for each skincare product
	product_words <- data %>%
	select(Product,ReviewContent)%>%
	tidytext::unnest_tokens(word, ReviewContent) %>%
	count(Product, word, sort = TRUE) %>%
	ungroup()

	# Calculate the amount of times a word appears in a document
	total_words <- product_words %>%
	group_by(Product)%>%
	summarize(total_words = sum(n))

	#join total and n
	product_words <- left_join(product_words, total_words)

	# compute df, idf
	product_words <- product_words %>%
	tidytext::bind_tf_idf(word, Product, n)

	# arrange dataset by tf_idf
	product_words %>%
	select(-total_words) %>%
	arrange(desc(tf_idf))

	### TF-IDF for "Query" user input ###
	# Function to compute tf_idf
	query_tf_idf <- function(tags_list, words_data){
	n <- length(tags_list)
	word <- data.frame(word = tags_list)
	tf <- rep(1/n,n)
	dt <- cbind(word,tf)
	selected_bag <- unique(select(words_data,word,idf))%>%filter(word %in% dt$word)
	dt <- left_join(dt,selected_bag)
	dt$tf_idf <- dt$tf * dt$idf
	dt <- dt[order(word),]
	return(dt)
	}