Skip to content

Instantly share code, notes, and snippets.

@yvlau92
Last active February 21, 2017 03:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yvlau92/6a1b9a6241486af38bfeed152a34406c to your computer and use it in GitHub Desktop.
Save yvlau92/6a1b9a6241486af38bfeed152a34406c to your computer and use it in GitHub Desktop.
# tf-idf implementation in R
#### TF-IDF for Products ###
# Calculate frequency of words for each skincare product
product_words <- data %>%
select(Product,ReviewContent)%>%
tidytext::unnest_tokens(word, ReviewContent) %>%
count(Product, word, sort = TRUE) %>%
ungroup()
# Calculate the amount of times a word appears in a document
total_words <- product_words %>%
group_by(Product)%>%
summarize(total_words = sum(n))
#join total and n
product_words <- left_join(product_words, total_words)
# compute df, idf
product_words <- product_words %>%
tidytext::bind_tf_idf(word, Product, n)
# arrange dataset by tf_idf
product_words %>%
select(-total_words) %>%
arrange(desc(tf_idf))
### TF-IDF for "Query" user input ###
# Function to compute tf_idf
query_tf_idf <- function(tags_list, words_data){
n <- length(tags_list)
word <- data.frame(word = tags_list)
tf <- rep(1/n,n)
dt <- cbind(word,tf)
selected_bag <- unique(select(words_data,word,idf))%>%filter(word %in% dt$word)
dt <- left_join(dt,selected_bag)
dt$tf_idf <- dt$tf * dt$idf
dt <- dt[order(word),]
return(dt)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment