Last active
February 21, 2017 03:12
-
-
Save yvlau92/6a1b9a6241486af38bfeed152a34406c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# tf-idf implementation in R | |
#### TF-IDF for Products ### | |
# Calculate frequency of words for each skincare product | |
product_words <- data %>% | |
select(Product,ReviewContent)%>% | |
tidytext::unnest_tokens(word, ReviewContent) %>% | |
count(Product, word, sort = TRUE) %>% | |
ungroup() | |
# Calculate the amount of times a word appears in a document | |
total_words <- product_words %>% | |
group_by(Product)%>% | |
summarize(total_words = sum(n)) | |
#join total and n | |
product_words <- left_join(product_words, total_words) | |
# compute df, idf | |
product_words <- product_words %>% | |
tidytext::bind_tf_idf(word, Product, n) | |
# arrange dataset by tf_idf | |
product_words %>% | |
select(-total_words) %>% | |
arrange(desc(tf_idf)) | |
### TF-IDF for "Query" user input ### | |
# Function to compute tf_idf | |
query_tf_idf <- function(tags_list, words_data){ | |
n <- length(tags_list) | |
word <- data.frame(word = tags_list) | |
tf <- rep(1/n,n) | |
dt <- cbind(word,tf) | |
selected_bag <- unique(select(words_data,word,idf))%>%filter(word %in% dt$word) | |
dt <- left_join(dt,selected_bag) | |
dt$tf_idf <- dt$tf * dt$idf | |
dt <- dt[order(word),] | |
return(dt) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment