Last active
June 16, 2024 22:38
-
-
Save Jong-Sig/444bcc532ea48d9f09aa4aecd5940dd9 to your computer and use it in GitHub Desktop.
LDA Topic Modeling (R version)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| --- | |
| title: "Instagram Advertisement Text Analysis" | |
| author: "Sik Chung" | |
| --- | |
| ```{r global_options, include = FALSE} | |
| set.seed(1234) | |
| library(knitr) | |
| library(markdown) | |
| library(ggplot2) | |
| library(tidyverse) | |
| library(tidytext) | |
| library(textclean) | |
| library(SnowballC) | |
| library(topicmodels) | |
| library(ldatuning) | |
| library(RMySQL) | |
| ``` | |
| ## Step 1. Prepare Data | |
| ***Step 1-1. Connect to SQL*** | |
| ```{r, message = FALSE} | |
| connection <- dbConnect(RMySQL::MySQL(), | |
| dbname = '', | |
| host = '', | |
| port = , | |
| user = '', | |
| password = '') | |
| ``` | |
| ***sTEP 1-2. Import Table*** | |
| ```{r, message = False} | |
| query <- "SELECT * FROM advertisements" | |
| insta <- dbGetQuery(connection, query) | |
| ``` | |
| ***Step 1-3. Disconnect*** | |
| ```{r, message = False} | |
| dbDisconnect(connection) | |
| ``` | |
| ***Step 1-4. Identify Advertising Posts*** | |
| ```{r, message = False} | |
| insta2 <- insta %>% | |
| filter(EnglishTextDum == 1) %>% | |
| filter((Influencer2500 == 1) | (PostDirect3 == 1)) %>% | |
| filter((OfficialAD == 1) | (UnofficialAD == 1)) | |
| ``` | |
| ## Step 2. Data Cleaning | |
| ***Step 2-1. lower case*** | |
| ```{r} | |
| insta2 %>% | |
| mutate(PostText = tolower(PostText)) -> insta2 | |
| ``` | |
| ***Step 2-2. Check URLs and remove if exists*** | |
| URLs are not necessary for clustering texts. | |
| ```{r} | |
| insta2 %>% | |
| mutate(PostText = gsub('\\s*http\\S+\\s*', ' ', PostText)) %>% | |
| mutate(PostText = gsub('\\s*www\\S+\\s*', ' ', PostText)) %>% | |
| mutate(PostText = gsub('\\S*\\.(com|net|org|co.uk|us|eu|at|it|ly|html|ee)+\\b', ' ', PostText)) -> insta2 | |
| ``` | |
| ***Step 2-3. Remove user names*** | |
| ```{r} | |
| insta2 %>% | |
| mutate(PostText = gsub('@\\S+\\s*', '', PostText)) -> insta2 | |
| ``` | |
| ***Step 2-4. Remove non-ASCII(emojis, spanish characters, etc.)*** | |
| ```{r} | |
| insta2 %>% | |
| # Revise quotes to avoid contraction issue ('|‘|´ versus ’) | |
| mutate(PostText = gsub("’", "'", PostText, ignore.case = TRUE)) %>% | |
| mutate(PostText = gsub("‘", "'", PostText, ignore.case = TRUE)) %>% | |
| mutate(PostText = gsub("´", "'", PostText, ignore.case = TRUE)) %>% | |
| mutate(PostText = gsub('[^\x01-\x7F]', ' ', PostText)) -> insta2 | |
| ``` | |
| ***Step 2-5. Strip new lines, tabs, and white spaces*** | |
| ```{r} | |
| insta2 %>% | |
| mutate(PostText = gsub('\n', ' ', PostText)) %>% | |
| mutate(PostText = gsub('^\\s+', '', PostText)) %>% | |
| mutate(PostText = gsub('\\s+$', '', PostText)) %>% | |
| mutate(PostText = gsub('[ |\t]+', ' ', PostText)) -> insta2 | |
| ``` | |
| ## Step 3. LDA Topic Modeling | |
| ***Step 3.1. Tokenizing and Further data wrangling pre LDA TM*** | |
| Numbers, stop words, and disclosing words contains no information. Remove them. | |
| ```{r} | |
| #Append disclosing words to a list of stopwords | |
| patterns <- c('ad', 'ads', 'advertisement', 'advertisements', 'advertising', 'paidad', 'sp', 'spon', 'sponsored', 'sponsoredpost', 'collab', 'partner', 'ambassador') | |
| custom_stop <- data.frame(word = patterns, | |
| lexicon = 'custom') %>% | |
| rbind(stop_words) | |
| ``` | |
| ```{r} | |
| #change column order to make post_id as first column | |
| #insta_ad <- insta_ad[,c(4,3,5:ncol(insta_ad))] | |
| ``` | |
| ```{r} | |
| insta_ad_tokenized <- insta2 %>% | |
| #grouping | |
| ungroup() %>% | |
| # it fails to remove contractions replace contractions with original form | |
| mutate(PostText = replace_contraction(PostText)) %>% | |
| #replace slang to words | |
| mutate(PostText = replace_internet_slang(PostText)) %>% | |
| #replace informal writing with known semantic replacements | |
| mutate(PostText = replace_word_elongation(PostText)) %>% | |
| #replace _ to space (they often use _ instead of space) | |
| mutate(PostText = gsub("_", " ", PostText)) %>% | |
| #Creating corpus & remove punct. | |
| unnest_tokens(output = word, input = PostText, token = 'words') %>% | |
| #remove stopwords | |
| anti_join(custom_stop, by = 'word') %>% | |
| #remove numbers | |
| filter(!str_detect(word, '[0-9]+')) %>% | |
| # Replace non-alphanumeric characters into "" | |
| mutate(word = gsub("[^[:alnum:]]", "", word)) %>% | |
| #remove words that have characters less than 2 (e.g., ya, uk, p) because it contain less info or mere typo | |
| mutate(char_num = nchar(word)) %>% | |
| filter(char_num > 3) %>% #3 worked better | |
| #Stemming (Porter's algorithm) | |
| mutate(word = wordStem(word)) %>% | |
| #remove "" | |
| #filter(!(word=="")) %>% | |
| #put lower threshold | |
| add_count(word) %>% | |
| filter(n > 49) %>% # 0.05% | |
| select(-n) %>% | |
| #count the number of times a word is used per each post | |
| count(PostID, word) %>% | |
| #creates a document-term matrix | |
| cast_dtm(PostID, word, n) | |
| insta_ad_tokenized | |
| ``` | |
| ```{r} | |
| insta_ad_tokenized | |
| ``` | |
| The feature of text data is that often 99-100% of document-word pairs are zero (sparsity). | |
| In this case, removing words using a lower threshold could help generalization and prevent overfitting. | |
| ***Step 4.2. Finding best K using four metrics*** | |
| * The metrics I utilized are as follows: | |
| 1. Griffiths, T. L., & Steyvers, M. (2004). Finding scientific topics. Proceedings of the National academy of Sciences, 101(suppl 1), 5228-5235. | |
| 2. Cao, J., Xia, T., Li, J., Zhang, Y., & Tang, S. (2009). A density-based method for adaptive LDA model selection. Neurocomputing, 72(7-9), 1775-1781. | |
| 3. Arun, R., Suresh, V., Madhavan, C. V., & Murthy, M. N. (2010, June). On finding the natural number of topics with latent dirichlet allocation: Some observations. In Pacific-Asia conference on knowledge discovery and data mining (pp. 391-402). Springer, Berlin, Heidelberg. | |
| 4. Deveaud, R., SanJuan, E., & Bellot, P. (2014). Accurate and effective latent concept modeling for ad hoc information retrieval. Document numérique, 17(1), 61-84. | |
| * Minimization: | |
| - Arun2010 | |
| - CaoJuan2009 | |
| * Maximization: | |
| - Deveaud2014 | |
| - Griffiths2004 | |
| ```{r} | |
| # Filter N = 4 | |
| SEED <- 1234 | |
| start_time <- Sys.time() | |
| sequence <- seq(from = 2, to = 35, by = 1) | |
| result_2 <- FindTopicsNumber(insta_ad_tokenized, | |
| topics = sequence, | |
| metrics = c("Deveaud2014", "Griffiths2004", "CaoJuan2009", "Arun2010"), #"Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014" | |
| method = 'Gibbs', | |
| control = list(seed = SEED, alpha = 50/sequence, delta = 0.1, iter = 1000), #delta is beta in Griffiths and Steyvers (2004) | |
| mc.cores = 8L, #I have 12 CPU cores | |
| verbose = FALSE) | |
| Sys.time() - start_time | |
| ``` | |
| ```{r} | |
| FindTopicsNumber_plot(result_2) | |
| ``` | |
| ```{r} | |
| min_max_norm <- function(x) { | |
| (x - min(x)) / (max(x) - min(x)) | |
| } | |
| result_1 %>% | |
| mutate(Deveaud2014 = min_max_norm(Deveaud2014), | |
| Griffiths2004 = min_max_norm(Griffiths2004), | |
| CaoJuan2009 = 1- min_max_norm(CaoJuan2009), | |
| Arun2010 = 1- min_max_norm(Arun2010)) %>% | |
| mutate(sum = rowSums(.[2:5])) | |
| ``` | |
| ***Step 4.3. LDM Topic Modeling: Main Analysis*** | |
| ```{r} | |
| gc() | |
| SEED <- 1234 | |
| topic_num <- 18 | |
| insta_ad_lda <- topicmodels::LDA(insta_ad_tokenized, | |
| k = topic_num, | |
| method = 'Gibbs', | |
| control = list(seed = SEED, alpha = 50/topic_num, delta = 0.1, iter = 1000) | |
| ) | |
| ``` | |
| ***Step 4.4. LDM Topic Modeling: beta*** | |
| beta indicates the degree to which each word is matter for a topic. | |
| ```{r, fig.height = 10, fig.width = 15} | |
| #extract beta information by word | |
| topics <- tidy(insta_ad_lda, matrix = 'beta') | |
| #extract top 20 | |
| top_terms <- topics %>% | |
| #group by topic | |
| group_by(topic) %>% | |
| #select the words with top 20 beta scores | |
| top_n(15, beta) %>% | |
| #ungroup the topic | |
| ungroup() | |
| #plot the result | |
| top_terms %>% | |
| ggplot(aes(x = reorder_within(term, beta, topic, sep = '_'), | |
| y = beta, | |
| fill = factor(topic))) + | |
| geom_bar(stat = 'identity', show.legend = FALSE) + | |
| facet_wrap(.~topic, nrow = 2, scale = 'free', | |
| labeller = as_labeller(c('1' = 'Cooking', | |
| '2' = 'Dayoff', | |
| '3' = 'Travel', | |
| '4' = 'Watch', | |
| '5' = 'Diet', | |
| '6' = 'Community', | |
| '7' = 'Promotion', | |
| '8' = 'Holiday', | |
| '9' = 'Workout', | |
| '10' = 'Fashion', | |
| '11' = 'Food', | |
| '12' = 'Interior', | |
| '13' = 'Parenting', | |
| '14' = 'Foreign Beauty', | |
| '15' = 'Makeup', | |
| '16' = 'Outfit', | |
| '17' = 'Skincare', | |
| '18' = 'Local Shop'))) + | |
| scale_x_discrete(labels = function(x) gsub('_.+$', '', x)) + | |
| labs(x = 'Words are ranked by the beta score', | |
| y = 'top 15 keywords in each topic', | |
| title = '21 topics captured by LDA') + | |
| coord_flip() | |
| ``` | |
| ***Discussion*** | |
| As we can see, some of the topics are similar to each other, which could be combined. | |
| However, many of topics are very different from each other. This indicates that brands from various product areas advertise via Influencers in Instagram. | |
| * Based on the results I could provide labels to topics as follows: | |
| - 'Bake & Cook' | |
| - 'Dietary Products' | |
| - 'Fashion & Giveaway' | |
| - 'Scooter & Riding' | |
| - 'Skincare' | |
| - 'Lifestyle & Relationships' | |
| - 'Home Decor & Interior' | |
| - 'Style & Outfit' | |
| - 'Workout' | |
| - 'Family & Travel' | |
| - 'Makeup' | |
| For example, topic 4 seems to focus more on supreme court discourse, while topic 10 focuses more on affidavits. However, we also see a lot of topics that are very similar (for example, a lot of topics are talking about voter fraud and ballot harvesting). Results such as this might suggest that some topics can be combined. | |
| ***Step 4.5. LDM Topic Modeling: topic ordering--Dendogram using Hellinger distance*** | |
| ```{r} | |
| lda_terms <- posterior(insta_ad_lda)$terms | |
| dist_models <- distHellinger(lda_terms, lda_terms) | |
| dendrogram <- hclust(as.dist(dist_models), "ward.D") | |
| plot(dendrogram) | |
| ``` | |
| ***Step 4.6. LDM Topic Modeling: gamma*** | |
| gamma indicates the degree to which each document is relevant to each topic. | |
| each document often has multiple gamma scores because there are multiple topics. | |
| I will assign one topic to each document based on the gamma score. | |
| ```{r} | |
| #extract gamma information by document | |
| topics_doc <- tidy(insta_ad_lda, matrix = 'gamma') | |
| ``` | |
| ```{r} | |
| topics_wide <- topics_doc %>% | |
| pivot_wider(names_from = topic, | |
| values_from = gamma) | |
| #view(topics_wide) | |
| ``` | |
| ```{r} | |
| cols <- seq(1, 19) | |
| add <- "PostTopic" | |
| topics_wide <- topics_wide %>% | |
| rename_with(.fn = ~paste0(add, .), | |
| .cols = num_range("", cols)) %>% | |
| rename(PostID = document) | |
| ``` | |
| ```{r} | |
| # subset rows with the largest gamma per document | |
| toptopics <- topics_doc %>% | |
| group_by(document) %>% | |
| slice_max(gamma) %>% | |
| ungroup() %>% | |
| rename(PostID = document, | |
| PostTopic = topic) | |
| ``` | |
| ```{r} | |
| #Joing the dataset and save | |
| insta_fin <- full_join(insta2, topics_wide, by = 'PostID') | |
| # insta_fin <- full_join(insta_fin, toptopics, by = 'PostID') | |
| # | |
| # # see which topics have been assigned to the most posts | |
| # table(insta_fin$PostTopic) %>% | |
| # as.data.frame() %>% | |
| # ggplot(aes(x = Var1, y = Freq)) + | |
| # geom_bar(stat = 'identity') + | |
| # labs(x = 'Topics', | |
| # y = 'Number of Documents') | |
| ``` | |
| ```{r} | |
| insta_fin | |
| ``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment