Skip to content

Instantly share code, notes, and snippets.

@katerinabc
Created February 2, 2019 21:45
Show Gist options
  • Save katerinabc/d0396ad1bec0498fbd876b9fc0e0ed40 to your computer and use it in GitHub Desktop.
Save katerinabc/d0396ad1bec0498fbd876b9fc0e0ed40 to your computer and use it in GitHub Desktop.
library(tidytext)
mission_words <- ms %>% unnest_tokens(word, mission) %>%
count(country, word, sort=T) %>% ungroup()
total_words <- mission_words %>% group_by(country) %>% summarize(total=sum(n))
mission_words <- left_join(mission_words, total_words)
mission_words <- mission_words %>% anti_join(stop_words, sort=T)
# calculate tf_idf
mission_td <- mission_words %>% cast_sparse(country, word, n)
mission_td <- quanteda::as.dfm(as.matrix(mission_td))
topfeatures(mission_td, 25)
mission_td[,1:5]
mission_dist <- textstat_simil(mission_td)
mission_dist_m <- reshape2::melt(as.matrix(mission_dist))
mission_dist_m <- mission_dist_m[-which(mission_dist_m$Var1 == mission_dist_m$Var2),]
ggplot(as.data.frame(mission_dist_2d), aes(V1, V2, label = rownames(mission_dist_2d))) +
geom_point() +
geom_label() +
labs('Similarity of mission Texts',
caption='Principal coordinate analysis on Euclidian distance based on words in mission text ',
x = 'Component 1', y = 'Component 2') +
theme_classic() + theme(axis.ticks = element_blank(),
axis.text = element_blank())
ggsave('Similarity_index_mission.png', path=mypath)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment