Skip to content

Instantly share code, notes, and snippets.

@phileas-condemine
Last active November 4, 2019 18:18
Show Gist options
  • Save phileas-condemine/8d160e55f86aac17f95a74ecbd5f9ea7 to your computer and use it in GitHub Desktop.
Save phileas-condemine/8d160e55f86aac17f95a74ecbd5f9ea7 to your computer and use it in GitHub Desktop.
le monde => text prep => stemming => prune low freq words => DTM | cooc | cosine-sim => SVD => tsne => plotly
# https://joparga3.github.io/Udemy_text_analysis/#document-similarity-cosine-similarity-and-latent-semantic-analysis
library(data.table)
library(tm)
library(SnowballC)
library(Rtsne)
library(irlba)
library(plotly)
# articles =fread("data_text_mining/lemonde_csv_formation.csv",encoding='UTF-8')
# données scrapées avec le gist scraping_lemonde
scrape = pbapply::pblapply(list.files("lemonde_scraping/"),function(x){
load(paste0("lemonde_scraping/",x))
data
})
scrape_dt = rbindlist(scrape)
scrape_dt=unique(scrape_dt)
lemonde = Corpus(VectorSource(paste(scrape_dt$title,scrape_dt$abstract)))
lemonde
stpwords = stopwords("fr")
# PREPROCESSING
lemonde = tm_map(lemonde, removeWords, stpwords)
# Number
lemonde = tm_map(lemonde, content_transformer(removeNumbers))
# Punctuation
lemonde = tm_map(lemonde, content_transformer(removePunctuation))
# Whitespaces
# lemonde = tm_map(lemonde, content_transformer(function(x){stringr::str_wrap(x)}))
lemonde = tm_map(lemonde, content_transformer(function(x){stringr::str_replace_all(x, "\n|’|“", " ")}))
lemonde = tm_map(lemonde, stripWhitespace)
lemonde <- tm_map(lemonde, stemDocument, "french")
# DOCUMENT TERM MATRIX
dtm = DocumentTermMatrix(lemonde, control = list(removePunctuation = TRUE
, removeNumbers = TRUE
, stopwords = stpwords
, weighting = weightBin))
mj <- colSums(as.matrix(dtm))
word.types <- names(mj)
seuil = 50
dtm <- dtm[,seuil <= mj]# mots présents au moins 50 fois
dim(dtm)
ni <- rowSums(as.matrix(dtm))
sum(mj >= seuil)
dtm <- dtm[5 <= ni,]#phrases de plus de 5 mots
dim(dtm)
i.svd <- sample(nrow(dtm), nrow(dtm))
dtm.svd <- as.matrix(dtm)[i.svd,]
approaches = c("raw_DTM_SVD","cosSIM_SVD")
approach=approaches[2]
# SVD sur DTM raw
if(approach == "raw_DTM_SVD"){
ni.svd <- rowSums(dtm.svd) # number of words in a document, its length
mj.svd <- pmax(1,colSums(dtm.svd)) # frequency of word type in vocabulary (avoid 0 divisor)
min(mj.svd)
min(ni.svd)
dtm.svd <- dtm.svd/sqrt(ni.svd) # take advantage of R behavior
dtm.svd <- t( t(dtm.svd)/sqrt(mj.svd) )
udv <- irlba(dtm.svd,100) # returns u, d, v
} else if (approach == "cosSIM_SVD"){
# if weighting = weightBin => we get cooccurrence mat !
dtm.svd = as(dtm.svd,"dgCMatrix")
# https://stackoverflow.com/questions/5888287/running-cor-or-any-variant-over-a-sparse-matrix-in-r
sparse.cor2 <- function(x){
n <- nrow(x)
covmat <- (crossprod(x)-2*(colMeans(x) %o% colSums(x))
+n*colMeans(x)%o%colMeans(x))/(n-1)
sdvec <- sqrt(diag(covmat)) # standard deviations of columns
covmat/crossprod(t(sdvec)) # correlation matrix
}
corr_mat = sparse.cor2(dtm.svd)
udv <- irlba(corr_mat,100) # returns u, d, v
}
word_embedding = udv$v
# udv <- svd(dtm.svd) # returns u, d, v
# names(udv)
# cooc_2D=Rtsne(word_embedding,initial_dims = 100,pca = F,pca_center = F,pca_scale = F,theta = .1,perplexity = 100)
cooc_2D=Rtsne(word_embedding,initial_dims = 100,pca = F,pca_center = F,pca_scale = F,theta = .5,perplexity = 30)
words = colSums(dtm.svd)
word_embedding2D = data.table(cooc_2D$Y)
names(word_embedding2D) <- c("X","Y")
word_embedding2D$word = names(words)
word_embedding2D$weight = words
one_word = ""
sum(grepl(one_word,word_embedding2D$word))
word_embedding2D$size=1
word_embedding2D$type="std"
word_embedding2D[grepl(one_word,word),c("size","type"):=.(20,"focus")]
plot_ly(word_embedding2D%>%unique,
x=~X,y=~Y,text = ~word,color=~type,size=~size,
hoverinfo = 'text')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment