Skip to content

Instantly share code, notes, and snippets.

@fedeisas
Last active October 4, 2017 20:03
Show Gist options
  • Save fedeisas/2562bee6354bbcaab2e48cdf62c789b8 to your computer and use it in GitHub Desktop.
Save fedeisas/2562bee6354bbcaab2e48cdf62c789b8 to your computer and use it in GitHub Desktop.
Wordcloud de letras de canciones
# Cargo las librerias necesarias
library('rvest') # Para scrapear paginas
library('tm') # Text Mining
library('wordcloud')
library('RColorBrewer') # Paletas de colores
library('SnowballC') # Stemming
library('GetoptLong') # qq() permite interpolar strings
# Seteo locale para no tener problemas de acentos
Sys.setlocale('LC_CTYPE', 'en_US.UTF-8')
artist <- 'damas-gratis'
#artist <- 'patricio-rey-y-sus-redonditos-de-ricota'
url <- qq('https://www.letras.com/@{artist}/')
webpage <- read_html(url)
links <- html_nodes(webpage, '.cnt-list li a')
urls <- html_attr(links, 'href')
# Agrego el dominio a las URLs relativas
urls <- paste(
'https://www.letras.com',
urls[startsWith(urls, qq('/@{artist}/'))],
sep=''
)
# Dejar solo las URLs unicas
urls <- unique(urls)
# Tomar una muestra
urls <- sample(urls, 50)
# Bajar los documentos
documents <- lapply(urls, read_html)
# Extraigo la letra de cada documento
clean_articles <- sapply(documents, function (d) {
return(
html_text(html_node(d, '.cnt-letra article'), trim=TRUE)
)
})
# Creo un corpus
corpus <- Corpus(VectorSource(clean_articles))
# Convertir a minusculas
corpus <- tm_map(corpus, content_transformer(tolower))
# Sacar numeros
corpus <- tm_map(corpus, removeNumbers)
# Sacar palabras comunes
corpus <- tm_map(corpus, removeWords, stopwords('spanish'))
# Sacar puntiacion
corpus <- tm_map(corpus, removePunctuation)
# Sacar espacios
corpus <- tm_map(corpus, stripWhitespace)
# Stemming
#corpus <- tm_map(corpus, stemDocument)
# Creo matriz de documentos
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
# Frecuencias
v <- sort(rowSums(m), decreasing=TRUE)
# Creo un DF con las frecuencias de cada palabra
d <- data.frame(word = names(v), freq=v)
# Dibujo la nube de palabras
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, 'Dark2'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment