Skip to content

Instantly share code, notes, and snippets.

@hliang
Created June 6, 2018 01:54
Show Gist options
  • Save hliang/be8d50817cb789d69003ed4f0e0a5853 to your computer and use it in GitHub Desktop.
Save hliang/be8d50817cb789d69003ed4f0e0a5853 to your computer and use it in GitHub Desktop.
NLP toys: analysis of song lyrics
---
title: "NLP toys"
author: "hliang"
date: "8/24/2016"
output: html_document
---
Use NLP tools to analyze song lyrics.
load packages required:
```{r}
#install.packages("jiebaR")
#install.packages("devtools")
#devtools::install_github("qinwf/jiebaRD")
#devtools::install_github("qinwf/jiebaR")
library("jiebaRD")
library("jiebaR")
library(jsonlite)
setwd("~/prj/playr/jieba")
setwd("~/lyric/")
```
download lyrics
```{r}
dir.create("lrc")
# setwd("./lrc")
# download.file("http://www.cnlyric.com/LrcDown/3098/185575.lrc", destfile = "./lrc/185575.lrc")
# download.file("http://www.cnlyric.com/LrcDown/3098/118126.lrc", destfile = "./lrc/118126.lrc")
# download.file("http://www.cnlyric.com/LrcDown/2778/228924.lrc", destfile = "./lrc/228924.lrc")
# download.file("http://music.163.com/api/song/media?id=96661", destfile = "./lrc/96661.lrc")
# use jsonlite package to parse the json
# 163
s_ids = c("186331", "5260326", "25638702", "99", "99999", "5274094", "28680228") # 186331 #最冷一天 5260326:"好人" 25638702:"童话" 5274094: "我们这里还有鱼" 28680228: # 第五十期-红色警戒中国邻国武力值大PK(上)
for (i in seq_along(s_ids)) {
id163 = s_ids[i]
uri_detail = paste0("http://music.163.com/api/song/detail/?ids=[", id163, "]")
uri_media = paste0("http://music.163.com/api/song/media?id=", id163)
download.file(uri_detail, destfile = paste0("./lrc/", id163, ".detail.json"))
download.file(uri_media, destfile = paste0("./lrc/", id163, ".media.json"))
}
```
parse json file and extract lyric
```{r}
setwd("~/prj/playr/jieba")
setwd("~/lyric/")
lrc = data.frame(id=paste0("sid_", s_ids), name=NA, artist=NA, lyric=NA, stringsAsFactors = FALSE)
# parse lyric files
for (i in seq_len(nrow(lrc))) {
id163 = s_ids[i]
s_detail = fromJSON(txt=paste0("./lrc/", id163, ".detail.json"))
s_media = fromJSON(txt=paste0("./lrc/", id163, ".media.json"))
if (!(is.null(s_detail$songs$name) & is.null(s_media$lyric))) {
lrc$name[i] = s_detail$songs$name
lrc$lyric[i] = gsub("\\[[0-9.:]*\\]", "", s_media$lyric)
artist[[paste0("sid_", id163)]] = s_detail$songs$artists[[1]]$name
} else {
artist[[paste0("sid_", id163)]] = NA
}
}
# remove NA entries
keep = apply(!is.na(lrc[, -1]), 1, all)
lrc = lrc[keep, ]
artist = artist[keep]
```
segmentation
```{r}
cutter = worker(bylines=TRUE, write=FALSE)
segs = cutter[lrc$lyric]
names(segs) = lrc$id
str(segs)
```
simhash and distance
```{r}
simhasher = worker("simhash", topn=10)
#segs = lapply(lrc, "[[", "seg") # extract segmentations
sims = lapply(X = segs, FUN = vector_simhash, jiebar = simhasher) # compute simhash
# extract simhash
x = unlist(sapply(sims, "[", "simhash"))
# distance matrix
distmat = simhash_dist_mat(x, x)
colnames(distmat) = names(x)
rownames(distmat) = names(x)
d = as.dist(distmat)
fit = hclust(d)
tmpid = gsub("(sid_.*)\\.simhash", "\\1", fit$labels)
tmpname = lrc$name[match(tmpid, lrc$id)]
fit$labels = tmpname
```
You can also embed plots, for example:
```{r, echo=FALSE}
plot(fit)
```
Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment