Skip to content

Instantly share code, notes, and snippets.

@chenpanliao
Last active June 24, 2020 20:06
Show Gist options
  • Save chenpanliao/789f7984003719ea4bcc to your computer and use it in GitHub Desktop.
Save chenpanliao/789f7984003719ea4bcc to your computer and use it in GitHub Desktop.
# To the extent possible under law, Chen-Pan Liao has waived all
# copyright and related or neighboring rights to PTT-R_Language_Praising.R.
# This work is published from: Taiwan.
Sys.setlocale(locale = "C") # for windows user
library("RCurl")
library("XML")
board <- "R_Language"
i <- 1
st <- T
dat <- list()
while(st) {
myurl <- paste0("https://www.ptt.cc/bbs/", board, "/index", i, ".html")
if (url.exists(myurl)){
text <- getURL(myurl)
tree <- htmlTreeParse(text, asText = TRUE)$children$html[[2]][[2]][[2]]
date <- unlist(lapply(xpathApply(tree, "//div[@class='date']"), xmlValue))
author <- unlist(lapply(xpathApply(tree, "//div[@class='author']"), xmlValue))
title <- unlist(lapply(xpathApply(tree, "//div[@class='title']/a"), xmlValue))
nrec <- lapply(xpathApply(tree, "//div[@class='nrec']"), xmlValue)
nrec <- lapply(nrec, as.numeric)
nrec <- lapply(nrec, function(x){ if(length(x)==0){return(0)}else{return(x)} })
mark <- lapply(xpathApply(tree, "//div[@class='mark']"), xmlValue)
mark <- lapply(mark, function(x){ if(length(x)==0){return(F)}else{return(T)} })
dat[[page = i]] <- list(date, author, title, nrec, mark)
cat("Praising", myurl, "\n")
i <- i + 1
} else {
st = F
}
}
dat <- data.frame(
date = unlist(lapply(dat, "[[", 1)),
author = unlist(lapply(dat, "[[", 2)),
title = unlist(lapply(dat, "[[", 3)),
nrce = unlist(lapply(dat, "[[", 4)),
isMark = unlist(lapply(dat, "[[", 5))
)
# 發名排名前10
sort(n1 <- table(dat$author), decreasing = T)[1:10]
# 被推文排名前10
sort(n2 <- tapply(dat$nrce, dat$author, sum), decreasing = T)[1:10]
# 平均每文被推次數排名前10
sort(n3 <- n2/n1, decreasing = T)[1:10]
# 被M文排名前10
sort(n4 <- tapply(dat$isMark, dat$author, sum), decreasing = T)[1:10]
# 被M機率排名前10
sort(n5 <- n4/n1, decreasing = T)[1:10]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment