Skip to content

Instantly share code, notes, and snippets.

@nezuQ
Created May 25, 2014 01:27
Show Gist options
  • Save nezuQ/c8dbe4fb9021cf568b14 to your computer and use it in GitHub Desktop.
Save nezuQ/c8dbe4fb9021cf568b14 to your computer and use it in GitHub Desktop.
Rでクローリング。FirefoxをR経由で自動操作する。 ref: http://qiita.com/nezuq/items/550ed145421a5c551eaa
sudo update-java-alternatives -s java-7-oracle
sudo R CMD javareconf
#GetSyosetuList.R
#Copyright (c) 2014 nezuq
#This software is released under the MIT License.
#http://opensource.org/licenses/mit-license.php
#依存パッケージのインストール
install.packages("rJava")
install.packages("devtools")
library(devtools)
install_github('seleniumJars', 'LluisRamon')
install_github('relenium', 'LluisRamon')
#依存パッケージの使用宣言
library(relenium)
#検索キーワード
keywords = "ガールズラブ シリアス -ガールズラブ? -シリアス? -ボーイズラブ -R15 -ファンタジー"
#Firefoxを起動
firefox <- firefoxClass$new()
#検索結果画面を表示
firefox$get(paste("http://yomou.syosetu.com/search.php?&order=new&genre=&type=&word=", keywords, sep=""))
#検索データセット
novellist.df <- data.frame()
getTextAndHref <- function(x){c(x$getText(), x$getAttribute("href"))}
getSysTime <- function(){format(Sys.time(), "%Y年%m月%d日 %H時%M分%S秒")}
while(T){
Sys.sleep(3)
#検索作品の情報を取得
cat(paste("題名取得中...\n", getSysTime(), "\n"))
titles <- firefox$findElementsByCssSelector(".searchkekka_box > .novel_h > a")
titles.df <- sapply(titles, getTextAndHref)
cat(paste("筆者情報取得中...\n", getSysTime(), "\n"))
writers <- firefox$findElementsByCssSelector(".searchkekka_box > a:nth-of-type(1)")
writers.df <- sapply(writers, getTextAndHref)
cat(paste("小説情報取得中...\n", getSysTime(), "\n"))
novel.info <- firefox$findElementsByCssSelector(".searchkekka_box > a:nth-of-type(2)")
novel.info.df <- sapply(novel.info, getTextAndHref)
cat(paste("小説ステータス取得中...\n", getSysTime(), "\n"))
novel.status <- firefox$findElementsByCssSelector(".searchkekka_box > table > tbody > tr > td:nth-of-type(1)")
novel.status.df <- sapply(novel.status, function(x){
x.c <- c(strsplit(x$getText(), "\n")[[1]], 0)
x.c[2] <- gsub("[^0-9]+", "", x.c[2])
x.c[1:2]
})
cat(paste("あらすじ取得中...\n", getSysTime(), "\n"))
novel.ex <- firefox$findElementsByClassName("ex")
cat(paste("小説関連情報取得中...\n", getSysTime(), "\n"))
novel.opt <- firefox$findElementsByCssSelector(".searchkekka_box > table > tbody > tr > td:nth-of-type(2)")
novel.opt.df <- sapply(novel.opt, function(x){
x.c <- gsub("^[ ]+|[\n ]|[ ]+$","",strsplit(x$getText(), "[^[:space:]]+[:]{1}")[[1]])
n <- length(x.c)
x.c <- x.c[(n-10):n]
x.c[4] <- gsub(" ", "", x.c[4])
x.c[5] <- gsub("約[[:digit:]]{1,4}分(|,|文字)", "", x.c[5])
x.c[7] <- gsub("件", "", x.c[7])
x.c[8] <- gsub(" pt", "", x.c[8])
x.c[9] <- gsub(" 人", "", x.c[9])
x.c[10] <- gsub(" pt", "", x.c[10])
x.c[11] <- gsub("件", "", x.c[11])
x.c
})
#データセットに格納
novellist.df <- rbind(novellist.df, data.frame(
novel.id=sapply(strsplit(titles.df[2,],"/"), function(x){x[4]}),
novel.name=titles.df[1,],
novel.status=novel.status.df[1,],
novel.npara=novel.status.df[2,],
novel.genre=novel.opt.df[2,],
novel.ex=sapply(novel.ex, function(x){x$getText()}),
novel.info.url=novel.info.df[2,],
novel.url=titles.df[2,],
novel.keywords=novel.opt.df[3,],
writer.id=sapply(strsplit(writers.df[2,],"/"), function(x){x[4]}),
writer.name=writers.df[1,],
writer.url=writers.df[2,],
novel.updated=novel.opt.df[4,],
novel.nchar=novel.opt.df[5,],
novel.nuser=novel.opt.df[6,],
novel.nreview=novel.opt.df[7,],
novel.nval=novel.opt.df[8,],
novel.nfav=novel.opt.df[9,],
created=sapply(1:length(titles), function(x){getSysTime()})
))
cat(paste("次ページリンク取得中...\n", getSysTime(), "\n"))
nextlink <- firefox$findElementByClassName("nextlink")
if (is.null(nextlink)) break
nextlink$click()
}
#ファイルへ出力
write.table(novellist.df, "./INPUT.txt")
#Firefoxを停止
firefox$close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment