Created
May 15, 2011 11:59
-
-
Save masaha03/973085 to your computer and use it in GitHub Desktop.
KAKENから研究者ネットワーク作成
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(XML) | |
# 社会学者一覧 | |
# 研究種目が社会学の研究者を検索→研究者番号を取得→検索2ページ目以降の研究者の研究者番号をループで所得 | |
# 取得の所要時間は5分程度 | |
search.result <- xmlParse(enc2utf8('http://kaken.nii.ac.jp/opensearchr.cgi?count=200&q1="社会学"')) # 研究者検索 | |
search.result <- xmlToList(search.result) | |
search.result <- rapply(search.result, function(x){iconv(x,"UTF-8","CP932")}, how="replace") | |
entries <- search.result[which(names(search.result)=="entry")] | |
r.names <- sapply(entries, function(x){x$title$text}) # 研究者名 | |
links <- sapply(entries, function(x){x$link}) # 研究者ページへのリンク | |
numbers <- sub("http://kaken.nii.ac.jp/ja/r/", "", links) # 研究者番号 | |
names(numbers) <- r.names | |
selfURL <- search.result[[11]][[2]] # 検索の現在のページ | |
nextURL <- search.result[[13]][[2]] # 検索の次のページ | |
lastURL <- search.result[[14]][[2]] # 検索の最後のページ | |
continue <- TRUE | |
while(continue) { # 検索の2ページ目以降の研究者をループで取得 | |
search.result <- xmlParse(enc2utf8(nextURL)) | |
search.result <- xmlToList(search.result) | |
search.result <- rapply(search.result, function(x){iconv(x,"UTF-8","CP932")}, how="replace") | |
entries <- search.result[which(names(search.result)=="entry")] | |
r.names <- sapply(entries, function(x){x$title$text}) | |
links <- sapply(entries, function(x){x$link}) | |
numbers.tmp <- sub("http://kaken.nii.ac.jp/ja/r/", "", links) | |
names(numbers.tmp) <- r.names | |
numbers <- c(numbers, numbers.tmp) | |
selfURL <- search.result[[11]][[2]] | |
nextURL <- search.result[[14]][[2]] | |
continue <- selfURL != lastURL # 現在のページのURLと最後のページのURLが同じならば終了 | |
} | |
length(numbers) | |
# 研究者ネットワーク | |
# 研究者番号から研究者ネットワークを作成(https://gist.github.com/973078) | |
# 100人分で8分程度 | |
numbers.sample <- sample(numbers, 100) # 研究者をサンプリング | |
researcher.network <- list() | |
for (number in numbers.sample){ | |
url <- sprintf("http://kaken.nii.ac.jp/opensearchk.cgi?count=200&q15=%s", number) | |
Sys.sleep(0.5) # サーバー負荷対策 | |
search.result <- xmlParse(enc2utf8(url)) | |
search.result <- xmlToList(search.result) | |
search.result <- rapply(search.result, function(x){iconv(x,"UTF-8","CP932")}, how="replace") | |
entries <- search.result[which(names(search.result)=="entry")] | |
links <- sapply(entries, function(x){x$link}) | |
links <- sub("ja","rdf", links) | |
membersNum <- list() | |
for (i in seq_along(links)){ | |
Sys.sleep(0.5) # サーバー負荷対策 | |
kaken.page <- xmlParse(links[i]) | |
kaken.page <- xmlToList(kaken.page) | |
kaken.page <- rapply(kaken.page, function(x){iconv(x,"UTF-8","CP932")}, how="replace") | |
project <- kaken.page$Project | |
members <- project[which(names(project)=="member")] | |
membersNum[[i]] <- sapply(members, function(x){x$Researcher$researcherNumber}) | |
} | |
membersNum <- list(unique(unlist(membersNum))) | |
names(membersNum) <- number | |
researcher.network <- c(researcher.network, membersNum) | |
} | |
# 隣接行列作成 | |
row <- unique(unlist(researcher.network)) | |
r.names <- names(researcher.network) | |
mat <- matrix(0, nrow=length(numbers), ncol=length(numbers), dimnames=list(numbers, numbers)) | |
for (i in seq_along(researcher.network)){ | |
col <- names(researcher.network)[i] | |
row <- researcher.network[[i]] | |
row <- row[row %in% numbers] # 研究種目が社会学でない研究者を除去 | |
mat[row, col] <- 1 | |
} | |
mat <- mat[r.names, r.names] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment