Last active
July 29, 2017 18:35
-
-
Save youngjoon5/f43a4692eb15ee816f460cfe731f06ec to your computer and use it in GitHub Desktop.
Renaissance network
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################## | |
# Making Renaissance network | |
# Young Joon Oh | |
# https://www.youtube.com/watch?v=FOLTJ-Tg8dg | |
# https://sites.google.com/site/youngjoon5/ | |
############################# | |
### | |
# Scraping | |
### | |
library(rvest) | |
figures <- read_html("https://en.wikipedia.org/wiki/List_of_Renaissance_figures") | |
list <- html_nodes(figures, "li a") | |
list | |
## Create Label ## | |
temp_list<-html_text(list) | |
list_d<-temp_list | |
Re_list <- list_d[8:182] | |
Re_list <- unique(Re_list) | |
Re_list_d <- data.frame(Re_list) | |
colnames(Re_list_d)<-"label" | |
# nodes(list type) | |
label <- Re_list | |
## Creating nodes ## | |
temp_list<-list[8:182] | |
temp_list<-html_attr(temp_list, "href") | |
temp_list <- unique(temp_list) | |
nodes<-temp_list | |
##### | |
## For Web crawling | |
#### | |
## Creating complete html links ## | |
ht <-"https://en.wikipedia.org" | |
completelink <- function(a){ # for complete html links | |
cl<-paste(ht,a,sep="") | |
return(cl) | |
} | |
temp<-lapply(temp_list, completelink) | |
links<-unlist(temp) | |
## create edge data through exploring connected web pages | |
label | |
nodes | |
links | |
# check two columns with dataframe | |
temp <- cbind(nodes, label, links) | |
temp_d <- data.frame(temp) | |
colnames(temp_d)[2] <- "label" | |
# Create Function : find_text -> Finding node data in the Target web page. | |
# node -> find_text -> the other artist's webpage | |
find_text <- function(so,ta_li){ | |
temp1<-read_html(ta_li) | |
temp2<-html_attr(html_nodes(temp1, "p a"), "href") | |
temp3<-grep(so, temp2, value=T) | |
temp4<-unique(temp3) | |
return(temp4) | |
} | |
# Loop | |
temp_edges <- data.frame("","") | |
colnames(temp_edges) <- c("Source","Target") | |
for (i in 1:173){ | |
so <- nodes[i] | |
temp_link <-links[-i] # except the node's own link | |
cat("\n") | |
cat ("1st loop, i= ", i) # For check | |
for(j in 1:172){ | |
target_li<-temp_link[j] | |
temp_find<-find_text(so, target_li) | |
cat("\n") | |
cat (" j=", j) # For check | |
if(length(temp_find) != 0) { # skip it when nothing matches | |
pos<-which(links==target_li) # find position of the web page including "so" | |
ta<-nodes[pos] | |
cat(" Source= ", so, " Target=", ta) # For check | |
cat("\n") # line break | |
update <- data.frame(so,ta) | |
colnames(update) <- c("Source","Target") | |
temp_edges <- rbind(temp_edges,update ) | |
} | |
} | |
} | |
re_edges<-temp_edges[-1,] | |
## create .csv files | |
temp <- cbind(nodes, label) | |
re_nodes <- data.frame(temp) | |
colnames(re_nodes)[1] <- "id" | |
# adding category column | |
category_1<-rep("Artist and architect", 75) | |
category_2<-rep("Mathematician", 13) | |
category_3<-rep("Writer", 19) | |
category_4<-rep("Philosopher", 22) | |
category_5<-rep("Composers", 20) | |
category_6<-rep("Dancemaster", 5) | |
category_7<-rep("Explorer and navigator", 19) | |
category<-c(category_1,category_2,category_3,category_4,category_5,category_6,category_7) | |
re_nodes<-cbind(re_nodes,category) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment