Skip to content

Instantly share code, notes, and snippets.

@youngjoon5
Last active July 29, 2017 18:35
Show Gist options
  • Save youngjoon5/f43a4692eb15ee816f460cfe731f06ec to your computer and use it in GitHub Desktop.
Save youngjoon5/f43a4692eb15ee816f460cfe731f06ec to your computer and use it in GitHub Desktop.
Renaissance network
##############################
# Making Renaissance network
# Young Joon Oh
# https://www.youtube.com/watch?v=FOLTJ-Tg8dg
# https://sites.google.com/site/youngjoon5/
#############################
###
# Scraping
###
library(rvest)
figures <- read_html("https://en.wikipedia.org/wiki/List_of_Renaissance_figures")
list <- html_nodes(figures, "li a")
list
## Create Label ##
temp_list<-html_text(list)
list_d<-temp_list
Re_list <- list_d[8:182]
Re_list <- unique(Re_list)
Re_list_d <- data.frame(Re_list)
colnames(Re_list_d)<-"label"
# nodes(list type)
label <- Re_list
## Creating nodes ##
temp_list<-list[8:182]
temp_list<-html_attr(temp_list, "href")
temp_list <- unique(temp_list)
nodes<-temp_list
#####
## For Web crawling
####
## Creating complete html links ##
ht <-"https://en.wikipedia.org"
completelink <- function(a){ # for complete html links
cl<-paste(ht,a,sep="")
return(cl)
}
temp<-lapply(temp_list, completelink)
links<-unlist(temp)
## create edge data through exploring connected web pages
label
nodes
links
# check two columns with dataframe
temp <- cbind(nodes, label, links)
temp_d <- data.frame(temp)
colnames(temp_d)[2] <- "label"
# Create Function : find_text -> Finding node data in the Target web page.
# node -> find_text -> the other artist's webpage
find_text <- function(so,ta_li){
temp1<-read_html(ta_li)
temp2<-html_attr(html_nodes(temp1, "p a"), "href")
temp3<-grep(so, temp2, value=T)
temp4<-unique(temp3)
return(temp4)
}
# Loop
temp_edges <- data.frame("","")
colnames(temp_edges) <- c("Source","Target")
for (i in 1:173){
so <- nodes[i]
temp_link <-links[-i] # except the node's own link
cat("\n")
cat ("1st loop, i= ", i) # For check
for(j in 1:172){
target_li<-temp_link[j]
temp_find<-find_text(so, target_li)
cat("\n")
cat (" j=", j) # For check
if(length(temp_find) != 0) { # skip it when nothing matches
pos<-which(links==target_li) # find position of the web page including "so"
ta<-nodes[pos]
cat(" Source= ", so, " Target=", ta) # For check
cat("\n") # line break
update <- data.frame(so,ta)
colnames(update) <- c("Source","Target")
temp_edges <- rbind(temp_edges,update )
}
}
}
re_edges<-temp_edges[-1,]
## create .csv files
temp <- cbind(nodes, label)
re_nodes <- data.frame(temp)
colnames(re_nodes)[1] <- "id"
# adding category column
category_1<-rep("Artist and architect", 75)
category_2<-rep("Mathematician", 13)
category_3<-rep("Writer", 19)
category_4<-rep("Philosopher", 22)
category_5<-rep("Composers", 20)
category_6<-rep("Dancemaster", 5)
category_7<-rep("Explorer and navigator", 19)
category<-c(category_1,category_2,category_3,category_4,category_5,category_6,category_7)
re_nodes<-cbind(re_nodes,category)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment