youngjoon5/gist:f43a4692eb15ee816f460cfe731f06ec

## gistfile1.txt
##############################
# Making Renaissance network
# Young Joon Oh
# https://www.youtube.com/watch?v=FOLTJ-Tg8dg
# https://sites.google.com/site/youngjoon5/
#############################


###
# Scraping
###


library(rvest)
figures <- read_html("https://en.wikipedia.org/wiki/List_of_Renaissance_figures")
list <- html_nodes(figures, "li a")
list


## Create Label ##

temp_list<-html_text(list)

list_d<-temp_list

Re_list <- list_d[8:182]
Re_list <- unique(Re_list)

Re_list_d <- data.frame(Re_list)
colnames(Re_list_d)<-"label"

# nodes(list type)
label <- Re_list

## Creating nodes ##

temp_list<-list[8:182]

temp_list<-html_attr(temp_list, "href")

temp_list <- unique(temp_list)

nodes<-temp_list


#####
##   For Web crawling
####


## Creating complete html links ##

ht <-"https://en.wikipedia.org"

completelink <- function(a){   # for complete html links
  cl<-paste(ht,a,sep="")
  return(cl)
  }


temp<-lapply(temp_list, completelink)

links<-unlist(temp)


## create edge data through exploring connected web pages

label
nodes
links


# check two columns with dataframe
temp <- cbind(nodes, label, links)
temp_d <- data.frame(temp)
colnames(temp_d)[2] <- "label"


# Create Function : find_text -> Finding node data in the Target web page.
# node -> find_text -> the other artist's webpage

find_text <- function(so,ta_li){
  temp1<-read_html(ta_li)
  temp2<-html_attr(html_nodes(temp1, "p a"), "href")

  temp3<-grep(so, temp2, value=T)
  temp4<-unique(temp3)
  return(temp4)

}

# Loop

temp_edges <- data.frame("","")
colnames(temp_edges) <- c("Source","Target")

for (i in 1:173){
    so <- nodes[i]
    temp_link <-links[-i] # except the node's own link

    cat("\n")
    cat ("1st loop, i= ", i)         # For check

    for(j in 1:172){
         target_li<-temp_link[j]

          temp_find<-find_text(so, target_li)
         cat("\n")
         cat (" j=", j)         # For check

         if(length(temp_find) != 0) {  # skip it when nothing matches

           pos<-which(links==target_li) # find position of the web page including "so"
           ta<-nodes[pos]
           cat(" Source= ", so, " Target=", ta) # For check
           cat("\n")                                  # line break

           update <- data.frame(so,ta)

           colnames(update) <- c("Source","Target")
           temp_edges <- rbind(temp_edges,update )

           }

       }

}


re_edges<-temp_edges[-1,]


## create .csv files


temp <- cbind(nodes, label)
re_nodes <- data.frame(temp)
colnames(re_nodes)[1] <- "id"

# adding category column

category_1<-rep("Artist and architect", 75)
category_2<-rep("Mathematician", 13)
category_3<-rep("Writer", 19)
category_4<-rep("Philosopher", 22)
category_5<-rep("Composers", 20)
category_6<-rep("Dancemaster", 5)
category_7<-rep("Explorer and navigator", 19)

category<-c(category_1,category_2,category_3,category_4,category_5,category_6,category_7)

re_nodes<-cbind(re_nodes,category)
	##############################
	# Making Renaissance network
	# Young Joon Oh
	# https://www.youtube.com/watch?v=FOLTJ-Tg8dg
	# https://sites.google.com/site/youngjoon5/
	#############################


	###
	# Scraping
	###


	library(rvest)
	figures <- read_html("https://en.wikipedia.org/wiki/List_of_Renaissance_figures")
	list <- html_nodes(figures, "li a")
	list


	## Create Label ##

	temp_list<-html_text(list)

	list_d<-temp_list

	Re_list <- list_d[8:182]
	Re_list <- unique(Re_list)

	Re_list_d <- data.frame(Re_list)
	colnames(Re_list_d)<-"label"

	# nodes(list type)
	label <- Re_list

	## Creating nodes ##

	temp_list<-list[8:182]

	temp_list<-html_attr(temp_list, "href")

	temp_list <- unique(temp_list)

	nodes<-temp_list


	#####
	## For Web crawling
	####


	## Creating complete html links ##

	ht <-"https://en.wikipedia.org"

	completelink <- function(a){ # for complete html links
	cl<-paste(ht,a,sep="")
	return(cl)
	}


	temp<-lapply(temp_list, completelink)

	links<-unlist(temp)






	## create edge data through exploring connected web pages

	label
	nodes
	links


	# check two columns with dataframe
	temp <- cbind(nodes, label, links)
	temp_d <- data.frame(temp)
	colnames(temp_d)[2] <- "label"




	# Create Function : find_text -> Finding node data in the Target web page.
	# node -> find_text -> the other artist's webpage

	find_text <- function(so,ta_li){
	temp1<-read_html(ta_li)
	temp2<-html_attr(html_nodes(temp1, "p a"), "href")

	temp3<-grep(so, temp2, value=T)
	temp4<-unique(temp3)
	return(temp4)

	}

	# Loop

	temp_edges <- data.frame("","")
	colnames(temp_edges) <- c("Source","Target")

	for (i in 1:173){
	so <- nodes[i]
	temp_link <-links[-i] # except the node's own link

	cat("\n")
	cat ("1st loop, i= ", i) # For check

	for(j in 1:172){
	target_li<-temp_link[j]

	temp_find<-find_text(so, target_li)
	cat("\n")
	cat (" j=", j) # For check

	if(length(temp_find) != 0) { # skip it when nothing matches

	pos<-which(links==target_li) # find position of the web page including "so"
	ta<-nodes[pos]
	cat(" Source= ", so, " Target=", ta) # For check
	cat("\n") # line break

	update <- data.frame(so,ta)

	colnames(update) <- c("Source","Target")
	temp_edges <- rbind(temp_edges,update )

	}

	}

	}


	re_edges<-temp_edges[-1,]


	## create .csv files


	temp <- cbind(nodes, label)
	re_nodes <- data.frame(temp)
	colnames(re_nodes)[1] <- "id"

	# adding category column

	category_1<-rep("Artist and architect", 75)
	category_2<-rep("Mathematician", 13)
	category_3<-rep("Writer", 19)
	category_4<-rep("Philosopher", 22)
	category_5<-rep("Composers", 20)
	category_6<-rep("Dancemaster", 5)
	category_7<-rep("Explorer and navigator", 19)

	category<-c(category_1,category_2,category_3,category_4,category_5,category_6,category_7)

	re_nodes<-cbind(re_nodes,category)