Skip to content

Instantly share code, notes, and snippets.

@linnil1
Last active September 18, 2016 13:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save linnil1/32dcbd6c38f956db59a3a148ffa3b2fe to your computer and use it in GitHub Desktop.
Save linnil1/32dcbd6c38f956db59a3a148ffa3b2fe to your computer and use it in GitHub Desktop.
new to R and use R to scrape ntu course
library("httr")
library("xml2")
library("magrittr")
webGet = function(query=list(),file="search_for_02_dpt.php") {
url = paste("http://nol.ntu.edu.tw/nol/coursesearch/",file,sep="")
return (GET(url,query=query) %>% content(encoding="Big5"))
}
if(FALSE){
# get all department
html = webGet()
dptname = xml_find_all(html,"//select[@id='dptname']") %>% xml_children() %>% xml_text(trim=TRUE) %>% strsplit(' ')
print(dptname)
}
# get each department
depGet = function(depname){
html = webGet(list(current_sem= "105-1",dptname= depname,yearcode= "0"))
course = xml_find_all(html,"//table[5]")
course_head = xml_find_all(course,"tr[1]/td") %>% xml_text(trim=TRUE)
print(course_head)
course_num = xml_find_first(course,"//font[@color='#CC0033']") %>% xml_text(trim=T) %>% strtoi
print(course_num)
#get each course
td_all = lapply( xml_find_all(course,"tr")[-1] ,function(tr){ return(xml_find_all(tr,"td")) } )
for( num in seq(15,course_num,15) ){
html = webGet(list(current_sem = "105-1", dptname = depname,yearcode="0",startrec=num))
course = xml_find_all(html,"//table[5]")
tds = lapply( xml_find_all(course,"tr")[-1] ,function(tr){ return(xml_find_all(tr,"td")) } )
td_all = c(td_all,tds)
}
return (td_all)
}
data = depGet("1010")
#modify data
depMod = function(data){
d = lapply(data,function(td)
{
cname = td[5] %>% xml_text
dep = td[2] %>% xml_text
link = td[5] %>% xml_find_first("a") %>% xml_attrs
link = link[[1]]['href'][[1]]
return (c(cname,dep,link))
})
df = data.frame(matrix(unlist(d), ncol=3, byrow=T))
colnames(df) = c("className","department","classLink")
return (df)
}
data = depMod(data)
#write
write.csv(data,"data/test.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment