Skip to content

Instantly share code, notes, and snippets.

@schochastics
Created October 14, 2018 21:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save schochastics/a79cb0c6beee8d9758decc778d2c0ea3 to your computer and use it in GitHub Desktop.
Save schochastics/a79cb0c6beee8d9758decc778d2c0ea3 to your computer and use it in GitHub Desktop.
library(rvest)
library(tidyverse)
# helper functions
get_theo_refs_cache <- function(url){
doc <- read_html(url)
caps <- doc %>% html_nodes("caption") %>% html_text()
skippy <- (any(caps=="Hypothesis" | caps=="Hypotheses"))+0
refs <- doc %>%
html_table() %>%
.[[4+skippy]] %>%
.$Ref %>%
# word(1,sep=" ") %>%
str_remove_all("\\s[0-9]+") %>%
str_trim() %>%
unique()
descr <- doc %>%
html_table() %>%
.[[2]] %>%
pull(X1) %>%
str_squish() %>%
str_remove("Description: ") %>%
str_remove("\\..*")
return(list(refs=refs,description=descr))
}
fc <- memoise::cache_filesystem(".cache")
get_theo_refs <- memoise::memoise(get_theo_refs_cache, cache = fc)
get_descr_cache <- function(url){
doc <- tryCatch(read_html(url),error=function(e) NULL)
if(is.null(doc)){
descr <- "None"
} else{
descr <- doc %>%
html_table() %>%
.[[2]] %>%
pull(X1) %>%
str_squish() %>%
str_remove("Description: ") %>%
str_remove("\\..*")
}
descr
}
get_descr <- memoise::memoise(get_descr_cache, cache = fc)
crawl_theo <- function(theo,depth=2){
el <- matrix(0,0,2)
descriptions <- matrix(0,0,2)
base_url <- "http://us.metamath.org/mpeuni/"
url <- paste0(base_url,theo,".html")
res <- get_theo_refs(url)
links <- res$refs
descriptions <- rbind(descriptions,cbind(theo,res$description))
el <- rbind(el,cbind(rep(theo,length(links)),links))
d <- 1
while(d<depth){
print(d)
new_links <- c()
for(l in links){
url <- paste0(base_url,l,".html")
res <- tryCatch(get_theo_refs(url),error=function(e) NULL)
if(is.null(res)){
next()
}
tmp <- res$refs
descriptions <- rbind(descriptions,cbind(l,res$description))
el <- rbind(el,cbind(rep(l,length(tmp)),tmp))
new_links <- c(new_links,tmp)
}
d <- d+1
links <- new_links
print(length(links))
}
for(l in links){
url <- paste0(base_url,l,".html")
descr <- get_descr(url)
descriptions <- rbind(descriptions,c(l,descr))
}
descriptions <- descriptions[!duplicated(descriptions[,1]),]
return(list(el=el,descr=descriptions))
}
################################################################################
c("basel","ftc1", "ftc2", "demoivre", "pythagtrip", "leibpi", "ramsey", "amgm",
"binom", "pw2en","konigsberg", "lhop", "perfect","mvth", "bpos") -> theos
for(theo in theos){
res <- suppressWarnings(crawl_theo(theo,3))
g <- graph_from_edgelist(res$el)
g <- simplify(g)
V(g)$descr <- res$descr[match(V(g)$name,res$descr[,1]),2]
saveRDS(g,paste0(theo,".rds"))
Sys.sleep(runif(1,100,200))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment