Skip to content

Instantly share code, notes, and snippets.

@dubsnipe
Created May 17, 2023 16:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dubsnipe/45d4dff5ae4fe8e2327407c535978355 to your computer and use it in GitHub Desktop.
Save dubsnipe/45d4dff5ae4fe8e2327407c535978355 to your computer and use it in GitHub Desktop.
setwd("C:/Users/rudes/Documents/2023/appropedia/data dump")
require(jsonlite)
require(data.table)
require(tidyverse)
base_url <- "https://www.appropedia.org/w/api.php?action=query&list=allpages&aplimit=500&format=json&apfrom="
to_df <- function(apfrom="\"Backpack\"_Snap-fit_Clips"){
call_url <- paste0(base_url, apfrom)
json_data <- fromJSON(paste(readLines(call_url), collapse=""), flatten = T)
if (json_data$query$allpages %>% length() > 0){
json_pages <- json_data$query$allpages
data <- json_pages %>% filter(!grepl("/\\w+{2}$", title)) # Removing translations
# continue <- json_data$continue$apcontinue
return(data)
}else{
break
}
}
rm(data, new_data, old_data)
data <- to_df() %>% filter(ns == 0) %>% select(-ns)
repeat{
apfrom <- data %>% tail(1) %>% select(title) %>% str_replace_all("\\s+", "_")
if (ifelse(any(ls() %in% "new_data"), is.data.frame(get("new_data")),FALSE)){
old_data <- new_data
}else{
old_data <- tibble()
}
new_data <- to_df(apfrom) %>% filter(ns == 0) %>% select(-ns)
data <- bind_rows(data, new_data)
Sys.sleep(1)
# last_title <- new_data %>% tail(1) %>% select(title) %>% str_replace_all("\\s+", "_") # Finding the last item
if( identical(new_data, old_data) ){
break
}
}
data <- data %>% unique()
write.csv(data, paste0("all_pages_",Sys.Date(),".csv"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment