Skip to content

Instantly share code, notes, and snippets.

@ceaksan
Created March 22, 2021 20:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ceaksan/7104631cec8390daa91cfb81b98db4e0 to your computer and use it in GitHub Desktop.
Save ceaksan/7104631cec8390daa91cfb81b98db4e0 to your computer and use it in GitHub Desktop.
any(grepl("curl", installed.packages()))
any(grepl("XML", installed.packages()))
library(curl)
library(XML)
getURL <- function(url) {
getExtension <- function(url){
ex <- strsplit(basename(url), split="\\.")[[1]]
return(ex[-1])
}
if(!is.na(url)
&& !is.na(getExtension(url))
&& getExtension(url) == 'xml') {
cat("Please wait while fetching the XML...", "\n")
get <- curl_fetch_memory(url)
if(get$status_code == '200' &&
!is.null(get$content) &&
grepl("xml", get$type)) {
xmlData <- xmlParse(rawToChar(get$content), encoding = "UTF-8")
# rootNodeList <- xmlToList(xmlData)
rootNodeDF <- xmlToDataFrame(xmlData)
rootNodeDF$status <- NA
rootNodeDFwStatus <- rootNodeDF
data <- list()
success <- function(res){
# cat(res$url, res$status, "\n")
data <<- c(data, list(res))
}
failure <- function(msg){
cat("Request failed!", msg, "\n")
}
cat("The URL status control is started!","\n")
getURIs <- function(URIs) {
pool <- new_pool()
for(i in URIs) {
cat("> ", i, "\n")
curl_fetch_multi(i, done = success, fail = failure, pool = pool)
}
multi_run(pool = pool)
}
getStatus <- lapply(rootNodeDFwStatus$loc, getURIs)
for (j in 1:length(data)){
w <- which(rootNodeDFwStatus$loc == data[[j]]$url)
if(any(w)){
rootNodeDFwStatus[w,]$status = data[[j]]$status_code
}
}
rootNodeDFwStatus$xmlFileName <- parseURI(url)$path
return(rootNodeDFwStatus)
}
}
}
getURL("https://ilkadimlarim.com/kategorilersitemap.xml")
saveXMLFile <- function(
url,
extension = ".csv",
fileName = format(Sys.time(), "%y-%m-%d_%H-%M-%S"),
workDir = "/Users/user/Desktop/",
preX = "sitemap_",
fileDir = "sitemaps") {
res <- getURL(url)
pageData <- lapply(res$loc, getURL)
pages <- do.call(rbind.data.frame, pageData)
dataXML <- if(any(ncol(pages) > 2)) pages else res
fullPath <- paste0(workDir, fileDir)
if(!dir.exists(fullPath)) dir.create(file.path(fullPath))
setwd(file.path(fullPath))
file <- paste0(preX, fileName, extension)
switch(
extension,
".csv" = {
write.csv(dataXML, file)
}, ".xlsx" = {
library(xlsx)
write.xlsx(dataXML, file, sheetName = "Links")
}, {
saveRDS(dataXML, file)
}
)
}
siteMapURL <- "https://domain.com/sitemap.xml"
saveXMLFile(siteMapURL)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment