Skip to content

Instantly share code, notes, and snippets.

@ceaksan
Last active March 25, 2021 19:43
Show Gist options
  • Save ceaksan/b34d3b71091c3329555cb5ff8c43b81b to your computer and use it in GitHub Desktop.
Save ceaksan/b34d3b71091c3329555cb5ff8c43b81b to your computer and use it in GitHub Desktop.
library(Rcrawler)
library(dplyr)
library(stringr)
parallel:::setDefaultClusterOptions(setup_strategy = "sequential")
CustomXPaths <- c("//link[@rel='canonical']/@href",
"//meta[@name='robots']/@content",
# "//body/@class",
# "//title",
# "//h1",
# "//div[@class='entry-content']"
)
CustomLabels <- c("link_canonical",
"meta_robots",
# "body_class",
# "title",
# "h1",
# "div_class"
)
setwd("~/Desktop")
Rcrawler(Website = "https://domain.com/",
ExtractXpathPat = CustomXPaths,
PatternsNames = CustomLabels)
saveRDS(DATA, file="DATA.rds")
saveRDS(INDEX, file="INDEX.rds")
mergedCrawl <- cbind(INDEX, data.frame(do.call(rbind, DATA)))
mergedCrawl$Id <- as.integer(mergedCrawl$Id)
Indexable_pages <- mergedCrawl %>%
mutate(Canonical_Indexability = ifelse(Url == link_canonical | is.na(mergedCrawl$link_canonical), TRUE, FALSE)) %>%
mutate(Indexation = ifelse(grepl("NOINDEX|noindex", mergedCrawl$meta_robots), FALSE, TRUE)) %>%
filter(Canonical_Indexability == TRUE & Indexation == TRUE)
Sitemaps <- Indexable_pages %>%
filter(`Http Resp` == '200' & `Content Type` == 'text/html') %>%
select(Url) %>%
mutate(Content_type =
ifelse(str_detect(Indexable_pages$Url, "/caegory|tag/"), "Taxonomy",
ifelse(str_detect(Indexable_pages$Url, "/list|listings/"), "Listing",
ifelse(str_detect(Indexable_pages$Url, "/shop|contact/"), "Pages",
ifelse(str_detect(Indexable_pages$Url, "/locations"), "Locations",
ifelse(str_detect(Indexable_pages$Url, "/product"), "Products", "Posts")))))) %>%
group_by(Content_type) %>%
unique() %>%
arrange(Content_type)
Sitemaps$Content_type <- as.factor(Sitemaps$Content_type)
Sitemap_taxonomy <- Sitemaps %>% filter(Content_type == "Taxonomy")
Sitemap_listing <- Sitemaps %>% filter(Content_type == "Listing")
Sitemap_pages <- Sitemaps %>% filter(Content_type == "Pages")
Sitemap_places <- Sitemaps %>% filter(Content_type == "Places")
Sitemap_products <- Sitemaps %>% filter(Content_type == "Products")
Sitemap_posts <- Sitemaps %>% filter(Content_type == "Posts")
createSitemap <- function (links = list(), fileName = format(Sys.time(), "%y-%m-%d_%H-%M-%S")) {
cat("Please wait...", "\n")
cat("Total Link: ", length(links), "\n")
require(whisker)
require(httr)
template <- '<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{{#links}}
<url>
<loc>{{{loc}}}</loc>
<lastmod>{{{lastmod}}}</lastmod>
<changefreq>{{{changefreq}}}</changefreq>
<priority>{{{priority}}}</priority>
</url>
{{/links}}
</urlset>'
map_links <- function(url) {
tmp <- GET(url)
# https://www.stat.berkeley.edu/~s133/dates.html
date <- format(as.Date(strptime(tmp$headers$date, format = '%a, %d %b %Y %H:%M:%S', tz = "UTC")), "%Y-%m-%d")
sys_date <- format(Sys.time(), "%Y-%m-%d")
list(loc = url,
lastmod = ifelse(!is.na(date), date, sys_date),
changefreq = "monthly",
priority = "0.8")
}
links <- lapply(links, map_links)
cat(whisker.render(tpl, partials = links), file = paste(fileName, ".xml", sep = ""))
}
createSitemap(links = Sitemap_listing$Url, file = "listing")
createSitemap(links = Sitemap_pages$Url, file = "pages")
createSitemap(links = Sitemap_places$Url, file = "places")
createSitemap(links = Sitemap_products$Url, file = "products")
createSitemap(links = Sitemap_posts$Url, file = "posts")
createSitemap(links = Sitemap_taxonomy$Url, file = "taxonomy")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment