ceaksan/R_createXMLSitemap.R Secret

## R_createXMLSitemap.R
library(Rcrawler)
library(dplyr)
library(stringr)

parallel:::setDefaultClusterOptions(setup_strategy = "sequential")

CustomXPaths <- c("//link[@rel='canonical']/@href",
                  "//meta[@name='robots']/@content",
#                  "//body/@class",
#                  "//title",
#                  "//h1",
#                  "//div[@class='entry-content']"
                 )

CustomLabels <- c("link_canonical",
                  "meta_robots",
#                  "body_class",
#                  "title",
#                  "h1",
#                  "div_class"
                 )

setwd("~/Desktop")

Rcrawler(Website = "https://domain.com/",
         ExtractXpathPat = CustomXPaths,
         PatternsNames = CustomLabels)

saveRDS(DATA, file="DATA.rds")
saveRDS(INDEX, file="INDEX.rds")

mergedCrawl <- cbind(INDEX, data.frame(do.call(rbind, DATA)))

mergedCrawl$Id <- as.integer(mergedCrawl$Id)

Indexable_pages <- mergedCrawl %>%
  mutate(Canonical_Indexability = ifelse(Url == link_canonical | is.na(mergedCrawl$link_canonical), TRUE, FALSE)) %>%
  mutate(Indexation = ifelse(grepl("NOINDEX|noindex", mergedCrawl$meta_robots), FALSE, TRUE)) %>%
  filter(Canonical_Indexability == TRUE & Indexation == TRUE)

Sitemaps <- Indexable_pages %>%
  filter(`Http Resp` == '200' & `Content Type` == 'text/html') %>%
  select(Url) %>%
  mutate(Content_type =
           ifelse(str_detect(Indexable_pages$Url, "/caegory|tag/"), "Taxonomy",
                  ifelse(str_detect(Indexable_pages$Url, "/list|listings/"), "Listing",
                         ifelse(str_detect(Indexable_pages$Url, "/shop|contact/"), "Pages",
                                ifelse(str_detect(Indexable_pages$Url, "/locations"), "Locations",
                                       ifelse(str_detect(Indexable_pages$Url, "/product"), "Products", "Posts")))))) %>%
  group_by(Content_type) %>%
  unique() %>%
  arrange(Content_type)

Sitemaps$Content_type <- as.factor(Sitemaps$Content_type)

Sitemap_taxonomy <- Sitemaps %>% filter(Content_type == "Taxonomy")
Sitemap_listing <- Sitemaps %>% filter(Content_type == "Listing")
Sitemap_pages <- Sitemaps %>% filter(Content_type == "Pages")
Sitemap_places <- Sitemaps %>% filter(Content_type == "Places")
Sitemap_products <- Sitemaps %>% filter(Content_type == "Products")
Sitemap_posts <- Sitemaps %>% filter(Content_type == "Posts")

createSitemap <- function (links = list(), fileName = format(Sys.time(), "%y-%m-%d_%H-%M-%S")) {

  cat("Please wait...", "\n")
  cat("Total Link: ", length(links), "\n")

  require(whisker)
  require(httr)

  template <- '<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 {{#links}}
   <url>
      <loc>{{{loc}}}</loc>
      <lastmod>{{{lastmod}}}</lastmod>
      <changefreq>{{{changefreq}}}</changefreq>
      <priority>{{{priority}}}</priority>
   </url>
 {{/links}}
</urlset>'

  map_links <- function(url) {
    tmp <- GET(url)
    # https://www.stat.berkeley.edu/~s133/dates.html
    date <- format(as.Date(strptime(tmp$headers$date, format = '%a, %d %b %Y %H:%M:%S', tz = "UTC")), "%Y-%m-%d")
    sys_date <- format(Sys.time(), "%Y-%m-%d")

    list(loc = url,
         lastmod = ifelse(!is.na(date), date, sys_date),
         changefreq = "monthly",
         priority = "0.8")
  }

  links <- lapply(links, map_links)
  cat(whisker.render(tpl, partials = links), file = paste(fileName, ".xml", sep = ""))

}

createSitemap(links = Sitemap_listing$Url, file = "listing")
createSitemap(links = Sitemap_pages$Url, file = "pages")
createSitemap(links = Sitemap_places$Url, file = "places")
createSitemap(links = Sitemap_products$Url, file = "products")
createSitemap(links = Sitemap_posts$Url, file = "posts")
createSitemap(links = Sitemap_taxonomy$Url, file = "taxonomy")
	library(Rcrawler)
	library(dplyr)
	library(stringr)

	parallel:::setDefaultClusterOptions(setup_strategy = "sequential")

	CustomXPaths <- c("//link[@rel='canonical']/@href",
	"//meta[@name='robots']/@content",
	# "//body/@class",
	# "//title",
	# "//h1",
	# "//div[@class='entry-content']"
	)

	CustomLabels <- c("link_canonical",
	"meta_robots",
	# "body_class",
	# "title",
	# "h1",
	# "div_class"
	)

	setwd("~/Desktop")

	Rcrawler(Website = "https://domain.com/",
	ExtractXpathPat = CustomXPaths,
	PatternsNames = CustomLabels)

	saveRDS(DATA, file="DATA.rds")
	saveRDS(INDEX, file="INDEX.rds")

	mergedCrawl <- cbind(INDEX, data.frame(do.call(rbind, DATA)))

	mergedCrawl$Id <- as.integer(mergedCrawl$Id)

	Indexable_pages <- mergedCrawl %>%
	mutate(Canonical_Indexability = ifelse(Url == link_canonical \| is.na(mergedCrawl$link_canonical), TRUE, FALSE)) %>%
	mutate(Indexation = ifelse(grepl("NOINDEX\|noindex", mergedCrawl$meta_robots), FALSE, TRUE)) %>%
	filter(Canonical_Indexability == TRUE & Indexation == TRUE)

	Sitemaps <- Indexable_pages %>%
	filter(`Http Resp` == '200' & `Content Type` == 'text/html') %>%
	select(Url) %>%
	mutate(Content_type =
	ifelse(str_detect(Indexable_pages$Url, "/caegory\|tag/"), "Taxonomy",
	ifelse(str_detect(Indexable_pages$Url, "/list\|listings/"), "Listing",
	ifelse(str_detect(Indexable_pages$Url, "/shop\|contact/"), "Pages",
	ifelse(str_detect(Indexable_pages$Url, "/locations"), "Locations",
	ifelse(str_detect(Indexable_pages$Url, "/product"), "Products", "Posts")))))) %>%
	group_by(Content_type) %>%
	unique() %>%
	arrange(Content_type)

	Sitemaps$Content_type <- as.factor(Sitemaps$Content_type)

	Sitemap_taxonomy <- Sitemaps %>% filter(Content_type == "Taxonomy")
	Sitemap_listing <- Sitemaps %>% filter(Content_type == "Listing")
	Sitemap_pages <- Sitemaps %>% filter(Content_type == "Pages")
	Sitemap_places <- Sitemaps %>% filter(Content_type == "Places")
	Sitemap_products <- Sitemaps %>% filter(Content_type == "Products")
	Sitemap_posts <- Sitemaps %>% filter(Content_type == "Posts")

	createSitemap <- function (links = list(), fileName = format(Sys.time(), "%y-%m-%d_%H-%M-%S")) {

	cat("Please wait...", "\n")
	cat("Total Link: ", length(links), "\n")

	require(whisker)
	require(httr)

	template <- '<?xml version="1.0" encoding="UTF-8"?>
	<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
	{{#links}}
	<url>
	<loc>{{{loc}}}</loc>
	<lastmod>{{{lastmod}}}</lastmod>
	<changefreq>{{{changefreq}}}</changefreq>
	<priority>{{{priority}}}</priority>
	</url>
	{{/links}}
	</urlset>'

	map_links <- function(url) {
	tmp <- GET(url)
	# https://www.stat.berkeley.edu/~s133/dates.html
	date <- format(as.Date(strptime(tmp$headers$date, format = '%a, %d %b %Y %H:%M:%S', tz = "UTC")), "%Y-%m-%d")
	sys_date <- format(Sys.time(), "%Y-%m-%d")

	list(loc = url,
	lastmod = ifelse(!is.na(date), date, sys_date),
	changefreq = "monthly",
	priority = "0.8")
	}

	links <- lapply(links, map_links)
	cat(whisker.render(tpl, partials = links), file = paste(fileName, ".xml", sep = ""))

	}

	createSitemap(links = Sitemap_listing$Url, file = "listing")
	createSitemap(links = Sitemap_pages$Url, file = "pages")
	createSitemap(links = Sitemap_places$Url, file = "places")
	createSitemap(links = Sitemap_products$Url, file = "products")
	createSitemap(links = Sitemap_posts$Url, file = "posts")
	createSitemap(links = Sitemap_taxonomy$Url, file = "taxonomy")