Last active
August 29, 2015 14:11
-
-
Save sheepeeh/4c0ded92207380a27c98 to your computer and use it in GitHub Desktop.
Add real last modified dates, priorities, and change frequencies to XML Sitemaps-generated sitemaps for Omeka.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-------------------------------------------------------------------------------------------------------------- | |
# This is the script I use to add real dates, change frequencies, and priorities to Omeka items, | |
# exhibits, and simple pages to sitemaps generated with the XML Sitemap tool. | |
# https://www.xml-sitemaps.com/standalone-google-sitemap-generator.html | |
# | |
# I use the following settins in order to keep the number of URLs down (only include simple pages, exhibits | |
# and exhibit pages, exhibit items, and collection pages). | |
# | |
# Exclude from sitemap extensions: | |
# divx flv zip m4a m4v rar tar bz2 tgz exe gif tif jpg png class jar mpeg mpg mp3 wav mp4 avi wmv | |
# gz mov mid ra ram css pdf xml | |
# | |
# Exclude URLs: | |
# sort_field= | |
# page= | |
# search | |
# files/show | |
# items/browse?advanced%5B0%5D%5B | |
# output | |
# ?advanced%5 | |
# ?collection= | |
# items/show | |
# | |
# Maximum depth level: | |
# 5 | |
# | |
# To get the correct dates, I create one CSV file with last modified dates from the items table | |
# (id, modified) and one CSV file with last modified dates from the exhibits and | |
# simple_pages_pages tables (slug, modified) | |
#-------------------------------------------------------------------------------------------------------------- | |
require 'csv' | |
require 'benchmark' | |
require 'rexml/document' | |
include REXML | |
#Create XML Document | |
xml_file = File.new("PATH TO YOUR SITEMAP") | |
xml_doc = Document.new(xml_file) | |
#Read CSV files, turn into arays | |
item_nums = CSV.read('PATH TO YOUR ITEM DATES').to_a | |
slugs = CSV.read('PATH TO YOUR SLUGS').to_a | |
#Set output file | |
output = "PATH TO YOUR DESIRED OUTPUT FILE" | |
#Gather URLs | |
nodes = XPath.match(xml_doc, "//url") | |
#Start count, start time, Benchmark for STDOUT progress | |
count = 0 | |
report_time = 0 | |
btime = Benchmark.realtime { | |
#Go through all the nodes | |
nodes.each do |node| | |
node_time = Benchmark.realtime { | |
loc = node.elements["loc"].text | |
#Remove inaccurate GMT modifier from lasmod timestamp | |
node.elements["lastmod"].text = node.elements["lastmod"].text.gsub("+00:00","") | |
item_num = nil | |
#Set conditions for different pages' priority and changefreq | |
if loc.match('/item') | |
#Get item number from URL, get lastmod date from array by that number | |
item_num = loc.gsub(/(http:\/\/YOUR_URL\/exhibits\/show\/)(.*\/)(item\/)([0-9]{1,})/,"\\4") | |
node.elements["lastmod"].text = item_nums.select { |a| a[0] == item_num }.flatten[1] | |
node.elements["priority"].text = "0.4" | |
node.elements["changefreq"].text = "yearly" | |
#Collections pages | |
elsif loc.match('/collections/show') | |
node.elements["priority"].text = "0.4" | |
node.elements["changefreq"].text = "monthly" | |
#Exhibit summary pages | |
elsif loc =~ /http:\/\/YOUR_URL\/exhibits\/show\/[\w\-]*\z/ | |
node.elements["priority"].text = "0.8" | |
node.elements["changefreq"].text = "yearly" | |
else | |
node.elements["priority"].text = "0.6" | |
node.elements["changefreq"].text = "monthly" | |
end | |
#Add real lastmods for Exhibits and Simple Pages. This won't work so well if some of your slugs | |
#are contained within other slugs. (e.g. "about" and "about-collection")--you'll have to | |
#add some conditionals. | |
slugs.each do |slug| | |
node.elements["lastmod"].text = slug[1] if loc.match("#{slug[0]}") | |
end | |
#Change URL for production | |
node.elements["loc"].text = loc.gsub("DEV SUBDOMAIN","www") | |
#Add correct GMT modifier. Sub your local modifier for -05:00 | |
node.elements["lastmod"].text = node.elements["lastmod"].text.concat("-05:00").gsub(" ","T") unless node.elements["lastmod"].text.nil? | |
#End node Benchmark time | |
} | |
#Add node time to time passed, print progress if number of nodes processed is divisible by 100, increment counter. | |
report_time += node_time | |
puts "Proceesed #{count} records in #{report_time} seconds." if count.modulo(100).zero? | |
count += 1 | |
end | |
#Output changed XML document to a new file | |
formatter = Formatters::Default.new | |
File.open(output,"w") do |result| | |
formatter.write(xml_doc,result) | |
end | |
#End Benchmark | |
} | |
#Say we're done. | |
puts "Done in #{btime} seconds" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment