Skip to content

Instantly share code, notes, and snippets.

@jweisman
Last active September 3, 2015 08:23
Show Gist options
  • Save jweisman/1ae658243a0bad01f91e to your computer and use it in GitHub Desktop.
Save jweisman/1ae658243a0bad01f91e to your computer and use it in GitHub Desktop.
Harvest OAI to AWS CloudSearch
require 'rest-client'
require 'nokogiri'
require 'aws-sdk-core'
def process_oai(inst, qs, domain, alma)
oai_base = "https://#{alma}.alma.exlibrisgroup.com/view/oai/#{inst}/request"
log "Calling OAI with query string #{qs}"
oai = RestClient.get oai_base + qs
document = Nokogiri::XML(oai)
xsl = RestClient.get 'https://gist.githubusercontent.com/jweisman/1ae658243a0bad01f91e/raw/oai-to-aws-cloudsearch.xsl'
template = Nokogiri::XSLT(xsl)
recordCount = document.xpath('/oai:OAI-PMH/oai:ListRecords/oai:record', {'oai' => 'http://www.openarchives.org/OAI/2.0/'}).count
log "#{recordCount} records retrieved"
if recordCount > 0
csPayload = template.transform(document).to_s.strip #remove trailing spaces...
# Get domain endpoint from AWS
cloudsearch = Aws::CloudSearch::Client.new(region: 'us-east-1', credentials: aws_creds)
d = cloudsearch.describe_domains(:domain_names => [ domain ]).domain_status_list.find {
|d| d.domain_name == domain }
log "Domain of name #{domain} was not found" and return if d.nil?
endpoint = "https://#{d.doc_service.endpoint}/2013-01-01/documents/batch"
response = aws_post('cloudsearch', endpoint, csPayload, 'application/xml')
log "Sent to CloudSearch: #{response.gsub(/\n/, ' ')}"
end
document.xpath('/oai:OAI-PMH/oai:ListRecords/oai:resumptionToken', {'oai' => 'http://www.openarchives.org/OAI/2.0/'}).text
end
def log(msg)
time = Time.new
time = time.strftime("%Y-%m-%d %H:%M:%S")
puts "#{time} - #{msg}"
true
end
def aws_post(service, url, body, content_type)
headers = sign_request(service, url, 'POST', body, content_type)
request =
RestClient::Request.new(method: :post,
url: url,
payload: body,
headers: headers
)
request.execute
end
def aws_get
end
def sign_request(service, endpoint, method, body, content_type)
Aws.config[:ssl_verify_peer] = false
signer = Aws::Signers::V4.new(aws_creds, service, 'us-east-1')
r =
Seahorse::Client::Http::Request.new(http_method: method,
endpoint: endpoint,
body: body,
headers: { 'Content-Type' => content_type}
)
r = signer.sign r
r.headers.to_hash
end
def write_file(bucket, key, content)
Aws.config[:ssl_verify_peer] = false
s3 ||= Aws::S3::Client.new(credentials: aws_creds, region: 'us-east-1')
s3.put_object(
acl: 'public-read',
key: key,
body: content,
bucket: bucket,
content_type: 'text/plain'
)
end
def aws_creds(profile = 'default')
# access credentials in credential file - http://tinyurl.com/ljn7r63
Aws::SharedCredentials.new({ :profile_name => profile })
end
### Define variables
### <<<<<<<<<<<<<<<<
s3_bucket = 'exldev-scratch'
inst = 'TR_INTEGRATION_INST'
domain = 'catalog'
alma_inst = 'na01'
oai_set = 'discovery'
### <<<<<<<<<<<<<<<<
log "Starting..."
log "Retrieving 'from' time"
from_time = ''
# retrieve from date
RestClient.get("https://#{s3_bucket}.s3.amazonaws.com/oai-discovery-from-time.txt") {
|response, request, result|
if response.code == 200
from_time = "&from=#{response}"
log "Retrieved from time: #{from_time}"
end
}
# set to date
to_time = Time.new.getutc.strftime("%Y-%m-%dT%H:%M:%SZ")
log "Set 'to' time to: #{to_time}"
qs = "?verb=ListRecords&set=#{oai_set}&metadataPrefix=marc21&until=#{to_time}#{from_time}"
begin
resumptionToken = process_oai(inst, qs, domain, alma_inst)
qs = '?verb=ListRecords&resumptionToken=' + resumptionToken
end until resumptionToken == ''
# write to date for next time
log "Storing 'to' time"
write_file s3_bucket, 'oai-discovery-from-time.txt', to_time
log "Complete"
<xsl:transform version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:marc="http://www.loc.gov/MARC21/slim"
exclude-result-prefixes="oai xsi marc"
>
<xsl:output method="xml" encoding="utf-8"/>
<xsl:template match="/">
<batch>
<xsl:for-each select="oai:OAI-PMH/oai:ListRecords/oai:record">
<xsl:choose>
<xsl:when test="oai:header[@status = 'deleted']">
<delete>
<xsl:attribute name="id">
<xsl:value-of select="substring-after(substring-after(oai:header/oai:identifier,':'),':')" />
</xsl:attribute>
</delete>
</xsl:when>
<xsl:otherwise>
<add>
<xsl:attribute name="id">
<xsl:value-of select="substring-after(substring-after(oai:header/oai:identifier,':'),':')" />
</xsl:attribute>
<field name="title">
<xsl:value-of select="oai:metadata/marc:record/marc:datafield[@tag='245']/marc:subfield[@code='a']"/>
</field>
<field name="author">
<xsl:value-of select="oai:metadata/marc:record/marc:datafield[@tag='100']/marc:subfield[@code='a']"/>
</field>
<xsl:for-each select="oai:metadata/marc:record/marc:datafield[@tag='650' and @ind2='0']/marc:subfield[@code='a']">
<field name="subject">
<xsl:value-of select="."/>
</field>
</xsl:for-each>
<xsl:for-each select="oai:metadata/marc:record/marc:datafield[@tag='852']">
<field name="collection">
<xsl:value-of select="marc:subfield[@code='a']"/>::<xsl:value-of select="marc:subfield[@code='b']"/>
</field>
</xsl:for-each>
<xsl:variable name="ldr6" select="substring(oai:metadata/marc:record/marc:leader, 7, 1)"/>
<xsl:variable name="ldr7" select="substring(oai:metadata/marc:record/marc:leader, 8, 1)"/>
<field name="type">
<!-- logic according to Yoel -->
<xsl:choose>
<xsl:when test="($ldr6 = 'a' and ($ldr7 = 'a' or $ldr7 = 'd' or $ldr7 = 'd' or $ldr7 = 'm')) or ($ldr6 = 't' and $ldr7 = '')">Book</xsl:when>
<xsl:when test="$ldr6 = 'a' and ($ldr7 = 'b' or $ldr7 = 'i' or $ldr7 = 's')">Journal</xsl:when>
<xsl:when test="$ldr6 = 'c' or $ldr6 = 'd' or $ldr6 = 'i' or $ldr6 = 'j'">Music</xsl:when>
<xsl:when test="$ldr6 = 'e' or $ldr6 = 'f'">Map</xsl:when>
<xsl:when test="$ldr6 = 'g' or $ldr6 = 'k' or $ldr6 = 'o' or $ldr6 = 'r'">Visual material</xsl:when>
<xsl:when test="$ldr6 = 'm'">Computer file</xsl:when>
</xsl:choose>
</field>
</add>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each>
</batch>
</xsl:template>
</xsl:transform>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment