Last active
September 3, 2015 08:23
-
-
Save jweisman/1ae658243a0bad01f91e to your computer and use it in GitHub Desktop.
Harvest OAI to AWS CloudSearch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rest-client' | |
require 'nokogiri' | |
require 'aws-sdk-core' | |
def process_oai(inst, qs, domain, alma) | |
oai_base = "https://#{alma}.alma.exlibrisgroup.com/view/oai/#{inst}/request" | |
log "Calling OAI with query string #{qs}" | |
oai = RestClient.get oai_base + qs | |
document = Nokogiri::XML(oai) | |
xsl = RestClient.get 'https://gist.githubusercontent.com/jweisman/1ae658243a0bad01f91e/raw/oai-to-aws-cloudsearch.xsl' | |
template = Nokogiri::XSLT(xsl) | |
recordCount = document.xpath('/oai:OAI-PMH/oai:ListRecords/oai:record', {'oai' => 'http://www.openarchives.org/OAI/2.0/'}).count | |
log "#{recordCount} records retrieved" | |
if recordCount > 0 | |
csPayload = template.transform(document).to_s.strip #remove trailing spaces... | |
# Get domain endpoint from AWS | |
cloudsearch = Aws::CloudSearch::Client.new(region: 'us-east-1', credentials: aws_creds) | |
d = cloudsearch.describe_domains(:domain_names => [ domain ]).domain_status_list.find { | |
|d| d.domain_name == domain } | |
log "Domain of name #{domain} was not found" and return if d.nil? | |
endpoint = "https://#{d.doc_service.endpoint}/2013-01-01/documents/batch" | |
response = aws_post('cloudsearch', endpoint, csPayload, 'application/xml') | |
log "Sent to CloudSearch: #{response.gsub(/\n/, ' ')}" | |
end | |
document.xpath('/oai:OAI-PMH/oai:ListRecords/oai:resumptionToken', {'oai' => 'http://www.openarchives.org/OAI/2.0/'}).text | |
end | |
def log(msg) | |
time = Time.new | |
time = time.strftime("%Y-%m-%d %H:%M:%S") | |
puts "#{time} - #{msg}" | |
true | |
end | |
def aws_post(service, url, body, content_type) | |
headers = sign_request(service, url, 'POST', body, content_type) | |
request = | |
RestClient::Request.new(method: :post, | |
url: url, | |
payload: body, | |
headers: headers | |
) | |
request.execute | |
end | |
def aws_get | |
end | |
def sign_request(service, endpoint, method, body, content_type) | |
Aws.config[:ssl_verify_peer] = false | |
signer = Aws::Signers::V4.new(aws_creds, service, 'us-east-1') | |
r = | |
Seahorse::Client::Http::Request.new(http_method: method, | |
endpoint: endpoint, | |
body: body, | |
headers: { 'Content-Type' => content_type} | |
) | |
r = signer.sign r | |
r.headers.to_hash | |
end | |
def write_file(bucket, key, content) | |
Aws.config[:ssl_verify_peer] = false | |
s3 ||= Aws::S3::Client.new(credentials: aws_creds, region: 'us-east-1') | |
s3.put_object( | |
acl: 'public-read', | |
key: key, | |
body: content, | |
bucket: bucket, | |
content_type: 'text/plain' | |
) | |
end | |
def aws_creds(profile = 'default') | |
# access credentials in credential file - http://tinyurl.com/ljn7r63 | |
Aws::SharedCredentials.new({ :profile_name => profile }) | |
end | |
### Define variables | |
### <<<<<<<<<<<<<<<< | |
s3_bucket = 'exldev-scratch' | |
inst = 'TR_INTEGRATION_INST' | |
domain = 'catalog' | |
alma_inst = 'na01' | |
oai_set = 'discovery' | |
### <<<<<<<<<<<<<<<< | |
log "Starting..." | |
log "Retrieving 'from' time" | |
from_time = '' | |
# retrieve from date | |
RestClient.get("https://#{s3_bucket}.s3.amazonaws.com/oai-discovery-from-time.txt") { | |
|response, request, result| | |
if response.code == 200 | |
from_time = "&from=#{response}" | |
log "Retrieved from time: #{from_time}" | |
end | |
} | |
# set to date | |
to_time = Time.new.getutc.strftime("%Y-%m-%dT%H:%M:%SZ") | |
log "Set 'to' time to: #{to_time}" | |
qs = "?verb=ListRecords&set=#{oai_set}&metadataPrefix=marc21&until=#{to_time}#{from_time}" | |
begin | |
resumptionToken = process_oai(inst, qs, domain, alma_inst) | |
qs = '?verb=ListRecords&resumptionToken=' + resumptionToken | |
end until resumptionToken == '' | |
# write to date for next time | |
log "Storing 'to' time" | |
write_file s3_bucket, 'oai-discovery-from-time.txt', to_time | |
log "Complete" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<xsl:transform version="1.0" | |
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" | |
xmlns:oai="http://www.openarchives.org/OAI/2.0/" | |
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
xmlns:marc="http://www.loc.gov/MARC21/slim" | |
exclude-result-prefixes="oai xsi marc" | |
> | |
<xsl:output method="xml" encoding="utf-8"/> | |
<xsl:template match="/"> | |
<batch> | |
<xsl:for-each select="oai:OAI-PMH/oai:ListRecords/oai:record"> | |
<xsl:choose> | |
<xsl:when test="oai:header[@status = 'deleted']"> | |
<delete> | |
<xsl:attribute name="id"> | |
<xsl:value-of select="substring-after(substring-after(oai:header/oai:identifier,':'),':')" /> | |
</xsl:attribute> | |
</delete> | |
</xsl:when> | |
<xsl:otherwise> | |
<add> | |
<xsl:attribute name="id"> | |
<xsl:value-of select="substring-after(substring-after(oai:header/oai:identifier,':'),':')" /> | |
</xsl:attribute> | |
<field name="title"> | |
<xsl:value-of select="oai:metadata/marc:record/marc:datafield[@tag='245']/marc:subfield[@code='a']"/> | |
</field> | |
<field name="author"> | |
<xsl:value-of select="oai:metadata/marc:record/marc:datafield[@tag='100']/marc:subfield[@code='a']"/> | |
</field> | |
<xsl:for-each select="oai:metadata/marc:record/marc:datafield[@tag='650' and @ind2='0']/marc:subfield[@code='a']"> | |
<field name="subject"> | |
<xsl:value-of select="."/> | |
</field> | |
</xsl:for-each> | |
<xsl:for-each select="oai:metadata/marc:record/marc:datafield[@tag='852']"> | |
<field name="collection"> | |
<xsl:value-of select="marc:subfield[@code='a']"/>::<xsl:value-of select="marc:subfield[@code='b']"/> | |
</field> | |
</xsl:for-each> | |
<xsl:variable name="ldr6" select="substring(oai:metadata/marc:record/marc:leader, 7, 1)"/> | |
<xsl:variable name="ldr7" select="substring(oai:metadata/marc:record/marc:leader, 8, 1)"/> | |
<field name="type"> | |
<!-- logic according to Yoel --> | |
<xsl:choose> | |
<xsl:when test="($ldr6 = 'a' and ($ldr7 = 'a' or $ldr7 = 'd' or $ldr7 = 'd' or $ldr7 = 'm')) or ($ldr6 = 't' and $ldr7 = '')">Book</xsl:when> | |
<xsl:when test="$ldr6 = 'a' and ($ldr7 = 'b' or $ldr7 = 'i' or $ldr7 = 's')">Journal</xsl:when> | |
<xsl:when test="$ldr6 = 'c' or $ldr6 = 'd' or $ldr6 = 'i' or $ldr6 = 'j'">Music</xsl:when> | |
<xsl:when test="$ldr6 = 'e' or $ldr6 = 'f'">Map</xsl:when> | |
<xsl:when test="$ldr6 = 'g' or $ldr6 = 'k' or $ldr6 = 'o' or $ldr6 = 'r'">Visual material</xsl:when> | |
<xsl:when test="$ldr6 = 'm'">Computer file</xsl:when> | |
</xsl:choose> | |
</field> | |
</add> | |
</xsl:otherwise> | |
</xsl:choose> | |
</xsl:for-each> | |
</batch> | |
</xsl:template> | |
</xsl:transform> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment