Skip to content

Instantly share code, notes, and snippets.

@no-reply
Created August 5, 2013 19:16
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save no-reply/6158616 to your computer and use it in GitHub Desktop.
Save no-reply/6158616 to your computer and use it in GitHub Desktop.
Blobs of bagit-for-hydra stuff
module Hybag
class BagImportError < StandardError
end
end
require 'bagit'
module Hybag
module Baggable
def write_bag(path = '')
raise Exception if self.pid == '__DO_NOT_USE__'
# delete any existing bags before making a new one
self.delete_bag(path)
path = bag_dir(path)
FileUtils.mkdir_p path unless File.directory? path
bag = BagIt::Bag.new(path)
#TODO: Writing to bag files is naive; reads file out and writes it.
# Possibly there is a better way to do this.
# add the datastreams to the bag, then manifest
datastreams.each do |label, ds|
unless ds.content.nil?
label = label + mime_extension(ds)
if bag_tags.include? ds
bag.add_tag_file(label) { |f|
f.puts ds.content
}
elsif bag_fedora_tags.values.include? ds
bag.add_tag_file('fedora/' + label) { |f|
f.puts ds.content
}
else
bag.add_file(label) { |f|
f.puts ds.content.force_encoding('UTF-8')
}
end
end
end
bag.tagmanifest!
bag.manifest!
return bag
end
# just an alias for the export job's perform method against self
def queue_bag_export(path = '')
Resque.enqueue(Exporter, self.pid, path)
end
def delete_bag(path = '')
bag_path = bag_dir(path)
FileUtils.rm_rf bag_path if File.directory? bag_path
end
private
# create a safe cross-platform bag path
def bag_dir(path)
# TODO: make bag directory configurable?
path = Rails.root.join("tmp/bags", path) unless path.to_s.starts_with? Rails.root.join("tmp/bags").to_s
return File.join(path, self.pid.safe_filename)
end
#TODO: allow selection of specific content datastreams to bag
# to ignore thumbnails and other derivitives, for example.
# return all content files for bag
def bag_contents
self.datastreams.reject { |label, ds| bag_tags.include?(ds) or bag_fedora_tags.include?(ds) }
end
# return all non-fedora tag files
def bag_tags
self.metadata_streams
end
# return fedora tag files
def bag_fedora_tags
self.datastreams.select { |label, ds| ds.is_a?(ActiveFedora::RelsExtDatastream) or ds.dsid == "DC"}
end
def mime_extension(ds)
if ds.kind_of?(ActiveFedora::NtriplesRDFDatastream)
ext = 'nt'
else
if ds.mimeType == ''
ext = ''
else
ext = MIME::Types[ds.mimeType].first.extensions[0]
end
end
return '.' + ext
end
end
end
require 'bagit'
require 'filemagic'
require 'rdf/ntriples'
module Hybag
# try to ingest the bag
def self.ingest(bag, needs_review=true)
raise BagImportError, "Bag is incomplete: #{bag.bag_dir}" unless bag.complete?
raise BagImportError, "Bag is inconsistent." unless bag.consistent?
raise BagImportError, "Bag is invalid." unless bag.valid?
model, collections = get_relations(bag)
for coll in collections
unless Collection.exists?(coll)
coll = Collection.new(pid: coll)
coll.title = OregonDigital::IdService.noidify(coll.pid)
coll.save
end
end
item = model.constantize.new
for ds in bag.bag_files
label = File.basename(ds, '.*')
opts = {
:mimeType => FileMagic.new(FileMagic::MAGIC_MIME).file(ds).split(';')[0],
:label => label,
:dsid => label
}
# forcing binary may not work for all content?
item.add_file_datastream(File.open(ds).read.force_encoding('BINARY'), opts)
end
# save to generate a pid
item.descMetadata.set = collections
item.save
import_desc_metadata(item, bag)
#TODO: add other tag files
item.review! unless needs_review
return item
end
def self.get_relations(bag)
if(File.exist?(File.join(bag.bag_dir,"fedora","RELS-EXT.rdf")))
model, collections = item_from_rels(File.join(bag.bag_dir,"fedora","RELS-EXT.rdf"))
elsif(File.exist?(File.join(bag.bag_dir,"hybag.yml")))
bagconf = YAML.load(File.read(File.join(bag.bag_dir,"hybag.yml")))
model = bagconf["model"]
collections = bagconf["collections"]
else
#TODO: Fall back to ingest form if no RELS/config
model = "GenericAsset"
collections = []
end
return model, collections
end
private
# Write descMetadata
def self.import_desc_metadata(item, bag)
#TODO: what if descMetadata comes in other formats?
#TODO: if there is more than one RDF datastream, merge the graph
graph = RDF::Graph.load(File.join(bag.bag_dir, 'descMetadata.nt'))
# This assumes that the first subject in the RDF is the bag item
#TODO: actually figure out which subject to overwrite
# could do this by trying to find one which is not also
# an object or predicate
itemSubject = graph.first_subject
graph.each_statement do |statement|
# Overwrite the subject if necessary
if statement.subject == itemSubject
item.descMetadata.append(item.descMetadata.rdf_subject, statement.predicate, statement.object)
else
item.descMetadata.append(statement.subject, statement.predicate, statement.object)
end
end
return item.save
end
# Search extract model and collection associations from a ferora RELS file
def self.item_from_rels(file)
#TODO: Move this method somewhere it can be used by other modules?
model_predicate = "info:fedora/fedora-system:def/model#hasModel"
coll_predicate = "info:fedora/fedora-system:def/relations-external#isMemberOf"
rels_graph = RDF::Graph.load(file)
model = 'GenericAsset'
if(rels_graph.has_predicate?(model_predicate))
model = rels_graph.to_a.select{|x| x.predicate == model_predicate}[0].object.to_s
model["info:fedora/afmodel:"] = ''
end
collections = []
if(rels_graph.has_predicate?(coll_predicate))
collection_triples = rels_graph.to_a.select{|x| x.predicate == coll_predicate}
for triple in collection_triples
collections.append(triple.object.to_s)
end
end
return [model, collections]
end
end
@tpendragon
Copy link

In OregonDigital's case I think we can strip out the majority of the collection stuff here and count on the metadata to be correct and handle all the association stuff after the model gets saved.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment