Created
December 14, 2015 23:33
-
-
Save dmolesUC/f2df001da389c477b597 to your computer and use it in GitHub Desktop.
Sample data generator for stash-wrapper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'mime/types' | |
require 'nokogiri' | |
require 'set' | |
require 'stash/wrapper' | |
ST = Stash::Wrapper | |
# ------------------------------------------------------------ | |
# Generate Stash wrapper | |
# ------------------------------------------------------------ | |
# Generate datacite metadata | |
# TO DO: More realistic DataCite, with files | |
def datacite(doi:, creators:, title:, publisher:, pubyear:, subjects:, resource_type:, abstract:, methods:, usage:, grant_number:) | |
xml_text = "<dcs:resource xmlns:dcs='http://datacite.org/schema/kernel-3' | |
xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' | |
xsi:schemaLocation='http://datacite.org/schema/kernel-3 | |
http://schema.datacite.org/meta/kernel-3/metadata.xsd'> | |
<dcs:identifier identifierType='DOI'>#{doi}</dcs:identifier> | |
<dcs:creators> | |
#{(creators.map { |cn, ca| "<dcs:creator><dcs:creatorName>#{cn}</dcs:creatorName><dcs:affiliation>#{ca}</dcs:affiliation></dcs:creator>" }).join("\n")} | |
</dcs:creators> | |
<dcs:titles> | |
<dcs:title>#{title}</dcs:title> | |
</dcs:titles> | |
<dcs:publisher>#{publisher}</dcs:publisher> | |
<dcs:publicationYear>#{pubyear}</dcs:publicationYear> | |
<dcs:subjects> | |
#{(subjects.map { |s| "<dcs:subject>#{s}</dcs:subject>" }).join("\n")} | |
</dcs:subjects> | |
<dcs:resourceType resourceTypeGeneral='Dataset'>#{resource_type}</dcs:resourceType> | |
<dcs:descriptions> | |
<dcs:description descriptionType='Abstract'>#{abstract}</dcs:description> | |
<dcs:description descriptionType='Methods'>#{methods}</dcs:description> | |
<dcs:description descriptionType='Other'>#{usage}</dcs:description> | |
<dcs:description descriptionType='Other'>#{grant_number}</dcs:description> | |
</dcs:descriptions> | |
</dcs:resource>" | |
REXML::Document.new(xml_text).root | |
end | |
def dublin_core(doi:, creators:, title:, publisher:, pubyear:, subjects:, resource_type:, abstract:, methods:, usage:, grant_number:) | |
xml_text = "<oai_dc:dc xmlns:oai_dc='http://www.openarchives.org/OAI/2.0/oai_dc/' xmlns:dc='http://purl.org/dc/elements/1.1/' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' xsi:schemaLocation='http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd'> | |
<dc:title>#{title}</dc:title> | |
#{(creators.map { |cn, _ca| "<dc:creator>#{cn}</dc:creator>" }).join("\n")} | |
#{(subjects.map { |s| "<dc:subject>#{s}</dc:subject>" }).join("\n")} | |
<dc:description>#{abstract}</dc:description> | |
<dc:description>#{methods}</dc:description> | |
<dc:description>#{usage}</dc:description> | |
<dc:description>#{grant_number}</dc:description> | |
<dc:publisher>#{publisher}</dc:publisher> | |
<dc:date>#{pubyear}</dc:date> | |
<dc:type>#{resource_type}</dc:type> | |
<dc:identifier>doi:#{doi}</dc:identifier> | |
</oai_dc:dc>" | |
REXML::Document.new(xml_text).root | |
end | |
class Sample | |
attr_reader :wrapper_xml | |
attr_reader :dcs_resource | |
attr_reader :oai_dc | |
def initialize(wrapper_xml:, dcs_resource:, oai_dc:) | |
@wrapper_xml = wrapper_xml | |
@dcs_resource = dcs_resource | |
@oai_dc = oai_dc | |
end | |
end | |
# ------------------------------------------------------------ | |
# Sample text | |
@text = 'Quo usque tandem abutere, Catilina, patientia nostra? Quam diu etiam furor iste tuus nos eludet? Quem ad finem sese effrenata iactabit audacia? Nihil ne te nocturnum praesidium Palati, nihil urbis vigiliae, nihil timor populi, nihil concursus bonorum omnium, nihil hic munitissimus habendi senatus locus, nihil horum ora voltusque moverunt? Patere tua consilia non sentis, constrictam iam horum omnium scientia teneri coniurationem tuam non vides? Quid proxima, quid superiore nocte egeris, ubi fueris, quos convocaveris, quid consilii ceperis, quem nostrum ignorare arbitraris? O tempora, o mores! Senatus haec intellegit. Consul videt; hic tamen vivit. Vivit? immo vero etiam in senatum venit, fit publici consilii particeps, notat et designat oculis ad caedem unum quemque nostrum. Nos autem fortes viri satisfacere rei publicae videmur, si istius furorem ac tela vitemus. Ad mortem te, Catilina, duci iussu consulis iam pridem oportebat, in te conferri pestem, quam tu in nos omnes iam diu machinaris. An vero vir amplissumus, Scipio, pontifex maximus, Gracchum mediocriter labefactantem statum rei publicae privatus interfecit; Catilinam orbem terrae caede atque incendiis vastare cupientem nos consules perferemus? Nam illa nimis antiqua praetereo, quod Servilius Ahala Maelium novis rebus studentem manu sua occidit. Fuit, fuit ista quondam in hac re publica virtus, ut viri fortes acrioribus suppliciis civem perniciosum quam acerbissimum hostem coercerent. Habemus senatus consultum in te, Catilina, vehemens et grave, non deest rei publicae consilium neque auctoritas huius ordinis; nos, nos, dico aperte, consules desumus. Decrevit quondam senatus, ut Opimius consul videret, ne quid res publica detrimenti caperet; nox nulla intercessit; interfectus est propter quasdam seditionum suspiciones Gracchus, clarissimo patre, avo, maioribus, occisus est cum liberis Fulvius consularis. Simili senatus consulto Mario et Valerio consulibus est permissa res publica; num unum diem postea Saturninum tribunum et Servilium praetorem mors ac rei publicae poena remorata est? At vero nos vicesimum iam diem patimur hebescere aciem horum auctoritatis. Habemus enim huiusce modi senatus consultum, verum inclusum in tabulis tamquam in vagina reconditum, quo ex senatus consulto confestim te interfectum esse, Catilina, convenit. Vivis, et vivis non ad deponendam, sed ad confirmandam audaciam. Cupio, patres conscripti, me esse clementem, cupio in tantis rei publicae periculis me non dissolutum videri, sed iam me ipse inertiae nequitiaeque condemno. Castra sunt in Italia contra populum Romanum in Etruriae faucibus conlocata, crescit in dies singulos hostium numerus; eorum autem castrorum imperatorem ducemque hostium intra moenia atque adeo in senatu videmus intestinam aliquam cotidie perniciem rei publicae molientem. Si te iam, Catilina, comprehendi, si interfici iussero, credo, erit verendum mihi, ne non potius hoc omnes boni serius a me quam quisquam crudelius factum esse dicat. Verum ego hoc, quod iam pridem factum esse oportuit, certa de causa nondum adducor ut faciam. Tum denique interficiere, cum iam nemo tam inprobus, tam perditus, tam tui similis inveniri poterit, qui id non iure factum esse fateatur. Quamdiu quisquam erit, qui te defendere audeat, vives, et vives ita, ut nunc vivis. multis meis et firmis praesidiis obsessus, ne commovere te contra rem publicam possis. Multorum te etiam oculi et aures non sentientem, sicut adhuc fecerunt, speculabuntur atque custodient. Etenim quid est, Catilina, quod iam amplius expectes, si neque nox tenebris obscurare coeptus nefarios nec privata domus parietibus continere voces coniurationis tuae potest, si illustrantur, si erumpunt omnia? Muta iam istam mentem, mihi crede, obliviscere caedis atque incendiorum. Teneris undique; luce sunt clariora nobis tua consilia omnia; quae iam mecum licet recognoscas. Meministine me ante diem XII Kalendas Novembris dicere in senatu fore in armis certo die, qui dies futurus esset ante diem VI Kal. Novembris, Manlium, audaciae satellitem atque administrum tuae? Num me fefellit, Catilina, non modo res tanta, tam atrox tamque incredibilis, verum, id quod multo magis est admirandum, dies? Dixi ego idem in senatu caedem te optumatium contulisse in ante diem V Kalendas Novembris, tum cum multi principes civitatis Roma non tam sui conservandi quam tuorum consiliorum reprimendorum causa profugerunt. Num infitiari potes te illo ipso die meis praesidiis, mea diligentia circumclusum commovere te contra rem publicam non potuisse, cum tu discessu ceterorum nostra tamen, qui remansissemus, caede te contentum esse dicebas? Quid? cum te Praeneste Kalendis ipsis Novembribus occupaturum nocturno impetu esse confideres, sensistin illam coloniam meo iussu meis praesidiis, custodiis, vigiliis esse munitam? Nihil agis, nihil moliris, nihil cogitas, quod non ego non modo audiam, sed etiam videam planeque sentiam. Recognosce tandem mecum noctem illam superiorem; iam intelleges multo me vigilare acrius ad salutem quam te ad perniciem rei publicae. Dico te priore nocte venisse inter falcarios--non agam obscure--in Laecae domum; convenisse eodem complures eiusdem amentiae scelerisque socios. Num negare audes? quid taces? Convincam, si negas. Video enim esse hic in senatu quosdam, qui tecum una fuerunt. O di inmortales! ubinam gentium sumus? in qua urbe vivimus? quam rem publicam habemus? Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent! Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero! Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem. Reperti sunt duo equites Romani, qui te ista cura liberarent et sese illa ipsa nocte paulo ante lucem me in meo lectulo interfecturos esse pollicerentur.' | |
@words = @text.split(' ') | |
@words_plain = @words.map { |t| t.tr('^A-Za-z', '') }.select { |w| w.length > 1 } | |
@words_unique = @words_plain.map(&:downcase).to_set.to_a.sort! | |
@letters = (97..122).map(&:chr) | |
def take(list, start, len) | |
start = start % list.length | |
slice = list.slice(start, list.length - start).to_a | |
taken = slice.take(len) | |
(taken.length >= len) ? taken : taken + take(list, start + taken.length, len - taken.length) | |
end | |
def take_sublist(list, len) | |
start = rand(list.length) | |
take(list, start, len) | |
end | |
def take_random(list, count) | |
set = Set.new | |
set << list[rand(list.length)] while set.size < count | |
set.to_a.sort! | |
end | |
def random_text(word_count) | |
text = take_sublist(@words, word_count).join(' ').capitalize | |
text = text.slice(0, text.length - 1) while text.match(/[^a-z]$/) | |
text | |
end | |
def random_sentence(word_count) | |
text = random_text(word_count) | |
end_index = text =~ /[.?!-]/ || (text.length - 1) | |
text.slice(0, end_index) | |
end | |
def random_word | |
random_from(@words_unique) | |
end | |
def random_words(count) | |
(0...count).map { |_i| random_word } | |
end | |
def iterative_sum(max, chance) | |
1 + ((0...max).inject do |sum, _i| | |
rval = rand | |
break sum if rval > chance | |
(sum + rval) | |
end).round | |
end | |
def less_than(max) | |
iterative_sum(max, 0.999) | |
end | |
def much_less_than(max) | |
iterative_sum(max, 0.992) | |
end | |
def very_much_less_than(max) | |
iterative_sum(max, 0.973) | |
end | |
def random_names(max) | |
num_names = much_less_than max | |
(0...num_names).map { |_i| random_name } | |
end | |
def random_name | |
names = less_than(4) | |
names = 2 if names < 2 | |
random_words(names).map(&:capitalize).join(' ') | |
end | |
def random_from(list) | |
list[rand(0...list.length)] | |
end | |
def random_list_from(list, count) | |
(0...count).map { |_i| random_from(list) } | |
end | |
@all_mime_types = MIME::Types.to_a.select { |mt| !mt.extensions.empty? } | |
@mime_types = random_list_from(@all_mime_types, 20) | |
@authors = (0...1000).map { |_i| random_name } | |
@publishers = (0...100).map { |_i| random_words(much_less_than(10)).map(&:capitalize).join(' ') } | |
@affiliations = (0...100).map { |_i| random_words(much_less_than(10)).map(&:capitalize).join(' ') } | |
@resource_types = (0...20).map { |_i| random_words(less_than(3)).join(' ') } | |
# ------------------------------------------------------------ | |
# Random sample fields | |
def doi(index) | |
prefix = take_sublist(@letters, 3).join | |
registrant = 20_000 + less_than(10_000) | |
"10.#{registrant}/#{prefix}#{1_000_000 + index}" | |
end | |
def creators | |
num_authors = less_than(10) | |
(0...num_authors).map { |_i| [random_from(@authors), random_from(@affiliations)] } | |
end | |
def title | |
loop do | |
title = random_sentence(much_less_than(100)) | |
return title if title.length > 10 | |
end | |
end | |
def publisher | |
random_from(@publishers) | |
end | |
def pub_year | |
2000 + rand(15) | |
end | |
def subjects | |
num_keywords = much_less_than 20 | |
take_random(@words_unique, num_keywords) | |
end | |
def resource_type | |
random_from(@resource_types) | |
end | |
def abstract | |
random_text(less_than(500)) + '.' | |
end | |
def methods | |
random_text(much_less_than(500)) + '.' | |
end | |
def usage | |
random_text(less_than(500)) + '.' | |
end | |
def grant_number | |
"#{rand(10)} #{random_word[0]}#{rand(20)} #{random_word[0..1]}#{100_000 + less_than(1_000_000)}-#{rand(100)}#{random_word[0]}#{rand(30)}".upcase | |
end | |
def period_for(period_months) | |
case | |
when period_months == 1 | |
'1 month' | |
when period_months == 6 | |
'6 months' | |
else | |
'1 year' | |
end | |
end | |
def embargo_for_type(embargo_type, end_date) | |
period_months = 6 * rand(3) | |
period_months = 1 if period_months == 0 | |
start_date = end_date - (period_months * 365.24 / 12) | |
period = period_for(period_months) | |
ST::Embargo.new(type: embargo_type, period: period, start_date: start_date, end_date: end_date) | |
end | |
def no_embargo(end_date) | |
ST::Embargo.new(type: ST::EmbargoType::NONE, period: 'none', start_date: end_date, end_date: end_date) | |
end | |
def embargo(year) | |
end_date = Date.new(year, 1 + rand(12), 1 + rand(28)) | |
embargo_type = random_from(ST::EmbargoType.values) | |
(embargo_type == ST::EmbargoType::NONE) ? no_embargo(end_date) : embargo_for_type(embargo_type, end_date) | |
end | |
def files | |
num_files = less_than 10 | |
(0...num_files).map do |_i| | |
mime_type = random_from(@mime_types) | |
pathname = "#{random_word}.#{mime_type.extensions[0]}" | |
size_bytes = less_than(100) + less_than(1000) * much_less_than(1000) | |
ST::StashFile.new(pathname: pathname, size_bytes: size_bytes, mime_type: mime_type) | |
end | |
end | |
def gen_sample(i) | |
st_year = pub_year | |
st_doi = doi(i) | |
st_identifier = ST::Identifier.new(type: ST::IdentifierType::DOI, value: st_doi) | |
st_license = ST::License::CC_BY | |
st_embargo = embargo(st_year) | |
st_version = ST::Version.new(number: 1, date: st_embargo.start_date, note: random_sentence(less_than(10))) | |
st_inventory = ST::Inventory.new(files: files) | |
st_creators = creators | |
st_title = title | |
st_publisher = publisher | |
st_subjects = subjects | |
st_resource_type = resource_type | |
st_abstract = abstract | |
st_methods = methods | |
st_usage = usage | |
st_grant_number = grant_number | |
dcs_resource = datacite( | |
doi: st_doi, | |
creators: st_creators, | |
title: st_title, | |
publisher: st_publisher, | |
pubyear: st_year, | |
subjects: st_subjects, | |
resource_type: st_resource_type, | |
abstract: st_abstract, | |
methods: st_methods, | |
usage: st_usage, | |
grant_number: st_grant_number | |
) | |
wrapper = ST::StashWrapper.new( | |
identifier: st_identifier, | |
version: st_version, | |
license: st_license, | |
embargo: st_embargo, | |
inventory: st_inventory, | |
descriptive_elements: [dcs_resource] | |
) | |
wrapper_xml = wrapper.save_to_xml | |
oai_dc = dublin_core( | |
doi: st_doi, | |
creators: st_creators, | |
title: st_title, | |
publisher: st_publisher, | |
pubyear: st_year, | |
subjects: st_subjects, | |
resource_type: st_resource_type, | |
abstract: st_abstract, | |
methods: st_methods, | |
usage: st_usage, | |
grant_number: st_grant_number | |
) | |
Sample.new(wrapper_xml: wrapper_xml, dcs_resource: dcs_resource, oai_dc: oai_dc) | |
end | |
# ------------------------------------------------------------ | |
# Main | |
formatter = REXML::Formatters::Pretty.new | |
formatter.compact = true | |
count = ARGV[0].to_i if ARGV[0] | |
if count | |
digits = 1 + Math.log(count, 10).round | |
(1..count).each do |i| | |
sample = gen_sample(i) | |
record_num = i.to_s.rjust(digits, '0') | |
File.open("stash_wrapper-#{record_num}.xml", 'w') do |f| | |
formatter.write(sample.wrapper_xml, f) | |
puts File.absolute_path(f.path) | |
end | |
File.open("stash_dcs_resource-#{record_num}.xml", 'w') do |f| | |
formatter.write(sample.dcs_resource, f) | |
puts File.absolute_path(f.path) | |
end | |
File.open("stash_oai_dc-#{record_num}.xml", 'w') do |f| | |
formatter.write(sample.oai_dc, f) | |
puts File.absolute_path(f.path) | |
end | |
end | |
else | |
sample = gen_sample(1) | |
formatter.write(sample.wrapper_xml, $stdout) | |
puts | |
formatter.write(sample.dcs_resource, $stdout) | |
puts | |
formatter.write(sample.oai_dc, $stdout) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment