Skip to content

Instantly share code, notes, and snippets.

@dmolesUC
Created December 14, 2015 23:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dmolesUC/f2df001da389c477b597 to your computer and use it in GitHub Desktop.
Save dmolesUC/f2df001da389c477b597 to your computer and use it in GitHub Desktop.
Sample data generator for stash-wrapper
#!/usr/bin/env ruby
require 'mime/types'
require 'nokogiri'
require 'set'
require 'stash/wrapper'
ST = Stash::Wrapper
# ------------------------------------------------------------
# Generate Stash wrapper
# ------------------------------------------------------------
# Generate datacite metadata
# TO DO: More realistic DataCite, with files
def datacite(doi:, creators:, title:, publisher:, pubyear:, subjects:, resource_type:, abstract:, methods:, usage:, grant_number:)
xml_text = "<dcs:resource xmlns:dcs='http://datacite.org/schema/kernel-3'
xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'
xsi:schemaLocation='http://datacite.org/schema/kernel-3
http://schema.datacite.org/meta/kernel-3/metadata.xsd'>
<dcs:identifier identifierType='DOI'>#{doi}</dcs:identifier>
<dcs:creators>
#{(creators.map { |cn, ca| "<dcs:creator><dcs:creatorName>#{cn}</dcs:creatorName><dcs:affiliation>#{ca}</dcs:affiliation></dcs:creator>" }).join("\n")}
</dcs:creators>
<dcs:titles>
<dcs:title>#{title}</dcs:title>
</dcs:titles>
<dcs:publisher>#{publisher}</dcs:publisher>
<dcs:publicationYear>#{pubyear}</dcs:publicationYear>
<dcs:subjects>
#{(subjects.map { |s| "<dcs:subject>#{s}</dcs:subject>" }).join("\n")}
</dcs:subjects>
<dcs:resourceType resourceTypeGeneral='Dataset'>#{resource_type}</dcs:resourceType>
<dcs:descriptions>
<dcs:description descriptionType='Abstract'>#{abstract}</dcs:description>
<dcs:description descriptionType='Methods'>#{methods}</dcs:description>
<dcs:description descriptionType='Other'>#{usage}</dcs:description>
<dcs:description descriptionType='Other'>#{grant_number}</dcs:description>
</dcs:descriptions>
</dcs:resource>"
REXML::Document.new(xml_text).root
end
def dublin_core(doi:, creators:, title:, publisher:, pubyear:, subjects:, resource_type:, abstract:, methods:, usage:, grant_number:)
xml_text = "<oai_dc:dc xmlns:oai_dc='http://www.openarchives.org/OAI/2.0/oai_dc/' xmlns:dc='http://purl.org/dc/elements/1.1/' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' xsi:schemaLocation='http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd'>
<dc:title>#{title}</dc:title>
#{(creators.map { |cn, _ca| "<dc:creator>#{cn}</dc:creator>" }).join("\n")}
#{(subjects.map { |s| "<dc:subject>#{s}</dc:subject>" }).join("\n")}
<dc:description>#{abstract}</dc:description>
<dc:description>#{methods}</dc:description>
<dc:description>#{usage}</dc:description>
<dc:description>#{grant_number}</dc:description>
<dc:publisher>#{publisher}</dc:publisher>
<dc:date>#{pubyear}</dc:date>
<dc:type>#{resource_type}</dc:type>
<dc:identifier>doi:#{doi}</dc:identifier>
</oai_dc:dc>"
REXML::Document.new(xml_text).root
end
class Sample
attr_reader :wrapper_xml
attr_reader :dcs_resource
attr_reader :oai_dc
def initialize(wrapper_xml:, dcs_resource:, oai_dc:)
@wrapper_xml = wrapper_xml
@dcs_resource = dcs_resource
@oai_dc = oai_dc
end
end
# ------------------------------------------------------------
# Sample text
@text = 'Quo usque tandem abutere, Catilina, patientia nostra? Quam diu etiam furor iste tuus nos eludet? Quem ad finem sese effrenata iactabit audacia? Nihil ne te nocturnum praesidium Palati, nihil urbis vigiliae, nihil timor populi, nihil concursus bonorum omnium, nihil hic munitissimus habendi senatus locus, nihil horum ora voltusque moverunt? Patere tua consilia non sentis, constrictam iam horum omnium scientia teneri coniurationem tuam non vides? Quid proxima, quid superiore nocte egeris, ubi fueris, quos convocaveris, quid consilii ceperis, quem nostrum ignorare arbitraris? O tempora, o mores! Senatus haec intellegit. Consul videt; hic tamen vivit. Vivit? immo vero etiam in senatum venit, fit publici consilii particeps, notat et designat oculis ad caedem unum quemque nostrum. Nos autem fortes viri satisfacere rei publicae videmur, si istius furorem ac tela vitemus. Ad mortem te, Catilina, duci iussu consulis iam pridem oportebat, in te conferri pestem, quam tu in nos omnes iam diu machinaris. An vero vir amplissumus, Scipio, pontifex maximus, Gracchum mediocriter labefactantem statum rei publicae privatus interfecit; Catilinam orbem terrae caede atque incendiis vastare cupientem nos consules perferemus? Nam illa nimis antiqua praetereo, quod Servilius Ahala Maelium novis rebus studentem manu sua occidit. Fuit, fuit ista quondam in hac re publica virtus, ut viri fortes acrioribus suppliciis civem perniciosum quam acerbissimum hostem coercerent. Habemus senatus consultum in te, Catilina, vehemens et grave, non deest rei publicae consilium neque auctoritas huius ordinis; nos, nos, dico aperte, consules desumus. Decrevit quondam senatus, ut Opimius consul videret, ne quid res publica detrimenti caperet; nox nulla intercessit; interfectus est propter quasdam seditionum suspiciones Gracchus, clarissimo patre, avo, maioribus, occisus est cum liberis Fulvius consularis. Simili senatus consulto Mario et Valerio consulibus est permissa res publica; num unum diem postea Saturninum tribunum et Servilium praetorem mors ac rei publicae poena remorata est? At vero nos vicesimum iam diem patimur hebescere aciem horum auctoritatis. Habemus enim huiusce modi senatus consultum, verum inclusum in tabulis tamquam in vagina reconditum, quo ex senatus consulto confestim te interfectum esse, Catilina, convenit. Vivis, et vivis non ad deponendam, sed ad confirmandam audaciam. Cupio, patres conscripti, me esse clementem, cupio in tantis rei publicae periculis me non dissolutum videri, sed iam me ipse inertiae nequitiaeque condemno. Castra sunt in Italia contra populum Romanum in Etruriae faucibus conlocata, crescit in dies singulos hostium numerus; eorum autem castrorum imperatorem ducemque hostium intra moenia atque adeo in senatu videmus intestinam aliquam cotidie perniciem rei publicae molientem. Si te iam, Catilina, comprehendi, si interfici iussero, credo, erit verendum mihi, ne non potius hoc omnes boni serius a me quam quisquam crudelius factum esse dicat. Verum ego hoc, quod iam pridem factum esse oportuit, certa de causa nondum adducor ut faciam. Tum denique interficiere, cum iam nemo tam inprobus, tam perditus, tam tui similis inveniri poterit, qui id non iure factum esse fateatur. Quamdiu quisquam erit, qui te defendere audeat, vives, et vives ita, ut nunc vivis. multis meis et firmis praesidiis obsessus, ne commovere te contra rem publicam possis. Multorum te etiam oculi et aures non sentientem, sicut adhuc fecerunt, speculabuntur atque custodient. Etenim quid est, Catilina, quod iam amplius expectes, si neque nox tenebris obscurare coeptus nefarios nec privata domus parietibus continere voces coniurationis tuae potest, si illustrantur, si erumpunt omnia? Muta iam istam mentem, mihi crede, obliviscere caedis atque incendiorum. Teneris undique; luce sunt clariora nobis tua consilia omnia; quae iam mecum licet recognoscas. Meministine me ante diem XII Kalendas Novembris dicere in senatu fore in armis certo die, qui dies futurus esset ante diem VI Kal. Novembris, Manlium, audaciae satellitem atque administrum tuae? Num me fefellit, Catilina, non modo res tanta, tam atrox tamque incredibilis, verum, id quod multo magis est admirandum, dies? Dixi ego idem in senatu caedem te optumatium contulisse in ante diem V Kalendas Novembris, tum cum multi principes civitatis Roma non tam sui conservandi quam tuorum consiliorum reprimendorum causa profugerunt. Num infitiari potes te illo ipso die meis praesidiis, mea diligentia circumclusum commovere te contra rem publicam non potuisse, cum tu discessu ceterorum nostra tamen, qui remansissemus, caede te contentum esse dicebas? Quid? cum te Praeneste Kalendis ipsis Novembribus occupaturum nocturno impetu esse confideres, sensistin illam coloniam meo iussu meis praesidiis, custodiis, vigiliis esse munitam? Nihil agis, nihil moliris, nihil cogitas, quod non ego non modo audiam, sed etiam videam planeque sentiam. Recognosce tandem mecum noctem illam superiorem; iam intelleges multo me vigilare acrius ad salutem quam te ad perniciem rei publicae. Dico te priore nocte venisse inter falcarios--non agam obscure--in Laecae domum; convenisse eodem complures eiusdem amentiae scelerisque socios. Num negare audes? quid taces? Convincam, si negas. Video enim esse hic in senatu quosdam, qui tecum una fuerunt. O di inmortales! ubinam gentium sumus? in qua urbe vivimus? quam rem publicam habemus? Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent! Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero! Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem. Reperti sunt duo equites Romani, qui te ista cura liberarent et sese illa ipsa nocte paulo ante lucem me in meo lectulo interfecturos esse pollicerentur.'
@words = @text.split(' ')
@words_plain = @words.map { |t| t.tr('^A-Za-z', '') }.select { |w| w.length > 1 }
@words_unique = @words_plain.map(&:downcase).to_set.to_a.sort!
@letters = (97..122).map(&:chr)
def take(list, start, len)
start = start % list.length
slice = list.slice(start, list.length - start).to_a
taken = slice.take(len)
(taken.length >= len) ? taken : taken + take(list, start + taken.length, len - taken.length)
end
def take_sublist(list, len)
start = rand(list.length)
take(list, start, len)
end
def take_random(list, count)
set = Set.new
set << list[rand(list.length)] while set.size < count
set.to_a.sort!
end
def random_text(word_count)
text = take_sublist(@words, word_count).join(' ').capitalize
text = text.slice(0, text.length - 1) while text.match(/[^a-z]$/)
text
end
def random_sentence(word_count)
text = random_text(word_count)
end_index = text =~ /[.?!-]/ || (text.length - 1)
text.slice(0, end_index)
end
def random_word
random_from(@words_unique)
end
def random_words(count)
(0...count).map { |_i| random_word }
end
def iterative_sum(max, chance)
1 + ((0...max).inject do |sum, _i|
rval = rand
break sum if rval > chance
(sum + rval)
end).round
end
def less_than(max)
iterative_sum(max, 0.999)
end
def much_less_than(max)
iterative_sum(max, 0.992)
end
def very_much_less_than(max)
iterative_sum(max, 0.973)
end
def random_names(max)
num_names = much_less_than max
(0...num_names).map { |_i| random_name }
end
def random_name
names = less_than(4)
names = 2 if names < 2
random_words(names).map(&:capitalize).join(' ')
end
def random_from(list)
list[rand(0...list.length)]
end
def random_list_from(list, count)
(0...count).map { |_i| random_from(list) }
end
@all_mime_types = MIME::Types.to_a.select { |mt| !mt.extensions.empty? }
@mime_types = random_list_from(@all_mime_types, 20)
@authors = (0...1000).map { |_i| random_name }
@publishers = (0...100).map { |_i| random_words(much_less_than(10)).map(&:capitalize).join(' ') }
@affiliations = (0...100).map { |_i| random_words(much_less_than(10)).map(&:capitalize).join(' ') }
@resource_types = (0...20).map { |_i| random_words(less_than(3)).join(' ') }
# ------------------------------------------------------------
# Random sample fields
def doi(index)
prefix = take_sublist(@letters, 3).join
registrant = 20_000 + less_than(10_000)
"10.#{registrant}/#{prefix}#{1_000_000 + index}"
end
def creators
num_authors = less_than(10)
(0...num_authors).map { |_i| [random_from(@authors), random_from(@affiliations)] }
end
def title
loop do
title = random_sentence(much_less_than(100))
return title if title.length > 10
end
end
def publisher
random_from(@publishers)
end
def pub_year
2000 + rand(15)
end
def subjects
num_keywords = much_less_than 20
take_random(@words_unique, num_keywords)
end
def resource_type
random_from(@resource_types)
end
def abstract
random_text(less_than(500)) + '.'
end
def methods
random_text(much_less_than(500)) + '.'
end
def usage
random_text(less_than(500)) + '.'
end
def grant_number
"#{rand(10)} #{random_word[0]}#{rand(20)} #{random_word[0..1]}#{100_000 + less_than(1_000_000)}-#{rand(100)}#{random_word[0]}#{rand(30)}".upcase
end
def period_for(period_months)
case
when period_months == 1
'1 month'
when period_months == 6
'6 months'
else
'1 year'
end
end
def embargo_for_type(embargo_type, end_date)
period_months = 6 * rand(3)
period_months = 1 if period_months == 0
start_date = end_date - (period_months * 365.24 / 12)
period = period_for(period_months)
ST::Embargo.new(type: embargo_type, period: period, start_date: start_date, end_date: end_date)
end
def no_embargo(end_date)
ST::Embargo.new(type: ST::EmbargoType::NONE, period: 'none', start_date: end_date, end_date: end_date)
end
def embargo(year)
end_date = Date.new(year, 1 + rand(12), 1 + rand(28))
embargo_type = random_from(ST::EmbargoType.values)
(embargo_type == ST::EmbargoType::NONE) ? no_embargo(end_date) : embargo_for_type(embargo_type, end_date)
end
def files
num_files = less_than 10
(0...num_files).map do |_i|
mime_type = random_from(@mime_types)
pathname = "#{random_word}.#{mime_type.extensions[0]}"
size_bytes = less_than(100) + less_than(1000) * much_less_than(1000)
ST::StashFile.new(pathname: pathname, size_bytes: size_bytes, mime_type: mime_type)
end
end
def gen_sample(i)
st_year = pub_year
st_doi = doi(i)
st_identifier = ST::Identifier.new(type: ST::IdentifierType::DOI, value: st_doi)
st_license = ST::License::CC_BY
st_embargo = embargo(st_year)
st_version = ST::Version.new(number: 1, date: st_embargo.start_date, note: random_sentence(less_than(10)))
st_inventory = ST::Inventory.new(files: files)
st_creators = creators
st_title = title
st_publisher = publisher
st_subjects = subjects
st_resource_type = resource_type
st_abstract = abstract
st_methods = methods
st_usage = usage
st_grant_number = grant_number
dcs_resource = datacite(
doi: st_doi,
creators: st_creators,
title: st_title,
publisher: st_publisher,
pubyear: st_year,
subjects: st_subjects,
resource_type: st_resource_type,
abstract: st_abstract,
methods: st_methods,
usage: st_usage,
grant_number: st_grant_number
)
wrapper = ST::StashWrapper.new(
identifier: st_identifier,
version: st_version,
license: st_license,
embargo: st_embargo,
inventory: st_inventory,
descriptive_elements: [dcs_resource]
)
wrapper_xml = wrapper.save_to_xml
oai_dc = dublin_core(
doi: st_doi,
creators: st_creators,
title: st_title,
publisher: st_publisher,
pubyear: st_year,
subjects: st_subjects,
resource_type: st_resource_type,
abstract: st_abstract,
methods: st_methods,
usage: st_usage,
grant_number: st_grant_number
)
Sample.new(wrapper_xml: wrapper_xml, dcs_resource: dcs_resource, oai_dc: oai_dc)
end
# ------------------------------------------------------------
# Main
formatter = REXML::Formatters::Pretty.new
formatter.compact = true
count = ARGV[0].to_i if ARGV[0]
if count
digits = 1 + Math.log(count, 10).round
(1..count).each do |i|
sample = gen_sample(i)
record_num = i.to_s.rjust(digits, '0')
File.open("stash_wrapper-#{record_num}.xml", 'w') do |f|
formatter.write(sample.wrapper_xml, f)
puts File.absolute_path(f.path)
end
File.open("stash_dcs_resource-#{record_num}.xml", 'w') do |f|
formatter.write(sample.dcs_resource, f)
puts File.absolute_path(f.path)
end
File.open("stash_oai_dc-#{record_num}.xml", 'w') do |f|
formatter.write(sample.oai_dc, f)
puts File.absolute_path(f.path)
end
end
else
sample = gen_sample(1)
formatter.write(sample.wrapper_xml, $stdout)
puts
formatter.write(sample.dcs_resource, $stdout)
puts
formatter.write(sample.oai_dc, $stdout)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment