Skip to content

Instantly share code, notes, and snippets.

@marrus-sh
Last active May 22, 2018 11:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save marrus-sh/2ac9f5951a2fd1fe588de59d17715ae2 to your computer and use it in GitHub Desktop.
Save marrus-sh/2ac9f5951a2fd1fe588de59d17715ae2 to your computer and use it in GitHub Desktop.
BNS YAML GENERATOR
# frozen_string_literal: true
require 'yaml'
require 'rexml/document'
include REXML
$bns_idchars = <<~REXP.gsub(/\s+/, '')
0-9A-Za-z
!\\$&'\\(\\)*+,\\-;=?@_~
\\u{A0}-\\u{D7FF}\\u{E000}-\\u{FDCF}\\u{FDF0}-\\u{FDFD}
\\u{10000}-\\u{1FFFD}\\u{20000}-\\u{2FFFD}\\u{30000}-\\u{3FFFD}
\\u{40000}-\\u{4FFFD}\\u{50000}-\\u{5FFFD}\\u{60000}-\\u{6FFFD}
\\u{70000}-\\u{7FFFD}\\u{80000}-\\u{8FFFD}\\u{90000}-\\u{9FFFD}
\\u{A0000}-\\u{AFFFD}\\u{B0000}-\\u{BFFFD}\\u{C0000}-\\u{CFFFD}
\\u{D0000}-\\u{DFFFD}\\u{E0000}-\\u{EFFFD}\\u{F0000}-\\u{FFFFD}
\\u{100000}-\\u{10FFFD}
REXP
$project_rexp = /^([0-9]{3}|[1-9][0-9]{3,}) ([#{$bns_idchars}]+)$/u
$concept_rexp =
/^Concept (0|[A-Z]|`[#{$bns_idchars}]+`)(?: – “(.*)”)?$/u
$volume_rexp =
/^Volume (0|Ⅿ*(?:ⅭⅯ|ⅭⅮ|Ⅾ?Ⅽ{0,3})(?:(?:ⅩⅭ|ⅩⅬ)(?![ⅩⅪⅫ])|Ⅼ)?Ⅹ{0,2}(?:[ⅡⅢⅣⅥⅦⅧⅨⅪⅫ]|Ⅹ?(?:ⅠⅩ|ⅠⅤ|Ⅴ?Ⅰ{0,3}))|`[#{$bns_idchars}]+`)(?: – “(.*)”)?$/u
$version_rexp =
/^Version (0|[1-9][0-9]*|`[#{$bns_idchars}]+`)(?: – “(.*)”)?$/u
$draft_rexp = /^Draft ([0-9]*|`[#{$bns_idchars}]+`)(?: – “(.*)”)?$/u
$part_rexp =
/^Part (0|ⅿ*(?:ⅽⅿ|ⅽⅾ|ⅾ?ⅽ{0,3})(?:(?:ⅹⅽ|ⅹⅼ)(?![ⅹⅺⅻ])|ⅼ)?ⅹ{0,2}(?:[ⅱⅲⅳⅵⅶⅷⅸⅺⅻ]|ⅹ?(?:ⅰⅹ|ⅰⅴ|ⅴ?ⅰ{0,3}))|`[#{$bns_idchars}]+`)(?: – “(.*)”)?$/u
def extract_id (str, level)
shortname = str[/^`([#{$bns_idchars}]+)`$/, 1]
return shortname unless shortname.nil?
case level
when 0, 3, 4
str.to_i
when 1
return 0 if str == '0'
str.each_codepoint { |cpt| return cpt - 64 }
when 2, 5
return 0 if str == '0'
i = 0
i_has_appeared = false
c_has_appeared = false
x_has_appeared = false
str.each_char do |ch|
case ch
when 'Ⅿ', 'ⅿ'
if c_has_appeared
i += 800
else
i += 1000
end
when 'Ⅾ', 'ⅾ'
if c_has_appeared
i += 300
else
i += 500
end
when 'Ⅽ', 'ⅽ'
if x_has_appeared
i += 80
else
i += 100
end
c_has_appeared = true
when 'Ⅼ', 'ⅼ'
if x_has_appeared
i += 30
else
i += 50
end
when 'Ⅹ', 'ⅹ'
if i_has_appeared
i += 8
else
i += 10
end
x_has_appeared = true
when 'V', 'v'
if i_has_appeared
i += 3
else
i += 5
end
when 'I', 'i'
i += 1
i_has_appeared = true
when 'Ⅱ', 'Ⅲ', 'Ⅳ', 'Ⅵ', 'Ⅶ', 'Ⅷ', 'Ⅸ', 'Ⅺ', 'Ⅻ',
'ⅱ', 'ⅲ', 'ⅳ', 'ⅵ', 'ⅶ', 'ⅷ', 'ⅸ', 'ⅺ', 'ⅻ'
i += ch.each_codepoint { |cpt| return cpt - 8543 }
end
end
i
end
end
def fill (ida, level)
if ida.size >= level + 1
ida
else
ida.fill 1, ida.size, level - ida.size + 1
end
end
def handle_div (path, parent_level, ida)
name = File.basename path, path[/(?:“.*”)?(\..+)?/, 1] || ''
dirs = File.join path, '*'
meta = File.join path, '@.{yaml,yml}'
identifier = nil
label = nil
level = 5
type = nil
case true
when (match = name.match $concept_rexp) && parent_level < 1
puts "> Processing #{name}…"
identifier = extract_id match[1], 1
label = match[2]
level = 1
type = 'bns:Concept'
when (match = name.match $volume_rexp) && parent_level < 2
puts ">> Processing #{name}…"
identifier = extract_id match[1], 2
label = match[2]
level = 2
type = 'bns:Volume'
when (match = name.match $version_rexp) && parent_level < 3
puts ">>> Processing #{name}…"
identifier = extract_id match[1], 3
label = match[2]
level = 3
type = 'bns:Version'
when (match = name.match $draft_rexp) && parent_level < 4
puts ">>>> Processing #{name}…"
identifier = extract_id match[1], 4
label = match[2]
level = 4
type = 'bns:Draft'
when (match = name.match $part_rexp) && parent_level < 5
puts ">>>>> Processing #{name}…"
identifier = extract_id match[1], 5
label = match[2]
level = 5
type = 'bns:Part'
else
return
end
fill ida, level
if File.directory? path
begin
data = YAML.load_file Dir[meta][0]
data = {} unless data.is_a? Hash
rescue
data = {}
end
if data['isAccessibleForFree'] && !data['text']
case true
when File.exist?(text = (File.join path, 'text.txt'))
data['text'] = "<pre xmlns=\"http://www.w3.org/1999/xhtml\">#{IO.read text}</pre>"
when File.exist?(text = (File.join path, 'text.xml'))
begin
file = File.new text
document = Document.new file
data['text'] = document.root.to_s
rescue
end
when File.exist?(text = (File.join path, 'text.xhtml'))
begin
file = File.new text
document = Document.new file
body = XPath.first doc, '//html:body',
html: 'http://www.w3.org/1999/xhtml'
data['text'] = body.to_s
rescue
end
end
end
else
data = {}
end
data['@type'] ||= type
data['identifier'] ||= identifier
data['@id'] ||= "#{ida.join ":"}:#{data['identifier']}"
data['name'] ||= label unless label.nil?
if level < 5
subida = Array.new ida
subida.push data['identifier']
divs = Dir[dirs].map { |div| handle_div div, level, subida }
divs.compact!
end
data['hasPart'] ||= divs unless divs.nil? || divs.empty?
data
end
def handle_prj (path, prefix)
return unless File.directory? path
name = File.basename path
name.match $project_rexp do |match|
puts "Processing #{name}…"
dirs = File.join path, '*'
meta = File.join path, '@.{yaml,yml}'
subida = [prefix, match[2]]
divs = Dir[dirs].map { |div| handle_div div, 0, subida }
divs.compact!
begin
data = YAML.load_file Dir[meta][0]
rescue
data = {}
end
data['@type'] ||= 'bns:Project'
data['@id'] ||= "#{prefix}:#{match[2]}"
data['identifier'] ||= [match[1].to_i, match[2]].compact
data['hasPart'] ||= divs unless divs.nil? || divs.empty?
data
end
#rescue
end
def handle_root (root, prefix)
return unless File.directory? root
puts "Starting…"
dirs = File.join root, '*'
meta = File.join root, '@.{yaml,yml}'
prjs = Dir[dirs].map { |prj| handle_prj prj, prefix }
begin
data = YAML.load_file Dir[meta][0]
rescue
data = {}
end
data['@type'] ||= 'bns:Corpus'
data['hasPart'] ||= prjs unless prjs.nil? || prjs.compact!.empty?
data
end
puts "Setting up files…"
File.delete 'bns.yml~' if File.exist? 'bns.yml~'
File.rename 'bns.yml', 'bns.yml~' if File.exist? 'bns.yml'
file = File.new 'bns.yml', 'w+'
bns = handle_root ARGV[0] || '.', ARGV[1] || 'corpus'
YAML.dump bns || {}, file, line_width: 71
file.close
puts "…Done."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment