Last active
May 22, 2018 11:15
-
-
Save marrus-sh/2ac9f5951a2fd1fe588de59d17715ae2 to your computer and use it in GitHub Desktop.
BNS YAML GENERATOR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# frozen_string_literal: true | |
require 'yaml' | |
require 'rexml/document' | |
include REXML | |
$bns_idchars = <<~REXP.gsub(/\s+/, '') | |
0-9A-Za-z | |
!\\$&'\\(\\)*+,\\-;=?@_~ | |
\\u{A0}-\\u{D7FF}\\u{E000}-\\u{FDCF}\\u{FDF0}-\\u{FDFD} | |
\\u{10000}-\\u{1FFFD}\\u{20000}-\\u{2FFFD}\\u{30000}-\\u{3FFFD} | |
\\u{40000}-\\u{4FFFD}\\u{50000}-\\u{5FFFD}\\u{60000}-\\u{6FFFD} | |
\\u{70000}-\\u{7FFFD}\\u{80000}-\\u{8FFFD}\\u{90000}-\\u{9FFFD} | |
\\u{A0000}-\\u{AFFFD}\\u{B0000}-\\u{BFFFD}\\u{C0000}-\\u{CFFFD} | |
\\u{D0000}-\\u{DFFFD}\\u{E0000}-\\u{EFFFD}\\u{F0000}-\\u{FFFFD} | |
\\u{100000}-\\u{10FFFD} | |
REXP | |
$project_rexp = /^([0-9]{3}|[1-9][0-9]{3,}) ([#{$bns_idchars}]+)$/u | |
$concept_rexp = | |
/^Concept (0|[A-Z]|`[#{$bns_idchars}]+`)(?: – “(.*)”)?$/u | |
$volume_rexp = | |
/^Volume (0|Ⅿ*(?:ⅭⅯ|ⅭⅮ|Ⅾ?Ⅽ{0,3})(?:(?:ⅩⅭ|ⅩⅬ)(?![ⅩⅪⅫ])|Ⅼ)?Ⅹ{0,2}(?:[ⅡⅢⅣⅥⅦⅧⅨⅪⅫ]|Ⅹ?(?:ⅠⅩ|ⅠⅤ|Ⅴ?Ⅰ{0,3}))|`[#{$bns_idchars}]+`)(?: – “(.*)”)?$/u | |
$version_rexp = | |
/^Version (0|[1-9][0-9]*|`[#{$bns_idchars}]+`)(?: – “(.*)”)?$/u | |
$draft_rexp = /^Draft ([0-9]*|`[#{$bns_idchars}]+`)(?: – “(.*)”)?$/u | |
$part_rexp = | |
/^Part (0|ⅿ*(?:ⅽⅿ|ⅽⅾ|ⅾ?ⅽ{0,3})(?:(?:ⅹⅽ|ⅹⅼ)(?![ⅹⅺⅻ])|ⅼ)?ⅹ{0,2}(?:[ⅱⅲⅳⅵⅶⅷⅸⅺⅻ]|ⅹ?(?:ⅰⅹ|ⅰⅴ|ⅴ?ⅰ{0,3}))|`[#{$bns_idchars}]+`)(?: – “(.*)”)?$/u | |
def extract_id (str, level) | |
shortname = str[/^`([#{$bns_idchars}]+)`$/, 1] | |
return shortname unless shortname.nil? | |
case level | |
when 0, 3, 4 | |
str.to_i | |
when 1 | |
return 0 if str == '0' | |
str.each_codepoint { |cpt| return cpt - 64 } | |
when 2, 5 | |
return 0 if str == '0' | |
i = 0 | |
i_has_appeared = false | |
c_has_appeared = false | |
x_has_appeared = false | |
str.each_char do |ch| | |
case ch | |
when 'Ⅿ', 'ⅿ' | |
if c_has_appeared | |
i += 800 | |
else | |
i += 1000 | |
end | |
when 'Ⅾ', 'ⅾ' | |
if c_has_appeared | |
i += 300 | |
else | |
i += 500 | |
end | |
when 'Ⅽ', 'ⅽ' | |
if x_has_appeared | |
i += 80 | |
else | |
i += 100 | |
end | |
c_has_appeared = true | |
when 'Ⅼ', 'ⅼ' | |
if x_has_appeared | |
i += 30 | |
else | |
i += 50 | |
end | |
when 'Ⅹ', 'ⅹ' | |
if i_has_appeared | |
i += 8 | |
else | |
i += 10 | |
end | |
x_has_appeared = true | |
when 'V', 'v' | |
if i_has_appeared | |
i += 3 | |
else | |
i += 5 | |
end | |
when 'I', 'i' | |
i += 1 | |
i_has_appeared = true | |
when 'Ⅱ', 'Ⅲ', 'Ⅳ', 'Ⅵ', 'Ⅶ', 'Ⅷ', 'Ⅸ', 'Ⅺ', 'Ⅻ', | |
'ⅱ', 'ⅲ', 'ⅳ', 'ⅵ', 'ⅶ', 'ⅷ', 'ⅸ', 'ⅺ', 'ⅻ' | |
i += ch.each_codepoint { |cpt| return cpt - 8543 } | |
end | |
end | |
i | |
end | |
end | |
def fill (ida, level) | |
if ida.size >= level + 1 | |
ida | |
else | |
ida.fill 1, ida.size, level - ida.size + 1 | |
end | |
end | |
def handle_div (path, parent_level, ida) | |
name = File.basename path, path[/(?:“.*”)?(\..+)?/, 1] || '' | |
dirs = File.join path, '*' | |
meta = File.join path, '@.{yaml,yml}' | |
identifier = nil | |
label = nil | |
level = 5 | |
type = nil | |
case true | |
when (match = name.match $concept_rexp) && parent_level < 1 | |
puts "> Processing #{name}…" | |
identifier = extract_id match[1], 1 | |
label = match[2] | |
level = 1 | |
type = 'bns:Concept' | |
when (match = name.match $volume_rexp) && parent_level < 2 | |
puts ">> Processing #{name}…" | |
identifier = extract_id match[1], 2 | |
label = match[2] | |
level = 2 | |
type = 'bns:Volume' | |
when (match = name.match $version_rexp) && parent_level < 3 | |
puts ">>> Processing #{name}…" | |
identifier = extract_id match[1], 3 | |
label = match[2] | |
level = 3 | |
type = 'bns:Version' | |
when (match = name.match $draft_rexp) && parent_level < 4 | |
puts ">>>> Processing #{name}…" | |
identifier = extract_id match[1], 4 | |
label = match[2] | |
level = 4 | |
type = 'bns:Draft' | |
when (match = name.match $part_rexp) && parent_level < 5 | |
puts ">>>>> Processing #{name}…" | |
identifier = extract_id match[1], 5 | |
label = match[2] | |
level = 5 | |
type = 'bns:Part' | |
else | |
return | |
end | |
fill ida, level | |
if File.directory? path | |
begin | |
data = YAML.load_file Dir[meta][0] | |
data = {} unless data.is_a? Hash | |
rescue | |
data = {} | |
end | |
if data['isAccessibleForFree'] && !data['text'] | |
case true | |
when File.exist?(text = (File.join path, 'text.txt')) | |
data['text'] = "<pre xmlns=\"http://www.w3.org/1999/xhtml\">#{IO.read text}</pre>" | |
when File.exist?(text = (File.join path, 'text.xml')) | |
begin | |
file = File.new text | |
document = Document.new file | |
data['text'] = document.root.to_s | |
rescue | |
end | |
when File.exist?(text = (File.join path, 'text.xhtml')) | |
begin | |
file = File.new text | |
document = Document.new file | |
body = XPath.first doc, '//html:body', | |
html: 'http://www.w3.org/1999/xhtml' | |
data['text'] = body.to_s | |
rescue | |
end | |
end | |
end | |
else | |
data = {} | |
end | |
data['@type'] ||= type | |
data['identifier'] ||= identifier | |
data['@id'] ||= "#{ida.join ":"}:#{data['identifier']}" | |
data['name'] ||= label unless label.nil? | |
if level < 5 | |
subida = Array.new ida | |
subida.push data['identifier'] | |
divs = Dir[dirs].map { |div| handle_div div, level, subida } | |
divs.compact! | |
end | |
data['hasPart'] ||= divs unless divs.nil? || divs.empty? | |
data | |
end | |
def handle_prj (path, prefix) | |
return unless File.directory? path | |
name = File.basename path | |
name.match $project_rexp do |match| | |
puts "Processing #{name}…" | |
dirs = File.join path, '*' | |
meta = File.join path, '@.{yaml,yml}' | |
subida = [prefix, match[2]] | |
divs = Dir[dirs].map { |div| handle_div div, 0, subida } | |
divs.compact! | |
begin | |
data = YAML.load_file Dir[meta][0] | |
rescue | |
data = {} | |
end | |
data['@type'] ||= 'bns:Project' | |
data['@id'] ||= "#{prefix}:#{match[2]}" | |
data['identifier'] ||= [match[1].to_i, match[2]].compact | |
data['hasPart'] ||= divs unless divs.nil? || divs.empty? | |
data | |
end | |
#rescue | |
end | |
def handle_root (root, prefix) | |
return unless File.directory? root | |
puts "Starting…" | |
dirs = File.join root, '*' | |
meta = File.join root, '@.{yaml,yml}' | |
prjs = Dir[dirs].map { |prj| handle_prj prj, prefix } | |
begin | |
data = YAML.load_file Dir[meta][0] | |
rescue | |
data = {} | |
end | |
data['@type'] ||= 'bns:Corpus' | |
data['hasPart'] ||= prjs unless prjs.nil? || prjs.compact!.empty? | |
data | |
end | |
puts "Setting up files…" | |
File.delete 'bns.yml~' if File.exist? 'bns.yml~' | |
File.rename 'bns.yml', 'bns.yml~' if File.exist? 'bns.yml' | |
file = File.new 'bns.yml', 'w+' | |
bns = handle_root ARGV[0] || '.', ARGV[1] || 'corpus' | |
YAML.dump bns || {}, file, line_width: 71 | |
file.close | |
puts "…Done." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment