Snarp/superscripts_subscripts_normalized_via_nfkc.yml

## superscripts_subscripts_normalized_via_nfkc.yml
---
:subscripts:
  "₀": '0'
  "₁": '1'
  "₂": '2'
  "₃": '3'
  "₄": '4'
  "₅": '5'
  "₆": '6'
  "₇": '7'
  "₈": '8'
  "₉": '9'
  "₍": "("
  "₎": ")"
  "₊": "+"
  "₌": "="
  "₋": "−"
  ₐ: a
  ₑ: e
  ₕ: h
  ᵢ: i
  ⱼ: j
  ₖ: k
  ₗ: l
  ₘ: m
  ₙ: n
  ₒ: o
  ₚ: p
  ᵣ: r
  ₛ: s
  ₜ: t
  ᵤ: u
  ᵥ: v
  ₓ: x
  ₔ: ə
  ᵦ: β
  ᵧ: γ
  ᵨ: ρ
  ᵩ: φ
  ᵪ: χ
:superscripts:
  "⁰": '0'
  "¹": '1'
  "²": '2'
  "³": '3'
  "⁴": '4'
  "⁵": '5'
  "⁶": '6'
  "⁷": '7'
  "⁸": '8'
  "⁹": '9'
  "⁽": "("
  "⁾": ")"
  "⁺": "+"
  "⁼": "="
  ᴬ: A
  ᴮ: B
  ᴰ: D
  ᴱ: E
  ᴳ: G
  ᴴ: H
  ᴵ: I
  ᴶ: J
  ᴷ: K
  ᴸ: L
  ᴹ: M
  ᴺ: N
  ᴼ: O
  ᴾ: P
  ᴿ: R
  ᵀ: T
  ᵁ: U
  ⱽ: V
  ᵂ: W
  ª: a
  ᵃ: a
  ᵇ: b
  ᶜ: c
  ᵈ: d
  ᵉ: e
  ᶠ: f
  ᵍ: g
  ʰ: h
  ⁱ: i
  ʲ: j
  ᵏ: k
  ˡ: l
  ᵐ: m
  ⁿ: n
  º: o
  ᵒ: o
  ᵖ: p
  ʳ: r
  ˢ: s
  ᵗ: t
  ᵘ: u
  ᵛ: v
  ʷ: w
  ˣ: x
  ʸ: y
  ᶻ: z
  ᴭ: Æ
  ᶞ: ð
  ꟸ: Ħ
  ᵑ: ŋ
  ꟹ: œ
  ᴲ: Ǝ
  ᶵ: ƫ
  ᴽ: Ȣ
  ᵄ: ɐ
  ᵅ: ɑ
  ᶛ: ɒ
  ᵓ: ɔ
  ᶝ: ɕ
  ᵊ: ə
  ᵋ: ɛ
  ᵌ: ɜ
  ᶟ: ɜ
  ᶡ: ɟ
  ᶢ: ɡ
  ˠ: ɣ
  ᶣ: ɥ
  ʱ: ɦ
  ᶤ: ɨ
  ᶥ: ɩ
  ᶦ: ɪ
  ꭞ: ɫ
  ᶩ: ɭ
  ᵚ: ɯ
  ᶭ: ɰ
  ᶬ: ɱ
  ᶮ: ɲ
  ᶯ: ɳ
  ᶰ: ɴ
  ᶱ: ɵ
  ᶲ: ɸ
  ʴ: ɹ
  ʵ: ɻ
  ʶ: ʁ
  ᶳ: ʂ
  ᶴ: ʃ
  ᶶ: ʉ
  ᶷ: ʊ
  ᶹ: ʋ
  ᶺ: ʌ
  ᶼ: ʐ
  ᶽ: ʑ
  ᶾ: ʒ
  ˤ: ʕ
  ᶨ: ʝ
  ᶫ: ʟ
  ᵝ: β
  ᵞ: γ
  ᵟ: δ
  ᶿ: θ
  ᵠ: φ
  ᵡ: χ
  ᵸ: н
  ꚜ: ъ
  ꚝ: ь
  ჼ: ნ
  ᵆ: ᴂ
  ᵔ: ᴖ
  ᵕ: ᴗ
  ᶸ: ᴜ
  ᵙ: ᴝ
  ᵜ: ᴥ
  ᶧ: ᵻ
  ᶪ: ᶅ
  "⁻": "−"
  ⵯ: ⵡ
  "㆒": 一
  "㆜": 丁
  "㆔": 三
  "㆖": 上
  "㆘": 下
  "㆛": 丙
  "㆗": 中
  "㆚": 乙
  "㆓": 二
  "㆟": 人
  "㆕": 四
  "㆞": 地
  "㆝": 天
  "㆙": 甲
  ꭜ: ꜧ
  ꝰ: ꝯ
  ꭝ: ꬷ
  ꭟ: ꭒ

## superscripts_subscripts_normalized_via_nfkc_as_ruby_regex.yml
--- # As Ruby regular expressions:
:subscripts: !ruby/regexp /[₀₁₂₃₄₅₆₇₈₉₍₎₊₌₋ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓₔᵦᵧᵨᵩᵪ]+/
:superscripts: !ruby/regexp /[⁰¹²³⁴⁵⁶⁷⁸⁹⁽⁾⁺⁼ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂªᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿºᵒᵖʳˢᵗᵘᵛʷˣʸᶻᴭᶞꟸᵑꟹᴲᶵᴽᵄᵅᶛᵓᶝᵊᵋᵌᶟᶡᶢˠᶣʱᶤᶥᶦꭞᶩᵚᶭᶬᶮᶯᶰᶱᶲʴʵʶᶳᶴᶶᶷᶹᶺᶼᶽᶾˤᶨᶫᵝᵞᵟᶿᵠᵡᵸꚜꚝჼᵆᵔᵕᶸᵙᵜᶧᶪ⁻ⵯ㆒㆜㆔㆖㆘㆛㆗㆚㆓㆟㆕㆞㆝㆙ꭜꝰꭝꭟ]+/

## unicode_sub_and_superscripts_to_sub_and_sup_tags.rb
# Method for processing HTML to replace Unicode subscript and superscript
# characters with normalized characters wrapped in `<sub>` and `<sup>` tags.
# Uses regular expressions to identify sequences of sub- and superscripts.
# Examples:
#
#     "1ˢᵗ 2ⁿᵈ 3ʳᵈ" => "1<sup>st</sup> 2<sup>nd</sup> 3<sup>rd</sup>"
#     "PO₄³⁻ ion" => "PO<sub>4</sub><sup>3−</sup> ion"

REGEXPS = {
  # Less-thorough regular expressions:
  sub: /[₀₁₂₃₄₅₆₇₈₉₍₎₊₌₋ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓ]+/,
  sup: /[⁰¹²³⁴⁵⁶⁷⁸⁹⁽⁾⁺⁼⁻ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂªᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿºᵒᵖʳˢᵗᵘᵛʷˣʸᶻ]+/,

  # # More-thorough versions:
  # sub: /[₀₁₂₃₄₅₆₇₈₉₍₎₊₌₋ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓₔᵦᵧᵨᵩᵪ]+/,
  # sup: /[⁰¹²³⁴⁵⁶⁷⁸⁹⁽⁾⁺⁼ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂªᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿºᵒᵖʳˢᵗᵘᵛʷˣʸᶻᴭᶞꟸᵑꟹᴲᶵᴽᵄᵅᶛᵓᶝᵊᵋᵌᶟᶡᶢˠᶣʱᶤᶥᶦꭞᶩᵚᶭᶬᶮᶯᶰᶱᶲʴʵʶᶳᶴᶶᶷᶹᶺᶼᶽᶾˤᶨᶫᵝᵞᵟᶿᵠᵡᵸꚜꚝჼᵆᵔᵕᶸᵙᵜᶧᶪ⁻ⵯ㆒㆜㆔㆖㆘㆛㆗㆚㆓㆟㆕㆞㆝㆙ꭜꝰꭝꭟ]+/,
}

# "Dihydrogen monoxide, or H₂O, can be very dangerous."
# => "Dihydrogen monoxide, or H<sub>2</sub>O, can be very dangerous."
def subscripts_to_sub_tags(html)
  capture_normalize_and_wrap(html,
    REGEXPS[:sub], # /[₀₁₂₃₄₅ ...
    'sub')
end

# "He wore a white <i>gat</i>¹."
# => "He wore a white <i>gat</i><sup>1</sup>."
def superscripts_to_sup_tags(html)
  capture_normalize_and_wrap(html,
    REGEXPS[:sup], # /[⁰¹²³⁴⁵ ...
    'sup')
end

def capture_normalize_and_wrap(html, rx, tag, log: true)
  scanned,to_scan = "",html
  while m=rx.match(to_scan)
    str = "<#{tag}>#{m[0].unicode_normalize(:nfkc)}</#{tag}>"
    (puts str) if log
    scanned += (m.pre_match + str)
    to_scan = m.post_match
  end
  return scanned + to_scan
end

# "PO₄³⁻" => "PO<sub>4</sub><sup>3−</sup>"
def process_html(html="PO₄³⁻")
  REGEXPS.each do |tag, rx|
    html = capture_normalize_and_wrap(html, rx, tag)
  end
  return html
end

def process_html_file(in_fname='sample_in.html', out_fname='sample_out.html')
  html = process_html File.read(in_fname)
  File.write(out_fname, html)
  return html
end
	---
	:subscripts:
	"₀": '0'
	"₁": '1'
	"₂": '2'
	"₃": '3'
	"₄": '4'
	"₅": '5'
	"₆": '6'
	"₇": '7'
	"₈": '8'
	"₉": '9'
	"₍": "("
	"₎": ")"
	"₊": "+"
	"₌": "="
	"₋": "−"
	ₐ: a
	ₑ: e
	ₕ: h
	ᵢ: i
	ⱼ: j
	ₖ: k
	ₗ: l
	ₘ: m
	ₙ: n
	ₒ: o
	ₚ: p
	ᵣ: r
	ₛ: s
	ₜ: t
	ᵤ: u
	ᵥ: v
	ₓ: x
	ₔ: ə
	ᵦ: β
	ᵧ: γ
	ᵨ: ρ
	ᵩ: φ
	ᵪ: χ
	:superscripts:
	"⁰": '0'
	"¹": '1'
	"²": '2'
	"³": '3'
	"⁴": '4'
	"⁵": '5'
	"⁶": '6'
	"⁷": '7'
	"⁸": '8'
	"⁹": '9'
	"⁽": "("
	"⁾": ")"
	"⁺": "+"
	"⁼": "="
	ᴬ: A
	ᴮ: B
	ᴰ: D
	ᴱ: E
	ᴳ: G
	ᴴ: H
	ᴵ: I
	ᴶ: J
	ᴷ: K
	ᴸ: L
	ᴹ: M
	ᴺ: N
	ᴼ: O
	ᴾ: P
	ᴿ: R
	ᵀ: T
	ᵁ: U
	ⱽ: V
	ᵂ: W
	ª: a
	ᵃ: a
	ᵇ: b
	ᶜ: c
	ᵈ: d
	ᵉ: e
	ᶠ: f
	ᵍ: g
	ʰ: h
	ⁱ: i
	ʲ: j
	ᵏ: k
	ˡ: l
	ᵐ: m
	ⁿ: n
	º: o
	ᵒ: o
	ᵖ: p
	ʳ: r
	ˢ: s
	ᵗ: t
	ᵘ: u
	ᵛ: v
	ʷ: w
	ˣ: x
	ʸ: y
	ᶻ: z
	ᴭ: Æ
	ᶞ: ð
	ꟸ: Ħ
	ᵑ: ŋ
	ꟹ: œ
	ᴲ: Ǝ
	ᶵ: ƫ
	ᴽ: Ȣ
	ᵄ: ɐ
	ᵅ: ɑ
	ᶛ: ɒ
	ᵓ: ɔ
	ᶝ: ɕ
	ᵊ: ə
	ᵋ: ɛ
	ᵌ: ɜ
	ᶟ: ɜ
	ᶡ: ɟ
	ᶢ: ɡ
	ˠ: ɣ
	ᶣ: ɥ
	ʱ: ɦ
	ᶤ: ɨ
	ᶥ: ɩ
	ᶦ: ɪ
	ꭞ: ɫ
	ᶩ: ɭ
	ᵚ: ɯ
	ᶭ: ɰ
	ᶬ: ɱ
	ᶮ: ɲ
	ᶯ: ɳ
	ᶰ: ɴ
	ᶱ: ɵ
	ᶲ: ɸ
	ʴ: ɹ
	ʵ: ɻ
	ʶ: ʁ
	ᶳ: ʂ
	ᶴ: ʃ
	ᶶ: ʉ
	ᶷ: ʊ
	ᶹ: ʋ
	ᶺ: ʌ
	ᶼ: ʐ
	ᶽ: ʑ
	ᶾ: ʒ
	ˤ: ʕ
	ᶨ: ʝ
	ᶫ: ʟ
	ᵝ: β
	ᵞ: γ
	ᵟ: δ
	ᶿ: θ
	ᵠ: φ
	ᵡ: χ
	ᵸ: н
	ꚜ: ъ
	ꚝ: ь
	ჼ: ნ
	ᵆ: ᴂ
	ᵔ: ᴖ
	ᵕ: ᴗ
	ᶸ: ᴜ
	ᵙ: ᴝ
	ᵜ: ᴥ
	ᶧ: ᵻ
	ᶪ: ᶅ
	"⁻": "−"
	ⵯ: ⵡ
	"㆒": 一
	"㆜": 丁
	"㆔": 三
	"㆖": 上
	"㆘": 下
	"㆛": 丙
	"㆗": 中
	"㆚": 乙
	"㆓": 二
	"㆟": 人
	"㆕": 四
	"㆞": 地
	"㆝": 天
	"㆙": 甲
	ꭜ: ꜧ
	ꝰ: ꝯ
	ꭝ: ꬷ
	ꭟ: ꭒ
	--- # As Ruby regular expressions:
	:subscripts: !ruby/regexp /[₀₁₂₃₄₅₆₇₈₉₍₎₊₌₋ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓₔᵦᵧᵨᵩᵪ]+/
	:superscripts: !ruby/regexp /[⁰¹²³⁴⁵⁶⁷⁸⁹⁽⁾⁺⁼ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂªᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿºᵒᵖʳˢᵗᵘᵛʷˣʸᶻᴭᶞꟸᵑꟹᴲᶵᴽᵄᵅᶛᵓᶝᵊᵋᵌᶟᶡᶢˠᶣʱᶤᶥᶦꭞᶩᵚᶭᶬᶮᶯᶰᶱᶲʴʵʶᶳᶴᶶᶷᶹᶺᶼᶽᶾˤᶨᶫᵝᵞᵟᶿᵠᵡᵸꚜꚝჼᵆᵔᵕᶸᵙᵜᶧᶪ⁻ⵯ㆒㆜㆔㆖㆘㆛㆗㆚㆓㆟㆕㆞㆝㆙ꭜꝰꭝꭟ]+/
	# Method for processing HTML to replace Unicode subscript and superscript
	# characters with normalized characters wrapped in `<sub>` and `<sup>` tags.
	# Uses regular expressions to identify sequences of sub- and superscripts.
	# Examples:
	#
	# "1ˢᵗ 2ⁿᵈ 3ʳᵈ" => "1<sup>st</sup> 2<sup>nd</sup> 3<sup>rd</sup>"
	# "PO₄³⁻ ion" => "PO<sub>4</sub><sup>3−</sup> ion"

	REGEXPS = {
	# Less-thorough regular expressions:
	sub: /[₀₁₂₃₄₅₆₇₈₉₍₎₊₌₋ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓ]+/,
	sup: /[⁰¹²³⁴⁵⁶⁷⁸⁹⁽⁾⁺⁼⁻ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂªᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿºᵒᵖʳˢᵗᵘᵛʷˣʸᶻ]+/,

	# # More-thorough versions:
	# sub: /[₀₁₂₃₄₅₆₇₈₉₍₎₊₌₋ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓₔᵦᵧᵨᵩᵪ]+/,
	# sup: /[⁰¹²³⁴⁵⁶⁷⁸⁹⁽⁾⁺⁼ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂªᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿºᵒᵖʳˢᵗᵘᵛʷˣʸᶻᴭᶞꟸᵑꟹᴲᶵᴽᵄᵅᶛᵓᶝᵊᵋᵌᶟᶡᶢˠᶣʱᶤᶥᶦꭞᶩᵚᶭᶬᶮᶯᶰᶱᶲʴʵʶᶳᶴᶶᶷᶹᶺᶼᶽᶾˤᶨᶫᵝᵞᵟᶿᵠᵡᵸꚜꚝჼᵆᵔᵕᶸᵙᵜᶧᶪ⁻ⵯ㆒㆜㆔㆖㆘㆛㆗㆚㆓㆟㆕㆞㆝㆙ꭜꝰꭝꭟ]+/,
	}

	# "Dihydrogen monoxide, or H₂O, can be very dangerous."
	# => "Dihydrogen monoxide, or H<sub>2</sub>O, can be very dangerous."
	def subscripts_to_sub_tags(html)
	capture_normalize_and_wrap(html,
	REGEXPS[:sub], # /[₀₁₂₃₄₅ ...
	'sub')
	end

	# "He wore a white <i>gat</i>¹."
	# => "He wore a white <i>gat</i><sup>1</sup>."
	def superscripts_to_sup_tags(html)
	capture_normalize_and_wrap(html,
	REGEXPS[:sup], # /[⁰¹²³⁴⁵ ...
	'sup')
	end

	def capture_normalize_and_wrap(html, rx, tag, log: true)
	scanned,to_scan = "",html
	while m=rx.match(to_scan)
	str = "<#{tag}>#{m[0].unicode_normalize(:nfkc)}</#{tag}>"
	(puts str) if log
	scanned += (m.pre_match + str)
	to_scan = m.post_match
	end
	return scanned + to_scan
	end

	# "PO₄³⁻" => "PO<sub>4</sub><sup>3−</sup>"
	def process_html(html="PO₄³⁻")
	REGEXPS.each do \|tag, rx\|
	html = capture_normalize_and_wrap(html, rx, tag)
	end
	return html
	end

	def process_html_file(in_fname='sample_in.html', out_fname='sample_out.html')
	html = process_html File.read(in_fname)
	File.write(out_fname, html)
	return html
	end