Skip to content

Instantly share code, notes, and snippets.

@amake
Last active March 6, 2022 09:13
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amake/8987426c54ad79b94619a5f3aa350837 to your computer and use it in GitHub Desktop.
Save amake/8987426c54ad79b94619a5f3aa350837 to your computer and use it in GitHub Desktop.
Unravel IDS data
# unravel.rb
#
# Expand entries in ids.txt
# (https://github.com/cjkvi/cjkvi-ids/blob/master/ids.txt) to make fully
# "unraveled" decompositions.
#
# Usage: ruby unravel.rb ids.txt
#
# Note that some unravelings are not "valid", in that they contain subcomponents
# from different regions and thus represent a decomposition not used in any
# region. These are marked with [🙅]. Other bracketed annotations follow the
# convention of ids.txt.
# @param io [IO]
# @return [Hash]
def load_db(io)
db = {}
io.each_line do |line|
next if line.start_with?('#')
codepoint, char, *decomps = line.split
db[char] = [codepoint, decomps.map { |d| Decomp.parse(d, represents: char) }]
end
db
end
# @param db [Hash]
def unravel!(db)
loop do
modified = false
db.each do |_char, (_codepoint, decomps)|
decomps.each do |decomp|
modified |= unravel_one!(db, decomp)
end
end
break unless modified
end
db.transform_values! do |codepoint, decomps|
expanded = decomps.flat_map(&:expand) # .select(&:valid?)
[codepoint, expanded]
end
end
# @param db [Hash]
# @param decomp [Decomp]
# @return [Boolean] modified or not
def unravel_one!(db, decomp)
modified = false
decomp.chars.map! do |c|
if db.key?(c) && !identity_decomp?(db, c)
modified = true
sub_decomps = db[c].last
sub_decomps.length == 1 ? sub_decomps.first : sub_decomps
else
c
end
end
modified
end
# @param db [Hash]
# @param c [String]
def identity_decomp?(db, c)
decomps = db[c].last
decomps.length == 1 && decomps.first.chars == [c]
end
class Decomp
class << self
# @param str [String]
# @param represents [String]
# @return [Decomp]
def parse(str, represents:)
/^(?<decomp>.*?)(?:\[(?<tags>[A-Z]+)\])?$/ =~ str
new(decomp.chars, tags&.chars, represents)
end
end
# @return [Array]
attr_reader :chars
# @return [Array,nil]
attr_reader :tags
# @return [String]
attr_reader :represents
# @param chars [Array]
# @param tags [Array,nil]
# @param represents [String]
def initialize(chars, tags, represents)
raise 'Tags cannot be empty' if tags && tags.empty?
@chars = chars # Don't freeze
@tags = tags.freeze
@represents = represents.freeze
end
def inspect
to_s(debug: true)
end
def to_s(include_tags = true, debug: false)
parts = chars.map { |c| c.is_a?(Decomp) ? c.to_s(false, debug: debug) : c }
parts << "(#{tags.join})" if tags && debug
if include_tags
eff_tags = effective_tags
if eff_tags.is_a?(Array)
tag_part = eff_tags.empty? ? '🙅' : eff_tags.join
parts << (debug ? "(#{tag_part})" : "[#{tag_part}]")
end
end
s = parts.join
debug ? "{#{s}}" : s
end
def ==(other)
other.class == Decomp &&
chars == other.chars && tags == other.tags &&
represents == other.represents
end
# @return [Boolean] whether this decomposition is "expanded", meaning it
# represents a single decomposition with no "branches"
def expanded?
chars.none? { |c| c.is_a?(Array) || (c.is_a?(Decomp) && !c.expanded?) }
end
# @return [Array<Decomp>] list of expanded decompositions
def expand
return [self] if expanded?
results = [[]]
chars.each do |c|
case c
when Array
results = c.flat_map { |c_| c_.is_a?(Decomp) ? c_.expand : c_ }
.flat_map { |c_| results.map { |r| r + [c_] } }
when Decomp
results = c.expand.flat_map { |c_| results.map { |r| r + [c_] } }
else
results.each { |r| r << c }
end
end
results.flat_map { |r| Decomp.new(r, tags, represents).expand } # .select(&:valid?)
end
# @return [Array,nil] nil means untagged; empty array means no valid set of
# tags can apply (this decomposition is invalid)
def effective_tags(acc = { result: nil })
raise unless expanded?
if tags
acc[:result] ||= tags.dup
acc[:result].select! { |t| tags.include?(t) }
end
chars.each { |c| c.effective_tags(acc) if c.is_a?(Decomp) }
acc[:result]
end
# @return [Boolean] whether this decomposition represents a consistent set of
# regional variations
def valid?
eff_tags = effective_tags
eff_tags.nil? || eff_tags.any?
end
end
db = load_db(ARGF)
unravel!(db)
db.each do |char, (codepoint, decomps)|
puts([codepoint, char, decomps.map { |d| d.to_s(debug: false) }].join("\t"))
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment