Skip to content

Instantly share code, notes, and snippets.

@seki
Created September 12, 2021 22:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save seki/df18d7b938b4b031da1c7fb68d0ff777 to your computer and use it in GitHub Desktop.
Save seki/df18d7b938b4b031da1c7fb68d0ff777 to your computer and use it in GitHub Desktop.
dicom charset converter; import from js
# coding: us-ascii
module DCM_CharSet
class InvalidCharSet < RuntimeError
end
# https://github.com/cornerstonejs/dicomParser/issues/146
module_function
def parse_charset(dcm_00080005)
charset = dcm_00080005
ary = charset ? charset.strip.split('\\').map {|x| x.strip.upcase} : []
return ['ISO_IR 6'] if ary.empty?
ary[0] = 'ISO 2022 IR 6' if ary[0].empty?
ary.each do |x|
raise(InvalidCharSet.new(x)) unless CharactorSet.include?(x)
end
ary
end
class Context
def initialize(dcm_00080005)
@charset_names = DCM_CharSet::parse_charset(dcm_00080005)
ary = @charset_names.map {CharactorSet[_1]}
if ary.size == 1 && ary.first[:extension].nil?
@wo_extensions = true
@encoding = ary.first[:encoding]
else
@default_encoding = ary.first
@wo_extensions = false
@allow_encoding = {}
ary.each do |x|
x[:elements].each do |y|
@allow_encoding[y.escape_sequence] = y
end
end
seg = @allow_encoding.keys.map{Regexp.escape(_1)} + ['[\000-\032\034-\177]+', '[\200-\377]+']
@reg = Regexp.new(seg.join('|'), nil, 'N')
end
end
attr_reader :allow_encoding
def without_extensions?
@wo_extensions
end
def scan(str)
return [str] if without_extensions?
str.scan(@reg)
end
def convert(str)
return convert_wo_extensions(str) if without_extensions?
element = {}
@default_encoding[:elements].each do |x|
element[x.code_element] = x
end
result = []
scan(str).each do |seg|
if seg[0] == "\e"
e = @allow_encoding[seg]
element[e.code_element] = e
elsif seg.bytes.first < 0x80
e = element['G0']
result << e.encode(seg)
else
e = element['G1']
result << e.encode(seg)
end
end
result
end
def convert_wo_extensions(str)
[str.force_encoding(@encoding)]
end
end
end
module DCM_CharSet
class Element
def initialize(code_element, escape_sequence, encoding, bytes_per_code_point)
@code_element = code_element
@escape_sequence = escape_sequence.pack('C*')
@encoding = encoding
@bytes_per_code_point = bytes_per_code_point
end
attr_reader :escape_sequence, :encoding, :code_element
def method_missing(name, *arg)
return false if name.end_with?('?')
super
end
def encode(str)
str.force_encoding(@encoding)
end
end
module E_dummy_encoding
def encode(str)
(@escape_sequence + str).force_encoding(@encoding)
end
end
module E_isJISX0212
def is_JISX0212?
true
end
end
AsciiElement = Element.new('G0', [0x1B, 0x28, 0x42], 'ascii', 1)
def AsciiElement.ascii?
true
end
CharactorSet = {
# single-byte w/o extensions
"ISO_IR 6" => {
:encoding => 'ascii'
},
'ISO_IR 100' => {
:encoding => 'windows-1252'
},
'ISO_IR 101' => {
:encoding => 'iso-8859-2'
},
'ISO_IR 109' => {
:encoding =>'iso-8859-3'
},
'ISO_IR 110' => {
:encoding => 'iso-8859-4'
},
'ISO_IR 144' => {
:encoding => 'iso-8859-5'
},
'ISO_IR 127' => {
:encoding => 'iso-8859-6'
},
'ISO_IR 126' => {
:encoding => 'iso-8859-7'
},
'ISO_IR 138' => {
:encoding => 'iso-8859-8'
},
# Latin alphabet No. 5
'ISO_IR 148' => {
:encoding => 'windows-1254' # FIXME
},
# FIXME
'ISO_IR 13' => {
:encoding => 'shift-jis' #FIXME
},
'ISO_IR 166' => {
:encoding => 'tis-620'
},
# single-byte with extensions
'ISO 2022 IR 6' =>{
:extension => true,
:elements => [AsciiElement]
},
'ISO 2022 IR 100' => {
:extension => true,
:elements => [
AsciiElement,
Element.new('G1', [0x1B, 0x2D, 0x41], 'ISO_8859_1', 1)
]
},
'ISO 2022 IR 101' => {
:extension => true,
:elements => [
AsciiElement,
Element.new('G1', [0x1B, 0x2D, 0x42], 'iso-8859-2', 1)
]
},
'ISO 2022 IR 109' => {
:extension => true,
:elements => [
AsciiElement,
Element.new('G1', [0x1B, 0x2D, 0x43], 'iso-8859-3', 1)
]
},
'ISO 2022 IR 110' => {
:extension => true,
:elements => [
AsciiElement,
Element.new('G1', [0x1B, 0x2D, 0x44], 'iso-8859-4', 1)
]
},
'ISO 2022 IR 144' => {
:extension => true,
:elements => [
AsciiElement,
Element.new('G1', [0x1B, 0x2D, 0x4C], 'iso-8859-5', 1)
]
},
'ISO 2022 IR 127' => {
:extension => true,
:elements => [
AsciiElement,
Element.new('G1', [0x1B, 0x2D, 0x47], 'iso-8859-6', 1)
]
},
'ISO 2022 IR 126' => {
:extension => true,
:elements => [
AsciiElement,
Element.new('G1', [0x1B, 0x2D, 0x46], 'iso-8859-7', 1)
]
},
'ISO 2022 IR 138' => {
:extension => true,
:elements => [
AsciiElement,
Element.new('G1', [0x1B, 0x2D, 0x48], 'iso-8859-8', 1)
]
},
'ISO 2022 IR 148' => {
:extension => true,
:elements => [
AsciiElement,
Element.new('G1', [0x1B, 0x2D, 0x4D], 'iso-8859-9', 1)
]
},
# Japanese
'ISO 2022 IR 13' => {
:extension => true,
:elements => [
Element.new('G0', [0x1B, 0x28, 0x4A], 'cp50221', 1),
Element.new('G1', [0x1B, 0x2D, 0x49], 'cp50221', 1)
]
},
'ISO 2022 IR 166' => {
:extension => true,
:elements => [
AsciiElement,
Element.new('G1', [0x1B, 0x2D, 0x54], 'tis-620', 1)
]
},
# Multi-byte with extensions
'ISO 2022 IR 87' => {
:extension => true,
:multi_byte => true,
:elements => [
Element.new('G0', [0x1B, 0x24, 0x42],
'iso-2022-jp', 2).extend(E_dummy_encoding)
]
},
'ISO 2022 IR 159' => {
:extension => true,
:multi_byte => true,
:elements => [
Element.new('G0', [0x1B, 0x24, 0x28, 0x44],
'ISO_2022_JP_2', 2).extend(E_dummy_encoding)
]
},
'ISO 2022 IR 149' => {
:extension => true,
:multi_byte => true,
:elements => [
Element.new('G1', [0x1B, 0x24, 0x29, 0x43], 'euc-kr', 2)
]
},
'ISO 2022 IR 58' => {
:extension => true,
:multi_byte => true,
:elements => [
Element.new('G1', [0x1B, 0x24, 0x29, 0x41], 'gb18030', 2)
]
},
# Multi-byte without extensions
'ISO_IR 192' => {
:encoding =>'utf-8',
:multi_byte => true
},
'GB18030' => {
:encoding => 'GB18030',
:multi_byte => true
},
'GBK' => {
:encoding => 'gbk',
:multi_byte => true
}
}
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment