Skip to content

Instantly share code, notes, and snippets.

@tonytonyjan
Last active October 18, 2020 07:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tonytonyjan/58bcc97fdb1940391eb01b4e4fa1ef2c to your computer and use it in GitHub Desktop.
Save tonytonyjan/58bcc97fdb1940391eb01b4e4fa1ef2c to your computer and use it in GitHub Desktop.
Ruby implementation for RFC2047
# Copyright (c) 2020 Jian Weihang <tonytonyjan@gmail.com>
# frozen_string_literal: true
module Rfc2047
TOKEN = /[\041\043-\047\052\053\055\060-\071\101-\132\134\136\137\141-\176]+/.freeze
ENCODED_TEXT = /[\041-\076\100-\176]*/.freeze
ENCODED_WORD = /=\?(?<charset>#{TOKEN})\?(?<encoding>[QBqb])\?(?<encoded_text>#{ENCODED_TEXT})\?=/.freeze
ENCODED_WORD_SEQUENCE = /#{ENCODED_WORD}(?:\s*#{ENCODED_WORD})*/.freeze
class << self
# example
#
# Rfc2047.decode_value '=?UTF-8?B?5Yu/5Lul5oOh5bCP6ICM54K65LmL77yM5Yu/5Lul5ZaE5bCP6ICM5LiN54K6?= =?UTF-8?B?44CC?='
# # => "勿以惡小而為之,勿以善小而不為。"
def decode_value(input)
return input unless input.match?(ENCODED_WORD)
input.gsub(ENCODED_WORD_SEQUENCE) do |match|
result = +''
match.scan(ENCODED_WORD) { result << decode($&) }
if result.encoding == Encoding::UTF_7
require 'net/imap'
result.replace(
Net::IMAP.decode_utf7(result.force_encoding(Encoding::BINARY))
).force_encoding(Encoding::UTF_8)
else
result.encode!(Encoding::UTF_8)
end
result
end
end
# example:
#
# Rfc2047.encode('己所不欲,勿施於人。')
# # => "=?UTF-8?B?5bex5omA5LiN5qyy77yM5Yu/5pa95pa85Lq644CC?="
def encode(input, encoding: :B)
case encoding
when :B then "=?#{input.encoding}?B?#{[input].pack('m0')}?="
when :Q then "=?#{input.encoding}?Q?#{[input].pack('M')}?="
else raise ":encoding should be either :B or :Q, got #{encoding}"
end
end
# example:
#
# Rfc2047.decode '=?UTF-8?B?5bex5omA5LiN5qyy77yM5Yu/5pa95pa85Lq644CC?='
# # => "己所不欲,勿施於人。"
def decode(input)
match_data = ENCODED_WORD.match(input)
raise ArgumentError if match_data.nil?
charset, encoding, encoded_text = match_data.captures
charset = 'CP950' if charset == 'MS950'
decoded =
case encoding
when 'Q', 'q' then encoded_text.gsub('_', '=20').unpack1('M')
when 'B', 'b' then encoded_text.unpack1('m')
end
found_encoding = find_encoding(charset)
found_encoding = Encoding::UTF_8 if found_encoding == Encoding::ASCII_8BIT
decoded.force_encoding(found_encoding)
end
private
def find_encoding(charset)
case charset.downcase
when 'utf-16' then Encoding::UTF_16BE
when 'utf-32' then Encoding::UTF_32BE
when 'ks_c_5601-1987' then Encoding::CP949
when 'shift-jis' then Encoding::Shift_JIS
when 'gb2312' then Encoding::GB18030
when 'ms950' then Encoding::CP950
when '8bit' then Encoding::ASCII_8BIT
when 'latin2' then Encoding::ISO_8859_2
else Encoding.find(charset)
end
end
end
end
# Copyright (c) 2020 Jian Weihang <tonytonyjan@gmail.com>
# frozen_string_literal: true
require 'minitest/autorun'
require 'rfc_2047'
class Test < Minitest::Test
def assert_decode_value(expected, actual)
assert_equal expected, Rfc2047.decode_value(actual)
end
end
class Rfc2047Test < Test
def test_encode
assert_equal '=?UTF-8?B?5ris6Kmm?=', Rfc2047.encode('測試')
end
def test_decode
assert_equal 'this is some text', Rfc2047.decode('=?iso-8859-1?q?this=20is=20some=20text?=')
assert_equal '測試', Rfc2047.decode('=?UTF-8?B?5ris6Kmm?=')
end
class Base64 < Test
def test_it_should_decode_an_encoded_string
assert_decode_value(
'This is あ string',
'=?UTF-8?B?VGhpcyBpcyDjgYIgc3RyaW5n?='
)
end
def test_it_should_decode_a_long_encoded_string
assert_decode_value(
'This is あ really long string This is あ really long string This is あ really long string This is あ really long string This is あ really long string',
'=?UTF-8?B?VGhpcyBpcyDjgYIgcmVhbGx5IGxvbmcgc3RyaW5nIFRoaXMgaXMg44GCIHJl?= =?UTF-8?B?YWxseSBsb25nIHN0cmluZyBUaGlzIGlzIOOBgiByZWFsbHkgbG9uZyBzdHJp?= =?UTF-8?B?bmcgVGhpcyBpcyDjgYIgcmVhbGx5IGxvbmcgc3RyaW5nIFRoaXMgaXMg44GC?= =?UTF-8?B?IHJlYWxseSBsb25nIHN0cmluZw==?='
)
end
def test_it_should_decode_utf_16_encoded_string
assert_decode_value(
'あいうえお',
'=?UTF-16?B?MEIwRDBGMEgwSg==?='
)
end
def test_it_should_decode_utf_32_encoded_string
assert_decode_value(
'あいうえお',
'=?UTF-32?B?AAAwQgAAMEQAADBGAAAwSAAAMEo=?='
)
end
def test_it_should_decoded
assert_decode_value(
'案件情報[-01 大手資産運用会社 - 資産運用にかかるDWHの二次開発業務]',
"=?iso-2022-jp?Q?=1B=24B0F7o=3EpJs=1B=28B=5B=2D01_=1B=24?=\n =?iso-2022-jp?Q?BBg=3Cj=3Bq=3B=3A1=3FMQ2q=3CR=1B=28B_=2D_=1B=24B=3B?=\n =?iso-2022-jp?Q?q=3B=3A1=3FMQ=24K=24=2B=24=2B=24k=1B=28BDWH=1B=24B=24?=\n =?iso-2022-jp?Q?NFs=3C=213=2BH=2F6HL3=1B=28B=5D?="
)
end
def test_it_should_decode_a_string_that_looks_similar_to_an_encoded_string
assert_decode_value('1+1=?', '1+1=?')
end
def test_it_should_parse_adjacent_encoded_words_separated_by_linear_white_space
assert_decode_value(
'новый сотрудник — дорофеев',
"=?utf-8?B?0L3QvtCy0YvQuSDRgdC+0YLRgNGD0LTQvdC40Log4oCUINC00L7RgNC+0YQ=?=\n =?utf-8?B?0LXQtdCy?="
)
end
def test_it_should_parse_adjacent_words_with_no_space
assert_decode_value(
'новый сотрудник — дорофеев',
'=?utf-8?B?0L3QvtCy0YvQuSDRgdC+0YLRgNGD0LTQvdC40Log4oCUINC00L7RgNC+0YQ=?==?utf-8?B?0LXQtdCy?='
)
end
def test_it_should_collapse_adjacent_words_with_multiple_encodings_on_one_line_seperated_by_non_spaces
assert_decode_value(
"Re:[グルーポン・ジャパン株式会社] 返信:【グルーポン】お問い合わせの件について(リクエスト#1056273\n )",
"Re:[=?iso-2022-jp?B?GyRCJTAlayE8JV0lcyEmJTglYyVRJXMzdDwwMnEbKEI=?=\n =?iso-2022-jp?B?GyRCPFIbKEI=?=] =?iso-2022-jp?B?GyRCSlY/LiEnGyhC?=\n =?iso-2022-jp?B?GyRCIVolMCVrITwlXSVzIVskKkxkJCQ5ZyRvJDsbKEI=?=\n =?iso-2022-jp?B?GyRCJE43byRLJEQkJCRGIUolaiUvJSglOSVIGyhC?=#1056273\n =?iso-2022-jp?B?GyRCIUsbKEI=?="
)
end
def test_it_should_decode_a_blank_string
assert_decode_value('', '=?utf-8?B??=')
end
def test_it_should_decode_ks_c_5601_1987_encoded_string
assert_decode_value(
'김 현진 <a@b.org>',
'=?ks_c_5601-1987?B?seggx/bB+A==?= <a@b.org>'
)
end
def test_it_should_decode_shift_jis_encoded_string
assert_decode_value('日本語', '=?shift-jis?Q?=93=FA=96{=8C=EA?=')
end
def test_it_should_decode_gb18030_encoded_string_misidentified_as_gb2312
assert_decode_value('開', '=?GB2312?B?6V8=?=')
end
def test_it_should_decode_a_utf_7_encoded_unstructured_field
assert_decode_value(
'勿以惡小而為之,勿以善小而不為。',
'=?utf-7?B?5Yu/5Lul5oOh5bCP6ICM54K65LmL77yM5Yu/5Lul5ZaE5bCP6ICM5LiN54K6?= =?utf-7?B?44CC?='
)
end
end
class QuotedPrintable < Test
def test_it_should_decode_an_encoded_string
assert_decode_value(
'This is あ string',
'=?UTF-8?Q?This_is_=E3=81=82_string?='
)
end
def test_it_should_decode_q_encoded_5F_as_underscore
assert_decode_value(
'This ­ and_that',
'=?UTF-8?Q?This_=C2=AD_and=5Fthat?='
)
end
def test_it_should_decode_a_blank_string
assert_decode_value('', '=?utf-8?Q??=')
end
def test_it_should_decode_8bit_encoded_string
assert_decode_value("ALPH\xC3\x89E", '=?8bit?Q?ALPH=C3=89E?=')
end
end
class Mixed < Test
def test_it_should_decode_an_encoded_string2
assert_decode_value(
'This is あ string This was あ string',
'=?UTF-8?B?VGhpcyBpcyDjgYIgc3RyaW5n?= =?UTF-8?Q?_This_was_=E3=81=82_string?='
)
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment