Skip to content

Instantly share code, notes, and snippets.

@jyuch
Last active August 29, 2015 14:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jyuch/112e886d3198a107c42d to your computer and use it in GitHub Desktop.
Save jyuch/112e886d3198a107c42d to your computer and use it in GitHub Desktop.
Pythonでもオレオレ文字コードを実装したい
""" 'Jyuch-Kana' Codec
Written by jyuch (http://jyuch.hatenablog.com/)
"""
import codecs
from itertools import count
def encode(input, errors='strict'):
buffer = []
for i, it in zip(count(0), input):
try:
buffer.append(convert_unicode_to_hex(it))
except ValueError:
raise UnicodeError(
"'jyuch-kana' codec can't encode characters in position {0}".format(i)
)
return bytes(buffer), len(input)
def convert_unicode_to_hex(unicode):
if unicode == chr(0x00):
return 0x00
elif unicode == chr(0x0A):
return 0x01
elif unicode == chr(0x0D):
return 0x02
elif unicode == chr(0x20):
return 0x10
elif chr(0x41) <= unicode <= chr(0x5A):
return ord(unicode) - 0x30
elif chr(0x61) <= unicode <= chr(0x7A):
return ord(unicode) - 0x36
elif unicode == chr(0x3000):
return 0x50
elif chr(0x3041) <= unicode <= chr(0x3093):
return ord(unicode) - 0x2FF0
else:
raise ValueError()
def decode(input, errors='strict'):
buffer = []
for i, it in zip(count(0), input):
try:
buffer.append(convert_hex_to_unicode(it))
except ValueError:
raise UnicodeError(
"'jyuch-kana' codec can't decode characters in position {0}".format(i)
)
return ''.join(buffer), len(input)
def convert_hex_to_unicode(hex):
if hex == 0x00:
return chr(0x00)
elif hex == 0x01:
return chr(0x0A)
elif hex == 0x02:
return chr(0x0D)
elif hex == 0x10:
return chr(0x20)
elif 0x11 <= hex <= 0x2A:
return chr(hex + 0x30)
elif 0x2B <= hex <= 0x44:
return chr(hex + 0x36)
elif hex == 0x50:
return chr(0x3000)
elif 0x51 <= hex <= 0xA3:
return chr(hex + 0x2FF0)
else:
raise ValueError()
class StreamWriter(codecs.StreamWriter):
encode = lambda self, input, errors: encode(input, errors)
class StreamReader(codecs.StreamReader):
decode = lambda self, input, errors: decode(input, errors)
def getregentry():
return codecs.CodecInfo(
name='jyuch-kana',
encode=lambda self, input, errors: encode(input, errors),
decode=lambda self, input, errors: decode(input, errors),
streamreader=StreamReader,
streamwriter=StreamWriter,
)
def search_function(codec_name):
if codec_name == 'jyuch-kana':
return getregentry()
else:
return None
def registry():
codecs.register(search_function)
registry()
import jyuch_kana
import codecs
def main():
fin = codecs.open('text.txt', encoding='jyuch-kana')
fout = codecs.open('text2.txt', 'w', encoding='jyuch-kana')
with fin, fout:
for it in fin:
fout.write(it.strip() + ' せかい\n')
with codecs.open('text2.txt', encoding='jyuch-kana') as f:
for it in f:
print(it.strip())
if __name__ == '__main__':
main()
import unittest
from jyuch_kana import convert_hex_to_unicode
class ConvertHexToUnicode(unittest.TestCase):
def test_convert_null(self):
self.assertEqual(convert_hex_to_unicode(0x00), chr(0x00))
def test_convert_lf(self):
self.assertEqual(convert_hex_to_unicode(0x01), chr(0x0A))
def test_convert_cr(self):
self.assertEqual(convert_hex_to_unicode(0x02), chr(0x0D))
def test_convert_space(self):
self.assertEqual(convert_hex_to_unicode(0x10), chr(0x20))
def test_convert_latin_capital_letter_A(self):
self.assertEqual(convert_hex_to_unicode(0x11), 'A')
def test_convert_latin_capital_letter_Z(self):
self.assertEqual(convert_hex_to_unicode(0x2A), 'Z')
def test_convert_latin_small_letter_a(self):
self.assertEqual(convert_hex_to_unicode(0x2B), 'a')
def test_convert_latin_small_letter_z(self):
self.assertEqual(convert_hex_to_unicode(0x44), 'z')
def test_convert_ideographic_space(self):
self.assertEqual(convert_hex_to_unicode(0x50), ' ')
def test_convert_hiragana_letter_small_a(self):
self.assertEqual(convert_hex_to_unicode(0x51), 'ぁ')
def test_convert_hiragana_letter_n(self):
self.assertEqual(convert_hex_to_unicode(0xA3), 'ん')
def test_raise_ValueError_unknown_code(self):
self.assertRaises(ValueError, convert_hex_to_unicode, 0xA4)
if __name__ == '__main__':
unittest.main()
import unittest
from jyuch_kana import convert_unicode_to_hex
class ConvertUnicodeToHex(unittest.TestCase):
def test_convert_null(self):
self.assertEqual(convert_unicode_to_hex(chr(0x00)), 0x00)
def test_convert_lf(self):
self.assertEqual(convert_unicode_to_hex(chr(0x0A)), 0x01)
def test_convert_cr(self):
self.assertEqual(convert_unicode_to_hex(chr(0x0D)), 0x02)
def test_convert_space(self):
self.assertEqual(convert_unicode_to_hex(chr(0x20)), 0x10)
def test_convert_latin_capital_letter_A(self):
self.assertEqual(convert_unicode_to_hex('A'), 0x11)
def test_convert_latin_capital_letter_Z(self):
self.assertEqual(convert_unicode_to_hex('Z'), 0x2A)
def test_convert_latin_small_letter_a(self):
self.assertEqual(convert_unicode_to_hex('a'), 0x2B)
def test_convert_latin_small_letter_z(self):
self.assertEqual(convert_unicode_to_hex('z'), 0x44)
def test_convert_ideographic_space(self):
self.assertEqual(convert_unicode_to_hex(' '), 0x50)
def test_convert_hiragana_letter_small_a(self):
self.assertEqual(convert_unicode_to_hex('ぁ'), 0x51)
def test_convert_hiragana_letter_n(self):
self.assertEqual(convert_unicode_to_hex('ん'), 0xA3)
def test_raise_ValueError_unknown_code(self):
self.assertRaises(ValueError, convert_unicode_to_hex, '亜')
if __name__ == '__main__':
unittest.main()
import unittest
from jyuch_kana import decode
class Decode(unittest.TestCase):
def test_encode_null_lf_cr(self):
self.assertEqual(
decode(bytes.fromhex('00 01 02')),
(''.join([chr(0x00), chr(0x0A), chr(0x0D)]), 3)
)
def test_encode_Hello_World_cr_lf(self):
self.assertEqual(
decode(bytes.fromhex('18 2F 36 36 39 10 27 39 3C 36 2E 02 01')),
('Hello World\r\n', 13)
)
def test_encode_hiragana_chars(self):
self.assertEqual(
decode(bytes.fromhex('63 A3 7B 71 7F 50 6B 5B 54')),
('こんにちは せかい', 9)
)
def test_encode(self):
self.assertEqual(
decode(bytes.fromhex('00 01 02 10 11 2A 2B 44 50 51 A3')),
(''.join([chr(0x00), chr(0x0A), chr(0x0D), chr(0x20), 'AZaz', chr(0x3000), 'ぁん']), 11)
)
def test_encode_void_str(self):
self.assertEqual(
decode(b''),
('', 0)
)
def test_raise_UnicodeEncodeError(self):
self.assertRaises(UnicodeError, decode, bytes.fromhex('63 A3 7B 71 7F 50 6B 5B 54 A4'))
if __name__ == '__main__':
unittest.main()
import unittest
from jyuch_kana import encode
class Encode(unittest.TestCase):
def test_encode_null_lf_cr(self):
self.assertEqual(
encode(''.join([chr(0x00), chr(0x0A), chr(0x0D)])),
(bytes.fromhex('00 01 02'), 3)
)
def test_encode_Hello_World_cr_lf(self):
self.assertEqual(
encode('Hello World\r\n'),
(bytes.fromhex('18 2F 36 36 39 10 27 39 3C 36 2E 02 01'), 13)
)
def test_encode_hiragana_chars(self):
self.assertEqual(
encode('こんにちは せかい'),
(bytes.fromhex('63 A3 7B 71 7F 50 6B 5B 54'), 9)
)
def test_encode(self):
self.assertEqual(
encode(''.join([chr(0x00), chr(0x0A), chr(0x0D), chr(0x20), 'AZaz', chr(0x3000), 'ぁん'])),
(bytes.fromhex('00 01 02 10 11 2A 2B 44 50 51 A3'), 11)
)
def test_encode_void_str(self):
self.assertEqual(
encode(''),
(b'', 0)
)
def test_raise_UnicodeEncodeError(self):
self.assertRaises(UnicodeError, encode, 'こんにちは世界')
if __name__ == '__main__':
unittest.main()
def main():
with open("text.txt", "wb") as f:
bary = bytearray([
0x63, 0xA3, 0x7B, 0x71, 0x7F, 0x50, 0x6B, 0x5B, 0x54,
0x01,
0x65, 0x98, 0x56, 0x7A, 0x99, 0x50, 0x6B, 0x5B, 0x54])
f.write(bary)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment