Last active
August 29, 2015 14:13
-
-
Save jyuch/112e886d3198a107c42d to your computer and use it in GitHub Desktop.
Pythonでもオレオレ文字コードを実装したい
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" 'Jyuch-Kana' Codec | |
Written by jyuch (http://jyuch.hatenablog.com/) | |
""" | |
import codecs | |
from itertools import count | |
def encode(input, errors='strict'): | |
buffer = [] | |
for i, it in zip(count(0), input): | |
try: | |
buffer.append(convert_unicode_to_hex(it)) | |
except ValueError: | |
raise UnicodeError( | |
"'jyuch-kana' codec can't encode characters in position {0}".format(i) | |
) | |
return bytes(buffer), len(input) | |
def convert_unicode_to_hex(unicode): | |
if unicode == chr(0x00): | |
return 0x00 | |
elif unicode == chr(0x0A): | |
return 0x01 | |
elif unicode == chr(0x0D): | |
return 0x02 | |
elif unicode == chr(0x20): | |
return 0x10 | |
elif chr(0x41) <= unicode <= chr(0x5A): | |
return ord(unicode) - 0x30 | |
elif chr(0x61) <= unicode <= chr(0x7A): | |
return ord(unicode) - 0x36 | |
elif unicode == chr(0x3000): | |
return 0x50 | |
elif chr(0x3041) <= unicode <= chr(0x3093): | |
return ord(unicode) - 0x2FF0 | |
else: | |
raise ValueError() | |
def decode(input, errors='strict'): | |
buffer = [] | |
for i, it in zip(count(0), input): | |
try: | |
buffer.append(convert_hex_to_unicode(it)) | |
except ValueError: | |
raise UnicodeError( | |
"'jyuch-kana' codec can't decode characters in position {0}".format(i) | |
) | |
return ''.join(buffer), len(input) | |
def convert_hex_to_unicode(hex): | |
if hex == 0x00: | |
return chr(0x00) | |
elif hex == 0x01: | |
return chr(0x0A) | |
elif hex == 0x02: | |
return chr(0x0D) | |
elif hex == 0x10: | |
return chr(0x20) | |
elif 0x11 <= hex <= 0x2A: | |
return chr(hex + 0x30) | |
elif 0x2B <= hex <= 0x44: | |
return chr(hex + 0x36) | |
elif hex == 0x50: | |
return chr(0x3000) | |
elif 0x51 <= hex <= 0xA3: | |
return chr(hex + 0x2FF0) | |
else: | |
raise ValueError() | |
class StreamWriter(codecs.StreamWriter): | |
encode = lambda self, input, errors: encode(input, errors) | |
class StreamReader(codecs.StreamReader): | |
decode = lambda self, input, errors: decode(input, errors) | |
def getregentry(): | |
return codecs.CodecInfo( | |
name='jyuch-kana', | |
encode=lambda self, input, errors: encode(input, errors), | |
decode=lambda self, input, errors: decode(input, errors), | |
streamreader=StreamReader, | |
streamwriter=StreamWriter, | |
) | |
def search_function(codec_name): | |
if codec_name == 'jyuch-kana': | |
return getregentry() | |
else: | |
return None | |
def registry(): | |
codecs.register(search_function) | |
registry() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import jyuch_kana | |
import codecs | |
def main(): | |
fin = codecs.open('text.txt', encoding='jyuch-kana') | |
fout = codecs.open('text2.txt', 'w', encoding='jyuch-kana') | |
with fin, fout: | |
for it in fin: | |
fout.write(it.strip() + ' せかい\n') | |
with codecs.open('text2.txt', encoding='jyuch-kana') as f: | |
for it in f: | |
print(it.strip()) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
from jyuch_kana import convert_hex_to_unicode | |
class ConvertHexToUnicode(unittest.TestCase): | |
def test_convert_null(self): | |
self.assertEqual(convert_hex_to_unicode(0x00), chr(0x00)) | |
def test_convert_lf(self): | |
self.assertEqual(convert_hex_to_unicode(0x01), chr(0x0A)) | |
def test_convert_cr(self): | |
self.assertEqual(convert_hex_to_unicode(0x02), chr(0x0D)) | |
def test_convert_space(self): | |
self.assertEqual(convert_hex_to_unicode(0x10), chr(0x20)) | |
def test_convert_latin_capital_letter_A(self): | |
self.assertEqual(convert_hex_to_unicode(0x11), 'A') | |
def test_convert_latin_capital_letter_Z(self): | |
self.assertEqual(convert_hex_to_unicode(0x2A), 'Z') | |
def test_convert_latin_small_letter_a(self): | |
self.assertEqual(convert_hex_to_unicode(0x2B), 'a') | |
def test_convert_latin_small_letter_z(self): | |
self.assertEqual(convert_hex_to_unicode(0x44), 'z') | |
def test_convert_ideographic_space(self): | |
self.assertEqual(convert_hex_to_unicode(0x50), ' ') | |
def test_convert_hiragana_letter_small_a(self): | |
self.assertEqual(convert_hex_to_unicode(0x51), 'ぁ') | |
def test_convert_hiragana_letter_n(self): | |
self.assertEqual(convert_hex_to_unicode(0xA3), 'ん') | |
def test_raise_ValueError_unknown_code(self): | |
self.assertRaises(ValueError, convert_hex_to_unicode, 0xA4) | |
if __name__ == '__main__': | |
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
from jyuch_kana import convert_unicode_to_hex | |
class ConvertUnicodeToHex(unittest.TestCase): | |
def test_convert_null(self): | |
self.assertEqual(convert_unicode_to_hex(chr(0x00)), 0x00) | |
def test_convert_lf(self): | |
self.assertEqual(convert_unicode_to_hex(chr(0x0A)), 0x01) | |
def test_convert_cr(self): | |
self.assertEqual(convert_unicode_to_hex(chr(0x0D)), 0x02) | |
def test_convert_space(self): | |
self.assertEqual(convert_unicode_to_hex(chr(0x20)), 0x10) | |
def test_convert_latin_capital_letter_A(self): | |
self.assertEqual(convert_unicode_to_hex('A'), 0x11) | |
def test_convert_latin_capital_letter_Z(self): | |
self.assertEqual(convert_unicode_to_hex('Z'), 0x2A) | |
def test_convert_latin_small_letter_a(self): | |
self.assertEqual(convert_unicode_to_hex('a'), 0x2B) | |
def test_convert_latin_small_letter_z(self): | |
self.assertEqual(convert_unicode_to_hex('z'), 0x44) | |
def test_convert_ideographic_space(self): | |
self.assertEqual(convert_unicode_to_hex(' '), 0x50) | |
def test_convert_hiragana_letter_small_a(self): | |
self.assertEqual(convert_unicode_to_hex('ぁ'), 0x51) | |
def test_convert_hiragana_letter_n(self): | |
self.assertEqual(convert_unicode_to_hex('ん'), 0xA3) | |
def test_raise_ValueError_unknown_code(self): | |
self.assertRaises(ValueError, convert_unicode_to_hex, '亜') | |
if __name__ == '__main__': | |
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
from jyuch_kana import decode | |
class Decode(unittest.TestCase): | |
def test_encode_null_lf_cr(self): | |
self.assertEqual( | |
decode(bytes.fromhex('00 01 02')), | |
(''.join([chr(0x00), chr(0x0A), chr(0x0D)]), 3) | |
) | |
def test_encode_Hello_World_cr_lf(self): | |
self.assertEqual( | |
decode(bytes.fromhex('18 2F 36 36 39 10 27 39 3C 36 2E 02 01')), | |
('Hello World\r\n', 13) | |
) | |
def test_encode_hiragana_chars(self): | |
self.assertEqual( | |
decode(bytes.fromhex('63 A3 7B 71 7F 50 6B 5B 54')), | |
('こんにちは せかい', 9) | |
) | |
def test_encode(self): | |
self.assertEqual( | |
decode(bytes.fromhex('00 01 02 10 11 2A 2B 44 50 51 A3')), | |
(''.join([chr(0x00), chr(0x0A), chr(0x0D), chr(0x20), 'AZaz', chr(0x3000), 'ぁん']), 11) | |
) | |
def test_encode_void_str(self): | |
self.assertEqual( | |
decode(b''), | |
('', 0) | |
) | |
def test_raise_UnicodeEncodeError(self): | |
self.assertRaises(UnicodeError, decode, bytes.fromhex('63 A3 7B 71 7F 50 6B 5B 54 A4')) | |
if __name__ == '__main__': | |
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
from jyuch_kana import encode | |
class Encode(unittest.TestCase): | |
def test_encode_null_lf_cr(self): | |
self.assertEqual( | |
encode(''.join([chr(0x00), chr(0x0A), chr(0x0D)])), | |
(bytes.fromhex('00 01 02'), 3) | |
) | |
def test_encode_Hello_World_cr_lf(self): | |
self.assertEqual( | |
encode('Hello World\r\n'), | |
(bytes.fromhex('18 2F 36 36 39 10 27 39 3C 36 2E 02 01'), 13) | |
) | |
def test_encode_hiragana_chars(self): | |
self.assertEqual( | |
encode('こんにちは せかい'), | |
(bytes.fromhex('63 A3 7B 71 7F 50 6B 5B 54'), 9) | |
) | |
def test_encode(self): | |
self.assertEqual( | |
encode(''.join([chr(0x00), chr(0x0A), chr(0x0D), chr(0x20), 'AZaz', chr(0x3000), 'ぁん'])), | |
(bytes.fromhex('00 01 02 10 11 2A 2B 44 50 51 A3'), 11) | |
) | |
def test_encode_void_str(self): | |
self.assertEqual( | |
encode(''), | |
(b'', 0) | |
) | |
def test_raise_UnicodeEncodeError(self): | |
self.assertRaises(UnicodeError, encode, 'こんにちは世界') | |
if __name__ == '__main__': | |
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def main(): | |
with open("text.txt", "wb") as f: | |
bary = bytearray([ | |
0x63, 0xA3, 0x7B, 0x71, 0x7F, 0x50, 0x6B, 0x5B, 0x54, | |
0x01, | |
0x65, 0x98, 0x56, 0x7A, 0x99, 0x50, 0x6B, 0x5B, 0x54]) | |
f.write(bary) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment