Last active
December 21, 2023 06:38
-
-
Save hyunjun/dea65972f3f723c0ad77 to your computer and use it in GitHub Desktop.
Python encoding
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
2진 파일을 읽어, | |
b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\xd4' | |
라는 2진파일로 되어 있고 이것을 문자열로 바꾸어서 저장하면 | |
"b'\\x89PNG\\r\\n\\x1a\\n\\......" 이런 식의 문자열로 저장 | |
이 문자열을 다시 위의 bytearray로 바꾸는 방법? | |
''' | |
>>> s | |
"b'\\x89PNG\\r\\n'" | |
>>> s[2:-1] | |
'\\x89PNG\\r\\n' | |
>>> bytes(bytes(s[2:-1], 'latin-1').decode('unicode-escape'), 'latin-1') | |
b'\x89PNG\r\n' | |
>>> | |
# bytes(<str>, '<encoding>') # str -> bytes | |
# <bytes>.decode('unicode-escape') # bytes -> str | |
>>> import ast | |
>>> ast.literal_eval(s) | |
b'\x89PNG\r\n' | |
# https://docs.python.org/3/library/binascii.html | |
>>> import binascii | |
>>> s | |
"b'\\x89PNG\\r\\n'" | |
>>> b | |
b'\x89PNG\r\n' | |
>>> binascii.b2a_hex(b) | |
b'89504e470d0a' | |
>>> binascii.a2b_hex(b'89504e470d0a') | |
b'\x89PNG\r\n' | |
>>> hex(ord('\n')) | |
# https://docs.python.org/3/library/base64.html | |
>>> import base64 | |
>>> base64.b64encode(b) | |
b'iVBORw0K' | |
>>> base64.b64decode(b'iVBORw0K') | |
b'\x89PNG\r\n' | |
'0xa' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> import json | |
>>> with open('test.json', 'w') as f: | |
... f.write(json.dumps({'key': 'value', '키': '값'}) + '\n') | |
... | |
# $ file test.json | |
# test.json: ASCII text | |
# $ cat test.json | |
# {"\ud0a4": "\uac12", "key": "value"} | |
# http://stackoverflow.com/questions/18337407/saving-utf-8-texts-in-json-dumps-as-utf8-not-as-u-escape-sequence | |
>>> with open('test.json', 'w') as f: | |
... f.write(json.dumps({'key': 'value', unicode('키', 'utf8'): unicode('값', 'utf8')}, ensure_ascii=False).encode('utf8') + '\n') | |
... | |
# $ file test.json | |
# test.json: UTF-8 Unicode text | |
# $ cat test.json | |
# {"키": "값", "key": "value"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# from pyspark | |
# write to hbase | |
# just write unicode string | |
# read from hbase | |
[unicode string].decode('string_escape') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> s = '가' | |
>>> ord(unicode(s, 'utf8')) | |
44032 | |
>>> u = unicode(str(hex(44032)).replace('0x', '\\u')).decode('unicode-escape') | |
>>> print u | |
가 | |
>>> type(u) | |
<type 'unicode'> | |
>>> u.encode('utf8') == s | |
True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# string type: unicode | |
# content: utf8 encoded escaped string | |
# case: put utf8 string into hbase using pyspark | |
>>> s = u'\\xEC\\x95\\x84\\xEC\\x8B\\x9C\\xEC\\x95\\x84' | |
>>> s.decode('unicode-escape') | |
# ref | |
# http://stackoverflow.com/questions/4020539/process-escape-sequences-in-a-string-in-python | |
# http://stackoverflow.com/questions/11375684/python-how-to-convert-utf-8-code-string-back-to-string | |
u'\xec\x95\x84\xec\x8b\x9c\xec\x95\x84' | |
>>> ss = s.decode('unicode-escape') | |
>>> s_unicode = bytearray(ss.encode('latin-1')).decode('utf8') | |
>>> print type(s_unicode), s_unicode | |
<type 'unicode'> 아시아 | |
>>> s_utf8 = bytearray(ss.encode('latin-1')).decode('utf8').encode('utf8') | |
>>> print type(s_utf8), s_utf8 | |
<type 'str'> 아시아 | |
>>> s_utf8 = s.decode('string_escape') | |
>>> print type(s_utf8), s_utf8 | |
<type 'str'> 아시아 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# string type: unicode | |
# content: utf8 encoded string | |
>>> s = u'\xEC\x95\x84\xEC\x8B\x9C\xEC\x95\x84' | |
>>> s_unicode = bytearray(s.encode('latin-1')).decode('utf8') | |
# [string].encode('latin-1') 내부 문자 값을 수정하지 않고 8비트 문자로 구성된 bytearray를 그대로 문자열로 변환 | |
>>> print type(s_unicode), s_unicode | |
<type 'unicode'> 아시아 | |
>>> s_utf8 = bytearray(s.encode('latin-1')).decode('utf8').encode('utf8') | |
>>> print type(s_utf8), s_utf8 | |
<type 'str'> 아시아 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# string type: utf8 | |
# content: unicode encoded escaped string | |
>>> s = '\\uc544\\uc2dc\\uc544' | |
>>> unicode('\\uc544\\uc2dc\\uc544') | |
u'\\uc544\\uc2dc\\uc544' | |
>>> s_unicode = unicode('\\uc544\\uc2dc\\uc544').decode('unicode-escape') | |
>>> print type(s_unicode), s_unicode | |
<type 'unicode'> 아시아 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# string type: utf8 | |
# content: unicode encoded string | |
>>> s = '\uc544\uc2dc\uc544' | |
>>> bytearray(s.encode('utf8')) | |
bytearray(b'\\uc544\\uc2dc\\uc544') | |
>>> unicode(bytearray(s.encode('utf8'))) | |
u'\\uc544\\uc2dc\\uc544' | |
>>> s_unicode = unicode(bytearray(s.encode('utf8'))).decode('unicode-escape') | |
>>> print type(s_unicode), s_unicode | |
<type 'unicode'> 아시아 | |
❯ python3 | |
Python 3.11.6 (main, Oct 2 2023, 13:45:54) [Clang 15.0.0 (clang-1500.0.40.1)] on darwin | |
Type "help", "copyright", "credits" or "license" for more information. | |
>>> s = '\uc774\ub0a0' | |
>>> s.encode('utf8').decode('utf8') | |
'이날' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
python2가 terminal에 없어서 paiz.io에서 test
sys.setdefaultencoding
은UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-1: ordinal not in range(128)
error 해결을 위해 사용