hyunjun/bytearray_string_back_to_bytearray.py Secret

## bytearray_string_back_to_bytearray.py
'''
2진 파일을 읽어,
b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\xd4'
라는 2진파일로 되어 있고 이것을 문자열로 바꾸어서 저장하면
"b'\\x89PNG\\r\\n\\x1a\\n\\......" 이런 식의 문자열로 저장
이 문자열을 다시 위의 bytearray로 바꾸는 방법?
'''
>>> s
"b'\\x89PNG\\r\\n'"
>>> s[2:-1]
'\\x89PNG\\r\\n'
>>> bytes(bytes(s[2:-1], 'latin-1').decode('unicode-escape'), 'latin-1')
b'\x89PNG\r\n'
>>>
# bytes(<str>, '<encoding>') # str -> bytes
# <bytes>.decode('unicode-escape') # bytes -> str

>>> import ast
>>> ast.literal_eval(s)
b'\x89PNG\r\n'

# https://docs.python.org/3/library/binascii.html
>>> import binascii
>>> s
"b'\\x89PNG\\r\\n'"
>>> b
b'\x89PNG\r\n'
>>> binascii.b2a_hex(b)
b'89504e470d0a'
>>> binascii.a2b_hex(b'89504e470d0a')
b'\x89PNG\r\n'
>>> hex(ord('\n'))

# https://docs.python.org/3/library/base64.html
>>> import base64
>>> base64.b64encode(b)
b'iVBORw0K'
>>> base64.b64decode(b'iVBORw0K')
b'\x89PNG\r\n'
'0xa'

## json_to_utf8_file.py
>>> import json

>>> with open('test.json', 'w') as f:
...     f.write(json.dumps({'key': 'value', '키': '값'}) + '\n')
...
# $ file test.json
# test.json: ASCII text
# $ cat test.json
# {"\ud0a4": "\uac12", "key": "value"}

# http://stackoverflow.com/questions/18337407/saving-utf-8-texts-in-json-dumps-as-utf8-not-as-u-escape-sequence
>>> with open('test.json', 'w') as f:
...     f.write(json.dumps({'key': 'value', unicode('키', 'utf8'): unicode('값', 'utf8')}, ensure_ascii=False).encode('utf8') + '\n')
...
# $ file test.json
# test.json: UTF-8 Unicode text
# $ cat test.json
# {"키": "값", "key": "value"}

## pyspark_hbase.py
# from pyspark

# write to hbase
# just write unicode string

# read from hbase
[unicode string].decode('string_escape')

## str_to_hex_to_unicode.py
>>> s = '가'
>>> ord(unicode(s, 'utf8'))
44032
>>> u = unicode(str(hex(44032)).replace('0x', '\\u')).decode('unicode-escape')
>>> print u
가
>>> type(u)
<type 'unicode'>
>>> u.encode('utf8') == s
True

## unicode_has_escaped_utf8.py
# string type: unicode
# content: utf8 encoded escaped string
# case: put utf8 string into hbase using pyspark
>>> s = u'\\xEC\\x95\\x84\\xEC\\x8B\\x9C\\xEC\\x95\\x84'

>>> s.decode('unicode-escape')
# ref
# http://stackoverflow.com/questions/4020539/process-escape-sequences-in-a-string-in-python
# http://stackoverflow.com/questions/11375684/python-how-to-convert-utf-8-code-string-back-to-string
u'\xec\x95\x84\xec\x8b\x9c\xec\x95\x84'
>>> ss = s.decode('unicode-escape')
>>> s_unicode = bytearray(ss.encode('latin-1')).decode('utf8')
>>> print type(s_unicode), s_unicode
<type 'unicode'> 아시아
>>> s_utf8 = bytearray(ss.encode('latin-1')).decode('utf8').encode('utf8')
>>> print type(s_utf8), s_utf8
<type 'str'> 아시아

>>> s_utf8 = s.decode('string_escape')
>>> print type(s_utf8), s_utf8
<type 'str'> 아시아

## unicode_has_utf8.py
# string type: unicode
# content: utf8 encoded string
>>> s = u'\xEC\x95\x84\xEC\x8B\x9C\xEC\x95\x84'
>>> s_unicode = bytearray(s.encode('latin-1')).decode('utf8')
# [string].encode('latin-1') 내부 문자 값을 수정하지 않고 8비트 문자로 구성된 bytearray를 그대로 문자열로 변환
>>> print type(s_unicode), s_unicode
<type 'unicode'> 아시아
>>> s_utf8 = bytearray(s.encode('latin-1')).decode('utf8').encode('utf8')
>>> print type(s_utf8), s_utf8
<type 'str'> 아시아

## utf8_has_escaped_unicode.py
# string type: utf8
# content: unicode encoded escaped string
>>> s = '\\uc544\\uc2dc\\uc544'
>>> unicode('\\uc544\\uc2dc\\uc544')
u'\\uc544\\uc2dc\\uc544'
>>> s_unicode = unicode('\\uc544\\uc2dc\\uc544').decode('unicode-escape')
>>> print type(s_unicode), s_unicode
<type 'unicode'> 아시아

## utf8_has_unicode.py
# string type: utf8
# content: unicode encoded string
>>> s = '\uc544\uc2dc\uc544'
>>> bytearray(s.encode('utf8'))
bytearray(b'\\uc544\\uc2dc\\uc544')
>>> unicode(bytearray(s.encode('utf8')))
u'\\uc544\\uc2dc\\uc544'
>>> s_unicode = unicode(bytearray(s.encode('utf8'))).decode('unicode-escape')
>>> print type(s_unicode), s_unicode
<type 'unicode'> 아시아

❯ python3
Python 3.11.6 (main, Oct  2 2023, 13:45:54) [Clang 15.0.0 (clang-1500.0.40.1)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> s = '\uc774\ub0a0'
>>> s.encode('utf8').decode('utf8')
'이날'
	'''
	2진 파일을 읽어,
	b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\xd4'
	라는 2진파일로 되어 있고 이것을 문자열로 바꾸어서 저장하면
	"b'\\x89PNG\\r\\n\\x1a\\n\\......" 이런 식의 문자열로 저장
	이 문자열을 다시 위의 bytearray로 바꾸는 방법?
	'''
	>>> s
	"b'\\x89PNG\\r\\n'"
	>>> s[2:-1]
	'\\x89PNG\\r\\n'
	>>> bytes(bytes(s[2:-1], 'latin-1').decode('unicode-escape'), 'latin-1')
	b'\x89PNG\r\n'
	>>>
	# bytes(<str>, '<encoding>') # str -> bytes
	# <bytes>.decode('unicode-escape') # bytes -> str

	>>> import ast
	>>> ast.literal_eval(s)
	b'\x89PNG\r\n'

	# https://docs.python.org/3/library/binascii.html
	>>> import binascii
	>>> s
	"b'\\x89PNG\\r\\n'"
	>>> b
	b'\x89PNG\r\n'
	>>> binascii.b2a_hex(b)
	b'89504e470d0a'
	>>> binascii.a2b_hex(b'89504e470d0a')
	b'\x89PNG\r\n'
	>>> hex(ord('\n'))

	# https://docs.python.org/3/library/base64.html
	>>> import base64
	>>> base64.b64encode(b)
	b'iVBORw0K'
	>>> base64.b64decode(b'iVBORw0K')
	b'\x89PNG\r\n'
	'0xa'
	>>> import json

	>>> with open('test.json', 'w') as f:
	... f.write(json.dumps({'key': 'value', '키': '값'}) + '\n')
	...
	# $ file test.json
	# test.json: ASCII text
	# $ cat test.json
	# {"\ud0a4": "\uac12", "key": "value"}

	# http://stackoverflow.com/questions/18337407/saving-utf-8-texts-in-json-dumps-as-utf8-not-as-u-escape-sequence
	>>> with open('test.json', 'w') as f:
	... f.write(json.dumps({'key': 'value', unicode('키', 'utf8'): unicode('값', 'utf8')}, ensure_ascii=False).encode('utf8') + '\n')
	...
	# $ file test.json
	# test.json: UTF-8 Unicode text
	# $ cat test.json
	# {"키": "값", "key": "value"}
	# from pyspark

	# write to hbase
	# just write unicode string

	# read from hbase
	[unicode string].decode('string_escape')
	>>> s = '가'
	>>> ord(unicode(s, 'utf8'))
	44032
	>>> u = unicode(str(hex(44032)).replace('0x', '\\u')).decode('unicode-escape')
	>>> print u
	가
	>>> type(u)
	<type 'unicode'>
	>>> u.encode('utf8') == s
	True
	# string type: unicode
	# content: utf8 encoded escaped string
	# case: put utf8 string into hbase using pyspark
	>>> s = u'\\xEC\\x95\\x84\\xEC\\x8B\\x9C\\xEC\\x95\\x84'

	>>> s.decode('unicode-escape')
	# ref
	# http://stackoverflow.com/questions/4020539/process-escape-sequences-in-a-string-in-python
	# http://stackoverflow.com/questions/11375684/python-how-to-convert-utf-8-code-string-back-to-string
	u'\xec\x95\x84\xec\x8b\x9c\xec\x95\x84'
	>>> ss = s.decode('unicode-escape')
	>>> s_unicode = bytearray(ss.encode('latin-1')).decode('utf8')
	>>> print type(s_unicode), s_unicode
	<type 'unicode'> 아시아
	>>> s_utf8 = bytearray(ss.encode('latin-1')).decode('utf8').encode('utf8')
	>>> print type(s_utf8), s_utf8
	<type 'str'> 아시아

	>>> s_utf8 = s.decode('string_escape')
	>>> print type(s_utf8), s_utf8
	<type 'str'> 아시아
	# string type: unicode
	# content: utf8 encoded string
	>>> s = u'\xEC\x95\x84\xEC\x8B\x9C\xEC\x95\x84'
	>>> s_unicode = bytearray(s.encode('latin-1')).decode('utf8')
	# [string].encode('latin-1') 내부 문자 값을 수정하지 않고 8비트 문자로 구성된 bytearray를 그대로 문자열로 변환
	>>> print type(s_unicode), s_unicode
	<type 'unicode'> 아시아
	>>> s_utf8 = bytearray(s.encode('latin-1')).decode('utf8').encode('utf8')
	>>> print type(s_utf8), s_utf8
	<type 'str'> 아시아
	# string type: utf8
	# content: unicode encoded escaped string
	>>> s = '\\uc544\\uc2dc\\uc544'
	>>> unicode('\\uc544\\uc2dc\\uc544')
	u'\\uc544\\uc2dc\\uc544'
	>>> s_unicode = unicode('\\uc544\\uc2dc\\uc544').decode('unicode-escape')
	>>> print type(s_unicode), s_unicode
	<type 'unicode'> 아시아
	# string type: utf8
	# content: unicode encoded string
	>>> s = '\uc544\uc2dc\uc544'
	>>> bytearray(s.encode('utf8'))
	bytearray(b'\\uc544\\uc2dc\\uc544')
	>>> unicode(bytearray(s.encode('utf8')))
	u'\\uc544\\uc2dc\\uc544'
	>>> s_unicode = unicode(bytearray(s.encode('utf8'))).decode('unicode-escape')
	>>> print type(s_unicode), s_unicode
	<type 'unicode'> 아시아

	❯ python3
	Python 3.11.6 (main, Oct 2 2023, 13:45:54) [Clang 15.0.0 (clang-1500.0.40.1)] on darwin
	Type "help", "copyright", "credits" or "license" for more information.
	>>> s = '\uc774\ub0a0'
	>>> s.encode('utf8').decode('utf8')
	'이날'