Skip to content

Instantly share code, notes, and snippets.

@ychennay
Last active November 27, 2019 01:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ychennay/044b5a16dce81cf29862f4ab638fbb1e to your computer and use it in GitHub Desktop.
Save ychennay/044b5a16dce81cf29862f4ab638fbb1e to your computer and use it in GitHub Desktop.
encodings_demonstration.py
import re
from typing import List
def get_binary_for_char(char: str, encoding="utf-8") -> str:
"""
Encodes a character using the desired encoding into its corresponding hex, then converts the
hex code into binary, formatted with tab spaces between byte marks.
"""
hex_code = char.encode(encoding).hex()
code_point = hex(ord(char))[2:].upper()
binary: str = f"{int(hex_code, 16):08b}"
byte_list: List[str] = re.findall('[01]{8}', binary)
formatted_binary: str = "\t".join(byte_list) # for variable length encoding, tab space between byte marks.
print(f"{char} (U+{code_point.zfill(4)}, hex:{hex_code}){encoding}: {formatted_binary}")
return formatted_binary
def get_binary(text: str, encoding="utf-8"):
return "\t".join([get_binary_for_char(char, encoding) for char in text])
if __name__ == "__main__":
encoding = "Windows-1252"
get_binary("a", "Windows-1252")
get_binary("a", "MacRoman") # these do match, since MacRoman / Windows-1252's first 128 character unicode codepoints match
print("\n")
get_binary("á", "Windows-1252")
get_binary("á", "MacRoman") # these won't match, hence the crazy characters on Mac Excel
print("\n")
get_binary("a", "Windows-1252")
get_binary("a", "utf-8")
get_binary("á", "utf-8")
get_binary("한", "utf-8")
#get_binary("한", "Windows-1252") -> error, codec maps to None for this particular codepoint!
get_binary("ñ", "Windows-1252")
get_binary("ñ", "utf-8")
get_binary("Ò", "MacRoman") # the binary representations of Ò on MacRoman and ñ on Windows-1252 are identical, which is why the ñ -> Ò
print("\n")
get_binary("í", "Windows-1252") # identical binaries, hence why í appears as Ì
get_binary("Ì", "MacRoman")
print("\n")
get_binary("é", "Windows-1252")
get_binary("È", "MacRoman")
'''
System output of above code:
a (U+0061, hex:61)Windows-1252: 01100001
a (U+0061, hex:61)MacRoman: 01100001
á (U+00E1, hex:e1)Windows-1252: 11100001
á (U+00E1, hex:87)MacRoman: 10000111
a (U+0061, hex:61)Windows-1252: 01100001
a (U+0061, hex:61)utf-8: 01100001
á (U+00E1, hex:c3a1)utf-8: 11000011 10100001
한 (U+D55C, hex:ed959c)utf-8: 11101101 10010101 10011100
ñ (U+00F1, hex:f1)Windows-1252: 11110001
ñ (U+00F1, hex:c3b1)utf-8: 11000011 10110001
Ò (U+00D2, hex:f1)MacRoman: 11110001
í (U+00ED, hex:ed)Windows-1252: 11101101
Ì (U+00CC, hex:ed)MacRoman: 11101101
é (U+00E9, hex:e9)Windows-1252: 11101001
È (U+00C8, hex:e9)MacRoman: 11101001
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment