Last active
November 27, 2019 01:29
-
-
Save ychennay/044b5a16dce81cf29862f4ab638fbb1e to your computer and use it in GitHub Desktop.
encodings_demonstration.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from typing import List | |
def get_binary_for_char(char: str, encoding="utf-8") -> str: | |
""" | |
Encodes a character using the desired encoding into its corresponding hex, then converts the | |
hex code into binary, formatted with tab spaces between byte marks. | |
""" | |
hex_code = char.encode(encoding).hex() | |
code_point = hex(ord(char))[2:].upper() | |
binary: str = f"{int(hex_code, 16):08b}" | |
byte_list: List[str] = re.findall('[01]{8}', binary) | |
formatted_binary: str = "\t".join(byte_list) # for variable length encoding, tab space between byte marks. | |
print(f"{char} (U+{code_point.zfill(4)}, hex:{hex_code}){encoding}: {formatted_binary}") | |
return formatted_binary | |
def get_binary(text: str, encoding="utf-8"): | |
return "\t".join([get_binary_for_char(char, encoding) for char in text]) | |
if __name__ == "__main__": | |
encoding = "Windows-1252" | |
get_binary("a", "Windows-1252") | |
get_binary("a", "MacRoman") # these do match, since MacRoman / Windows-1252's first 128 character unicode codepoints match | |
print("\n") | |
get_binary("á", "Windows-1252") | |
get_binary("á", "MacRoman") # these won't match, hence the crazy characters on Mac Excel | |
print("\n") | |
get_binary("a", "Windows-1252") | |
get_binary("a", "utf-8") | |
get_binary("á", "utf-8") | |
get_binary("한", "utf-8") | |
#get_binary("한", "Windows-1252") -> error, codec maps to None for this particular codepoint! | |
get_binary("ñ", "Windows-1252") | |
get_binary("ñ", "utf-8") | |
get_binary("Ò", "MacRoman") # the binary representations of Ò on MacRoman and ñ on Windows-1252 are identical, which is why the ñ -> Ò | |
print("\n") | |
get_binary("í", "Windows-1252") # identical binaries, hence why í appears as Ì | |
get_binary("Ì", "MacRoman") | |
print("\n") | |
get_binary("é", "Windows-1252") | |
get_binary("È", "MacRoman") | |
''' | |
System output of above code: | |
a (U+0061, hex:61)Windows-1252: 01100001 | |
a (U+0061, hex:61)MacRoman: 01100001 | |
á (U+00E1, hex:e1)Windows-1252: 11100001 | |
á (U+00E1, hex:87)MacRoman: 10000111 | |
a (U+0061, hex:61)Windows-1252: 01100001 | |
a (U+0061, hex:61)utf-8: 01100001 | |
á (U+00E1, hex:c3a1)utf-8: 11000011 10100001 | |
한 (U+D55C, hex:ed959c)utf-8: 11101101 10010101 10011100 | |
ñ (U+00F1, hex:f1)Windows-1252: 11110001 | |
ñ (U+00F1, hex:c3b1)utf-8: 11000011 10110001 | |
Ò (U+00D2, hex:f1)MacRoman: 11110001 | |
í (U+00ED, hex:ed)Windows-1252: 11101101 | |
Ì (U+00CC, hex:ed)MacRoman: 11101101 | |
é (U+00E9, hex:e9)Windows-1252: 11101001 | |
È (U+00C8, hex:e9)MacRoman: 11101001 | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment