Last active
May 4, 2023 15:19
-
-
Save thevickypedia/e6ef4de2735168f0134a1fbc50a0cfb8 to your computer and use it in GitHub Desktop.
Tokenize a string using python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import base64 | |
import binascii | |
import string | |
UNICODE_PREFIX = base64.b64decode(b'XA==').decode(encoding="ascii") + \ | |
string.ascii_letters[20] + string.digits[:1] * 2 | |
input_text = 'hello world' | |
def hex_encode(text): | |
return UNICODE_PREFIX + UNICODE_PREFIX.join(binascii.hexlify( | |
data=text.encode(encoding="utf-8"), sep="-" | |
).decode(encoding="utf-8").split(sep="-")) | |
def hex_decode(text): | |
return bytes(text, "utf-8").decode(encoding="unicode_escape") | |
def base_encode(text, urlsafe: bool = False): | |
if urlsafe: | |
return base64.b64encode(text.encode(), altchars=b'xx').decode() | |
return base64.b64encode(text.encode()).decode() | |
def base_decode(text): | |
return base64.b64decode(text).decode() | |
def int_encode(text): | |
return int.from_bytes(text.encode('utf-8'), 'little') | |
def int_decode(text): | |
recovered_bytes = text.to_bytes((text.bit_length() + 7) // 8, 'little') | |
return recovered_bytes.decode('utf-8') | |
encoded = hex_encode(input_text) | |
print(f"Hex encoded: {encoded}") | |
decoded = hex_decode(encoded) | |
assert decoded == input_text | |
encoded = base_encode(input_text) | |
print(f"Base64 encoded: {encoded}") | |
decoded = base_decode(encoded) | |
assert decoded == input_text | |
encoded = int_encode(input_text) | |
print(f"Int encoded: {encoded}") | |
decoded = int_decode(encoded) | |
assert decoded == input_text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment