Skip to content

Instantly share code, notes, and snippets.

@JordanReiter
Created April 6, 2018 21:24
Show Gist options
  • Save JordanReiter/f99105fa73c6fd82729ed4406e0e39c0 to your computer and use it in GitHub Desktop.
Save JordanReiter/f99105fa73c6fd82729ed4406e0e39c0 to your computer and use it in GitHub Desktop.
This function can be used to process text that may or may not be base64 encoded
# This function can be used to process text that may or may not be base64 encoded
# In general, if you use base64 decode on normal text, you end up with random bytes
# that don't resemble a standard encoding. This function makes use of the chardet
# library to recognize decoded text that matches an expected encoding.
# Note that it tends to fail for very short inputs.
import binascii
import base64
import chardet
def clean_base64(input):
output = input
try:
try:
input = input.encode()
except AttributeError:
pass
# normally, b64decode ignores non-base64 characters
# we don't want it to, but we do want it to ignore line breaks
# so validate the string with all line breaks removed
input = input.replace(b'\r', b'').replace(b'\n', b'')
decoded = base64.b64decode(input, validate=True)
detected = chardet.detect(decoded)
if detected['encoding'] and detected['confidence'] > 0.5:
try:
# first assume utf-8; chardet often wrongly guesses windows-1252
output = decoded.decode()
except UnicodeError:
output = decoded.decode(detected['encoding'])
except (binascii.Error, UnicodeError, ValueError):
# the value is definitely not base64
pass
return output
tests = [
("Tweebuffelsmeteenskootmorsdoodgeskietfontein", "Tweebuffelsmeteenskootmorsdoodgeskietfontein"),
("Hello, how are you?", "Hello, how are you?"),
(b'SGVsbG8sIGhvdyBhcmUgeW91Pw==', "Hello, how are you?"),
('SGVsbG8sIGhvdyBhcmUgeW91Pw==', "Hello, how are you?"),
(b'SGkgdGhlcmUsIGZyaWVuZA==', "Hi there, friend"),
('SGkgdGhlcmUsIGZyaWVuZA==', "Hi there, friend"),
("Hi there, friend", "Hi there, friend"),
("Hello", "Hello"),
(b'SGVsbG8=', "Hello"),
('SGVsbG8=', "Hello"),
]
for test, expected in tests:
assert clean_base64(test) == expected
# Fails for some very short strings
assert clean_base64('yoyo') == 'yoyo'
assert clean_base64('yoyoyoyo') == 'yoyoyoyo'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment