Created
April 22, 2013 21:07
-
-
Save agriffis/5438575 to your computer and use it in GitHub Desktop.
Patch Python's sys.stdout and sys.stderr to honor locale variables regardless of tty, and fall back to UTF-8 instead of ASCII in the absence of more information.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Patch Python's sys.stdout and sys.stderr to encode to UTF-8 | |
The short story is that sys.stdout.encoding is set based on the environment | |
variable LC_CTYPE (or LANG/LC_ALL) but only when stdout.isatty(). | |
Otherwise those variables are ignored and stdout has no encoding, and it | |
CANNOT be set. | |
The long story is here (especially in the comments): | |
http://drj11.wordpress.com/2007/05/14/python-how-is-sysstdoutencoding-chosen/ | |
""" | |
from codecs import StreamWriter, lookup | |
import os | |
def _get_codec(name): | |
""" | |
Returns a CodecInfo using codecs.lookup(). If `name` is None, attempts | |
to use locale environment variables, eventually falling back to UTF-8 | |
rather than ASCII. | |
""" | |
codec = None | |
if name: | |
codec = lookup(name) | |
else: | |
# This is the proper priority order for these environment variables. | |
name = os.environ.get('LC_ALL') or os.environ.get('LC_CTYPE') or os.environ.get('LANG') or '' | |
if '.' in name: | |
name = name.rsplit('.', 1)[-1] # en_US.utf8 | |
if name: | |
try: | |
codec = lookup(name) | |
except Exception: | |
pass | |
# If name wasn't passed in, and there was no usable environment | |
# variable, fall back to UTF-8. | |
if not codec: | |
codec = lookup('utf8') | |
return codec | |
class StdStreamWriter(StreamWriter): | |
""" | |
StreamWriter that accepts either unicode or encoded byte strings; for | |
the latter case this class assumes the input is already encoded | |
properly, rather than trying to decode and re-encode. | |
""" | |
def __init__(self, *args, **kwargs): | |
self.codec = _get_codec(kwargs.pop('encoding', None)) | |
self.encoding = self.codec.name | |
self._encode = self.codec.encode | |
StreamWriter.__init__(self, *args, **kwargs) | |
def encode(self, input, *args, **kwargs): | |
if not isinstance(input, unicode): | |
input = str(input) # should be already | |
return input, len(input) | |
return self._encode(input, *args, **kwargs) | |
def monkey(): | |
""" | |
Monkey patch sys.stdout and sys.stderr to handle unicode objects | |
properly. | |
""" | |
import sys | |
sys.stdout = StdStreamWriter(sys.stdout) | |
sys.stderr = StdStreamWriter(sys.stderr) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This breaks readline capabilities in
pdb
oripython
. Arrow keys will generate garbage as^[[D
as soon as thesys.stdout
is monkeypatched.