Skip to content

Instantly share code, notes, and snippets.

@LeslieK
Last active August 29, 2015 13:58
Show Gist options
  • Save LeslieK/10013426 to your computer and use it in GitHub Desktop.
Save LeslieK/10013426 to your computer and use it in GitHub Desktop.
Python 3.3: Windows Console Behavior with Unicode
{
"metadata": {
"name": "Python3.3WindowsConsoleUnicode"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": "Python 3.3 Windows console behavior: Issue Summary"
},
{
"cell_type": "code",
"collapsed": false,
"input": "import sys\nsys.platform",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 9,
"text": "'win32'"
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": "sys.version",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 10,
"text": "'3.3.3 |Anaconda 1.8.0 (64-bit)| (default, Dec 13 2013, 10:31:18) [MSC v.1600 64 bit (AMD64)]'"
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": "Python 3.3 Windows console behavior explained: should be included in python documentation\n\nWindows console uses 'cp437' to decode bytes -- This cannot be overridden. PYTHONIOENCODING does not override this.\nPYTHONIOENCODING is set to UTF-8\nIn [86]: sys.getdefaultencoding(), locale.getpreferredencoding(), sys.stdout.encoding, os.device_encoding(1)\nOut[86]: ('utf-8', 'cp1252', 'UTF-8', 'cp437')\n\nos.device_encoding(0), os.device_encoding(1), os.device_encoding(2) => None, 'cp437', 'cp437' # 0: stdin 1: stdout 2: stderr\n\nBEHAVIOR\nThe print encodes to bytes using 'utf-8' correctly\nconsole decodes bytes to glyph using 'cp437' => wrong glyph\n\nEXAMPLE\nIn [75]: shalom = '\\u05dd\\u05dc\\u05e9'\n\nIn [76]: shalom.encode()\nOut[76]: b'\\xd7\\x9d\\xd7\\x9c\\xd7\\xa9'\n\nIn [77]: shalom.encode().decode()\nOut[77]: '???'\n\nIn [78]: shalom.encode().decode('cp437')\nOut[78]: '\u256b\u00a5\u256b\u00a3\u256b\u2310'\n\nIn [79]: print(shalom)\n\u256b\u00a5\u256b\u00a3\u256b\u2310\n\nIn [80]: shalom\nOut[80]: '???'",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": "print('\\u05dd\\u05dc\\u05e9') # works in ipython notebook",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "\u05dd\u05dc\u05e9\n"
}
],
"prompt_number": 32
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": "Another Example with More Details"
},
{
"cell_type": "code",
"collapsed": false,
"input": "import locale\nlocale.getpreferredencoding() # value returned is platform dependent",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 13,
"text": "'cp1252'"
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": "# windows uses **console** encodings instead of locale encoding for the console\n# windows default console encodings\nsys.stdout.encoding\nsys.stdin.encoding\nsys.stderr.encoding\n'cp437'",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": "In [1]: print('\\u00A9')\n---------------------------------------------------------------------------\nUnicodeEncodeError Traceback (most recent call last)\n<ipython-input-1-427f3468b115> in <module>()\n----> 1 print('\\u00A9')\n\nC:\\Anaconda\\envs\\py33\\lib\\encodings\\cp437.py in encode(self, input, final)\n 17 class IncrementalEncoder(codecs.IncrementalEncoder):\n 18 def encode(self, input, final=False):\n---> 19 return codecs.charmap_encode(input,self.errors,encoding_map)[0]\n 20\n 21 class IncrementalDecoder(codecs.IncrementalDecoder):\n\nUnicodeEncodeError: 'charmap' codec can't encode character '\\xa9' in position 0: character maps to <undefined>\n\nIn [2]:",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": "# to override IO encodings (on all platforms) set ENVIRONMENT variable PYTHONIOENCODING\n# after setting system ENVIRONMENT variable: PYTHONIOENCODING UTF-8\n\nprint(\"sys.stdout.encoding\", sys.stdout.encoding) \nprint(\"sys.stdin.encoding \", sys.stdin.encoding)\nprint(\"sys.stderr.encoding\", sys.stdout.encoding)",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "sys.stdout.encoding UTF-8\nsys.stdin.encoding "
},
{
"output_type": "stream",
"stream": "stdout",
"text": " UTF-8\nsys.stderr.encoding"
},
{
"output_type": "stream",
"stream": "stdout",
"text": " UTF-8\n"
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": "# But I think python uses these device_encoding settings, not the ones that were overridden\n\nos.device_encoding(0), os.device_encoding(1), os.device_encoding(2) # 0: stdin, 1: stdout, 2: stderr",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 18,
"text": "(None, 'cp437', 'cp437')"
}
],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": "# This time ipython console doesn't crash but it prints out gibberish\n\nIn [11]: print('\\u00A9') \n\u252c\u2310\n\nIn [12]:\n\n# print encodes str to bytes using sys default ('utf-8') => b'\\xc2\\xa9' \n# then it decodes bytes with 'cp437' to glyph\n# calling 'cp437' is a bug",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": "print('\\u00A9') # works correctly in ipython notebook",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "\u00a9\n"
}
],
"prompt_number": 20
},
{
"cell_type": "code",
"collapsed": false,
"input": "'\\u00A9' # repr works",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 33,
"text": "'\u00a9'"
}
],
"prompt_number": 33
},
{
"cell_type": "code",
"collapsed": false,
"input": "copyutf8.decode? # this shows that encoding='utf-8' is the default encoding",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 29
},
{
"cell_type": "code",
"collapsed": false,
"input": "copyutf8.decode() # again, this displays correctly",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 31,
"text": "'\u00a9'"
}
],
"prompt_number": 31
},
{
"cell_type": "code",
"collapsed": false,
"input": "copyutf8 = '\\u00a9'.encode('utf-8')\ncopyutf8 # displays correctly",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 26,
"text": "b'\\xc2\\xa9'"
}
],
"prompt_number": 26
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment