Skip to content

Instantly share code, notes, and snippets.

@micktwomey
Created October 16, 2012 11:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save micktwomey/3898674 to your computer and use it in GitHub Desktop.
Save micktwomey/3898674 to your computer and use it in GitHub Desktop.
WSGI middleware to replace non-BMP characters in JSON with unknown character. Works around poor UTF-8 support in certain dbs.
"""WSGI middleware and test code to munge utf-8
To work around narrow builds you need to do this:
re.sub(r'\\U[0-9a-f]{8}', '\\ufffd', s.encode("unicode_escape")).decode("unicode_escape")
(Use the representation)
"""
import json
import logging
import re
from wsgiref.simple_server import make_server
import webob
def hello_world_app(environ, start_response):
request = webob.Request(environ)
response = webob.Response()
if request.content_length:
response.body = request.body
return response(environ, start_response)
def utf8_non_basic_plane_munger(app):
def utf8_non_basic_plane_munger_middleware(environ, start_response):
logger = logging.getLogger("utf8_non_basic_plane_munger")
request = webob.Request(environ)
try:
if (request.content_type == "application/json") and request.content_length:
logger.info("Spotted JSON request, will attempt to re-encode, removing non BMP (0x0000 -> 0xffff) characters.")
new_request = request.copy()
body = new_request.body.decode("unicode_escape")
logger.debug("Decoded JSON body into {!r} (should be unicode)".format(body))
body = body.encode("unicode_escape")
logger.debug("Re-encoded body into {!r} (should be a string with escaped unicode)".format(body))
if re.search(r'\\U[0-9a-f]{8}', body):
logger.info("Spotted extended characters in request, munging")
body = re.sub(r'\\U[0-9a-f]{8}', '\\ufffd', body)
logger.debug("Converted body to {!r} (extended characters should be replaced with \\ufffd)".format(body))
body = body.decode("unicode_escape")
logger.debug("Decoded body into {!r} (should be decoded into unicode again)".format(body))
body = json.dumps(json.loads(body))
logger.debug("Re-dumped body into {!r} (should be a JSON blob again)".format(body))
new_request.body = body
request = new_request
else:
logger.info("Didn't spot extended characters in request, leaving alone")
except:
logger.exception("Problem munging characters in request, leaving alone")
response = request.get_response(app)
return response(environ, start_response)
return utf8_non_basic_plane_munger_middleware
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
app = utf8_non_basic_plane_munger(hello_world_app)
httpd = make_server('', 8000, app)
print "Serving on port 8000..."
# Serve until process is killed
httpd.serve_forever()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment