Skip to content

Instantly share code, notes, and snippets.

@bitsgalore
Last active December 17, 2015 16:48
Show Gist options
  • Save bitsgalore/f65bcfc7a470b9fe9b90 to your computer and use it in GitHub Desktop.
Save bitsgalore/f65bcfc7a470b9fe9b90 to your computer and use it in GitHub Desktop.
Testing of surrogate pair handling in Py 2.x/ Py 3.x
import os
import sys
import codecs
import xml.etree.ElementTree as ET
from xml.dom import minidom
# Set encoding of the terminal to UTF-8
out = codecs.getwriter("UTF-8")(sys.stdout)
# Initialise Element object
root = ET.Element('test')
# File name as byte object
fileNameAsBytes = b'\xF3\x92\x80\x9B\xF2\xB6\xBD\x87\xF2\xA6\xA0\xA1\xED\xAF' \
'\x91\xF2\x94\xBD\xA0\x0A\xF3\x9E\xA9\x9A\xF1\xA0\xA4\xA8\xF1\x90\xA4\x9E' \
'\xED\xAB\x80\xF0\xB5\x8C\xB9\x0A\xED\xBF\xBF\xF1\xA0\x8A\xBE\xF0\x9C\xB0' \
'\xB8\xF4\x89\xB5\xA2\xF3\xA6\xB2\x80\x0A'
# to Unicode object
fileName = unicode(fileNameAsBytes, 'utf-8')
# Add subelement for this file
fileElt = ET.SubElement(root, "fileName")
# Add fileName to subelement
fileElt.text = fileName
# Convert Element object to string
xmlOut = ET.tostring(root, 'UTF-8', 'xml')
# Prettify
xmlPretty = minidom.parseString(xmlOut).toprettyxml(' ')
# Write to stdout
out.write(xmlPretty)
import os
import sys
import codecs
import re
import xml.etree.ElementTree as ET
from xml.dom import minidom
# This works for Python 2.x, but give SyntaxError for Python 3.x!
# Set encoding of the terminal to UTF-8
if sys.version.startswith("2"):
# Python 2.x
out = codecs.getwriter("UTF-8")(sys.stdout)
if sys.version.startswith("3"):
# Python 3.x
out = codecs.getwriter("UTF-8")(sys.stdout.buffer)
# Initialise Element object
root = ET.Element('test')
# Directory
myDir = "/home/johan/randomgit"
if sys.version.startswith("2"):
# Works for Py 2.x, but throws SyntaxError in Py 3!
try:
lone = re.compile(
ur"""(?x) # verbose expression (allows comments)
( # begin group
[\ud800-\udbff] # match leading surrogate
(?![\udc00-\udfff]) # but only if not followed by trailing surrogate
) # end group
| # OR
( # begin group
(?<![\ud800-\udbff]) # if not preceded by leading surrogate
[\udc00-\udfff] # match trailing surrogate
) # end group
""")
except:
lone = ""
if sys.version.startswith("2"):
myDir = unicode(myDir, 'utf-8')
# Iterate over files in directory
for file in os.listdir(myDir):
fileName = os.path.basename(file)
# This works for Python 3.x, but not for 2.x!
# Source: http://stackoverflow.com/q/19649463/1209004
if sys.version.startswith("3"):
try:
fileName.encode('utf-8')
except UnicodeEncodeError:
# Strip away surrogate pairs
tmp = fileName.encode('utf-8', 'surrogateescape')
fileName = tmp.decode('utf-8', 'ignore')
if sys.version.startswith("2"):
# Source: http://stackoverflow.com/a/18674109/1209004
tmp = lone.sub(ur'\ufffd',fileName).encode('utf-8')
fileName = tmp.decode('utf-8')
# Add subelement for this file
fileElt = ET.SubElement(root, "fileName")
# Add fileName to subelement
fileElt.text = fileName
# Convert Element object to string
xmlOut = ET.tostring(root, 'UTF-8', 'xml')
# Prettify
xmlPretty = minidom.parseString(xmlOut).toprettyxml(' ')
# Write to stdout
out.write(xmlPretty)
import os
import sys
import codecs
import xml.etree.ElementTree as ET
from xml.dom import minidom
# This works for Python 3.x, but not for Python 2.x!
# Set encoding of the terminal to UTF-8
if sys.version.startswith("2"):
# Python 2.x
out = codecs.getwriter("UTF-8")(sys.stdout)
if sys.version.startswith("3"):
# Python 3.x
out = codecs.getwriter("UTF-8")(sys.stdout.buffer)
# Initialise Element object
root = ET.Element('test')
# Directory
myDir = "/home/johan/randomgit"
if sys.version.startswith("2"):
myDir = unicode(myDir, 'utf-8')
# Iterate over files in directory
for file in os.listdir(myDir):
fileName = os.path.basename(file)
# This works for Python 3.x, but not for 2.x!
try:
fileName.encode('utf-8')
except UnicodeEncodeError:
# Strip away surrogate pairs
tmp = fileName.encode('utf-8', 'surrogateescape')
fileName = tmp.decode('utf-8', 'ignore')
# Add subelement for this file
fileElt = ET.SubElement(root, "fileName")
# Add fileName to subelement
fileElt.text = fileName
# Convert Element object to string
xmlOut = ET.tostring(root, 'UTF-8', 'xml')
# Prettify
xmlPretty = minidom.parseString(xmlOut).toprettyxml(' ')
# Write to stdout
out.write(xmlPretty)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment