Last active
December 17, 2015 16:48
-
-
Save bitsgalore/f65bcfc7a470b9fe9b90 to your computer and use it in GitHub Desktop.
Testing of surrogate pair handling in Py 2.x/ Py 3.x
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import codecs | |
import xml.etree.ElementTree as ET | |
from xml.dom import minidom | |
# Set encoding of the terminal to UTF-8 | |
out = codecs.getwriter("UTF-8")(sys.stdout) | |
# Initialise Element object | |
root = ET.Element('test') | |
# File name as byte object | |
fileNameAsBytes = b'\xF3\x92\x80\x9B\xF2\xB6\xBD\x87\xF2\xA6\xA0\xA1\xED\xAF' \ | |
'\x91\xF2\x94\xBD\xA0\x0A\xF3\x9E\xA9\x9A\xF1\xA0\xA4\xA8\xF1\x90\xA4\x9E' \ | |
'\xED\xAB\x80\xF0\xB5\x8C\xB9\x0A\xED\xBF\xBF\xF1\xA0\x8A\xBE\xF0\x9C\xB0' \ | |
'\xB8\xF4\x89\xB5\xA2\xF3\xA6\xB2\x80\x0A' | |
# to Unicode object | |
fileName = unicode(fileNameAsBytes, 'utf-8') | |
# Add subelement for this file | |
fileElt = ET.SubElement(root, "fileName") | |
# Add fileName to subelement | |
fileElt.text = fileName | |
# Convert Element object to string | |
xmlOut = ET.tostring(root, 'UTF-8', 'xml') | |
# Prettify | |
xmlPretty = minidom.parseString(xmlOut).toprettyxml(' ') | |
# Write to stdout | |
out.write(xmlPretty) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import codecs | |
import re | |
import xml.etree.ElementTree as ET | |
from xml.dom import minidom | |
# This works for Python 2.x, but give SyntaxError for Python 3.x! | |
# Set encoding of the terminal to UTF-8 | |
if sys.version.startswith("2"): | |
# Python 2.x | |
out = codecs.getwriter("UTF-8")(sys.stdout) | |
if sys.version.startswith("3"): | |
# Python 3.x | |
out = codecs.getwriter("UTF-8")(sys.stdout.buffer) | |
# Initialise Element object | |
root = ET.Element('test') | |
# Directory | |
myDir = "/home/johan/randomgit" | |
if sys.version.startswith("2"): | |
# Works for Py 2.x, but throws SyntaxError in Py 3! | |
try: | |
lone = re.compile( | |
ur"""(?x) # verbose expression (allows comments) | |
( # begin group | |
[\ud800-\udbff] # match leading surrogate | |
(?![\udc00-\udfff]) # but only if not followed by trailing surrogate | |
) # end group | |
| # OR | |
( # begin group | |
(?<![\ud800-\udbff]) # if not preceded by leading surrogate | |
[\udc00-\udfff] # match trailing surrogate | |
) # end group | |
""") | |
except: | |
lone = "" | |
if sys.version.startswith("2"): | |
myDir = unicode(myDir, 'utf-8') | |
# Iterate over files in directory | |
for file in os.listdir(myDir): | |
fileName = os.path.basename(file) | |
# This works for Python 3.x, but not for 2.x! | |
# Source: http://stackoverflow.com/q/19649463/1209004 | |
if sys.version.startswith("3"): | |
try: | |
fileName.encode('utf-8') | |
except UnicodeEncodeError: | |
# Strip away surrogate pairs | |
tmp = fileName.encode('utf-8', 'surrogateescape') | |
fileName = tmp.decode('utf-8', 'ignore') | |
if sys.version.startswith("2"): | |
# Source: http://stackoverflow.com/a/18674109/1209004 | |
tmp = lone.sub(ur'\ufffd',fileName).encode('utf-8') | |
fileName = tmp.decode('utf-8') | |
# Add subelement for this file | |
fileElt = ET.SubElement(root, "fileName") | |
# Add fileName to subelement | |
fileElt.text = fileName | |
# Convert Element object to string | |
xmlOut = ET.tostring(root, 'UTF-8', 'xml') | |
# Prettify | |
xmlPretty = minidom.parseString(xmlOut).toprettyxml(' ') | |
# Write to stdout | |
out.write(xmlPretty) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import codecs | |
import xml.etree.ElementTree as ET | |
from xml.dom import minidom | |
# This works for Python 3.x, but not for Python 2.x! | |
# Set encoding of the terminal to UTF-8 | |
if sys.version.startswith("2"): | |
# Python 2.x | |
out = codecs.getwriter("UTF-8")(sys.stdout) | |
if sys.version.startswith("3"): | |
# Python 3.x | |
out = codecs.getwriter("UTF-8")(sys.stdout.buffer) | |
# Initialise Element object | |
root = ET.Element('test') | |
# Directory | |
myDir = "/home/johan/randomgit" | |
if sys.version.startswith("2"): | |
myDir = unicode(myDir, 'utf-8') | |
# Iterate over files in directory | |
for file in os.listdir(myDir): | |
fileName = os.path.basename(file) | |
# This works for Python 3.x, but not for 2.x! | |
try: | |
fileName.encode('utf-8') | |
except UnicodeEncodeError: | |
# Strip away surrogate pairs | |
tmp = fileName.encode('utf-8', 'surrogateescape') | |
fileName = tmp.decode('utf-8', 'ignore') | |
# Add subelement for this file | |
fileElt = ET.SubElement(root, "fileName") | |
# Add fileName to subelement | |
fileElt.text = fileName | |
# Convert Element object to string | |
xmlOut = ET.tostring(root, 'UTF-8', 'xml') | |
# Prettify | |
xmlPretty = minidom.parseString(xmlOut).toprettyxml(' ') | |
# Write to stdout | |
out.write(xmlPretty) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment