Skip to content

Instantly share code, notes, and snippets.

@simonw
Created March 12, 2010 10:01
Show Gist options
  • Save simonw/330197 to your computer and use it in GitHub Desktop.
Save simonw/330197 to your computer and use it in GitHub Desktop.
import re
explicits = (
u'\u202a', # LEFT-TO-RIGHT EMBEDDING
u'\u202b', # RIGHT-TO-LEFT EMBEDDING
u'\u202d', # LEFT-TO-RIGHT OVERRIDE
u'\u202e', # RIGHT-TO-LEFT OVERRIDE
)
pdf = u'\u202c' # POP DIRECTIONAL FORMATTING
regex = re.compile('|'.join(explicits + (pdf,)))
def bidiclean(data):
"""
Ensure Unicode bidi characters are correctly balanced, as described by
Cal Henderson in http://www.iamcal.com/understanding-bidirectional-text/
"""
count = [0] # Trick to work around Python's dodgy closure scoping
def sub(m):
ch = m.group(0)
if ch == pdf:
if count[0]:
count[0] -= 1
return ch
else:
return '' # Kill unbalanced pdfs
else: # Not a pdf
count[0] += 1
return ch
data = regex.sub(sub, data)
return data + (pdf * count[0])
import unittest
import bidiclean
bidi_tests = (
(u'Normal string', u'Normal string'),
(u'One explicit \u202a', u'One explicit \u202a\u202c'),
(u'Two explicits \u202a\u202b',
u'Two explicits \u202a\u202b\u202c\u202c'),
(u'Three explicits \u202b\u202d\u202e',
u'Three explicits \u202b\u202d\u202e\u202c\u202c\u202c'),
(u'Rogue pdf \u202c', u'Rogue pdf '),
(u'Valid pdf \u202b\u202c', u'Valid pdf \u202b\u202c'),
(u'One valid pdf, one rogue \u202b\u202c\u202c',
u'One valid pdf, one rogue \u202b\u202c'),
)
class BidiCleanTest(unittest.TestCase):
pass
i = 0
for input, expected in bidi_tests:
def test(self):
actual = bidiclean.bidiclean(input)
self.assertEqual(expected, actual)
test.__name__ = 'test_%d' % i
setattr(BidiCleanTest, 'test_%d' % i, test)
i += 1
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment