Skip to content

Instantly share code, notes, and snippets.

@pnasrat
Forked from simonw/bidiclean.py
Created July 31, 2010 09:18
Show Gist options
  • Save pnasrat/501976 to your computer and use it in GitHub Desktop.
Save pnasrat/501976 to your computer and use it in GitHub Desktop.
import re
explicits = (
u'\u202a', # LEFT-TO-RIGHT EMBEDDING
u'\u202b', # RIGHT-TO-LEFT EMBEDDING
u'\u202d', # LEFT-TO-RIGHT OVERRIDE
u'\u202e', # RIGHT-TO-LEFT OVERRIDE
)
pdf = u'\u202c' # POP DIRECTIONAL FORMATTING
regex = re.compile('|'.join(explicits + (pdf,)))
def bidiclean(data):
"""
Ensure Unicode bidi characters are correctly balanced, as described by
Cal Henderson in http://www.iamcal.com/understanding-bidirectional-text/
"""
count = [0] # Trick to work around Python's dodgy closure scoping
def sub(m):
ch = m.group(0)
if ch == pdf:
if count[0]:
count[0] -= 1
return ch
else:
return '' # Kill unbalanced pdfs
else: # Not a pdf
count[0] += 1
return ch
data = regex.sub(sub, data)
return data + (pdf * count[0])
import unittest
import bidiclean
bidi_tests = (
(u'Normal string', u'Normal string'),
(u'One explicit \u202a', u'One explicit \u202a\u202c'),
(u'Two explicits \u202a\u202b',
u'Two explicits \u202a\u202b\u202c\u202c'),
(u'Three explicits \u202b\u202d\u202e',
u'Three explicits \u202b\u202d\u202e\u202c\u202c\u202c'),
(u'Rogue pdf \u202c', u'Rogue pdf '),
(u'Valid pdf \u202b\u202c', u'Valid pdf \u202b\u202c'),
(u'One valid pdf, one rogue \u202b\u202c\u202c',
u'One valid pdf, one rogue \u202b\u202c'),
)
class BidiCleanTest(unittest.TestCase):
pass
i = 0
for input, expected in bidi_tests:
def test(self):
actual = bidiclean.bidiclean(input)
self.assertEqual(expected, actual)
test.__name__ = 'test_%d' % i
setattr(BidiCleanTest, 'test_%d' % i, test)
i += 1
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment