-
-
Save pnasrat/501976 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
explicits = ( | |
u'\u202a', # LEFT-TO-RIGHT EMBEDDING | |
u'\u202b', # RIGHT-TO-LEFT EMBEDDING | |
u'\u202d', # LEFT-TO-RIGHT OVERRIDE | |
u'\u202e', # RIGHT-TO-LEFT OVERRIDE | |
) | |
pdf = u'\u202c' # POP DIRECTIONAL FORMATTING | |
regex = re.compile('|'.join(explicits + (pdf,))) | |
def bidiclean(data): | |
""" | |
Ensure Unicode bidi characters are correctly balanced, as described by | |
Cal Henderson in http://www.iamcal.com/understanding-bidirectional-text/ | |
""" | |
count = [0] # Trick to work around Python's dodgy closure scoping | |
def sub(m): | |
ch = m.group(0) | |
if ch == pdf: | |
if count[0]: | |
count[0] -= 1 | |
return ch | |
else: | |
return '' # Kill unbalanced pdfs | |
else: # Not a pdf | |
count[0] += 1 | |
return ch | |
data = regex.sub(sub, data) | |
return data + (pdf * count[0]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
import bidiclean | |
bidi_tests = ( | |
(u'Normal string', u'Normal string'), | |
(u'One explicit \u202a', u'One explicit \u202a\u202c'), | |
(u'Two explicits \u202a\u202b', | |
u'Two explicits \u202a\u202b\u202c\u202c'), | |
(u'Three explicits \u202b\u202d\u202e', | |
u'Three explicits \u202b\u202d\u202e\u202c\u202c\u202c'), | |
(u'Rogue pdf \u202c', u'Rogue pdf '), | |
(u'Valid pdf \u202b\u202c', u'Valid pdf \u202b\u202c'), | |
(u'One valid pdf, one rogue \u202b\u202c\u202c', | |
u'One valid pdf, one rogue \u202b\u202c'), | |
) | |
class BidiCleanTest(unittest.TestCase): | |
pass | |
i = 0 | |
for input, expected in bidi_tests: | |
def test(self): | |
actual = bidiclean.bidiclean(input) | |
self.assertEqual(expected, actual) | |
test.__name__ = 'test_%d' % i | |
setattr(BidiCleanTest, 'test_%d' % i, test) | |
i += 1 | |
if __name__ == '__main__': | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment