pnasrat/bidiclean.py

## bidiclean.py
import re

explicits = (
    u'\u202a', # LEFT-TO-RIGHT EMBEDDING
    u'\u202b', # RIGHT-TO-LEFT EMBEDDING
    u'\u202d', # LEFT-TO-RIGHT OVERRIDE
    u'\u202e', # RIGHT-TO-LEFT OVERRIDE
)
pdf = u'\u202c' # POP DIRECTIONAL FORMATTING

regex = re.compile('|'.join(explicits + (pdf,)))
def bidiclean(data):
    """
    Ensure Unicode bidi characters are correctly balanced, as described by
    Cal Henderson in http://www.iamcal.com/understanding-bidirectional-text/
    """
    count = [0] # Trick to work around Python's dodgy closure scoping
    def sub(m):
        ch = m.group(0)
        if ch == pdf:
            if count[0]:
                count[0] -= 1
                return ch
            else:
                return '' # Kill unbalanced pdfs
        else: # Not a pdf
            count[0] += 1
            return ch

    data = regex.sub(sub, data)
    return data + (pdf * count[0])

## bidiclean_tests.py
import unittest
import bidiclean

bidi_tests = (
    (u'Normal string', u'Normal string'),
    (u'One explicit \u202a', u'One explicit \u202a\u202c'),
    (u'Two explicits \u202a\u202b',
        u'Two explicits \u202a\u202b\u202c\u202c'),
    (u'Three explicits \u202b\u202d\u202e',
        u'Three explicits \u202b\u202d\u202e\u202c\u202c\u202c'),
    (u'Rogue pdf \u202c', u'Rogue pdf '),
    (u'Valid pdf \u202b\u202c', u'Valid pdf \u202b\u202c'),
    (u'One valid pdf, one rogue \u202b\u202c\u202c',
        u'One valid pdf, one rogue \u202b\u202c'),
)

class BidiCleanTest(unittest.TestCase):
    pass

i = 0
for input, expected in bidi_tests:
    def test(self):
        actual = bidiclean.bidiclean(input)
        self.assertEqual(expected, actual)
    test.__name__ = 'test_%d' % i
    setattr(BidiCleanTest, 'test_%d' % i, test)
    i += 1

if __name__ == '__main__':
    unittest.main()
	import re

	explicits = (
	u'\u202a', # LEFT-TO-RIGHT EMBEDDING
	u'\u202b', # RIGHT-TO-LEFT EMBEDDING
	u'\u202d', # LEFT-TO-RIGHT OVERRIDE
	u'\u202e', # RIGHT-TO-LEFT OVERRIDE
	)
	pdf = u'\u202c' # POP DIRECTIONAL FORMATTING

	regex = re.compile('\|'.join(explicits + (pdf,)))
	def bidiclean(data):
	"""
	Ensure Unicode bidi characters are correctly balanced, as described by
	Cal Henderson in http://www.iamcal.com/understanding-bidirectional-text/
	"""
	count = [0] # Trick to work around Python's dodgy closure scoping
	def sub(m):
	ch = m.group(0)
	if ch == pdf:
	if count[0]:
	count[0] -= 1
	return ch
	else:
	return '' # Kill unbalanced pdfs
	else: # Not a pdf
	count[0] += 1
	return ch

	data = regex.sub(sub, data)
	return data + (pdf * count[0])
	import unittest
	import bidiclean

	bidi_tests = (
	(u'Normal string', u'Normal string'),
	(u'One explicit \u202a', u'One explicit \u202a\u202c'),
	(u'Two explicits \u202a\u202b',
	u'Two explicits \u202a\u202b\u202c\u202c'),
	(u'Three explicits \u202b\u202d\u202e',
	u'Three explicits \u202b\u202d\u202e\u202c\u202c\u202c'),
	(u'Rogue pdf \u202c', u'Rogue pdf '),
	(u'Valid pdf \u202b\u202c', u'Valid pdf \u202b\u202c'),
	(u'One valid pdf, one rogue \u202b\u202c\u202c',
	u'One valid pdf, one rogue \u202b\u202c'),
	)

	class BidiCleanTest(unittest.TestCase):
	pass

	i = 0
	for input, expected in bidi_tests:
	def test(self):
	actual = bidiclean.bidiclean(input)
	self.assertEqual(expected, actual)
	test.__name__ = 'test_%d' % i
	setattr(BidiCleanTest, 'test_%d' % i, test)
	i += 1

	if __name__ == '__main__':
	unittest.main()