andjc/UAX_29.py

## UAX_29.py
# We start by loading up PyICU.
import PyICU as icu
# Let's create a test text. Notice it contains some punctuation.
test = u"This is (\"a\") test!"


# We create a wordbreak iterator. All break iterators in ICU are really RuleBasedBreakIterators, and we need to tell it which locale to take the word break rules from. Most locales have the same rules for UAX#29 so we will use English.
wb = icu.BreakIterator.createWordInstance(icu.Locale.getEnglish())

# An iterator is just that. It contains state and then we iterate over it. The state in this case is the text we want to break. So we set that.
wb.setText(test)


# The result is an iterator, which if we expand it gives us the break points in the text for word breaking.
breaks = list(wb)
breaks


# Out[8]:

#     [4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 18, 19]

# That's a lot of breaks. The word break iterator breaks up punctuation blocks as well. As we can see if we list the strings.

[test[x[0]:x[1]] for x in zip([0]+breaks, breaks)]

# Out[16]:

#     [u'This', u' ', u'is', u' ', u'(', u'"', u'a', u'"', u')', u' ', u'test', u'!']

# What we really want is runs of text that have been properly word broken but we want to group word forming character groups (generally these occur on their own) and then groups of non-word forming characters. To do this, we use a UnicodeSet of all the alphanumeric characters in Unicode and test each group to see if it is a member of that set. First to create the UnicodeSet.

alnum = icu.UnicodeSet("[:ALNUM:]")

# The easiest way to group things is have the groupby function do the hard work. This comes from itertools. We test each group to see whether it starts with an alphanumeric character and group the runs with the same start character type.

from itertools import groupby
starts = [list(g) for k,g in groupby([0]+breaks[:-1], lambda x:alnum.contains(test[x]))]
starts

# Out[13]:

#     [[0], [4], [5], [7, 8, 9], [10], [11, 12, 13], [14], [18]]

# Now with the start groups, we just need to turn those into runs:

starts = [x[0] for x in starts]
ends = starts[1:] + [breaks[-1]]
res = zip(starts, ends)
res

# Out[14]:

#     [(0, 4), (4, 5), (5, 7), (7, 10), (10, 11), (11, 14), (14, 18), (18, 19)]

out = [test[x[0]:x[1]] for x in res]
out

# Out[15]:

#     [u'This', u' ', u'is', u' ("', u'a', u'") ', u'test', u'!']
	# We start by loading up PyICU.
	import PyICU as icu
	# Let's create a test text. Notice it contains some punctuation.
	test = u"This is (\"a\") test!"


	# We create a wordbreak iterator. All break iterators in ICU are really RuleBasedBreakIterators, and we need to tell it which locale to take the word break rules from. Most locales have the same rules for UAX#29 so we will use English.
	wb = icu.BreakIterator.createWordInstance(icu.Locale.getEnglish())

	# An iterator is just that. It contains state and then we iterate over it. The state in this case is the text we want to break. So we set that.
	wb.setText(test)


	# The result is an iterator, which if we expand it gives us the break points in the text for word breaking.
	breaks = list(wb)
	breaks


	# Out[8]:

	# [4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 18, 19]

	# That's a lot of breaks. The word break iterator breaks up punctuation blocks as well. As we can see if we list the strings.

	[test[x[0]:x[1]] for x in zip([0]+breaks, breaks)]

	# Out[16]:

	# [u'This', u' ', u'is', u' ', u'(', u'"', u'a', u'"', u')', u' ', u'test', u'!']

	# What we really want is runs of text that have been properly word broken but we want to group word forming character groups (generally these occur on their own) and then groups of non-word forming characters. To do this, we use a UnicodeSet of all the alphanumeric characters in Unicode and test each group to see if it is a member of that set. First to create the UnicodeSet.

	alnum = icu.UnicodeSet("[:ALNUM:]")

	# The easiest way to group things is have the groupby function do the hard work. This comes from itertools. We test each group to see whether it starts with an alphanumeric character and group the runs with the same start character type.

	from itertools import groupby
	starts = [list(g) for k,g in groupby([0]+breaks[:-1], lambda x:alnum.contains(test[x]))]
	starts

	# Out[13]:

	# [[0], [4], [5], [7, 8, 9], [10], [11, 12, 13], [14], [18]]

	# Now with the start groups, we just need to turn those into runs:

	starts = [x[0] for x in starts]
	ends = starts[1:] + [breaks[-1]]
	res = zip(starts, ends)
	res

	# Out[14]:

	# [(0, 4), (4, 5), (5, 7), (7, 10), (10, 11), (11, 14), (14, 18), (18, 19)]

	out = [test[x[0]:x[1]] for x in res]
	out

	# Out[15]:

	# [u'This', u' ', u'is', u' ("', u'a', u'") ', u'test', u'!']