feerrenrut/testSpeedUnicodeCharacterLookup.py

## testSpeedUnicodeCharacterLookup.py
import bisect

import unicodeScriptData

scriptCode = unicodeScriptData.scriptRanges
unicodeScriptRangeEnd = [ k[1] for k in scriptCode]


def withBisect(chr):
	# Based on the following assumptions:
	# - ranges must overlap
	# - range end and start values are included in that range
	# - there may be gaps between ranges.

	# Approach: Look for the first index of a range where the range end value is greater
	# than the code we are searching for. If this is found, and the start value for this range
	# is less than or equal to the code we are searching for then we have found the range.
	# That is startValue <= characterUnicodeCode <= endValue

	characterUnicodeCode = ord(chr)

	# Number should respect preferred language setting
	# FullWidthNumber is in Common category, however, it indicates Japanese language context
	if 0x30 <= characterUnicodeCode <= 0x39:
		return "Number"
	elif 0xff10 <= characterUnicodeCode <= 0xff19:
		return "FullWidthNumber"

	index = bisect.bisect_left(unicodeScriptRangeEnd, characterUnicodeCode )
	if index == len(unicodeScriptRangeEnd):
		# there is no value of index such that: `characterUnicodeCode <= scriptCode[index][1]`
		# characterUnicodeCode is larger than all of the range end values so a range is not
		# found for the value:
		return None

	# Since the range at index is the first where `characterUnicodeCode <= rangeEnd` is True,
	# we now ensure that for the range at the index `characterUnicodeCode >= rangeStart`
	# is also True.
	candidateRange = scriptCode[index]
	rangeStart = candidateRange[0]
	if rangeStart > characterUnicodeCode :
		# characterUnicodeCode comes before the start of the range at index so a range
		# is not found for the value
		return None
	rangeName = candidateRange[2]
	return rangeName

def customBinarySearch(chr):
	"""performs a binary search in scripCodes for unicode ranges
	@param chr: character for which a script should be found
	@type chr: string
	@return: script code
	@rtype: int"""
	mStart = 0
	mEnd = len(scriptCode)-1
	characterUnicodeCode = ord(chr)
	# Number should respect preferred language setting
	# FullWidthNumber is in Common category, however, it indicates Japanese language context
	if 0x30 <= characterUnicodeCode <= 0x39:
		return "Number"
	elif 0xff10 <= characterUnicodeCode <= 0xff19:
		return "FullWidthNumber"
	while( mEnd >= mStart ):
		midPoint = (mStart + mEnd ) >> 1
		if characterUnicodeCode < scriptCode[midPoint][0]:
			mEnd = midPoint -1
		elif characterUnicodeCode > scriptCode[midPoint][1]:
			mStart = midPoint + 1
		else:
			return scriptCode[midPoint][2]
	return None


start = scriptCode[0][0]
end = scriptCode[len(scriptCode)-1][1] # gives: 917631
#start = 0
end = 0x10000 -1 #values over 65535 dont work with unichr()

def unicode_literal(n):
 s = "u'\U%08X'" % n
 c = s.decode('unicode-escape')
 return c.encode('utf-8')

def doAllChars(f):
	numberOfExceptions = 0
	for c in xrange(start, end):
    # some values can not be converted back to a single unicode character.
		#s = unicode_literal(c)
		try:
			s = unichr(c)
		except:
			numberOfExceptions = numberOfExceptions +1
			pass
		v = f(s)

import timeit
def measureIt():
	n = 100
	print("using {} iterations".format(n))
	print("testing over range {}-{}, a total of {} values".format(start, end, end-start))
	result = timeit.timeit(lambda: doAllChars(withBisect), number=n)
	print("withBisect: %s"%result)
	result = timeit.timeit(lambda: doAllChars(customBinarySearch), number=n)
	print("customBinarySearch: %s"%result)

def doIt():
	doAllChars(withBisect)
	doAllChars(customBinarySearch)

#doIt()
measureIt()
	import bisect

	import unicodeScriptData

	scriptCode = unicodeScriptData.scriptRanges
	unicodeScriptRangeEnd = [ k[1] for k in scriptCode]


	def withBisect(chr):
	# Based on the following assumptions:
	# - ranges must overlap
	# - range end and start values are included in that range
	# - there may be gaps between ranges.

	# Approach: Look for the first index of a range where the range end value is greater
	# than the code we are searching for. If this is found, and the start value for this range
	# is less than or equal to the code we are searching for then we have found the range.
	# That is startValue <= characterUnicodeCode <= endValue

	characterUnicodeCode = ord(chr)

	# Number should respect preferred language setting
	# FullWidthNumber is in Common category, however, it indicates Japanese language context
	if 0x30 <= characterUnicodeCode <= 0x39:
	return "Number"
	elif 0xff10 <= characterUnicodeCode <= 0xff19:
	return "FullWidthNumber"

	index = bisect.bisect_left(unicodeScriptRangeEnd, characterUnicodeCode )
	if index == len(unicodeScriptRangeEnd):
	# there is no value of index such that: `characterUnicodeCode <= scriptCode[index][1]`
	# characterUnicodeCode is larger than all of the range end values so a range is not
	# found for the value:
	return None

	# Since the range at index is the first where `characterUnicodeCode <= rangeEnd` is True,
	# we now ensure that for the range at the index `characterUnicodeCode >= rangeStart`
	# is also True.
	candidateRange = scriptCode[index]
	rangeStart = candidateRange[0]
	if rangeStart > characterUnicodeCode :
	# characterUnicodeCode comes before the start of the range at index so a range
	# is not found for the value
	return None
	rangeName = candidateRange[2]
	return rangeName

	def customBinarySearch(chr):
	"""performs a binary search in scripCodes for unicode ranges
	@param chr: character for which a script should be found
	@type chr: string
	@return: script code
	@rtype: int"""
	mStart = 0
	mEnd = len(scriptCode)-1
	characterUnicodeCode = ord(chr)
	# Number should respect preferred language setting
	# FullWidthNumber is in Common category, however, it indicates Japanese language context
	if 0x30 <= characterUnicodeCode <= 0x39:
	return "Number"
	elif 0xff10 <= characterUnicodeCode <= 0xff19:
	return "FullWidthNumber"
	while( mEnd >= mStart ):
	midPoint = (mStart + mEnd ) >> 1
	if characterUnicodeCode < scriptCode[midPoint][0]:
	mEnd = midPoint -1
	elif characterUnicodeCode > scriptCode[midPoint][1]:
	mStart = midPoint + 1
	else:
	return scriptCode[midPoint][2]
	return None


	start = scriptCode[0][0]
	end = scriptCode[len(scriptCode)-1][1] # gives: 917631
	#start = 0
	end = 0x10000 -1 #values over 65535 dont work with unichr()

	def unicode_literal(n):
	s = "u'\U%08X'" % n
	c = s.decode('unicode-escape')
	return c.encode('utf-8')

	def doAllChars(f):
	numberOfExceptions = 0
	for c in xrange(start, end):
	# some values can not be converted back to a single unicode character.
	#s = unicode_literal(c)
	try:
	s = unichr(c)
	except:
	numberOfExceptions = numberOfExceptions +1
	pass
	v = f(s)

	import timeit
	def measureIt():
	n = 100
	print("using {} iterations".format(n))
	print("testing over range {}-{}, a total of {} values".format(start, end, end-start))
	result = timeit.timeit(lambda: doAllChars(withBisect), number=n)
	print("withBisect: %s"%result)
	result = timeit.timeit(lambda: doAllChars(customBinarySearch), number=n)
	print("customBinarySearch: %s"%result)

	def doIt():
	doAllChars(withBisect)
	doAllChars(customBinarySearch)

	#doIt()
	measureIt()