ritwikmishra/lang_detect_unicode.py

## lang_detect_unicode.py


from collections import Counter

lang_unicodes = [['English',('\u0021','\u007F')], ['Devnagri',('\u0900','\u097F'),('\uA8E0','\uA8FF')], ['Bangla', ('\u0980','\u09FF')]
    ,['Gujarati',('\u0A80','\u0AFF')], ['Urdu/Persian/Arabic', ('\u0600','\u06FF'),('\u08A0','\u08FF')], ['Tamil',('\u0B80','\u0BFF')]
    ,['Telegu',('\u0C00','\u0C7F')], ['punjabi/gurumukhi',('\u0A00','\u0A7F')], ['malayalam',('\u0D00','\u0D7F')]
    ,['oriya',('\u0B00','\u0B7F')], ['kannada',('\u0C80','\u0CFF')] ,['Sinhala',('\u0D80','\u0DFF')]
    ,['Thai',('\u0E00','\u0E7F')], ['Lao',('\u0E80','\u0EFF')], ['Tibetan',('\u0F00','\u0FFF')]
	,['Myanmar',('\u1000','\u109F')], ['Georgian',('\u10A0','\u10FF')], ['Ethiopic',('\u1200','\u139F')]
	,['Chinese',('\u2e80','\u2fdf'),('\u3190','\u319f'),('\u3400','\u4dbf'),('\u4e00','\u9fcc'),('\uf900','\ufaad')]
	,['Greek',('\u1F00','\u1FFF')]
	# , ['',('\u','\u')], ['',('\u','\u')] # --> Update this if needed
    ]

def lang_detect_unicode(sentence):
	ml = []
	for word in sentence.split():
		for ch in word:
			for i,lu in enumerate(lang_unicodes):
				for block in lu[1:]:
					if ch >= block[0] and ch <= block[1]:
						ml.append(lu[0])
	c = Counter(ml)
	return c.most_common()[0][0]

print(lang_detect_unicode('你好，我的名字是瑞特維克'))


	from collections import Counter

	lang_unicodes = [['English',('\u0021','\u007F')], ['Devnagri',('\u0900','\u097F'),('\uA8E0','\uA8FF')], ['Bangla', ('\u0980','\u09FF')]
	,['Gujarati',('\u0A80','\u0AFF')], ['Urdu/Persian/Arabic', ('\u0600','\u06FF'),('\u08A0','\u08FF')], ['Tamil',('\u0B80','\u0BFF')]
	,['Telegu',('\u0C00','\u0C7F')], ['punjabi/gurumukhi',('\u0A00','\u0A7F')], ['malayalam',('\u0D00','\u0D7F')]
	,['oriya',('\u0B00','\u0B7F')], ['kannada',('\u0C80','\u0CFF')] ,['Sinhala',('\u0D80','\u0DFF')]
	,['Thai',('\u0E00','\u0E7F')], ['Lao',('\u0E80','\u0EFF')], ['Tibetan',('\u0F00','\u0FFF')]
	,['Myanmar',('\u1000','\u109F')], ['Georgian',('\u10A0','\u10FF')], ['Ethiopic',('\u1200','\u139F')]
	,['Chinese',('\u2e80','\u2fdf'),('\u3190','\u319f'),('\u3400','\u4dbf'),('\u4e00','\u9fcc'),('\uf900','\ufaad')]
	,['Greek',('\u1F00','\u1FFF')]
	# , ['',('\u','\u')], ['',('\u','\u')] # --> Update this if needed
	]

	def lang_detect_unicode(sentence):
	ml = []
	for word in sentence.split():
	for ch in word:
	for i,lu in enumerate(lang_unicodes):
	for block in lu[1:]:
	if ch >= block[0] and ch <= block[1]:
	ml.append(lu[0])
	c = Counter(ml)
	return c.most_common()[0][0]

	print(lang_detect_unicode('你好，我的名字是瑞特維克'))