Skip to content

Instantly share code, notes, and snippets.

@ritwikmishra
Created November 10, 2022 14:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ritwikmishra/bd46a4772e720aa5478283acc928b68f to your computer and use it in GitHub Desktop.
Save ritwikmishra/bd46a4772e720aa5478283acc928b68f to your computer and use it in GitHub Desktop.
Detection of a natural language written script using unicode ranges.
from collections import Counter
lang_unicodes = [['English',('\u0021','\u007F')], ['Devnagri',('\u0900','\u097F'),('\uA8E0','\uA8FF')], ['Bangla', ('\u0980','\u09FF')]
,['Gujarati',('\u0A80','\u0AFF')], ['Urdu/Persian/Arabic', ('\u0600','\u06FF'),('\u08A0','\u08FF')], ['Tamil',('\u0B80','\u0BFF')]
,['Telegu',('\u0C00','\u0C7F')], ['punjabi/gurumukhi',('\u0A00','\u0A7F')], ['malayalam',('\u0D00','\u0D7F')]
,['oriya',('\u0B00','\u0B7F')], ['kannada',('\u0C80','\u0CFF')] ,['Sinhala',('\u0D80','\u0DFF')]
,['Thai',('\u0E00','\u0E7F')], ['Lao',('\u0E80','\u0EFF')], ['Tibetan',('\u0F00','\u0FFF')]
,['Myanmar',('\u1000','\u109F')], ['Georgian',('\u10A0','\u10FF')], ['Ethiopic',('\u1200','\u139F')]
,['Chinese',('\u2e80','\u2fdf'),('\u3190','\u319f'),('\u3400','\u4dbf'),('\u4e00','\u9fcc'),('\uf900','\ufaad')]
,['Greek',('\u1F00','\u1FFF')]
# , ['',('\u','\u')], ['',('\u','\u')] # --> Update this if needed
]
def lang_detect_unicode(sentence):
ml = []
for word in sentence.split():
for ch in word:
for i,lu in enumerate(lang_unicodes):
for block in lu[1:]:
if ch >= block[0] and ch <= block[1]:
ml.append(lu[0])
c = Counter(ml)
return c.most_common()[0][0]
print(lang_detect_unicode('你好,我的名字是瑞特維克'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment