Skip to content

Instantly share code, notes, and snippets.

@tathagata-raha
Created August 30, 2021 22:03
Show Gist options
  • Save tathagata-raha/68645e433d64ec1ee42039e4e3d9d3eb to your computer and use it in GitHub Desktop.
Save tathagata-raha/68645e433d64ec1ee42039e4e3d9d3eb to your computer and use it in GitHub Desktop.
Unit tests for code mixed language identification PR in the inltk toolkit
import unittest
from inltk.inltk import identify_language, reset_language_identifying_models
from inltk.codemixed_util import *
class TestLangIndenfier(unittest.TestCase):
def test_iden_sans(self):
self.assertEqual(identify_language('न्यायदर्शनम् भारतीयदर्शनेषु अन्यतमम्। वैदिकदर्शनेषु '),'sanskrit')
def test_iden_beng(self):
self.assertEqual(identify_language('বাংলা লিপি হলো একটি লিখন পদ্ধতি যেটা ব্যবহার করা হয় বাংলা, মণিপুরি, ককবরক, অসমীয়া ভাষায়‌। পূর্ব নাগরী লিপি থেকে এই লিপির উদ্ভব'),'bengali')
def test_iden_hindi(self):
self.assertEqual(identify_language('बंगाली साहित्य अत्यन्त समृद्ध है'),'hindi')
def test_iden_english(self):
self.assertEqual(identify_language('The model was trained on code-mixed data.'),'en')
def test_iden_english_cm(self):
self.assertEqual(identify_language('The model was trained on code-mixed data.', check_codemixed=True),'en')
def test_hin_romanized_noncm(self):
self.assertEqual(identify_language('Waha ek achha balak hain.'),'en')
def test_hin_romanized_cm(self):
self.assertEqual(identify_language('Waha ek achha balak hain.', check_codemixed=True),'hi-en')
def test_codemixed_hin_cm(self):
self.assertEqual(identify_language('thanks, nahi khoj paye to batana, i have a few tough ones, but will need to work together for them', check_codemixed=True),'hi-en')
def test_mal_romanized_cm(self):
self.assertEqual(identify_language('oru kottaravum oru mahanagaravum vizhungikazhinittum', check_codemixed=True),'ml-en')
def test_codemixed_mal_cm(self):
self.assertEqual(identify_language("ithreyum kaalam ivide thamasichittum you don't know where to find a hospital?", check_codemixed=True),'ml-en')
def test_tam_romanized_cm(self):
self.assertEqual(identify_language('aagama irrundha santhosam', check_codemixed=True),'ta-en')
def test_codemixed_tam_cm(self):
self.assertEqual(identify_language('naa introvert da enneye yenda pese vekkiringe', check_codemixed=True),'ta-en')
if __name__ == '__main__':
unittest.main()
'''
Output
----------------------------------------------------------------------
Ran 12 tests in 6.216s
OK
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment