Created
August 30, 2021 22:03
-
-
Save tathagata-raha/68645e433d64ec1ee42039e4e3d9d3eb to your computer and use it in GitHub Desktop.
Unit tests for code mixed language identification PR in the inltk toolkit
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
from inltk.inltk import identify_language, reset_language_identifying_models | |
from inltk.codemixed_util import * | |
class TestLangIndenfier(unittest.TestCase): | |
def test_iden_sans(self): | |
self.assertEqual(identify_language('न्यायदर्शनम् भारतीयदर्शनेषु अन्यतमम्। वैदिकदर्शनेषु '),'sanskrit') | |
def test_iden_beng(self): | |
self.assertEqual(identify_language('বাংলা লিপি হলো একটি লিখন পদ্ধতি যেটা ব্যবহার করা হয় বাংলা, মণিপুরি, ককবরক, অসমীয়া ভাষায়। পূর্ব নাগরী লিপি থেকে এই লিপির উদ্ভব'),'bengali') | |
def test_iden_hindi(self): | |
self.assertEqual(identify_language('बंगाली साहित्य अत्यन्त समृद्ध है'),'hindi') | |
def test_iden_english(self): | |
self.assertEqual(identify_language('The model was trained on code-mixed data.'),'en') | |
def test_iden_english_cm(self): | |
self.assertEqual(identify_language('The model was trained on code-mixed data.', check_codemixed=True),'en') | |
def test_hin_romanized_noncm(self): | |
self.assertEqual(identify_language('Waha ek achha balak hain.'),'en') | |
def test_hin_romanized_cm(self): | |
self.assertEqual(identify_language('Waha ek achha balak hain.', check_codemixed=True),'hi-en') | |
def test_codemixed_hin_cm(self): | |
self.assertEqual(identify_language('thanks, nahi khoj paye to batana, i have a few tough ones, but will need to work together for them', check_codemixed=True),'hi-en') | |
def test_mal_romanized_cm(self): | |
self.assertEqual(identify_language('oru kottaravum oru mahanagaravum vizhungikazhinittum', check_codemixed=True),'ml-en') | |
def test_codemixed_mal_cm(self): | |
self.assertEqual(identify_language("ithreyum kaalam ivide thamasichittum you don't know where to find a hospital?", check_codemixed=True),'ml-en') | |
def test_tam_romanized_cm(self): | |
self.assertEqual(identify_language('aagama irrundha santhosam', check_codemixed=True),'ta-en') | |
def test_codemixed_tam_cm(self): | |
self.assertEqual(identify_language('naa introvert da enneye yenda pese vekkiringe', check_codemixed=True),'ta-en') | |
if __name__ == '__main__': | |
unittest.main() | |
''' | |
Output | |
---------------------------------------------------------------------- | |
Ran 12 tests in 6.216s | |
OK | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment