This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# If you come from bash you might have to change your $PATH. | |
# export PATH=$HOME/bin:/usr/local/bin:$PATH | |
# Path to your oh-my-zsh installation. | |
export ZSH="/home/tathagata.raha/.oh-my-zsh" | |
#Set name of the theme to load --- if set to "random", it will | |
PATH+=:~/utils/rclone-v1.53.3-linux-amd64/: | |
PATH+=~/utils: | |
PATH+=/bin: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
from inltk.inltk import identify_language, reset_language_identifying_models | |
from inltk.codemixed_util import * | |
class TestLangIndenfier(unittest.TestCase): | |
def test_iden_sans(self): | |
self.assertEqual(identify_language('न्यायदर्शनम् भारतीयदर्शनेषु अन्यतमम्। वैदिकदर्शनेषु '),'sanskrit') | |
def test_iden_beng(self): | |
self.assertEqual(identify_language('বাংলা লিপি হলো একটি লিখন পদ্ধতি যেটা ব্যবহার করা হয় বাংলা, মণিপুরি, ককবরক, অসমীয়া ভাষায়। পূর্ব নাগরী লিপি থেকে এই লিপির উদ্ভব'),'bengali') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from inltk.inltk import identify_language, reset_language_identifying_models | |
inp = 'The model was trained on code-mixed data' | |
print('inp: ', identify_language(inp)) | |
from inltk.codemixed_util import * # In order to check code-mixed or romanized Indian languages, you have to import all the classes from codemixed_util. Else it will raise AttributeError. Comment this line and check for yourself. | |
inp2 = 'Tu achha insan hain' | |
print('inp2: ', identify_language(inp2, check_codemixed=True)) # Passing the check_codemixed argument as True will check the romanised Indian languages and code-mixed instances | |
inp3 = 'Tu achha insan hain' | |
print('inp3: ', identify_language(inp3)) # if check_codemixed is set to False, it will return 'en' for anything written in Latin script | |
inp4 = 'thanks, nahi khoj paye to batana, i have a few tough ones, but will need to work together for them' |