Last active
April 14, 2017 21:37
-
-
Save iandev/7fcf97f5ffbbc3ad1f7feecba0554762 to your computer and use it in GitHub Desktop.
Based on this paper: http://odur.let.rug.nl/~vannoord/TextCat/textcat.pdf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function LanguageDetection(n, max, languageProfiles) { | |
function nGrams(n1, n2, text) { | |
var ngrams = {}, tokens = text.split(/[^A-Za-z'\u0430-\u044f]+/); | |
for (var n = n1; n <= n2; n++) { | |
for(var t = 0; t < tokens.length; t++){ | |
var token = '_'+tokens[t]+'_'; | |
for(var i = 0; i < token.length - n + 1; i++) { | |
var ngram = token.substring(i, i+n); | |
ngrams[ngram] = (ngrams[ngram] || 0)+1; | |
} | |
} | |
} | |
return Object.keys(ngrams).map(function(ngram) { | |
return {ngram: ngram, count: ngrams[ngram]}; | |
}); | |
} | |
function getProfile(text) { | |
return nGrams(1,n,text).sort(function(a,b) { | |
if (a.count < b.count) return 1; | |
if (a.count > b.count) return -1; | |
return 0; | |
}).slice(0, max).map(function(g) { | |
return g.ngram; | |
}); | |
} | |
function profileDistance(textProfile, catProfile) { | |
var distance = 0; | |
for(var pos1=0; pos1 < textProfile.length; pos1++) { | |
var pos2 = catProfile.indexOf(textProfile[pos1]); | |
if (pos2 == -1) { | |
distance += catProfile.length; | |
continue; | |
} | |
distance += Math.abs(pos1-pos2); | |
} | |
return distance; | |
} | |
return { | |
categorize: function(text) { | |
var category, profile = getProfile(text), | |
minDistance = Number.MAX_SAFE_INTEGER || 99999999999; | |
for(var cat in languageProfiles) { | |
var distance = profileDistance(profile, languageProfiles[cat]); | |
if (distance < minDistance) { | |
minDistance = distance; | |
category = cat; | |
} | |
} | |
return category; | |
} | |
} | |
} | |
var languageDetection = LanguageDetection(4, 400, { | |
'afrikaans': "_|e|a|n|i|o|r|s|t|d|e_|l|k|ie|g|n_|m|_d|t_|er|h|u|ie_|y|w|s_|_s|_h|di|an|r_|aa|v|en|_di|.|y_|_v|et|._|die|die_|_n|_die|p|_m|_die_|_w|ee|ge|_o|b|te|,|in|k_|_e|,_|oo|et_|de|el|_g|f|ar|ni|nd|an_|en_|_i|he|g_|_t|oe|at|er_|om|wa|_a|_b|_k|nie|_he|aar|_ge|es|_ni|da|m_|ou|it|_nie|d_|l_|_wa|or|le|we|ek|het|me|_het|is|j|at_|on|se|_en|ma|st|as|va|_en_|re|\"|'|het_|_het_|om_|al|ar_|li|te_|aar_|_da|u_|nde|ou_|_l|be|_'|rd|_va|ig|ng|ns|ve|it_|_j|_me|sy|ke|_sy|aan|van|_in|is_|in_|sy_|_sy_|'n|ro|ko|_'n|ra|'n_|_'n_|so|D|ho|rs|eer|ik|la|_te|_van|_ma|as_|ui|ver|e.|der|to|op|van_|ag|_ve|and|_van_|ha|f_|ka|ne|_is|sk|e._|oor|_ver|ek_|_hy|hy|p_|_be|ri|ur|nie_|_so|_D|si|ll|no|_in_|_hy_|hy_|ed|ers|_r|ak|_ho|_nie_|eg|nt|de_|_p|_we|_is_|ei|es_|maa|wee|na|nder|a_|ing|ew|S|lle|_om|_te_|eu|ie.|wo|em|wat|_no|_\"|vo|E|H|_wat|ti|mo|A|e,|_ha|vi|el_|ter|e,_|dat|eer_|wat_|le_|ta|Di|dat_|_wat_|ie._|was|ste|_H|_se|se_|ul|al_|_was|_om_|_st|lik|\"_|_ko|_maa|lo|_to|ns_|aan_|nie.|_vi|met|_nie.|nk|_Di|-|_op|_oo|_on|ir|ord|uit|ens|_was_|was_|een|_met|os|_S|nie._|ig_|_sk|op_|_ek|_wee|ir_|met_|_met_|rt|ik_|end|nd_|gt|ond|ot|_aa|og|vir_|vir|_ka|hu|_mo|_vir_|_vir|_dit|kr|am|ol|dit|_ek_|ki|sa|_aan|man|jy|ng_|aak|lle_|_hu|_na|_vo|ewe|of|jy_|_dit_|dit_|_jy|der_|jo|_f|_u|sie|_dat|_jy_|daa|do|vr|wi|ry|_dat_|eur|rs_|_jo|_wo|_ne|jie|ji|pe|moe|my|ull|Die|maar|_hom|ulle|_maar|hom|_uit|_ui|ges|raa|or_|ies|jou|_la|maar_|ulle_|_daa|Die_|daar|_daar|ien|_my|_jou|ok|il|lik_|sta|_Die|ur_|ga|ag_|kan".split('|'), | |
'english': "_|e|t|o|n|i|a|s|r|h|e_|d|_t|c|l|th|he|_th|u|f|m|p|_a|the|_the|s_|er|_o|he_|d_|t_|the_|_the_|on|in|y|n_|b|re|,|,_|an|g|w|_i|en|f_|y_|of|_of|es|ti|v|_of_|of_|nd|at|r_|_w|it|ed|_p|nt|_c|o_|io|_an|te|or|_b|nd_|to|st|is|_s|_in|ion|and|de|ve|ha|ar|_m|and_|_and|_and_|se|_to|me|to_|ed_|.|be|_f|._|_to_|co|ic|ns|al|le|ou|ce|ent|l_|_co|tio|on_|_d|tion|ri|_e|ng|hi|er_|ea|as|_be|pe|h_|_r|ec|ch|ro|ct|_h|pr|in_|ne|ll|rt|s,_|s,|li|ra|T|wh|a_|ac|_wh|_n|ts|di|es_|si|re_|at_|nc|ie|_a_|_in_|ing|us|_re|g_|ng_|op|con|tha|_l|_tha|ver|ma|ion_|_con|ci|ons|_it|po|ere|is_|ta|la|_pr|fo|ho|ir|ss|men|be_|un|ty|_be_|ing_|om|ot|hat|ly|_g|em|_T|rs|mo|ch_|wi|we|ad|ts_|res|_wi|I|hat_|ei|ly_|ni|os|ca|ur|A|ut|that|_that|ati|_fo|st_|il|or_|for|pa|ul|ate|ter|it_|nt_|that_|_ha|al_|el|as_|ll_|_ma|no|ment|an_|tion_|su|bl|_de|nce|pl|fe|tr|so|int|ov|e,|e,_|_u|ent_|Th|her|j|atio|ation|_Th|le_|ai|_it_|_on|_for|ect|k|hic|est|der|tu|na|_by_|by_|E|by|_by|ve_|_di|en_|vi|m_|_whi|iv|whi|ns_|_A|ich|ge|pro|ess|_whic|ers|hich|ce_|which|whic|all|ove|_is|ich_|ee|hich_|n,_|n,|im|ir_|hei|ions|sti|se_|per|The|_pa|heir|id|eir|eir_|ig|heir_|_no|ev|era|_int|ted|_The|ies|art|thei|_ar|_thei|their|_pro|et|_pe|_mo|ther|x|gh|S|_is_|ol|ty_|_I|nde|am|rn|nte|mp|_su|_we|par|_v|pu|his|ow|mi|go|N|ue|ple|ep|ab|;_|;|ex|ain|over|_un|q|qu|pp|ith|ry|_as|ber|ub|av|uc|s._|s.|enc|are|iti|gr|his_|ua|part|ff|eve|O|rea|ous|ia|The_|ag|mb|_go|fa|on,_|ern|t,_|on,|t,|_me".split('|'), | |
'spanish': "_|e|a|o|s|n|i|r|l|d|c|t|u|a_|e_|s_|de|p|_d|m|_de|n_|o_|en|_e|es|_l|de_|la|os|_de_|_p|l_|ci|_c|_a|os_|ar|er|as|ra|nt|_la|re|,_|,|el|ta|ue|g|on|al|_s|co|b|an|v|la_|or|te|st|el_|_la_|y|to|r_|ad|ó|do|ro|se|as_|q|qu|.|._|en_|ca|in|un|_co|es_|ic|_en|ac|que|na|lo|_m|f|ent|da|ue_|po|le|_q|_qu|que_|_que|ie|h|pa|y_|ti|_que_|_en_|_y|tr|_el|ri|ia|_el_|_se|ió|_y_|io|pr|ón|ec|no|id|í|mi|_t|ión|nte|me|aci|do_|li|con|nd|est|ni|á|di|_es|_lo|ció|ma|ón_|_pr|_r|ción|z|ra_|si|ión_|oc|nc|_u|_po|los|or_|_con|is|del|_del|ado|se_|_i|los_|_re|por|_del_|sta|del_|al_|ne|_h|cu|_n|_a_|_v|_un|ce|so|ción_|res|vi|om|te_|_pa|ien|j|E|_los|_los_|to_|ol|it|am|ació|rt|ación|pe|ha|_se_|nto|_o|_E|on_|sa|na_|ta_|su|cia|mo|ct|par|_f|_por|eg|_in|ur|L|ve|im|ga|_est|ar_|ab|_L|tu|at|no_|s,|s,_|_por_|por_|las|ba|o,_|o,|ento|et|C|_ha|A|tra|ient|_al|a,_|ica|a,|pro|ado_|ici|_ca|an_|las_|ara|nci|ente|ú|rr|ir|da_|em|ll|il|ía|iv|_su|_par|ul|ant|_A|mp|_las_|_las|_C|_pro|men|P|des|com|ion|era|ed|ida|sp|gu|nte_|ns|za|dos|M|cio|les|_P|bl|_com|s._|s.|_M|ua|nta|mu|_no|dad|ñ|é|un_|va|ist|nes|iento|one|ara_|S|ada|_un_|fi|pre|tos|ter|ot|esta|_me|ido|ob|_g|br|go|ea|nto_|ona|pu|dos_|tro|ier|para|ment|ag|ero|gr|rec|bi|ia_|una|nic|ncia|ía_|a._|tos_|a.|ran|lo_|ones|rm|lu|ron|con_|ó_|nes_|_ci|ante|ch|_con_|_para|ntr|una_|para_|mie|ico|fe|les_|uc|ip|sto|_ma|ui|sta_|_ve|cion|\"|op|cal|_mu|_S|ro_|_pe|ste|ras|pl|_una|_di|ento_|ita|ione|ect|_una_|mien|tan|du|den|ndo|per|eri".split('|'), | |
'french': "_|e|i|s|a|n|r|t|u|o|l|e_|d|s_|_d|c|p|é|m|es|t_|_l|de|on|_de|en|_p|nt|le|es_|re|,|,_|n_|de_|''|an|_de_|v|_s|r_|_c|er|ai|_a|_e|ou|q|qu|is|te|ti|ur|it|g|a_|f|la|in|_le|me|nt_|.|b|ra|io|ent|._|ne|ns|ion|h|ue|se|le_|ar|ie|co|at|tr|et|pr|ce|au|u_|il|_r|_la|un|eu|st|re_|ro|la_|on_|_m|_la_|que|_qu|_q|po|tio|tion|pa|li|_t|nc|si|_pr|ri|al|ui|_co|i_|ta|é_|x|em|l_|et_|_l'|l'|les|ns_|ir|_le_|ent_|or|ré|_f|ne_|à|ve|ch|it_|di|oi|-|ni|à_|les_|d'|el|ss|_n|ut|our|des|\"|ur_|nd|er_|ait|ion_|rs|_en|_et|j|_d'|ll|_des|des_|_pa|té|_et_|_à|_à_|om|ma|ati|_des_|L|so|_u|è|_\"|sa|_po|tre|dé|ue_|pe|en_|ont|_un|_L|us|_les|_les_|rt|is_|_i|du|e,_|e,|na|s,|s,_|as|men|M|ait_|'a|vi|ci|ant|_au|da|_M|ation|atio|con|que_|ons|eur|est|me_|mi|par|tion_|_so|te_|res|lo|ment|és|ans|_du|du_|ux|un_|y|pro|_du_|_dé|ce_|_se|_re|pl|A|ge|ic|su|x_|ien|nce|\"_|ac|il_|qui|_pro|no|av|_v|_o|rs_|ans_|eme|bl|emen|_en_|iqu|ct|iq|lle|nn|ts|ement|ét|_\"_|ér|té_|_ce|mp|ire|ui_|to|he|_é|ca|_j|ec|va|_par|ée|_con|se_|tre_|ique|dan|éc|ha|une|P|lu|ux_|_b|s.|pou|_pou|ier|C|ais|s._|ain|_un_|nte|'e|mo|mm|ment_|une_|com|_P|i|_ma|do|ant_|anc|che|ap|ont_|_que|os|urs|_di|fi|im|pour|_pour|ê|ts_|_g|our_|_sa|ntr|_da|_ré|rai|rm|_qui|e.|am|_com|uv|_C|D|qui_|e._|pu|_qui_|ia|_dan|_dans|dans|ter|fo|son|dans_|id|ag|ine|tu|ran|au_|ol|oc|est_|st_|enc|F|_tr|u|tai|ell|R|_su|S|ions|pré|sé|ab|né|_que_|_in|_av|pour_|fa|rr|air|_ch|_a_|ba|_pl|gr|tt|ssi|rd|pas|bi".split('|'), | |
'german': "_|e|n|i|r|t|s|a|h|d|er|en|u|l|n_|c|ch|g|o|e_|r_|m|_d|de|en_|ei|er_|in|te|ie|b|t_|f|k|ge|s_|un|,|,_|w|z|nd|he|st|_s|_de|.|_e|ne|der|._|be|es|ic|_a|ie_|is|ich|an|re|di|ein|se|\"|ng|_i|sc|sch|it|der_|h_|ch_|S|le|p|ä|ü|au|v|che|_w|d_|die|_di|m_|_die|el|_S|_der|li|_der_|si|al|ns|on|or|ti|ten|ht|die_|_die_|D|rt|nd_|_u|nt|A|in_|den|cht|und|me|_z|ung|ll|_un|_ei|_n|hr|ine|_A|_ein|ar|ra|_v|_g|as|zu|et|em|_D|eine|gen|g_|da|we|K|lt|B|_\"|nde|ni|und_|E|ur|_m|ri|ha|eh|ten_|es_|_K|_und|ig|_b|hen|_und_|_au|_B|_da|_zu|_in|at|us|wi|n,|n,_|nn|te_|eit|_h|ter|M|n.|ß|ng_|sche|-|rs|den_|_si|G|im|_ge|chen|rd|_E|n._|icht|rn|uf|isch|isc|nen|_in_|_M|_er|ich_|ac|lic|_G|ber|la|vo|eb|ke|F|as_|hen_|ach|en,|ung_|lich|ste|en,_|_k|ben|_f|en.|_be|it_|L|_se|mi|ve|na|on_|P|ss|ist|ö|ht_|ru|st_|_F|ts|ab|W|ol|_eine|hi|so|em_|\"_|ren|en._|chen_|R|ta|ere|ische|ers|ert|_P|tr|ed|ze|eg|ens|ür|ah|_vo|ne_|cht_|uc|_wi|nge|lle|fe|_L|ver|hl|V|ma|wa|auf|H|_W|T|nte|uch|l_|sei|nen_|u_|_den|_al|_V|t.|lte|ut|ent|sich|sic|il|ier|am|gen_|sen|fü|um|t._|f_|he_|ner|nst|ls|_sei|ro|ir|ebe|mm|ag|ern|t,_|t,|eu|ft|icht_|hre|Be|nz|nder|_T|_den_|iche|tt|zu_|and|J|rde|rei|_we|_H|ige|_Be|rte|hei|das|aus|che_|_das|_zu_|tz|_ni|das_|_R|N|des|_ve|_J|I|_das_|men|_so|_ver|_auf|ine_|_ha|rg|ind|eben|kt|mit|_an|her|Ge|Sc|_sich|U|Sch|_sic|end|Di|abe|ck|sse|ür_|ell|ik|o_|nic|nich|sa|_fü|hn|zi|no|nicht|im_|von_|von|_nic|_nich|eine_|oc|wei|io|schen|gt".split('|'), | |
'italian': "_|a|e|i|o|l|n|r|t|c|s|a_|e_|d|i_|o_|u|v|p|m|_c|,|,_|_s|_d|g|an|er|_a|_p|la|_l|re|ar|h|no|co|va|_e|n_|on|ra|to|f|di|_i|ch|ll|l_|la_|ta|el|in|_m|en|b|ri|_co|_n|_di|li|av|al|le|ia|se|ol|_f|or|te|_e_|ve|at|de|.|ne|va_|ca|._|tt|re_|nt|io|_v|pe|z|to_|_ch|na|si|'|he|no_|ci|_la|ro|_g|st|cc|he_|di_|ma|ev|che|es|me|pa|_t|ti|_di_|ss|che_|a,_|a,|nd|o,|o,_|ell|gl|sa|il|gli|da|as|do|_che|_che_|eva|_la_|lla|le_|un|_pe|_de|q|qu|ava|po|on_|r_|li_|_b|_il|_il_|il_|lo|om|e,|e,_|ni|tr|so|ra_|os|_in|_u|per|are|et|_se|ano|si_|_ca|_qu|lla_|_q|_a_|ac|_r|ic|_no|ie|fa|hi|del|ua|_per|ce|_ma|sc|_del|mi|_un|chi|era|i,|i,_|su|and|vo|_fa|eva_|ano_|gli_|non|pi|vi|er_|_al|se_|_ne|_non|am|is|ava_|_non_|non_|in_|ent|_si|_pa|com|!|_le|_su|uo|el_|!_|l'|ue|te_|_com|are_|pr|_in_|van|mo|ta_|gn|ere|na_|tto|it|_per_|per_|é|all|ess|ut|col|acc|gi|lo_|oc|vano|io_|_av|ndo|é_|ato|ave|_st|me_|a|ia_|con|mp|fi|ett|_si_|_pi|era_|ti_|ó|vano_|_gl|qua|ella|sta|ome|S|_gli|_S|ad|_ve|ant|ne_|ó_|sp|do_|_po|ro_|ov|_le_|ella_|sse|_con|ir|_vi|ig|_gli_|_ave|vev|un_|ot|veva|dell|que|a.|_o|a._|tu|cia|za|_que|_da|par|_pr|cch|_dell|eg|_sa|o._|o.|_col|lt|_un_|rt|ur|_vo|_me|ome_|L|ap|_L|zi|nto|og|_an|_so|em|ag|be|ni_|im|cchi|ver|lle|nz|cci|_ri|nc|_er|come_|come|aveva|ui|avev|tto_|_come|ed|P|man|_P|rs|occ|ndo_|ato_|_qua|_era|ari|ba|_mo|nel|id|men|_fi|_all|rr|_do|_avev|att|l'a|ei|zz|;|vol|pp|tra|;_|ere_|lle_|nda|utt|est|_nel|ul|ola|iv|ando|ale|lu|rn|e.|e._|ll'|tta|nte|_l'|uel".split('|'), | |
'russian': "_|о|и|е|а|с|р|н|т|в|к|л|п|м|д|я|у|и_|ы|г|й|_в|е_|_п|ст|_с|а_|ро|я_|б|з|ос|_р|й_|о_|_и|в_|но|ов|ен|ра|ь|ко|ск|ре|ч|х|ни|на|_к|пр|ер|ан|сс|го|_о|_в_|_пр|то|ве|ки|ны|_н|те|си|ли|та|ка|во|ор|од|по|_м|ти|ц|_и_|_г|х_|ль|м_|ол|ле|ет|рос|ой|ал|_ро|ва|ес|ю|ж|ии|ия|ел|ии_|сси|осс|_рос|_д|ой_|росс|ин|_б|со|_росс|росси|_по|осси|ия_|де|ар|ри|т_|он|ми|ие|ий|не|ы_|ла|ит|_т|ру|ски|ат|ом|тр|ис|_на|че|ог|да|ав|ш|ь_|от|ед|ско|ф|ры|_со|нн|к_|ик|ие_|тв|_а|ас|_го|ци|ем|из|i|ме|га|го_|за|на_|ло|ств|a|e|ени|_ко|ев|ий_|s|ост|до|ай|ус|ир|_ст|об|тс|про|ля|но_|н_|ая|им|пе|се|_з|ого|ак|ров|ая_|r|ек|n|ма|их|у_|ви|мо|ове|ей|_про|ил|те_|ого_|o|с_|ов_|льн|ьн|ста|ич|йт|аз|э|ть|енн|сл|сии|ых|их_|сто|сии_|_ч|сти|ых_|ые|ссии|ссии_|ые_|ок|оссии|_ре|_во|год|бо|_год|стр|ом_|ыг|вл|нт|_ка|_ф|айт|йте|йте_|нс|тор|ый|гай|вер|ый_|чес|ам|ыга|прыг|рыг|айте_|пры|айте|ыгай|_пры|_прыг|рыгай|рыга|прыга|ыгайт|гайт|гайте|сп|йс|ере|ист|_у|ди|_э|ое|етс|_ру|нов|ги|щ|ся|ова|кр|бр|дн|ду|тра|ад|рус|ние|кой|кой_|t|иче|ран|еск|вы|_за|ани|йск|ю_|_на_|кон|л_|аль|вн|ся_|вр|_л|кий_|кий|же|ные|нны|гр|_ра|ные_|ичес|ческ|_рус|ив|бы|са|ет_|ест|_из|ли_|ных_|рс|ных|сов|оп|да_|_с_|ть_|па|ту|р_|пер|_я|_пе|лен|ние_|ац|аци|ти_|альн|ской|ской_|ическ|ель|ский|ский_|ной_|ной|бл|з_|ите|ур|ва_|ча|_ми|усс|кт|_е|ря|ка_|l|сия|ния|_об|тел|ио|пре|ей_|су|ийс|ния_|_бы|ийск|вс|аст|ым|сн|кие|оль|русс|пу|сск|ки_|ны_|ода|сия_|русск|рен|тн|ыл|д_|усск|ссия".split('|'), | |
}); | |
var language = languageDetection.categorize(document.body.innerText.toLowerCase()); | |
console.log(language); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment