Skip to content

Instantly share code, notes, and snippets.

@iandev
Last active April 14, 2017 21:37
Show Gist options
  • Save iandev/7fcf97f5ffbbc3ad1f7feecba0554762 to your computer and use it in GitHub Desktop.
Save iandev/7fcf97f5ffbbc3ad1f7feecba0554762 to your computer and use it in GitHub Desktop.
function LanguageDetection(n, max, languageProfiles) {
function nGrams(n1, n2, text) {
var ngrams = {}, tokens = text.split(/[^A-Za-z'\u0430-\u044f]+/);
for (var n = n1; n <= n2; n++) {
for(var t = 0; t < tokens.length; t++){
var token = '_'+tokens[t]+'_';
for(var i = 0; i < token.length - n + 1; i++) {
var ngram = token.substring(i, i+n);
ngrams[ngram] = (ngrams[ngram] || 0)+1;
}
}
}
return Object.keys(ngrams).map(function(ngram) {
return {ngram: ngram, count: ngrams[ngram]};
});
}
function getProfile(text) {
return nGrams(1,n,text).sort(function(a,b) {
if (a.count < b.count) return 1;
if (a.count > b.count) return -1;
return 0;
}).slice(0, max).map(function(g) {
return g.ngram;
});
}
function profileDistance(textProfile, catProfile) {
var distance = 0;
for(var pos1=0; pos1 < textProfile.length; pos1++) {
var pos2 = catProfile.indexOf(textProfile[pos1]);
if (pos2 == -1) {
distance += catProfile.length;
continue;
}
distance += Math.abs(pos1-pos2);
}
return distance;
}
return {
categorize: function(text) {
var category, profile = getProfile(text),
minDistance = Number.MAX_SAFE_INTEGER || 99999999999;
for(var cat in languageProfiles) {
var distance = profileDistance(profile, languageProfiles[cat]);
if (distance < minDistance) {
minDistance = distance;
category = cat;
}
}
return category;
}
}
}
var languageDetection = LanguageDetection(4, 400, {
'afrikaans': "_|e|a|n|i|o|r|s|t|d|e_|l|k|ie|g|n_|m|_d|t_|er|h|u|ie_|y|w|s_|_s|_h|di|an|r_|aa|v|en|_di|.|y_|_v|et|._|die|die_|_n|_die|p|_m|_die_|_w|ee|ge|_o|b|te|,|in|k_|_e|,_|oo|et_|de|el|_g|f|ar|ni|nd|an_|en_|_i|he|g_|_t|oe|at|er_|om|wa|_a|_b|_k|nie|_he|aar|_ge|es|_ni|da|m_|ou|it|_nie|d_|l_|_wa|or|le|we|ek|het|me|_het|is|j|at_|on|se|_en|ma|st|as|va|_en_|re|\"|'|het_|_het_|om_|al|ar_|li|te_|aar_|_da|u_|nde|ou_|_l|be|_'|rd|_va|ig|ng|ns|ve|it_|_j|_me|sy|ke|_sy|aan|van|_in|is_|in_|sy_|_sy_|'n|ro|ko|_'n|ra|'n_|_'n_|so|D|ho|rs|eer|ik|la|_te|_van|_ma|as_|ui|ver|e.|der|to|op|van_|ag|_ve|and|_van_|ha|f_|ka|ne|_is|sk|e._|oor|_ver|ek_|_hy|hy|p_|_be|ri|ur|nie_|_so|_D|si|ll|no|_in_|_hy_|hy_|ed|ers|_r|ak|_ho|_nie_|eg|nt|de_|_p|_we|_is_|ei|es_|maa|wee|na|nder|a_|ing|ew|S|lle|_om|_te_|eu|ie.|wo|em|wat|_no|_\"|vo|E|H|_wat|ti|mo|A|e,|_ha|vi|el_|ter|e,_|dat|eer_|wat_|le_|ta|Di|dat_|_wat_|ie._|was|ste|_H|_se|se_|ul|al_|_was|_om_|_st|lik|\"_|_ko|_maa|lo|_to|ns_|aan_|nie.|_vi|met|_nie.|nk|_Di|-|_op|_oo|_on|ir|ord|uit|ens|_was_|was_|een|_met|os|_S|nie._|ig_|_sk|op_|_ek|_wee|ir_|met_|_met_|rt|ik_|end|nd_|gt|ond|ot|_aa|og|vir_|vir|_ka|hu|_mo|_vir_|_vir|_dit|kr|am|ol|dit|_ek_|ki|sa|_aan|man|jy|ng_|aak|lle_|_hu|_na|_vo|ewe|of|jy_|_dit_|dit_|_jy|der_|jo|_f|_u|sie|_dat|_jy_|daa|do|vr|wi|ry|_dat_|eur|rs_|_jo|_wo|_ne|jie|ji|pe|moe|my|ull|Die|maar|_hom|ulle|_maar|hom|_uit|_ui|ges|raa|or_|ies|jou|_la|maar_|ulle_|_daa|Die_|daar|_daar|ien|_my|_jou|ok|il|lik_|sta|_Die|ur_|ga|ag_|kan".split('|'),
'english': "_|e|t|o|n|i|a|s|r|h|e_|d|_t|c|l|th|he|_th|u|f|m|p|_a|the|_the|s_|er|_o|he_|d_|t_|the_|_the_|on|in|y|n_|b|re|,|,_|an|g|w|_i|en|f_|y_|of|_of|es|ti|v|_of_|of_|nd|at|r_|_w|it|ed|_p|nt|_c|o_|io|_an|te|or|_b|nd_|to|st|is|_s|_in|ion|and|de|ve|ha|ar|_m|and_|_and|_and_|se|_to|me|to_|ed_|.|be|_f|._|_to_|co|ic|ns|al|le|ou|ce|ent|l_|_co|tio|on_|_d|tion|ri|_e|ng|hi|er_|ea|as|_be|pe|h_|_r|ec|ch|ro|ct|_h|pr|in_|ne|ll|rt|s,_|s,|li|ra|T|wh|a_|ac|_wh|_n|ts|di|es_|si|re_|at_|nc|ie|_a_|_in_|ing|us|_re|g_|ng_|op|con|tha|_l|_tha|ver|ma|ion_|_con|ci|ons|_it|po|ere|is_|ta|la|_pr|fo|ho|ir|ss|men|be_|un|ty|_be_|ing_|om|ot|hat|ly|_g|em|_T|rs|mo|ch_|wi|we|ad|ts_|res|_wi|I|hat_|ei|ly_|ni|os|ca|ur|A|ut|that|_that|ati|_fo|st_|il|or_|for|pa|ul|ate|ter|it_|nt_|that_|_ha|al_|el|as_|ll_|_ma|no|ment|an_|tion_|su|bl|_de|nce|pl|fe|tr|so|int|ov|e,|e,_|_u|ent_|Th|her|j|atio|ation|_Th|le_|ai|_it_|_on|_for|ect|k|hic|est|der|tu|na|_by_|by_|E|by|_by|ve_|_di|en_|vi|m_|_whi|iv|whi|ns_|_A|ich|ge|pro|ess|_whic|ers|hich|ce_|which|whic|all|ove|_is|ich_|ee|hich_|n,_|n,|im|ir_|hei|ions|sti|se_|per|The|_pa|heir|id|eir|eir_|ig|heir_|_no|ev|era|_int|ted|_The|ies|art|thei|_ar|_thei|their|_pro|et|_pe|_mo|ther|x|gh|S|_is_|ol|ty_|_I|nde|am|rn|nte|mp|_su|_we|par|_v|pu|his|ow|mi|go|N|ue|ple|ep|ab|;_|;|ex|ain|over|_un|q|qu|pp|ith|ry|_as|ber|ub|av|uc|s._|s.|enc|are|iti|gr|his_|ua|part|ff|eve|O|rea|ous|ia|The_|ag|mb|_go|fa|on,_|ern|t,_|on,|t,|_me".split('|'),
'spanish': "_|e|a|o|s|n|i|r|l|d|c|t|u|a_|e_|s_|de|p|_d|m|_de|n_|o_|en|_e|es|_l|de_|la|os|_de_|_p|l_|ci|_c|_a|os_|ar|er|as|ra|nt|_la|re|,_|,|el|ta|ue|g|on|al|_s|co|b|an|v|la_|or|te|st|el_|_la_|y|to|r_|ad|ó|do|ro|se|as_|q|qu|.|._|en_|ca|in|un|_co|es_|ic|_en|ac|que|na|lo|_m|f|ent|da|ue_|po|le|_q|_qu|que_|_que|ie|h|pa|y_|ti|_que_|_en_|_y|tr|_el|ri|ia|_el_|_se|ió|_y_|io|pr|ón|ec|no|id|í|mi|_t|ión|nte|me|aci|do_|li|con|nd|est|ni|á|di|_es|_lo|ció|ma|ón_|_pr|_r|ción|z|ra_|si|ión_|oc|nc|_u|_po|los|or_|_con|is|del|_del|ado|se_|_i|los_|_re|por|_del_|sta|del_|al_|ne|_h|cu|_n|_a_|_v|_un|ce|so|ción_|res|vi|om|te_|_pa|ien|j|E|_los|_los_|to_|ol|it|am|ació|rt|ación|pe|ha|_se_|nto|_o|_E|on_|sa|na_|ta_|su|cia|mo|ct|par|_f|_por|eg|_in|ur|L|ve|im|ga|_est|ar_|ab|_L|tu|at|no_|s,|s,_|_por_|por_|las|ba|o,_|o,|ento|et|C|_ha|A|tra|ient|_al|a,_|ica|a,|pro|ado_|ici|_ca|an_|las_|ara|nci|ente|ú|rr|ir|da_|em|ll|il|ía|iv|_su|_par|ul|ant|_A|mp|_las_|_las|_C|_pro|men|P|des|com|ion|era|ed|ida|sp|gu|nte_|ns|za|dos|M|cio|les|_P|bl|_com|s._|s.|_M|ua|nta|mu|_no|dad|ñ|é|un_|va|ist|nes|iento|one|ara_|S|ada|_un_|fi|pre|tos|ter|ot|esta|_me|ido|ob|_g|br|go|ea|nto_|ona|pu|dos_|tro|ier|para|ment|ag|ero|gr|rec|bi|ia_|una|nic|ncia|ía_|a._|tos_|a.|ran|lo_|ones|rm|lu|ron|con_|ó_|nes_|_ci|ante|ch|_con_|_para|ntr|una_|para_|mie|ico|fe|les_|uc|ip|sto|_ma|ui|sta_|_ve|cion|\"|op|cal|_mu|_S|ro_|_pe|ste|ras|pl|_una|_di|ento_|ita|ione|ect|_una_|mien|tan|du|den|ndo|per|eri".split('|'),
'french': "_|e|i|s|a|n|r|t|u|o|l|e_|d|s_|_d|c|p|é|m|es|t_|_l|de|on|_de|en|_p|nt|le|es_|re|,|,_|n_|de_|''|an|_de_|v|_s|r_|_c|er|ai|_a|_e|ou|q|qu|is|te|ti|ur|it|g|a_|f|la|in|_le|me|nt_|.|b|ra|io|ent|._|ne|ns|ion|h|ue|se|le_|ar|ie|co|at|tr|et|pr|ce|au|u_|il|_r|_la|un|eu|st|re_|ro|la_|on_|_m|_la_|que|_qu|_q|po|tio|tion|pa|li|_t|nc|si|_pr|ri|al|ui|_co|i_|ta|é_|x|em|l_|et_|_l'|l'|les|ns_|ir|_le_|ent_|or|ré|_f|ne_|à|ve|ch|it_|di|oi|-|ni|à_|les_|d'|el|ss|_n|ut|our|des|\"|ur_|nd|er_|ait|ion_|rs|_en|_et|j|_d'|ll|_des|des_|_pa|té|_et_|_à|_à_|om|ma|ati|_des_|L|so|_u|è|_\"|sa|_po|tre|dé|ue_|pe|en_|ont|_un|_L|us|_les|_les_|rt|is_|_i|du|e,_|e,|na|s,|s,_|as|men|M|ait_|'a|vi|ci|ant|_au|da|_M|ation|atio|con|que_|ons|eur|est|me_|mi|par|tion_|_so|te_|res|lo|ment|és|ans|_du|du_|ux|un_|y|pro|_du_|_dé|ce_|_se|_re|pl|A|ge|ic|su|x_|ien|nce|\"_|ac|il_|qui|_pro|no|av|_v|_o|rs_|ans_|eme|bl|emen|_en_|iqu|ct|iq|lle|nn|ts|ement|ét|_\"_|ér|té_|_ce|mp|ire|ui_|to|he|_é|ca|_j|ec|va|_par|ée|_con|se_|tre_|ique|dan|éc|ha|une|P|lu|ux_|_b|s.|pou|_pou|ier|C|ais|s._|ain|_un_|nte|'e|mo|mm|ment_|une_|com|_P|i|_ma|do|ant_|anc|che|ap|ont_|_que|os|urs|_di|fi|im|pour|_pour|ê|ts_|_g|our_|_sa|ntr|_da|_ré|rai|rm|_qui|e.|am|_com|uv|_C|D|qui_|e._|pu|_qui_|ia|_dan|_dans|dans|ter|fo|son|dans_|id|ag|ine|tu|ran|au_|ol|oc|est_|st_|enc|F|_tr|u|tai|ell|R|_su|S|ions|pré|sé|ab|né|_que_|_in|_av|pour_|fa|rr|air|_ch|_a_|ba|_pl|gr|tt|ssi|rd|pas|bi".split('|'),
'german': "_|e|n|i|r|t|s|a|h|d|er|en|u|l|n_|c|ch|g|o|e_|r_|m|_d|de|en_|ei|er_|in|te|ie|b|t_|f|k|ge|s_|un|,|,_|w|z|nd|he|st|_s|_de|.|_e|ne|der|._|be|es|ic|_a|ie_|is|ich|an|re|di|ein|se|\"|ng|_i|sc|sch|it|der_|h_|ch_|S|le|p|ä|ü|au|v|che|_w|d_|die|_di|m_|_die|el|_S|_der|li|_der_|si|al|ns|on|or|ti|ten|ht|die_|_die_|D|rt|nd_|_u|nt|A|in_|den|cht|und|me|_z|ung|ll|_un|_ei|_n|hr|ine|_A|_ein|ar|ra|_v|_g|as|zu|et|em|_D|eine|gen|g_|da|we|K|lt|B|_\"|nde|ni|und_|E|ur|_m|ri|ha|eh|ten_|es_|_K|_und|ig|_b|hen|_und_|_au|_B|_da|_zu|_in|at|us|wi|n,|n,_|nn|te_|eit|_h|ter|M|n.|ß|ng_|sche|-|rs|den_|_si|G|im|_ge|chen|rd|_E|n._|icht|rn|uf|isch|isc|nen|_in_|_M|_er|ich_|ac|lic|_G|ber|la|vo|eb|ke|F|as_|hen_|ach|en,|ung_|lich|ste|en,_|_k|ben|_f|en.|_be|it_|L|_se|mi|ve|na|on_|P|ss|ist|ö|ht_|ru|st_|_F|ts|ab|W|ol|_eine|hi|so|em_|\"_|ren|en._|chen_|R|ta|ere|ische|ers|ert|_P|tr|ed|ze|eg|ens|ür|ah|_vo|ne_|cht_|uc|_wi|nge|lle|fe|_L|ver|hl|V|ma|wa|auf|H|_W|T|nte|uch|l_|sei|nen_|u_|_den|_al|_V|t.|lte|ut|ent|sich|sic|il|ier|am|gen_|sen|fü|um|t._|f_|he_|ner|nst|ls|_sei|ro|ir|ebe|mm|ag|ern|t,_|t,|eu|ft|icht_|hre|Be|nz|nder|_T|_den_|iche|tt|zu_|and|J|rde|rei|_we|_H|ige|_Be|rte|hei|das|aus|che_|_das|_zu_|tz|_ni|das_|_R|N|des|_ve|_J|I|_das_|men|_so|_ver|_auf|ine_|_ha|rg|ind|eben|kt|mit|_an|her|Ge|Sc|_sich|U|Sch|_sic|end|Di|abe|ck|sse|ür_|ell|ik|o_|nic|nich|sa|_fü|hn|zi|no|nicht|im_|von_|von|_nic|_nich|eine_|oc|wei|io|schen|gt".split('|'),
'italian': "_|a|e|i|o|l|n|r|t|c|s|a_|e_|d|i_|o_|u|v|p|m|_c|,|,_|_s|_d|g|an|er|_a|_p|la|_l|re|ar|h|no|co|va|_e|n_|on|ra|to|f|di|_i|ch|ll|l_|la_|ta|el|in|_m|en|b|ri|_co|_n|_di|li|av|al|le|ia|se|ol|_f|or|te|_e_|ve|at|de|.|ne|va_|ca|._|tt|re_|nt|io|_v|pe|z|to_|_ch|na|si|'|he|no_|ci|_la|ro|_g|st|cc|he_|di_|ma|ev|che|es|me|pa|_t|ti|_di_|ss|che_|a,_|a,|nd|o,|o,_|ell|gl|sa|il|gli|da|as|do|_che|_che_|eva|_la_|lla|le_|un|_pe|_de|q|qu|ava|po|on_|r_|li_|_b|_il|_il_|il_|lo|om|e,|e,_|ni|tr|so|ra_|os|_in|_u|per|are|et|_se|ano|si_|_ca|_qu|lla_|_q|_a_|ac|_r|ic|_no|ie|fa|hi|del|ua|_per|ce|_ma|sc|_del|mi|_un|chi|era|i,|i,_|su|and|vo|_fa|eva_|ano_|gli_|non|pi|vi|er_|_al|se_|_ne|_non|am|is|ava_|_non_|non_|in_|ent|_si|_pa|com|!|_le|_su|uo|el_|!_|l'|ue|te_|_com|are_|pr|_in_|van|mo|ta_|gn|ere|na_|tto|it|_per_|per_|é|all|ess|ut|col|acc|gi|lo_|oc|vano|io_|_av|ndo|é_|ato|ave|_st|me_|a|ia_|con|mp|fi|ett|_si_|_pi|era_|ti_|ó|vano_|_gl|qua|ella|sta|ome|S|_gli|_S|ad|_ve|ant|ne_|ó_|sp|do_|_po|ro_|ov|_le_|ella_|sse|_con|ir|_vi|ig|_gli_|_ave|vev|un_|ot|veva|dell|que|a.|_o|a._|tu|cia|za|_que|_da|par|_pr|cch|_dell|eg|_sa|o._|o.|_col|lt|_un_|rt|ur|_vo|_me|ome_|L|ap|_L|zi|nto|og|_an|_so|em|ag|be|ni_|im|cchi|ver|lle|nz|cci|_ri|nc|_er|come_|come|aveva|ui|avev|tto_|_come|ed|P|man|_P|rs|occ|ndo_|ato_|_qua|_era|ari|ba|_mo|nel|id|men|_fi|_all|rr|_do|_avev|att|l'a|ei|zz|;|vol|pp|tra|;_|ere_|lle_|nda|utt|est|_nel|ul|ola|iv|ando|ale|lu|rn|e.|e._|ll'|tta|nte|_l'|uel".split('|'),
'russian': "_|о|и|е|а|с|р|н|т|в|к|л|п|м|д|я|у|и_|ы|г|й|_в|е_|_п|ст|_с|а_|ро|я_|б|з|ос|_р|й_|о_|_и|в_|но|ов|ен|ра|ь|ко|ск|ре|ч|х|ни|на|_к|пр|ер|ан|сс|го|_о|_в_|_пр|то|ве|ки|ны|_н|те|си|ли|та|ка|во|ор|од|по|_м|ти|ц|_и_|_г|х_|ль|м_|ол|ле|ет|рос|ой|ал|_ро|ва|ес|ю|ж|ии|ия|ел|ии_|сси|осс|_рос|_д|ой_|росс|ин|_б|со|_росс|росси|_по|осси|ия_|де|ар|ри|т_|он|ми|ие|ий|не|ы_|ла|ит|_т|ру|ски|ат|ом|тр|ис|_на|че|ог|да|ав|ш|ь_|от|ед|ско|ф|ры|_со|нн|к_|ик|ие_|тв|_а|ас|_го|ци|ем|из|i|ме|га|го_|за|на_|ло|ств|a|e|ени|_ко|ев|ий_|s|ост|до|ай|ус|ир|_ст|об|тс|про|ля|но_|н_|ая|им|пе|се|_з|ого|ак|ров|ая_|r|ек|n|ма|их|у_|ви|мо|ове|ей|_про|ил|те_|ого_|o|с_|ов_|льн|ьн|ста|ич|йт|аз|э|ть|енн|сл|сии|ых|их_|сто|сии_|_ч|сти|ых_|ые|ссии|ссии_|ые_|ок|оссии|_ре|_во|год|бо|_год|стр|ом_|ыг|вл|нт|_ка|_ф|айт|йте|йте_|нс|тор|ый|гай|вер|ый_|чес|ам|ыга|прыг|рыг|айте_|пры|айте|ыгай|_пры|_прыг|рыгай|рыга|прыга|ыгайт|гайт|гайте|сп|йс|ере|ист|_у|ди|_э|ое|етс|_ру|нов|ги|щ|ся|ова|кр|бр|дн|ду|тра|ад|рус|ние|кой|кой_|t|иче|ран|еск|вы|_за|ани|йск|ю_|_на_|кон|л_|аль|вн|ся_|вр|_л|кий_|кий|же|ные|нны|гр|_ра|ные_|ичес|ческ|_рус|ив|бы|са|ет_|ест|_из|ли_|ных_|рс|ных|сов|оп|да_|_с_|ть_|па|ту|р_|пер|_я|_пе|лен|ние_|ац|аци|ти_|альн|ской|ской_|ическ|ель|ский|ский_|ной_|ной|бл|з_|ите|ур|ва_|ча|_ми|усс|кт|_е|ря|ка_|l|сия|ния|_об|тел|ио|пре|ей_|су|ийс|ния_|_бы|ийск|вс|аст|ым|сн|кие|оль|русс|пу|сск|ки_|ны_|ода|сия_|русск|рен|тн|ыл|д_|усск|ссия".split('|'),
});
var language = languageDetection.categorize(document.body.innerText.toLowerCase());
console.log(language);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment