ThomasSmyth/naive_bayes.q

## naive_bayes.q
/ see https://github.com/Senthilvadivel-20/NLP_in_KDB/blob/main/Navie_Bayes.ipynb

/ Example words
positive: "I am happy because I today am learning NLP I am happy not sad hello guys how are you this is good evening I going to office today great evening"
negative: "I am sad I am not learning NLP today I am sad not happy this is bad evening I take leave today today is not good worst of all"

/Convert the words to symbol list

/ Does case matter? If so convert to lower case
pos_lis:`$" " vs positive;
neg_lis:`$" " vs negative;

/ It's my preference to not update globals inside a function
/ so I've removed that, and then update globals outside the function
/ also means this code is reusable, which is nice
f:{[d;x]$[x in key d;d[x]+:1;d[x]:1];d};

/ positive list
w:()!();
/ use 'over' iterator to accumulate updates to v from iterating over pos_lis
/ https://code.kx.com/q/ref/accumulators/#binary-values
w:f/[w;pos_lis];

/ negative list
v:()!();
v:f/[v;neg_lis];

/ can key table without 1!
pos_t:([word:key w];pos:value w);
neg_t:([word:key v];negv:value v);

/ is lj definitely the right join here?
/ I'm not sure on the theory but it drops words from neg_list that aren't in pos_list (e.g. worst)
/ union join would keep all words
/
count pos_t lj neg_t
22
count pos_t uj neg_t
28
\
word_freq:0!0^pos_t lj neg_t;

/ the sum of pos and negv is repeated over and over again, only need to calculate once
/
\ts:1000 update pos_per:{x % sum word_freq`pos}'[pos], neg_per:{x % sum word_freq`negv}'[negv] from word_freq
29 3584
\ts:1000 update pos_per:pos%sum pos, neg_per:negv%sum negv from word_freq
1 2080
\
word_freq:update pos_per:pos%sum pos, neg_per:negv%sum negv from word_freq;

get_val:{exec pos_per%neg_per from word_freq where word=x}
/ can vectorise this function so you don't need to use each
/ should improve performance
get_val_vector:{exec pos_per%neg_per from word_freq where word in x}

test:`$" " vs "I am not learning today";

get_val'[test]
get_val_vector test

check:{{$[(((*/) get_val'[x]) > 1)[0];`Positive;`Negative]}[`$" " vs x]};
/ can use prd keyword instead of (*/)
/ can you Boolean value to index into list of possible values, rather than use conditional
/ although that is arguably less readable
check_vector:{{`Negative`Positive 1<prd get_val_vector[x]}[`$" " vs x]};

show a:check each ("hello guys";"today not good";"great evening";"I going to office");
show b:check_vector each ("hello guys";"today not good";"great evening";"I going to office");

a~b
	/ see https://github.com/Senthilvadivel-20/NLP_in_KDB/blob/main/Navie_Bayes.ipynb

	/ Example words
	positive: "I am happy because I today am learning NLP I am happy not sad hello guys how are you this is good evening I going to office today great evening"
	negative: "I am sad I am not learning NLP today I am sad not happy this is bad evening I take leave today today is not good worst of all"

	/Convert the words to symbol list

	/ Does case matter? If so convert to lower case
	pos_lis:`$" " vs positive;
	neg_lis:`$" " vs negative;

	/ It's my preference to not update globals inside a function
	/ so I've removed that, and then update globals outside the function
	/ also means this code is reusable, which is nice
	f:{[d;x]$[x in key d;d[x]+:1;d[x]:1];d};

	/ positive list
	w:()!();
	/ use 'over' iterator to accumulate updates to v from iterating over pos_lis
	/ https://code.kx.com/q/ref/accumulators/#binary-values
	w:f/[w;pos_lis];

	/ negative list
	v:()!();
	v:f/[v;neg_lis];

	/ can key table without 1!
	pos_t:([word:key w];pos:value w);
	neg_t:([word:key v];negv:value v);

	/ is lj definitely the right join here?
	/ I'm not sure on the theory but it drops words from neg_list that aren't in pos_list (e.g. worst)
	/ union join would keep all words
	/
	count pos_t lj neg_t
	22
	count pos_t uj neg_t
	28
	\
	word_freq:0!0^pos_t lj neg_t;

	/ the sum of pos and negv is repeated over and over again, only need to calculate once
	/
	\ts:1000 update pos_per:{x % sum word_freq`pos}'[pos], neg_per:{x % sum word_freq`negv}'[negv] from word_freq
	29 3584
	\ts:1000 update pos_per:pos%sum pos, neg_per:negv%sum negv from word_freq
	1 2080
	\
	word_freq:update pos_per:pos%sum pos, neg_per:negv%sum negv from word_freq;

	get_val:{exec pos_per%neg_per from word_freq where word=x}
	/ can vectorise this function so you don't need to use each
	/ should improve performance
	get_val_vector:{exec pos_per%neg_per from word_freq where word in x}

	test:`$" " vs "I am not learning today";

	get_val'[test]
	get_val_vector test

	check:{{$[(((*/) get_val'[x]) > 1)[0];`Positive;`Negative]}[`$" " vs x]};
	/ can use prd keyword instead of (*/)
	/ can you Boolean value to index into list of possible values, rather than use conditional
	/ although that is arguably less readable
	check_vector:{{`Negative`Positive 1<prd get_val_vector[x]}[`$" " vs x]};

	show a:check each ("hello guys";"today not good";"great evening";"I going to office");
	show b:check_vector each ("hello guys";"today not good";"great evening";"I going to office");

	a~b