Created
November 8, 2018 18:05
-
-
Save gognjanovski/48969ff9202e4f49c8ff5895fa59c34f to your computer and use it in GitHub Desktop.
Create spam filter word dictionary
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
allwords = ""; | |
% - Load and concatenate all the emails in one string | |
Files=dir('data/*/*.txt'); | |
for k=1:length(Files) | |
FileNames = Files(k).name; | |
dr = Files(k).folder; | |
file = fileread(strcat(dr, '\', FileNames)); | |
file = regexprep(file, '\<\w{1,2}\>', ""); | |
allwords = strcat(allwords, ' ', file); | |
end | |
% - Split the emails by white space so we can count the number of occurence of each of the words | |
words = strsplit(allwords, ' ' )'; | |
[words_u, ~, idxU] = unique(words); | |
counts = accumarray( idxU, 1 ); | |
% - Sort entries by count. | |
[~, idxS] = sort( counts, 'descend' ); | |
words_us = words_u(idxS)(1:2500); | |
counts_s = counts(idxS)(1:2500); | |
% - Build cell array of unique words and counts. | |
result = [words_us, num2cell( counts_s )]; | |
result = result(1:2500, :); | |
% - Save the word count in dictionary.txt file | |
filePh = fopen('dictionary.txt','w'); | |
[rows,cols]=size(result); | |
for r=1:rows | |
fprintf(filePh,'%d. %s %d\n', r, result(r,1){:}, result(r,2){1}); | |
end | |
fclose(filePh); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment