Created
May 23, 2017 21:24
-
-
Save ASteinheiser/7ec16cb561edc46f0fe18a2b41b65197 to your computer and use it in GitHub Desktop.
bag of words matlab function
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
% function to create a vocabulary from multiple text files under folders | |
function feat_vec = cse408_bow(filepath, voc) | |
[fid, msg] = fopen(filepath, 'rt'); | |
error(msg); | |
line = fgets(fid); % Get the first line from the file. | |
feat_vec = zeros(size(voc)); %Initialize the feature vector' | |
% array of words | |
W = []; | |
while line ~= -1 | |
%PUT YOUR IMPLEMENTATION HERE | |
[token, remain] = strtok(line); | |
cont = 1; | |
while cont | |
% lowercase string | |
token = lower(token); | |
% remove punctuation from word | |
token = regexprep(token, '[*/\)(-!,.?]', ''); | |
% assume there is no more remainder | |
cont = 0; | |
% store word in word array | |
W{end + 1} = char(token); | |
% if there is a remainder | |
if (~strcmp(remain, '')) | |
% get the next word | |
[token, remain] = strtok(remain); | |
% repeat the loop | |
cont = 1; | |
end | |
end | |
% get next line | |
line = fgets(fid); | |
end | |
% close file | |
fclose(fid); | |
% for each item in our vocab | |
for j=1:length(voc) | |
% check each word | |
for i=1:length(W) | |
% if the word matches an item in the vocab | |
if (strcmp(W{i},voc{j})) | |
% increment the count for the item | |
feat_vec(j) = feat_vec(j) + 1; | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment