ASteinheiser/buildVoc.m

## buildVoc.m
% function to create a vocabulary from multiple text files under folders

function voc = buildVoc(folder, voc)

stopword = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', ...
    'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', ...
    'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', ...
    'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', ...
    'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', ...
    'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', ...
    'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', ...
    'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', ...
    'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', ...
    'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', ...
    'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', ...
    'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', ...
    'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', ...
    'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', ...
    'further', 'was', 'here', 'than'}; % define English stop words, from NLTK


files = dir(fullfile(folder,'*.txt'));

for file = files'
    [fid, msg] = fopen(fullfile(folder,file.name), 'rt');
    error(msg);
    line = fgets(fid); % Get the first line from the file.
    while line ~= -1
        %PUT YOUR IMPLEMENTATION HERE
        % get first word
        [token, remain] = strtok(line);
        cont = 1;
        count = 0;
        while (cont && count ~= 100)
            % lowercase the string
            token = lower(token);
            % remove punctuation from word
            token = regexprep(token, '[*/\)(-!,.?]', '');
            % assume there is no more remainder
            cont = 0;
            % if string contains numbers
            if (contains(token, {'1','2','3','4','5','6','7','8','9','0'}))
                if (~strcmp(remain, ''))
                    % get the next word
                    [token, remain] = strtok(remain);
                    % repeat the loop
                    cont = 1;
                end
            % if string contains bullshit
            elseif (contains(token, 'â'))
                if (~strcmp(remain, ''))
                    % get the next word
                    [token, remain] = strtok(remain);
                    % repeat the loop
                    cont = 1;
                end
            % if string is empty
            elseif (strcmp(token, ''))
                if (~strcmp(remain, ''))
                    % get the next word
                    [token, remain] = strtok(remain);
                    % repeat the loop
                    cont = 1;
                end
            % if string is stop word
            elseif (ismember({token}, stopword))
                if (~strcmp(remain, ''))
                    % get the next word
                    [token, remain] = strtok(remain);
                    % repeat the loop
                    cont = 1;
                end
            else
                voc{end + 1} = char(token);
                % if there is a remainder
                if (~strcmp(remain, ''))
                    % get the next word
                    [token, remain] = strtok(remain);
                    % repeat the loop
                    cont = 1;
                end
            end
            count = count + 1;
        end
        % get the next line
        line = fgets(fid);
    end
    % close the file
    fclose(fid);
end

voc = unique(voc);
	% function to create a vocabulary from multiple text files under folders

	function voc = buildVoc(folder, voc)

	stopword = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', ...
	'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', ...
	'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', ...
	'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', ...
	'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', ...
	'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', ...
	'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', ...
	'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', ...
	'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', ...
	'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', ...
	'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', ...
	'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', ...
	'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', ...
	'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', ...
	'further', 'was', 'here', 'than'}; % define English stop words, from NLTK


	files = dir(fullfile(folder,'*.txt'));

	for file = files'
	[fid, msg] = fopen(fullfile(folder,file.name), 'rt');
	error(msg);
	line = fgets(fid); % Get the first line from the file.
	while line ~= -1
	%PUT YOUR IMPLEMENTATION HERE
	% get first word
	[token, remain] = strtok(line);
	cont = 1;
	count = 0;
	while (cont && count ~= 100)
	% lowercase the string
	token = lower(token);
	% remove punctuation from word
	token = regexprep(token, '[*/\)(-!,.?]', '');
	% assume there is no more remainder
	cont = 0;
	% if string contains numbers
	if (contains(token, {'1','2','3','4','5','6','7','8','9','0'}))
	if (~strcmp(remain, ''))
	% get the next word
	[token, remain] = strtok(remain);
	% repeat the loop
	cont = 1;
	end
	% if string contains bullshit
	elseif (contains(token, 'â'))
	if (~strcmp(remain, ''))
	% get the next word
	[token, remain] = strtok(remain);
	% repeat the loop
	cont = 1;
	end
	% if string is empty
	elseif (strcmp(token, ''))
	if (~strcmp(remain, ''))
	% get the next word
	[token, remain] = strtok(remain);
	% repeat the loop
	cont = 1;
	end
	% if string is stop word
	elseif (ismember({token}, stopword))
	if (~strcmp(remain, ''))
	% get the next word
	[token, remain] = strtok(remain);
	% repeat the loop
	cont = 1;
	end
	else
	voc{end + 1} = char(token);
	% if there is a remainder
	if (~strcmp(remain, ''))
	% get the next word
	[token, remain] = strtok(remain);
	% repeat the loop
	cont = 1;
	end
	end
	count = count + 1;
	end
	% get the next line
	line = fgets(fid);
	end
	% close the file
	fclose(fid);
	end

	voc = unique(voc);