Last active
May 23, 2017 20:35
-
-
Save ASteinheiser/4423bb1fdf03c7581cbb0135aa4163f0 to your computer and use it in GitHub Desktop.
build voc function in matlab
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
% function to create a vocabulary from multiple text files under folders | |
function voc = buildVoc(folder, voc) | |
stopword = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', ... | |
'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', ... | |
'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', ... | |
'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', ... | |
'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', ... | |
'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', ... | |
'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', ... | |
'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', ... | |
'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', ... | |
'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', ... | |
'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', ... | |
'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', ... | |
'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', ... | |
'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', ... | |
'further', 'was', 'here', 'than'}; % define English stop words, from NLTK | |
files = dir(fullfile(folder,'*.txt')); | |
for file = files' | |
[fid, msg] = fopen(fullfile(folder,file.name), 'rt'); | |
error(msg); | |
line = fgets(fid); % Get the first line from the file. | |
while line ~= -1 | |
%PUT YOUR IMPLEMENTATION HERE | |
% get first word | |
[token, remain] = strtok(line); | |
cont = 1; | |
count = 0; | |
while (cont && count ~= 100) | |
% lowercase the string | |
token = lower(token); | |
% remove punctuation from word | |
token = regexprep(token, '[*/\)(-!,.?]', ''); | |
% assume there is no more remainder | |
cont = 0; | |
% if string contains numbers | |
if (contains(token, {'1','2','3','4','5','6','7','8','9','0'})) | |
if (~strcmp(remain, '')) | |
% get the next word | |
[token, remain] = strtok(remain); | |
% repeat the loop | |
cont = 1; | |
end | |
% if string contains bullshit | |
elseif (contains(token, 'â')) | |
if (~strcmp(remain, '')) | |
% get the next word | |
[token, remain] = strtok(remain); | |
% repeat the loop | |
cont = 1; | |
end | |
% if string is empty | |
elseif (strcmp(token, '')) | |
if (~strcmp(remain, '')) | |
% get the next word | |
[token, remain] = strtok(remain); | |
% repeat the loop | |
cont = 1; | |
end | |
% if string is stop word | |
elseif (ismember({token}, stopword)) | |
if (~strcmp(remain, '')) | |
% get the next word | |
[token, remain] = strtok(remain); | |
% repeat the loop | |
cont = 1; | |
end | |
else | |
voc{end + 1} = char(token); | |
% if there is a remainder | |
if (~strcmp(remain, '')) | |
% get the next word | |
[token, remain] = strtok(remain); | |
% repeat the loop | |
cont = 1; | |
end | |
end | |
count = count + 1; | |
end | |
% get the next line | |
line = fgets(fid); | |
end | |
% close the file | |
fclose(fid); | |
end | |
voc = unique(voc); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment