cria X.pkl
import os | |
import re | |
import pickle | |
from nltk.stem import RSLPStemmer | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
def pre_process(): | |
vanilla = u'[^\u0041-\u005A \ | |
\u0061-\u007A \ | |
\u00C0-\u00D6 \ | |
\u00D8-\u00F6 \ | |
\u00F8-\u00FF \ | |
\u0100-\u017F \ | |
\u0020]' | |
regex = re.compile(vanilla) | |
st = RSLPStemmer() | |
basepath = '/caminho/ate/arquivos/CSVs/' # altere conforme necessario | |
flist = [fname for fname in os.listdir(basepath) if '.csv' in fname] | |
counter = 0 | |
for fname in flist: | |
fbuffer = open(basepath + fname, mode = 'rb') | |
for line in fbuffer: | |
counter += 1 | |
idx1 = line.index(',') | |
classe = line[:idx1] | |
if classe[0] == '2': | |
continue | |
print counter | |
subline1 = line[idx1 + 1:] | |
idx2 = subline1.index(',') | |
descricao = subline1[idx2 + 1:] | |
lowercased = descricao.decode('utf8').lower() | |
regexed = regex.sub(' ', lowercased) | |
tokenized = regexed.split() | |
singularized = [st.apply_rule(token, 0) for token in tokenized] | |
remerged = '' | |
for word in singularized: | |
if len(word) > 1: | |
remerged += word + ' ' | |
# print remerged.encode('utf-8') | |
# raw_input() | |
yield remerged | |
tfidf_maker = TfidfVectorizer(lowercase = False, | |
min_df = 2, | |
norm = 'l2', | |
smooth_idf = True) | |
tfidf = tfidf_maker.fit_transform(pre_process()) | |
pickle.dump(tfidf, open('X.pkl', 'wb')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment