Skip to content

Instantly share code, notes, and snippets.

@thiagomarzagao
Created December 7, 2015 16:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thiagomarzagao/54a1f8eb31d99be23972 to your computer and use it in GitHub Desktop.
Save thiagomarzagao/54a1f8eb31d99be23972 to your computer and use it in GitHub Desktop.
cria X.pkl
import os
import re
import pickle
from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
def pre_process():
vanilla = u'[^\u0041-\u005A \
\u0061-\u007A \
\u00C0-\u00D6 \
\u00D8-\u00F6 \
\u00F8-\u00FF \
\u0100-\u017F \
\u0020]'
regex = re.compile(vanilla)
st = RSLPStemmer()
basepath = '/caminho/ate/arquivos/CSVs/' # altere conforme necessario
flist = [fname for fname in os.listdir(basepath) if '.csv' in fname]
counter = 0
for fname in flist:
fbuffer = open(basepath + fname, mode = 'rb')
for line in fbuffer:
counter += 1
idx1 = line.index(',')
classe = line[:idx1]
if classe[0] == '2':
continue
print counter
subline1 = line[idx1 + 1:]
idx2 = subline1.index(',')
descricao = subline1[idx2 + 1:]
lowercased = descricao.decode('utf8').lower()
regexed = regex.sub(' ', lowercased)
tokenized = regexed.split()
singularized = [st.apply_rule(token, 0) for token in tokenized]
remerged = ''
for word in singularized:
if len(word) > 1:
remerged += word + ' '
# print remerged.encode('utf-8')
# raw_input()
yield remerged
tfidf_maker = TfidfVectorizer(lowercase = False,
min_df = 2,
norm = 'l2',
smooth_idf = True)
tfidf = tfidf_maker.fit_transform(pre_process())
pickle.dump(tfidf, open('X.pkl', 'wb'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment