Skip to content

Instantly share code, notes, and snippets.

View chumpblocckami's full-sized avatar
🎯
Focusing

Matteo Mazzola chumpblocckami

🎯
Focusing
View GitHub Profile
import nltk
from nltk import *
import random
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
file = "dataset"
collCategorized= CategorizedPlaintextCorpusReader(file, r'.*\.txt', cat_pattern=r'(\w+)/*',encoding="utf8")
documents=[(list(collCategorized.words(fileid)), category) for category in collCategorized.categories()
for fileid in collCategorized.fileids(category)]
random.shuffle(documents)