Skip to content

Instantly share code, notes, and snippets.

@ashokc
Created January 26, 2019 18:02
Show Gist options
  • Save ashokc/7f9778b66f656e9d6c10c36bce7939ab to your computer and use it in GitHub Desktop.
Save ashokc/7f9778b66f656e9d6c10c36bce7939ab to your computer and use it in GitHub Desktop.
Tokenize Movies
# Read the Text Corpus, Clean and Tokenize
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
nltk_stopw = stopwords.words('english')
def tokenize (text): # no punctuation & starts with a letter & between 2-15 characters in length
tokens = [word.strip(string.punctuation) for word in RegexpTokenizer(r'\b[a-zA-Z][a-zA-Z0-9]{2,14}\b').tokenize(text)]
return [f.lower() for f in tokens if f and f.lower() not in nltk_stopw]
def getMovies():
X, labels, labelToName = [], [], { 0 : 'neg', 1: 'pos' }
for dataset in ['train', 'test']:
for classIndex, directory in enumerate(['neg', 'pos']):
dirName = './data/' + dataset + "/" + directory
for reviewFile in os.listdir(dirName):
with open (dirName + '/' + reviewFile, 'r') as f:
tokens = tokenize (f.read())
if (len(tokens) == 0):
continue
X.append(tokens)
labels.append(classIndex)
nTokens = [len(x) for x in X]
return X, np.array(labels), labelToName, nTokens
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment