josht-jpg/cleaning

## cleaning
import nltk

stop_words = nltk.corpus.stopwords.words('english')

def clean(book, stop_words):
    book = book.lower()
    #tokenizing
    book_tokens_clean = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(book)
    book_clean = pd.DataFrame(book_tokens_clean, columns = ['word'])
    #removing stop words
    book_clean = book_clean[~book_clean['word'].isin(stop_words)]
    #removing extraneous spaces
    book_clean['word'] = book_clean['word'].apply(lambda x: re.sub(' +', ' ', x))
    book_clean = book_clean[book_clean['word'].str.len() > 1]
    return book_clean
	import nltk

	stop_words = nltk.corpus.stopwords.words('english')

	def clean(book, stop_words):
	book = book.lower()
	#tokenizing
	book_tokens_clean = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(book)
	book_clean = pd.DataFrame(book_tokens_clean, columns = ['word'])
	#removing stop words
	book_clean = book_clean[~book_clean['word'].isin(stop_words)]
	#removing extraneous spaces
	book_clean['word'] = book_clean['word'].apply(lambda x: re.sub(' +', ' ', x))
	book_clean = book_clean[book_clean['word'].str.len() > 1]
	return book_clean