shayaf84/nltk.py

## nltk.py
#Import nltk preprocessing library to convert text into a readable format
import nltk
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

#Tokenize the string (create a list -> each index is a word)
data['title'] = data.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)

#Define text lemmatization model (eg: walks will be changed to walk)
lemmatizer = WordNetLemmatizer()

#Loop through title dataframe and lemmatize each word
def lemma(data):
  return [lemmatizer.lemmatize(w) for w in data]

#Apply to dataframe
data['title'] = data['title'].apply(lemma)

#Define all stopwords in the English language (it, was, for, etc.)
stop = stopwords.words('english')

#Remove them from our dataframe
data['title'] = data['title'].apply(lambda x: [i for i in x if i not in stop])

data.head()
	#Import nltk preprocessing library to convert text into a readable format
	import nltk
	from nltk.tokenize import sent_tokenize
	from nltk.stem import WordNetLemmatizer
	from nltk.corpus import stopwords

	nltk.download('punkt')
	nltk.download('wordnet')
	nltk.download('stopwords')

	#Tokenize the string (create a list -> each index is a word)
	data['title'] = data.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)

	#Define text lemmatization model (eg: walks will be changed to walk)
	lemmatizer = WordNetLemmatizer()

	#Loop through title dataframe and lemmatize each word
	def lemma(data):
	return [lemmatizer.lemmatize(w) for w in data]

	#Apply to dataframe
	data['title'] = data['title'].apply(lemma)

	#Define all stopwords in the English language (it, was, for, etc.)
	stop = stopwords.words('english')

	#Remove them from our dataframe
	data['title'] = data['title'].apply(lambda x: [i for i in x if i not in stop])

	data.head()