amitrani6/process_text.py

## process_text.py
#Import the necessary libraries
import nltk
from nltk.stem import WordNetLemmatizer

#Initialize the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

#A function to lemmatize raw text, returns a list of lemmatized tokens
def lemmatize_text(tokenized_text):
    return ' '.join([lemmatizer.lemmatize(w) for w in tokenized_text])

#A function that ties all of the steps together
def process_text(file_name):
    raw_episode_text = open_file(file_name)
    clean_episode_text = cleaned_episode(raw_episode_text)
    tokenize_episode_text = tokenize(clean_episode_text)
    lemmatize_episode_text = lemmatize_text(tokenize_episode_text)

    return lemmatize_episode_text

#Applies the text to the data frame
df['lemmatize_text'] = df.file_path.apply(lambda x: process_text(x))
	#Import the necessary libraries
	import nltk
	from nltk.stem import WordNetLemmatizer

	#Initialize the Wordnet Lemmatizer
	lemmatizer = WordNetLemmatizer()

	#A function to lemmatize raw text, returns a list of lemmatized tokens
	def lemmatize_text(tokenized_text):
	return ' '.join([lemmatizer.lemmatize(w) for w in tokenized_text])

	#A function that ties all of the steps together
	def process_text(file_name):
	raw_episode_text = open_file(file_name)
	clean_episode_text = cleaned_episode(raw_episode_text)
	tokenize_episode_text = tokenize(clean_episode_text)
	lemmatize_episode_text = lemmatize_text(tokenize_episode_text)

	return lemmatize_episode_text

	#Applies the text to the data frame
	df['lemmatize_text'] = df.file_path.apply(lambda x: process_text(x))