Nithilaa Umasankar Nithilaa

## lower_case.py
%time df2['Review_Processed'] = df2['Review_Processed'].map(lambda x: x.lower())

## remove_punc_unicode.py
%time df2['Review_Processed'] = df2['Review_Processed'].map(lambda x : re.sub(r'[^\x00-\x7F]+',' ', x))
%time df2['Review_Processed'] = df2['Review_Processed'].map(lambda x: re.sub(r'[^\w\s]', '', x))

## remove_stopwords.py
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
%time df2['Review_Processed'] = df2['Review_Processed'].map(lambda x : ' '.join([w for w in x.split() if w not in stop_words]))

## lemmatize.py
lemmer = WordNetLemmatizer()
%time df2['Review_Processed'] = df2['Review_Processed'].map(lambda x : ' '.join([lemmer.lemmatize(w) for w in x.split() if w not in stop_words]))

## countvec_import.py
from sklearn.feature_extraction.text import CountVectorizer

## countvec_obj.py
tf_vectorizer = CountVectorizer(min_df=.015, max_df=.8, max_features=no_features, ngram_range=[1, 3])

## fit_transform.py
%time features = tf_vectorizer.fit_transform(df['user_review'])

## return_matrix.py
features_df = pd.DataFrame(features.toarray(), columns=tf_vectorizer.get_feature_names())

## concat.py
df = pd.concat([features_df,df['user_suggestion']],axis=1)

## drop_num_cols.py
df_tf_m_columns = df_tf_m.columns
df_tf_m_columns

res = [sub for sub in df_tf_m_columns if sub.isalpha()]
res.append('Flag_1')

df_tf_m = df_tf_m.drop(columns=[col for col in df_tf_m if col not in res])
df_tf_m.head()
	%time df2['Review_Processed'] = df2['Review_Processed'].map(lambda x : re.sub(r'[^\x00-\x7F]+',' ', x))
	%time df2['Review_Processed'] = df2['Review_Processed'].map(lambda x: re.sub(r'[^\w\s]', '', x))
	from nltk.stem import WordNetLemmatizer
	from nltk.corpus import stopwords

	stop_words = stopwords.words('english')
	%time df2['Review_Processed'] = df2['Review_Processed'].map(lambda x : ' '.join([w for w in x.split() if w not in stop_words]))
	lemmer = WordNetLemmatizer()
	%time df2['Review_Processed'] = df2['Review_Processed'].map(lambda x : ' '.join([lemmer.lemmatize(w) for w in x.split() if w not in stop_words]))
	df_tf_m_columns = df_tf_m.columns
	df_tf_m_columns

	res = [sub for sub in df_tf_m_columns if sub.isalpha()]
	res.append('Flag_1')

	df_tf_m = df_tf_m.drop(columns=[col for col in df_tf_m if col not in res])
	df_tf_m.head()