Arun Mohan arunm8489

## rnn19
def tockenize(X_train,X_test):
  """
  bow encoding
  """
  word_list = []
  for sent in X_train:
    for word in sent.split():
      word_list.append(word)

  corpus = Counter(word_list)

## rnn18
# handling numeric features
train_numeric = np.concatenate((X_train['price'].values.reshape(-1, 1),X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1, 1),X_train['quantity'].values.reshape(-1,1),X_train['presence_of_num'].values.reshape(-1, 1)),axis=1)
test_numeric = np.concatenate((X_test['price'].values.reshape(-1, 1), X_test['teacher_number_of_previously_posted_projects'].values.reshape(-1, 1),X_test['quantity'].values.reshape(-1,1),X_test['presence_of_num'].values.reshape(-1, 1)),axis=1)

stndardscalar = StandardScaler()
std_train_numeric = stndardscalar.fit_transform(train_numeric)
std_test_numeric = stndardscalar.transform(test_numeric)

## rnn17
# label encoding categorical features
def label_encoding(col):
  label_encoder = LabelEncoder()
  train_cols = label_encoder.fit_transform(X_train[col])
  X_test[col] = X_test[col].apply(lambda x: 'unknown' if x not in label_encoder.classes_ else x)
  label_encoder.classes_ = np.append(label_encoder.classes_, 'unknown')
  test_cols = label_encoder.fit_transform(X_test[col])
  return train_cols, test_cols


## rnn16
dff = pd.read_csv('final_df.csv')
y = dff['project_is_approved']
X = dff.drop(columns=['project_is_approved'])
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=100, shuffle= True)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## rnn15
df = data[['teacher_prefix','school_state','project_grade_category',
      'project_subject_categories','project_subject_subcategories','essay',
      'quantity','price','presence_of_num','teacher_number_of_previously_posted_projects','project_is_approved']]

df.to_csv('final_df.csv',index=False)

## rnn14
# merge two column text dataframe:
data["essay"] = data["project_essay_1"].map(str) +  data["project_essay_2"].map(str) + data["project_essay_3"].map(str) + data["project_essay_4"].map(str) + data['project_title'].map(str)
processed_essays = preprocess_text(data['essay'].values)
data['essay'] = processed_essays

## rnn13
print("printing some random reviews")
print(91, data['project_title'].values[91])
print(3, data['project_title'].values[3])
print(147, data['project_title'].values[147])

## rnn12
processed_titles = preprocess_text(data['project_title'].values)
data['project_title'] = processed_titles
#randomly printing some titles
print(9, processed_titles[91])
print(3, processed_titles[3])
print(147, processed_titles[147])

## rnn12
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)

## rnn11
def presence_number(data):
  presence = any(char.isdigit() for char in data)
  if presence is True :
          return 1
  elif presence is False :
          return 0
  return 0


data['presence_of_num'] = data['project_resource_summary'].map(lambda x : presence_number(x))
	def tockenize(X_train,X_test):
	"""
	bow encoding
	"""
	word_list = []
	for sent in X_train:
	for word in sent.split():
	word_list.append(word)

	corpus = Counter(word_list)
	# handling numeric features
	train_numeric = np.concatenate((X_train['price'].values.reshape(-1, 1),X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1, 1),X_train['quantity'].values.reshape(-1,1),X_train['presence_of_num'].values.reshape(-1, 1)),axis=1)
	test_numeric = np.concatenate((X_test['price'].values.reshape(-1, 1), X_test['teacher_number_of_previously_posted_projects'].values.reshape(-1, 1),X_test['quantity'].values.reshape(-1,1),X_test['presence_of_num'].values.reshape(-1, 1)),axis=1)

	stndardscalar = StandardScaler()
	std_train_numeric = stndardscalar.fit_transform(train_numeric)
	std_test_numeric = stndardscalar.transform(test_numeric)
	# label encoding categorical features
	def label_encoding(col):
	label_encoder = LabelEncoder()
	train_cols = label_encoder.fit_transform(X_train[col])
	X_test[col] = X_test[col].apply(lambda x: 'unknown' if x not in label_encoder.classes_ else x)
	label_encoder.classes_ = np.append(label_encoder.classes_, 'unknown')
	test_cols = label_encoder.fit_transform(X_test[col])
	return train_cols, test_cols
	dff = pd.read_csv('final_df.csv')
	y = dff['project_is_approved']
	X = dff.drop(columns=['project_is_approved'])
	X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=100, shuffle= True)
	print(X_train.shape)
	print(y_train.shape)
	print(X_test.shape)
	print(y_test.shape)
	df = data[['teacher_prefix','school_state','project_grade_category',
	'project_subject_categories','project_subject_subcategories','essay',
	'quantity','price','presence_of_num','teacher_number_of_previously_posted_projects','project_is_approved']]

	df.to_csv('final_df.csv',index=False)
	# merge two column text dataframe:
	data["essay"] = data["project_essay_1"].map(str) + data["project_essay_2"].map(str) + data["project_essay_3"].map(str) + data["project_essay_4"].map(str) + data['project_title'].map(str)
	processed_essays = preprocess_text(data['essay'].values)
	data['essay'] = processed_essays
	print("printing some random reviews")
	print(91, data['project_title'].values[91])
	print(3, data['project_title'].values[3])
	print(147, data['project_title'].values[147])
	processed_titles = preprocess_text(data['project_title'].values)
	data['project_title'] = processed_titles
	#randomly printing some titles
	print(9, processed_titles[91])
	print(3, processed_titles[3])
	print(147, processed_titles[147])
	import re

	def decontracted(phrase):
	# specific
	phrase = re.sub(r"won't", "will not", phrase)
	phrase = re.sub(r"can\'t", "can not", phrase)

	# general
	phrase = re.sub(r"n\'t", " not", phrase)
	phrase = re.sub(r"\'re", " are", phrase)
	def presence_number(data):
	presence = any(char.isdigit() for char in data)
	if presence is True :
	return 1
	elif presence is False :
	return 0
	return 0


	data['presence_of_num'] = data['project_resource_summary'].map(lambda x : presence_number(x))