Skip to content

Instantly share code, notes, and snippets.

View arunm8489's full-sized avatar

Arun Mohan arunm8489

View GitHub Profile
def tockenize(X_train,X_test):
"""
bow encoding
"""
word_list = []
for sent in X_train:
for word in sent.split():
word_list.append(word)
corpus = Counter(word_list)
# handling numeric features
train_numeric = np.concatenate((X_train['price'].values.reshape(-1, 1),X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1, 1),X_train['quantity'].values.reshape(-1,1),X_train['presence_of_num'].values.reshape(-1, 1)),axis=1)
test_numeric = np.concatenate((X_test['price'].values.reshape(-1, 1), X_test['teacher_number_of_previously_posted_projects'].values.reshape(-1, 1),X_test['quantity'].values.reshape(-1,1),X_test['presence_of_num'].values.reshape(-1, 1)),axis=1)
stndardscalar = StandardScaler()
std_train_numeric = stndardscalar.fit_transform(train_numeric)
std_test_numeric = stndardscalar.transform(test_numeric)
# label encoding categorical features
def label_encoding(col):
label_encoder = LabelEncoder()
train_cols = label_encoder.fit_transform(X_train[col])
X_test[col] = X_test[col].apply(lambda x: 'unknown' if x not in label_encoder.classes_ else x)
label_encoder.classes_ = np.append(label_encoder.classes_, 'unknown')
test_cols = label_encoder.fit_transform(X_test[col])
return train_cols, test_cols
dff = pd.read_csv('final_df.csv')
y = dff['project_is_approved']
X = dff.drop(columns=['project_is_approved'])
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=100, shuffle= True)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
df = data[['teacher_prefix','school_state','project_grade_category',
'project_subject_categories','project_subject_subcategories','essay',
'quantity','price','presence_of_num','teacher_number_of_previously_posted_projects','project_is_approved']]
df.to_csv('final_df.csv',index=False)
# merge two column text dataframe:
data["essay"] = data["project_essay_1"].map(str) + data["project_essay_2"].map(str) + data["project_essay_3"].map(str) + data["project_essay_4"].map(str) + data['project_title'].map(str)
processed_essays = preprocess_text(data['essay'].values)
data['essay'] = processed_essays
print("printing some random reviews")
print(91, data['project_title'].values[91])
print(3, data['project_title'].values[3])
print(147, data['project_title'].values[147])
processed_titles = preprocess_text(data['project_title'].values)
data['project_title'] = processed_titles
#randomly printing some titles
print(9, processed_titles[91])
print(3, processed_titles[3])
print(147, processed_titles[147])
import re
def decontracted(phrase):
# specific
phrase = re.sub(r"won't", "will not", phrase)
phrase = re.sub(r"can\'t", "can not", phrase)
# general
phrase = re.sub(r"n\'t", " not", phrase)
phrase = re.sub(r"\'re", " are", phrase)
def presence_number(data):
presence = any(char.isdigit() for char in data)
if presence is True :
return 1
elif presence is False :
return 0
return 0
data['presence_of_num'] = data['project_resource_summary'].map(lambda x : presence_number(x))