This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def w2v_tokenize_text(text): | |
tokens = [] | |
for sent in nltk.sent_tokenize(text, language='english'): | |
for word in nltk.word_tokenize(sent, language='english'): | |
if len(word) < 2: | |
continue | |
tokens.append(word) | |
return tokens | |
train, test = train_test_split(df, test_size=0.3, random_state = 42) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065) | |
model_dbow.build_vocab([x for x in tqdm(all_data)]) | |
for epoch in range(30): | |
model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1) | |
model_dbow.alpha -= 0.002 | |
model_dbow.min_alpha = model_dbow.alpha |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_vectors(model, corpus_size, vectors_size, vectors_type): | |
""" | |
Get vectors from trained doc2vec model | |
:param doc2vec_model: Trained Doc2Vec model | |
:param corpus_size: Size of the data | |
:param vectors_size: Size of the embedding vectors | |
:param vectors_type: Training or Testing vectors | |
:return: list of vectors | |
""" | |
vectors = np.zeros((corpus_size, vectors_size)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import itertools | |
import os | |
%matplotlib inline | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import tensorflow as tf | |
from sklearn.preprocessing import LabelBinarizer, LabelEncoder |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
def randomiseMissingData(df2): | |
"randomise missing data for DataFrame (within a column)" | |
df = df2.copy() | |
for col in df.columns: | |
data = df['prop_review_score'] | |
mask = data.isnull() | |
samples = random.choices( data[~mask].values , k = mask.sum() ) | |
data[mask] = samples | |
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
click_indices = us[us.click_bool == 1].index | |
random_indices = np.random.choice(click_indices, len(us.loc[us.click_bool == 1]), replace=False) | |
click_sample = us.loc[random_indices] | |
not_click = us[us.click_bool == 0].index | |
random_indices = np.random.choice(not_click, sum(us['click_bool']), replace=False) | |
not_click_sample = us.loc[random_indices] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_ensemble_models(): | |
rf =RandomForestClassifier(n_estimators=51,min_samples_leaf=5,min_samples_split=3) | |
bagg = BaggingClassifier(n_estimators=51,random_state=42) | |
extra = ExtraTreesClassifier(n_estimators=51,random_state=42) | |
ada = AdaBoostClassifier(n_estimators=51,random_state=42) | |
grad = GradientBoostingClassifier(n_estimators=51,random_state=42) | |
classifier_list = [rf,bagg,extra,ada,grad] | |
classifier_name_list = ['Random Forests','Bagging','Extra Trees','AdaBoost','Gradient Boost'] | |
return classifier_list,classifier_name_list | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_naive_bayes_models(): | |
gnb = GaussianNB() | |
mnb = MultinomialNB() | |
bnb = BernoulliNB() | |
classifier_list = [gnb,mnb,bnb] | |
classifier_name_list = ['Gaussian NB','Multinomial NB','Bernoulli NB'] | |
return classifier_list,classifier_name_list | |
classifier_list, classifier_name_list = get_naive_bayes_models() | |
for classifier,classifier_name in zip(classifier_list,classifier_name_list): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_neural_network(hidden_layer_size=50): | |
mlp = MLPClassifier(hidden_layer_sizes=hidden_layer_size) | |
return [mlp], ['MultiLayer Perceptron'] | |
classifier_list, classifier_name_list = get_neural_network() | |
for classifier,classifier_name in zip(classifier_list,classifier_name_list): | |
classifier.fit(X_train,y_train) | |
print_evaluation_metrics(classifier,classifier_name,X_test,y_test) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.model_selection import train_test_split | |
train, test = train_test_split(taxi, test_size=0.3, random_state=42) | |
import numpy as np | |
import shutil | |
def distance_between(lat1, lon1, lat2, lon2): | |
# Haversine formula to compute distance | |
dist = np.degrees(np.arccos(np.sin(np.radians(lat1)) * np.sin(np.radians(lat2)) + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.cos(np.radians(lon2 - lon1)))) * 60 * 1.515 * 1.609344 | |
return dist |