Skip to content

Instantly share code, notes, and snippets.

View susanli2016's full-sized avatar
💃
<script>Nice meet you here</script>

Susan Li susanli2016

💃
<script>Nice meet you here</script>
View GitHub Profile
def w2v_tokenize_text(text):
tokens = []
for sent in nltk.sent_tokenize(text, language='english'):
for word in nltk.word_tokenize(sent, language='english'):
if len(word) < 2:
continue
tokens.append(word)
return tokens
train, test = train_test_split(df, test_size=0.3, random_state = 42)
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])
for epoch in range(30):
model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
model_dbow.alpha -= 0.002
model_dbow.min_alpha = model_dbow.alpha
def get_vectors(model, corpus_size, vectors_size, vectors_type):
"""
Get vectors from trained doc2vec model
:param doc2vec_model: Trained Doc2Vec model
:param corpus_size: Size of the data
:param vectors_size: Size of the embedding vectors
:param vectors_type: Training or Testing vectors
:return: list of vectors
"""
vectors = np.zeros((corpus_size, vectors_size))
import itertools
import os
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
import random
def randomiseMissingData(df2):
"randomise missing data for DataFrame (within a column)"
df = df2.copy()
for col in df.columns:
data = df['prop_review_score']
mask = data.isnull()
samples = random.choices( data[~mask].values , k = mask.sum() )
data[mask] = samples
return df
import numpy as np
click_indices = us[us.click_bool == 1].index
random_indices = np.random.choice(click_indices, len(us.loc[us.click_bool == 1]), replace=False)
click_sample = us.loc[random_indices]
not_click = us[us.click_bool == 0].index
random_indices = np.random.choice(not_click, sum(us['click_bool']), replace=False)
not_click_sample = us.loc[random_indices]
def get_ensemble_models():
rf =RandomForestClassifier(n_estimators=51,min_samples_leaf=5,min_samples_split=3)
bagg = BaggingClassifier(n_estimators=51,random_state=42)
extra = ExtraTreesClassifier(n_estimators=51,random_state=42)
ada = AdaBoostClassifier(n_estimators=51,random_state=42)
grad = GradientBoostingClassifier(n_estimators=51,random_state=42)
classifier_list = [rf,bagg,extra,ada,grad]
classifier_name_list = ['Random Forests','Bagging','Extra Trees','AdaBoost','Gradient Boost']
return classifier_list,classifier_name_list
def get_naive_bayes_models():
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()
classifier_list = [gnb,mnb,bnb]
classifier_name_list = ['Gaussian NB','Multinomial NB','Bernoulli NB']
return classifier_list,classifier_name_list
classifier_list, classifier_name_list = get_naive_bayes_models()
for classifier,classifier_name in zip(classifier_list,classifier_name_list):
def get_neural_network(hidden_layer_size=50):
mlp = MLPClassifier(hidden_layer_sizes=hidden_layer_size)
return [mlp], ['MultiLayer Perceptron']
classifier_list, classifier_name_list = get_neural_network()
for classifier,classifier_name in zip(classifier_list,classifier_name_list):
classifier.fit(X_train,y_train)
print_evaluation_metrics(classifier,classifier_name,X_test,y_test)
from sklearn.model_selection import train_test_split
train, test = train_test_split(taxi, test_size=0.3, random_state=42)
import numpy as np
import shutil
def distance_between(lat1, lon1, lat2, lon2):
# Haversine formula to compute distance
dist = np.degrees(np.arccos(np.sin(np.radians(lat1)) * np.sin(np.radians(lat2)) + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.cos(np.radians(lon2 - lon1)))) * 60 * 1.515 * 1.609344
return dist