This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Display total time elapsed since program started | |
t1 = datetime.datetime.now() | |
print(f'Time Elapsed: {t1 - t0}\n') | |
# Hyperparameter grid | |
cdims = [128, 256] | |
ksizes = [3, 5, 7] | |
pools1 = [3, 5, 7] | |
pools2 = [25, 35] | |
densities = [64, 128] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def create_cnn_model(cdim=128, ksize=5, pool1=5, pool2=35, density=128, lr=.001, epochs=100, batch_size=128, validation_split=.3, patience=5, verbose=0): | |
# Start Timer | |
start = datetime.datetime.now() | |
# Display Hyperparameter Settings | |
print('Convolution Dimensions\tWindow Size\tPool 1\t\tPool 2\t\tDensity') | |
print(f'{cdim}\t\t\t{ksize}\t\t{pool1}\t\t{pool2}\t\t{density}') | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create matrix of random vectors which matches size of the word index | |
embedding_matrix = np.random.random((len(word_index) + 1, 100)) | |
for word, i in word_index.items(): | |
# For each word found in the embedding index, assign the known vector | |
embedding_vector = embeddings_index.get(word) | |
# Otherwise, keep the randomly assigned vector | |
if embedding_vector is not None: | |
embedding_matrix[i] = embedding_vector |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.svm import SVC | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.pipeline import Pipeline | |
from sklearn.model_selection import cross_val_score | |
rf = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(embeddings_index)), | |
("Random Forest", RandomForestClassifier(n_estimators=100, verbose=True, n_jobs=-1))]) | |
svc = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(embeddings_index)), | |
('Support Vector Machine', SVC())]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Shuffle the Dataset. | |
shuffled_df = df.sample(frac=1) | |
# Create dictionary to store scaled dataframes for each president. | |
scaled_dfs = {} | |
# Set max samples based on president with fewest samples | |
n = shuffled_df.loc[shuffled_df['name'] == df['name'].value_counts().index[-1]].shape[0] | |
#Randomly select appropriate number of observations from the majority class |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim.models import phrases | |
data = df['clean_text'].values | |
# Identify and create bigrams | |
bigrams = phrases.Phrases(data) | |
bigrams_data = [] | |
for i in range(len(data)): | |
bigrams_data.append(bigrams[data[i]]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create dictionary of vectors from GloVe based on total vocabulary | |
embeddings_index = {} | |
f = open('glove.6B.100d.txt', encoding='utf8') | |
for line in f: | |
values = line.split() | |
word = values[0] | |
if word in total_vocabulary: | |
coefs = np.asarray(values[1:], dtype='float32') | |
embeddings_index[word] = coefs | |
f.close() |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
NewerOlder