Skip to content

Instantly share code, notes, and snippets.

@rwilleynyc
rwilleynyc / understanding-word-vectors.ipynb
Created June 9, 2019 13:37 — forked from aparrish/understanding-word-vectors.ipynb
Understanding word vectors: A tutorial for "Reading and Writing Electronic Text," a class I teach at ITP. (Python 2.7) Code examples released under CC0 https://creativecommons.org/choose/zero/, other text released under CC BY 4.0 https://creativecommons.org/licenses/by/4.0/
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@rwilleynyc
rwilleynyc / glove_w2v.py
Last active July 31, 2019 18:53
GloVe & Word2Vec
# Create dictionary of vectors from GloVe based on total vocabulary
embeddings_index = {}
f = open('glove.6B.100d.txt', encoding='utf8')
for line in f:
values = line.split()
word = values[0]
if word in total_vocabulary:
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
@rwilleynyc
rwilleynyc / create_bigrams.py
Created July 31, 2019 18:55
Create Bigrams
from gensim.models import phrases
data = df['clean_text'].values
# Identify and create bigrams
bigrams = phrases.Phrases(data)
bigrams_data = []
for i in range(len(data)):
bigrams_data.append(bigrams[data[i]])
# Shuffle the Dataset.
shuffled_df = df.sample(frac=1)
# Create dictionary to store scaled dataframes for each president.
scaled_dfs = {}
# Set max samples based on president with fewest samples
n = shuffled_df.loc[shuffled_df['name'] == df['name'].value_counts().index[-1]].shape[0]
#Randomly select appropriate number of observations from the majority class
@rwilleynyc
rwilleynyc / presNLP_bench.py
Created July 31, 2019 19:00
Create benchmark using tradition ML methods.
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
rf = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(embeddings_index)),
("Random Forest", RandomForestClassifier(n_estimators=100, verbose=True, n_jobs=-1))])
svc = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(embeddings_index)),
('Support Vector Machine', SVC())])
@rwilleynyc
rwilleynyc / presNLP_embedding.py
Created July 31, 2019 19:05
Create embedding layer for RNNs & CNNs
# Create matrix of random vectors which matches size of the word index
embedding_matrix = np.random.random((len(word_index) + 1, 100))
for word, i in word_index.items():
# For each word found in the embedding index, assign the known vector
embedding_vector = embeddings_index.get(word)
# Otherwise, keep the randomly assigned vector
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
@rwilleynyc
rwilleynyc / presNLP_createCNN.py
Created July 31, 2019 19:06
Function to create a CNN for NLP classification.
def create_cnn_model(cdim=128, ksize=5, pool1=5, pool2=35, density=128, lr=.001, epochs=100, batch_size=128, validation_split=.3, patience=5, verbose=0):
# Start Timer
start = datetime.datetime.now()
# Display Hyperparameter Settings
print('Convolution Dimensions\tWindow Size\tPool 1\t\tPool 2\t\tDensity')
print(f'{cdim}\t\t\t{ksize}\t\t{pool1}\t\t{pool2}\t\t{density}')
@rwilleynyc
rwilleynyc / presNLP_CNNgrid.py
Last active July 31, 2019 19:08
Perform grid search to find best CNN
# Display total time elapsed since program started
t1 = datetime.datetime.now()
print(f'Time Elapsed: {t1 - t0}\n')
# Hyperparameter grid
cdims = [128, 256]
ksizes = [3, 5, 7]
pools1 = [3, 5, 7]
pools2 = [25, 35]
densities = [64, 128]
@rwilleynyc
rwilleynyc / presNLP_bestCNN.py
Created July 31, 2019 19:11
Train best CNN
# Create dataframe from results dictionary
cnn_results_df = pd.DataFrame.from_dict(cnn_results)
# Get row values associated with highest accuracy
best_cnn_model = cnn_results_df[cnn_results_df['Accuracy'] == cnn_results_df['Accuracy'].max()]
# Save hyperparameters to variables
cdim = best_cnn_model['Convolution Dimensions'].values[0]
ksize = int(best_cnn_model['Window Size'].values[0])
pool1 = int(best_cnn_model['Pool 1'].values[0])
@rwilleynyc
rwilleynyc / presNLP_createRNN.py
Last active July 31, 2019 19:15
Function to create single RNN for NLP.
def create_rnn_model(rnn_type='gru', units=50, drop=.5, density=50, lr=.001, epochs=100, batch_size=128, validation_split=.3, patience=5, verbose=0):
# Start Timer
start = datetime.datetime.now()
# Display Hyperparameter Settings
model_type = 'GRU' if rnn_type == 'gru' else 'LSTM'
print(f'Model Type:\t{model_type}\tUnits:\t{units}\tDropout Rate:\t{drop}\t\tDensity: {density}')