Raymond Willey rwilleynyc

## understanding-word-vectors.ipynb

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                rwilleynyc
                / understanding-word-vectors.ipynb
            
            
              Created
              June 9, 2019 13:37
                — forked from aparrish/understanding-word-vectors.ipynb
            
              
                Understanding word vectors: A tutorial for "Reading and Writing Electronic Text," a class I teach at ITP. (Python 2.7) Code examples released under CC0 https://creativecommons.org/choose/zero/, other text released under CC BY 4.0 https://creativecommons.org/licenses/by/4.0/
              
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## glove_w2v.py
# Create dictionary of vectors from GloVe based on total vocabulary
embeddings_index = {}
f = open('glove.6B.100d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    if word in total_vocabulary:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
f.close()

## create_bigrams.py
from gensim.models import phrases

data = df['clean_text'].values

# Identify and create bigrams
bigrams = phrases.Phrases(data)
bigrams_data = []

for i in range(len(data)):
    bigrams_data.append(bigrams[data[i]])

## presNLP_bal_dataset.py
# Shuffle the Dataset.
shuffled_df = df.sample(frac=1)

# Create dictionary to store scaled dataframes for each president.
scaled_dfs = {}

# Set max samples based on president with fewest samples
n = shuffled_df.loc[shuffled_df['name'] == df['name'].value_counts().index[-1]].shape[0]

#Randomly select appropriate number of observations from the majority class

## presNLP_bench.py
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

rf  = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(embeddings_index)),
                ("Random Forest", RandomForestClassifier(n_estimators=100, verbose=True, n_jobs=-1))])
svc = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(embeddings_index)),
                ('Support Vector Machine', SVC())])

## presNLP_embedding.py
# Create matrix of random vectors which matches size of the word index
embedding_matrix = np.random.random((len(word_index) + 1, 100))
for word, i in word_index.items():

    # For each word found in the embedding index, assign the known vector
    embedding_vector = embeddings_index.get(word)

    # Otherwise, keep the randomly assigned vector
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## presNLP_createCNN.py
def create_cnn_model(cdim=128, ksize=5, pool1=5, pool2=35, density=128, lr=.001, epochs=100, batch_size=128, validation_split=.3, patience=5, verbose=0):

    # Start Timer
    start = datetime.datetime.now()


    # Display Hyperparameter Settings
    print('Convolution Dimensions\tWindow Size\tPool 1\t\tPool 2\t\tDensity')
    print(f'{cdim}\t\t\t{ksize}\t\t{pool1}\t\t{pool2}\t\t{density}')


## presNLP_CNNgrid.py
# Display total time elapsed since program started
t1 = datetime.datetime.now()
print(f'Time Elapsed: {t1 - t0}\n')

# Hyperparameter grid
cdims = [128, 256]
ksizes = [3, 5, 7]
pools1 = [3, 5, 7]
pools2 = [25, 35]
densities = [64, 128]

## presNLP_bestCNN.py
# Create dataframe from results dictionary
cnn_results_df = pd.DataFrame.from_dict(cnn_results)

# Get row values associated with highest accuracy
best_cnn_model = cnn_results_df[cnn_results_df['Accuracy'] == cnn_results_df['Accuracy'].max()]

# Save hyperparameters to variables
cdim = best_cnn_model['Convolution Dimensions'].values[0]
ksize = int(best_cnn_model['Window Size'].values[0])
pool1 = int(best_cnn_model['Pool 1'].values[0])

## presNLP_createRNN.py
def create_rnn_model(rnn_type='gru', units=50, drop=.5, density=50, lr=.001, epochs=100, batch_size=128, validation_split=.3, patience=5, verbose=0):

    # Start Timer
    start = datetime.datetime.now()


    # Display Hyperparameter Settings
    model_type = 'GRU' if rnn_type == 'gru' else 'LSTM'
    print(f'Model Type:\t{model_type}\tUnits:\t{units}\tDropout Rate:\t{drop}\t\tDensity: {density}')
	# Create dictionary of vectors from GloVe based on total vocabulary
	embeddings_index = {}
	f = open('glove.6B.100d.txt', encoding='utf8')
	for line in f:
	values = line.split()
	word = values[0]
	if word in total_vocabulary:
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
	f.close()
	from gensim.models import phrases

	data = df['clean_text'].values

	# Identify and create bigrams
	bigrams = phrases.Phrases(data)
	bigrams_data = []

	for i in range(len(data)):
	bigrams_data.append(bigrams[data[i]])
	# Shuffle the Dataset.
	shuffled_df = df.sample(frac=1)

	# Create dictionary to store scaled dataframes for each president.
	scaled_dfs = {}

	# Set max samples based on president with fewest samples
	n = shuffled_df.loc[shuffled_df['name'] == df['name'].value_counts().index[-1]].shape[0]

	#Randomly select appropriate number of observations from the majority class
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.svm import SVC
	from sklearn.linear_model import LogisticRegression
	from sklearn.pipeline import Pipeline
	from sklearn.model_selection import cross_val_score

	rf = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(embeddings_index)),
	("Random Forest", RandomForestClassifier(n_estimators=100, verbose=True, n_jobs=-1))])
	svc = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(embeddings_index)),
	('Support Vector Machine', SVC())])
	# Create matrix of random vectors which matches size of the word index
	embedding_matrix = np.random.random((len(word_index) + 1, 100))
	for word, i in word_index.items():

	# For each word found in the embedding index, assign the known vector
	embedding_vector = embeddings_index.get(word)

	# Otherwise, keep the randomly assigned vector
	if embedding_vector is not None:
	embedding_matrix[i] = embedding_vector
	def create_cnn_model(cdim=128, ksize=5, pool1=5, pool2=35, density=128, lr=.001, epochs=100, batch_size=128, validation_split=.3, patience=5, verbose=0):

	# Start Timer
	start = datetime.datetime.now()


	# Display Hyperparameter Settings
	print('Convolution Dimensions\tWindow Size\tPool 1\t\tPool 2\t\tDensity')
	print(f'{cdim}\t\t\t{ksize}\t\t{pool1}\t\t{pool2}\t\t{density}')
	# Display total time elapsed since program started
	t1 = datetime.datetime.now()
	print(f'Time Elapsed: {t1 - t0}\n')

	# Hyperparameter grid
	cdims = [128, 256]
	ksizes = [3, 5, 7]
	pools1 = [3, 5, 7]
	pools2 = [25, 35]
	densities = [64, 128]
	# Create dataframe from results dictionary
	cnn_results_df = pd.DataFrame.from_dict(cnn_results)

	# Get row values associated with highest accuracy
	best_cnn_model = cnn_results_df[cnn_results_df['Accuracy'] == cnn_results_df['Accuracy'].max()]

	# Save hyperparameters to variables
	cdim = best_cnn_model['Convolution Dimensions'].values[0]
	ksize = int(best_cnn_model['Window Size'].values[0])
	pool1 = int(best_cnn_model['Pool 1'].values[0])
	def create_rnn_model(rnn_type='gru', units=50, drop=.5, density=50, lr=.001, epochs=100, batch_size=128, validation_split=.3, patience=5, verbose=0):

	# Start Timer
	start = datetime.datetime.now()


	# Display Hyperparameter Settings
	model_type = 'GRU' if rnn_type == 'gru' else 'LSTM'
	print(f'Model Type:\t{model_type}\tUnits:\t{units}\tDropout Rate:\t{drop}\t\tDensity: {density}')