Last active
November 14, 2023 13:27
-
-
Save gugarosa/fbbc294da27b163caed924062ad69a7e to your computer and use it in GitHub Desktop.
A t-SNE perplexity's parameter optimization through meta-heuristics. It uses NALP for loading word embeddings and Opytimizer for performing the optimization.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from opytimizer import Opytimizer | |
from opytimizer.core.function import Function | |
from opytimizer.optimizers.abc import ABC | |
from opytimizer.spaces.search import SearchSpace | |
from sklearn.manifold import TSNE | |
import word2vec | |
# Loading word2vec word embeddings | |
w2v = word2vec.load_word_vectors() | |
def tsne(opytimizer): | |
# Gathering hyperparams | |
perplexity = round(opytimizer[0][0]) | |
# Training a t-SNE model | |
tsne = TSNE(perplexity=perplexity).fit(w2v.encoder[w2v.encoder.wv.vocab]) | |
return tsne.kl_divergence_ | |
# Creating Function's object | |
f = Function(pointer=tsne) | |
# Number of agents | |
n_agents = 5 | |
# Number of decision variables | |
n_variables = 1 | |
# Number of running iterations | |
n_iterations = 10 | |
# Lower and upper bounds (has to be the same size as n_variables) | |
lower_bound = [1] | |
upper_bound = [100] | |
# Creating the SearchSpace class | |
s = SearchSpace(n_agents=n_agents, n_iterations=n_iterations, | |
n_variables=n_variables, lower_bound=lower_bound, | |
upper_bound=upper_bound) | |
# Hyperparameters for the optimizer | |
hyperparams = { | |
'n_trials': 10 | |
} | |
# Creating ABC's optimizer | |
p = ABC(hyperparams=hyperparams) | |
# Finally, we can create an Opytimizer class | |
o = Opytimizer(space=s, optimizer=p, function=f) | |
# Running the optimization task | |
history = o.start() | |
# Saving optimization's history | |
history.save('abc.pkl') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import numpy as np | |
from opytimizer.utils.history import History | |
# Declaring all possible algorithms | |
algorithms = ['ABC', 'BA', 'GP', 'PSO'] | |
# Dataset to be analyzed | |
dataset = 'Reuters' | |
# Creating an empty plot | |
fig, ax = plt.subplots() | |
# For every algorithm | |
for a in algorithms: | |
# Create an empty vector of runnings | |
y = np.zeros(10) | |
# For every running | |
for i in range(10): | |
# Declares the file name to be loaded | |
file_name = f'../results/{dataset}/{a}/{a}_{dataset}_{i}.pkl' | |
# Creating an empty History object | |
h = History() | |
# Loading content from pickle file | |
h.load(file_name) | |
# Recover the task's time | |
y[i] = h.time[0] | |
# Plotting the algorithm's computational load | |
ax.plot(np.arange(1, 11, 1), y, '-', label=a) | |
# Setting title and labels | |
ax.set(xlabel='running', ylabel='time (s)', | |
title=f'Computational load for Reuters-21578 (ApteMod) dataset') | |
# Setting axis limits | |
ax.set_xlim([1, 10]) | |
# Setting global properties | |
ax.grid() | |
ax.legend() | |
# Displaying plot | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import numpy as np | |
from opytimizer.utils.history import History | |
# Creating an empty History object | |
h = History() | |
# Declaring all possible algorithms | |
algorithms = ['ABC', 'BA', 'PSO'] | |
# Dataset to be analyzed | |
dataset = 'Reuters' | |
# Creating an empty plot | |
fig, ax = plt.subplots() | |
# For every algorithm | |
for a in algorithms: | |
# Create an empty vector of convergence iterations | |
y = np.zeros(10) | |
# For every running | |
for i in range(10): | |
# Declares the file name to be loaded | |
file_name = f'../results/{dataset}/{a}/{a}_{dataset}_{i}.pkl' | |
# Loading content from pickle file | |
h.load(file_name) | |
# For every best agent | |
for j, best in enumerate(h.best_agent): | |
# Summing to its corresponding position | |
y[j] = y[j] + best[1] | |
# Gathering its mean | |
y = y / 10 | |
# Plotting the algorithm's convergence | |
ax.plot(np.arange(1, 11, 1), y, '-', label=a) | |
# Setting title and labels | |
ax.set(xlabel='iteration', ylabel='fitness', | |
title=f'Mean convergence plot for Reuters-21578 (ApteMod) dataset') | |
# Setting axis limits | |
ax.set_xlim([1, 10]) | |
# Setting global properties | |
ax.grid() | |
ax.legend() | |
# Displaying plot | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
from sklearn.manifold import TSNE | |
import word2vec | |
# Loading word2vec word embeddings | |
w2v = word2vec.load_word_vectors() | |
# Defining main word to be searched and its index | |
word = 'corp' | |
word_index = w2v.encoder.wv.vocab[word].index | |
# Gathering the similarity values for all vocabulary | |
similar_words = w2v.encoder.similar_by_word(word, topn=len(w2v.encoder.wv.vocab)) | |
# Creating empty lists for positive and negative words | |
pos_index, neg_index = [], [] | |
# Number of positive and negative words | |
n = 30 | |
# For the top-n positive words | |
for s in similar_words[:n]: | |
# Appends the index of the word | |
pos_index.append(w2v.encoder.wv.vocab[s[0]].index) | |
# For the top-n negative words | |
for s in similar_words[:-n:-1]: | |
# Appends the index of the word | |
neg_index.append(w2v.encoder.wv.vocab[s[0]].index) | |
# Gathering perplexity value | |
perplexity = 30 | |
# Training and transforming data using t-SNE | |
tsne = TSNE(perplexity=perplexity).fit_transform( | |
w2v.encoder[w2v.encoder.wv.vocab]) | |
# Creating a figure and its subplots | |
fig, ax = plt.subplots() | |
# For the top-n positive words | |
for i in pos_index: | |
# Plot the words as blue points | |
ax.plot(tsne[i, 0], tsne[i, 1], 'bo') | |
# For the top-n negative words | |
for i in neg_index: | |
# Plot the words as red points | |
ax.plot(tsne[i, 0], tsne[i, 1], 'ro') | |
# Plot the main word | |
ax.plot(tsne[word_index, 0], tsne[word_index, 1], 'yo') | |
# Setting title | |
ax.set_title(f'Top-{n} positive and negative words similar to: `{word}`') | |
# Removing the axis ticks | |
# ax.set_yticklabels([]) | |
# ax.set_xticklabels([]) | |
# For the top-n positive words | |
for (s, index) in zip(similar_words[:n], pos_index): | |
# Annotate the word and its similarity level | |
plt.annotate(f'{s[0]} ({s[1]:.2f})', xy=(tsne[index, 0], tsne[index, 1])) | |
# For the top-n negative words | |
for (s, index) in zip(similar_words[:-n:-1], neg_index): | |
# Annotate the word and its similarity level | |
plt.annotate(f'{s[0]} ({s[1]:.2f})', xy=(tsne[index, 0], tsne[index, 1])) | |
# Annotate the main word and its similarity level | |
plt.annotate(word, xy=(tsne[word_index, 0], tsne[word_index, 1])) | |
# Shows the plot | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
from sklearn.manifold import TSNE | |
import word2vec | |
# Loading word2vec word embeddings | |
w2v = word2vec.load_word_vectors() | |
# Gathering perplexity value | |
perplexity = 99.99 | |
# Training and transforming data using t-SNE | |
tsne = TSNE(perplexity=perplexity, random_state=0).fit_transform( | |
w2v.encoder[w2v.encoder.wv.vocab]) | |
# Plotting first 50 words from the dataset | |
fig, ax = plt.subplots() | |
for i in range(49): | |
ax.plot(tsne[i, 0], tsne[i, 1], 'o') | |
ax.set_title(f'Reuters-21578 (ApteMod) dataset: Perplexity = {perplexity}') | |
ax.set_yticklabels([]) | |
ax.set_xticklabels([]) | |
# Adding words' labels to each point | |
words = list(w2v.encoder.wv.vocab) | |
for i, word in enumerate(words): | |
if i > 50: | |
break | |
plt.annotate(word, xy=(tsne[i, 0], tsne[i, 1])) | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nalp.stream.loader as l | |
import nalp.stream.preprocess as p | |
from nalp.encoders.word2vec import Word2Vec | |
from sklearn.datasets import fetch_20newsgroups, load_files | |
def load_word_vectors(): | |
# 20 Newsgroups dataset | |
dataset = fetch_20newsgroups(subset='all') | |
# Movie reviews dataset | |
# dataset = load_files('data/movie_reviews') | |
# Reuters-21578 (ApteMod) dataset | |
# dataset = load_files('data/reuters') | |
# Creates a pre-processing pipeline | |
pipe = p.pipeline( | |
p.lower_case, | |
p.valid_char, | |
p.tokenize_to_word | |
) | |
# Applying pre-processing pipeline to X | |
X = [pipe(str(x)) for x in dataset.data] | |
# Creating a Word2Vec (Enconder's child) class | |
e = Word2Vec() | |
# Calling its internal method to learn an encoding representation | |
e.learn(X, max_features=300, min_count=100) | |
return e |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment