Skip to content

Instantly share code, notes, and snippets.

@gugarosa
Last active November 14, 2023 13:27
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gugarosa/fbbc294da27b163caed924062ad69a7e to your computer and use it in GitHub Desktop.
Save gugarosa/fbbc294da27b163caed924062ad69a7e to your computer and use it in GitHub Desktop.
A t-SNE perplexity's parameter optimization through meta-heuristics. It uses NALP for loading word embeddings and Opytimizer for performing the optimization.
from opytimizer import Opytimizer
from opytimizer.core.function import Function
from opytimizer.optimizers.abc import ABC
from opytimizer.spaces.search import SearchSpace
from sklearn.manifold import TSNE
import word2vec
# Loading word2vec word embeddings
w2v = word2vec.load_word_vectors()
def tsne(opytimizer):
# Gathering hyperparams
perplexity = round(opytimizer[0][0])
# Training a t-SNE model
tsne = TSNE(perplexity=perplexity).fit(w2v.encoder[w2v.encoder.wv.vocab])
return tsne.kl_divergence_
# Creating Function's object
f = Function(pointer=tsne)
# Number of agents
n_agents = 5
# Number of decision variables
n_variables = 1
# Number of running iterations
n_iterations = 10
# Lower and upper bounds (has to be the same size as n_variables)
lower_bound = [1]
upper_bound = [100]
# Creating the SearchSpace class
s = SearchSpace(n_agents=n_agents, n_iterations=n_iterations,
n_variables=n_variables, lower_bound=lower_bound,
upper_bound=upper_bound)
# Hyperparameters for the optimizer
hyperparams = {
'n_trials': 10
}
# Creating ABC's optimizer
p = ABC(hyperparams=hyperparams)
# Finally, we can create an Opytimizer class
o = Opytimizer(space=s, optimizer=p, function=f)
# Running the optimization task
history = o.start()
# Saving optimization's history
history.save('abc.pkl')
import matplotlib.pyplot as plt
import numpy as np
from opytimizer.utils.history import History
# Declaring all possible algorithms
algorithms = ['ABC', 'BA', 'GP', 'PSO']
# Dataset to be analyzed
dataset = 'Reuters'
# Creating an empty plot
fig, ax = plt.subplots()
# For every algorithm
for a in algorithms:
# Create an empty vector of runnings
y = np.zeros(10)
# For every running
for i in range(10):
# Declares the file name to be loaded
file_name = f'../results/{dataset}/{a}/{a}_{dataset}_{i}.pkl'
# Creating an empty History object
h = History()
# Loading content from pickle file
h.load(file_name)
# Recover the task's time
y[i] = h.time[0]
# Plotting the algorithm's computational load
ax.plot(np.arange(1, 11, 1), y, '-', label=a)
# Setting title and labels
ax.set(xlabel='running', ylabel='time (s)',
title=f'Computational load for Reuters-21578 (ApteMod) dataset')
# Setting axis limits
ax.set_xlim([1, 10])
# Setting global properties
ax.grid()
ax.legend()
# Displaying plot
plt.show()
import matplotlib.pyplot as plt
import numpy as np
from opytimizer.utils.history import History
# Creating an empty History object
h = History()
# Declaring all possible algorithms
algorithms = ['ABC', 'BA', 'PSO']
# Dataset to be analyzed
dataset = 'Reuters'
# Creating an empty plot
fig, ax = plt.subplots()
# For every algorithm
for a in algorithms:
# Create an empty vector of convergence iterations
y = np.zeros(10)
# For every running
for i in range(10):
# Declares the file name to be loaded
file_name = f'../results/{dataset}/{a}/{a}_{dataset}_{i}.pkl'
# Loading content from pickle file
h.load(file_name)
# For every best agent
for j, best in enumerate(h.best_agent):
# Summing to its corresponding position
y[j] = y[j] + best[1]
# Gathering its mean
y = y / 10
# Plotting the algorithm's convergence
ax.plot(np.arange(1, 11, 1), y, '-', label=a)
# Setting title and labels
ax.set(xlabel='iteration', ylabel='fitness',
title=f'Mean convergence plot for Reuters-21578 (ApteMod) dataset')
# Setting axis limits
ax.set_xlim([1, 10])
# Setting global properties
ax.grid()
ax.legend()
# Displaying plot
plt.show()
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import word2vec
# Loading word2vec word embeddings
w2v = word2vec.load_word_vectors()
# Defining main word to be searched and its index
word = 'corp'
word_index = w2v.encoder.wv.vocab[word].index
# Gathering the similarity values for all vocabulary
similar_words = w2v.encoder.similar_by_word(word, topn=len(w2v.encoder.wv.vocab))
# Creating empty lists for positive and negative words
pos_index, neg_index = [], []
# Number of positive and negative words
n = 30
# For the top-n positive words
for s in similar_words[:n]:
# Appends the index of the word
pos_index.append(w2v.encoder.wv.vocab[s[0]].index)
# For the top-n negative words
for s in similar_words[:-n:-1]:
# Appends the index of the word
neg_index.append(w2v.encoder.wv.vocab[s[0]].index)
# Gathering perplexity value
perplexity = 30
# Training and transforming data using t-SNE
tsne = TSNE(perplexity=perplexity).fit_transform(
w2v.encoder[w2v.encoder.wv.vocab])
# Creating a figure and its subplots
fig, ax = plt.subplots()
# For the top-n positive words
for i in pos_index:
# Plot the words as blue points
ax.plot(tsne[i, 0], tsne[i, 1], 'bo')
# For the top-n negative words
for i in neg_index:
# Plot the words as red points
ax.plot(tsne[i, 0], tsne[i, 1], 'ro')
# Plot the main word
ax.plot(tsne[word_index, 0], tsne[word_index, 1], 'yo')
# Setting title
ax.set_title(f'Top-{n} positive and negative words similar to: `{word}`')
# Removing the axis ticks
# ax.set_yticklabels([])
# ax.set_xticklabels([])
# For the top-n positive words
for (s, index) in zip(similar_words[:n], pos_index):
# Annotate the word and its similarity level
plt.annotate(f'{s[0]} ({s[1]:.2f})', xy=(tsne[index, 0], tsne[index, 1]))
# For the top-n negative words
for (s, index) in zip(similar_words[:-n:-1], neg_index):
# Annotate the word and its similarity level
plt.annotate(f'{s[0]} ({s[1]:.2f})', xy=(tsne[index, 0], tsne[index, 1]))
# Annotate the main word and its similarity level
plt.annotate(word, xy=(tsne[word_index, 0], tsne[word_index, 1]))
# Shows the plot
plt.show()
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import word2vec
# Loading word2vec word embeddings
w2v = word2vec.load_word_vectors()
# Gathering perplexity value
perplexity = 99.99
# Training and transforming data using t-SNE
tsne = TSNE(perplexity=perplexity, random_state=0).fit_transform(
w2v.encoder[w2v.encoder.wv.vocab])
# Plotting first 50 words from the dataset
fig, ax = plt.subplots()
for i in range(49):
ax.plot(tsne[i, 0], tsne[i, 1], 'o')
ax.set_title(f'Reuters-21578 (ApteMod) dataset: Perplexity = {perplexity}')
ax.set_yticklabels([])
ax.set_xticklabels([])
# Adding words' labels to each point
words = list(w2v.encoder.wv.vocab)
for i, word in enumerate(words):
if i > 50:
break
plt.annotate(word, xy=(tsne[i, 0], tsne[i, 1]))
plt.show()
import nalp.stream.loader as l
import nalp.stream.preprocess as p
from nalp.encoders.word2vec import Word2Vec
from sklearn.datasets import fetch_20newsgroups, load_files
def load_word_vectors():
# 20 Newsgroups dataset
dataset = fetch_20newsgroups(subset='all')
# Movie reviews dataset
# dataset = load_files('data/movie_reviews')
# Reuters-21578 (ApteMod) dataset
# dataset = load_files('data/reuters')
# Creates a pre-processing pipeline
pipe = p.pipeline(
p.lower_case,
p.valid_char,
p.tokenize_to_word
)
# Applying pre-processing pipeline to X
X = [pipe(str(x)) for x in dataset.data]
# Creating a Word2Vec (Enconder's child) class
e = Word2Vec()
# Calling its internal method to learn an encoding representation
e.learn(X, max_features=300, min_count=100)
return e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment