Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
NLP Tutorials - Part 2: Text Representation & Word Embeddings
# Importing Libaries
!python -m spacy download en
!python -m spacy download en_vectors_web_lg
import spacy
nlp = spacy.load('en_vectors_web_lg')
import re
import nltk
import numpy as np
import unicodedata
import pandas as pd
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
# Reading Data
data = pd.read_csv('medium_data.csv')
data.head()
data['title']
# Text Processing
STOPWORDS = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
def process_text(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
text = re.sub(r'[^a-zA-Z\s]', '', text)
#text = text.translate(str.maketrans('', '', string.punctuation))
text = text.lower()
text = " ".join([word for word in str(text).split() if word not in STOPWORDS])
text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
return text
data_sentences = data['title'].to_list()
# Defining our Corpus
corpus = data_sentences[:50]
corpus
# Processing the Corpus
# data['title'] = df['title'].map(process_text) - For entire pandas column
process_corpus = np.vectorize(process_text)
processed_corpus = process_corpus(corpus)
processed_corpus
# Bag of Words Model - Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(min_df = 0., max_df = 1.)
matrix = count_vect.fit_transform(processed_corpus)
matrix = matrix.toarray()
matrix
vocabulary = count_vect.get_feature_names()
pd.DataFrame(matrix, columns = vocabulary)
# Bag of Words Model - n-gram
count_vect_n_gram = CountVectorizer(ngram_range = (2, 2))
matrix = count_vect_n_gram.fit_transform(processed_corpus)
matrix = matrix.toarray()
matrix
vocabulary = count_vect_n_gram.get_feature_names()
pd.DataFrame(matrix, columns = vocabulary)
# Tf-Idf Model
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer(min_df = 0., max_df = 1., use_idf = True)
tf_idf_matrix = tf_idf.fit_transform(processed_corpus)
tf_idf_matrix = tf_idf_matrix.toarray()
tf_idf_matrix
vocabulary = tf_idf.get_feature_names()
pd.DataFrame(np.round(tf_idf_matrix, 2), columns = vocabulary)
# Word Embeddings
# Word2Vec Model
from gensim.models import word2vec
tokenized_corpus = [nltk.word_tokenize(doc) for doc in processed_corpus]
# Parameters for Word2Vec model
# Word vector dimensionality
feature_size = 15
# Context window size
window_context = 20
# Minimum word count
min_word_count = 1
# Downsample setting for frequent words
sample = 1e-3
# Skip-gram model configuration. If not specified, the configuration is CBOW
skg = 1
w2v_model = word2vec.Word2Vec(tokenized_corpus, size = feature_size,
window = window_context, min_count = min_word_count,
sg = skg, sample=sample, iter = 5000)
w2v_model
# Visualizing the data points
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
words = w2v_model.wv.index2word
wvs = w2v_model.wv[words]
tsne = TSNE(n_components=2, random_state=42, n_iter=5000, perplexity=5)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(wvs)
labels = words
plt.figure(figsize=(12, 6))
plt.scatter(T[:, 0], T[:, 1], c='orange', edgecolors='r')
for label, x, y in zip(labels, T[:, 0], T[:, 1]):
plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')
print('Embedding')
print(w2v_model.wv['ai'])
print('\nEmbedding Shape')
print(w2v_model.wv['ai'].shape)
# Visualizing the matrix
vec_df = pd.DataFrame(wvs, index = words)
vec_df
# Similarity Matrix for the words in the given corpus
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(vec_df.values)
similarity_df = pd.DataFrame(similarity_matrix, index=words, columns=words)
similarity_df
feature_names = np.array(words)
similarity_df.apply(lambda row: feature_names[np.argsort(-row.values)[1:4]], axis=1)
# GloVe Model
import spacy
nlp = spacy.load('en_vectors_web_lg')
total_vectors = len(nlp.vocab.vectors)
print('Total word vectors:', total_vectors)
unique_words = list(set([word for sublist in tokenized_corpus for word in sublist]))
word_glove_vectors = np.array([nlp(word).vector for word in unique_words])
vec_df = pd.DataFrame(word_glove_vectors, index=unique_words)
vec_df
tsne = TSNE(n_components=2, random_state=42, n_iter=5000, perplexity=3)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(word_glove_vectors)
labels = unique_words
plt.figure(figsize=(12, 6))
plt.scatter(T[:, 0], T[:, 1], c='red', edgecolors='r')
for label, x, y in zip(labels, T[:, 0], T[:, 1]):
plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')
# Similarity matrix
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(vec_df.values)
similarity_df = pd.DataFrame(similarity_matrix, index=unique_words, columns=unique_words)
similarity_df
feature_names = np.array(unique_words)
similarity_df.apply(lambda row: feature_names[np.argsort(-row.values)[1:4]],
axis=1)
# FastText Model
from gensim.models.fasttext import FastText
# Various Parameters
feature_size = 15
window_context = 20
min_word_count = 1
# Downsample setting for frequent words
sample = 1e-3
sg = 1
ft_model = FastText(tokenized_corpus, size=feature_size,
window=window_context, min_count = min_word_count,
sg=sg, sample=sample, iter=5000)
ft_model
# Visualizing the Embeddings
from sklearn.manifold import TSNE
words = ft_model.wv.index2word
wvs = ft_model.wv[words]
tsne = TSNE(n_components=2, random_state=42, n_iter=5000, perplexity=5)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(wvs)
labels = words
plt.figure(figsize=(12, 6))
plt.scatter(T[:, 0], T[:, 1], c='green', edgecolors='k')
for label, x, y in zip(labels, T[:, 0], T[:, 1]):
plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')
# Embedding Operations
print('Embedding')
print(ft_model.wv['ai'])
print('\nEmbedding Shape')
print(ft_model.wv['ai'].shape)
print(ft_model.wv.similarity(w1='ai', w2='pytorch'))
print(ft_model.wv.similarity(w1='ai', w2='interview'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment