Created
April 8, 2021 12:31
-
-
Save pranavraikote/43d6a415044d4a3bee5c3568a190190a to your computer and use it in GitHub Desktop.
NLP Tutorials - Part 2: Text Representation & Word Embeddings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Importing Libaries | |
!python -m spacy download en | |
!python -m spacy download en_vectors_web_lg | |
import spacy | |
nlp = spacy.load('en_vectors_web_lg') | |
import re | |
import nltk | |
import numpy as np | |
import unicodedata | |
import pandas as pd | |
nltk.download('stopwords') | |
nltk.download('wordnet') | |
nltk.download('punkt') | |
from nltk import word_tokenize | |
from nltk.corpus import stopwords | |
from nltk.stem import PorterStemmer | |
from nltk.stem.wordnet import WordNetLemmatizer | |
# Reading Data | |
data = pd.read_csv('medium_data.csv') | |
data.head() | |
data['title'] | |
# Text Processing | |
STOPWORDS = stopwords.words('english') | |
lemmatizer = WordNetLemmatizer() | |
def process_text(text): | |
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore') | |
text = re.sub(r'[^a-zA-Z\s]', '', text) | |
#text = text.translate(str.maketrans('', '', string.punctuation)) | |
text = text.lower() | |
text = " ".join([word for word in str(text).split() if word not in STOPWORDS]) | |
text = " ".join([lemmatizer.lemmatize(word) for word in text.split()]) | |
return text | |
data_sentences = data['title'].to_list() | |
# Defining our Corpus | |
corpus = data_sentences[:50] | |
corpus | |
# Processing the Corpus | |
# data['title'] = df['title'].map(process_text) - For entire pandas column | |
process_corpus = np.vectorize(process_text) | |
processed_corpus = process_corpus(corpus) | |
processed_corpus | |
# Bag of Words Model - Count Vectorizer | |
from sklearn.feature_extraction.text import CountVectorizer | |
count_vect = CountVectorizer(min_df = 0., max_df = 1.) | |
matrix = count_vect.fit_transform(processed_corpus) | |
matrix = matrix.toarray() | |
matrix | |
vocabulary = count_vect.get_feature_names() | |
pd.DataFrame(matrix, columns = vocabulary) | |
# Bag of Words Model - n-gram | |
count_vect_n_gram = CountVectorizer(ngram_range = (2, 2)) | |
matrix = count_vect_n_gram.fit_transform(processed_corpus) | |
matrix = matrix.toarray() | |
matrix | |
vocabulary = count_vect_n_gram.get_feature_names() | |
pd.DataFrame(matrix, columns = vocabulary) | |
# Tf-Idf Model | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
tf_idf = TfidfVectorizer(min_df = 0., max_df = 1., use_idf = True) | |
tf_idf_matrix = tf_idf.fit_transform(processed_corpus) | |
tf_idf_matrix = tf_idf_matrix.toarray() | |
tf_idf_matrix | |
vocabulary = tf_idf.get_feature_names() | |
pd.DataFrame(np.round(tf_idf_matrix, 2), columns = vocabulary) | |
# Word Embeddings | |
# Word2Vec Model | |
from gensim.models import word2vec | |
tokenized_corpus = [nltk.word_tokenize(doc) for doc in processed_corpus] | |
# Parameters for Word2Vec model | |
# Word vector dimensionality | |
feature_size = 15 | |
# Context window size | |
window_context = 20 | |
# Minimum word count | |
min_word_count = 1 | |
# Downsample setting for frequent words | |
sample = 1e-3 | |
# Skip-gram model configuration. If not specified, the configuration is CBOW | |
skg = 1 | |
w2v_model = word2vec.Word2Vec(tokenized_corpus, size = feature_size, | |
window = window_context, min_count = min_word_count, | |
sg = skg, sample=sample, iter = 5000) | |
w2v_model | |
# Visualizing the data points | |
import matplotlib.pyplot as plt | |
from sklearn.manifold import TSNE | |
words = w2v_model.wv.index2word | |
wvs = w2v_model.wv[words] | |
tsne = TSNE(n_components=2, random_state=42, n_iter=5000, perplexity=5) | |
np.set_printoptions(suppress=True) | |
T = tsne.fit_transform(wvs) | |
labels = words | |
plt.figure(figsize=(12, 6)) | |
plt.scatter(T[:, 0], T[:, 1], c='orange', edgecolors='r') | |
for label, x, y in zip(labels, T[:, 0], T[:, 1]): | |
plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points') | |
print('Embedding') | |
print(w2v_model.wv['ai']) | |
print('\nEmbedding Shape') | |
print(w2v_model.wv['ai'].shape) | |
# Visualizing the matrix | |
vec_df = pd.DataFrame(wvs, index = words) | |
vec_df | |
# Similarity Matrix for the words in the given corpus | |
import numpy as np | |
from sklearn.metrics.pairwise import cosine_similarity | |
similarity_matrix = cosine_similarity(vec_df.values) | |
similarity_df = pd.DataFrame(similarity_matrix, index=words, columns=words) | |
similarity_df | |
feature_names = np.array(words) | |
similarity_df.apply(lambda row: feature_names[np.argsort(-row.values)[1:4]], axis=1) | |
# GloVe Model | |
import spacy | |
nlp = spacy.load('en_vectors_web_lg') | |
total_vectors = len(nlp.vocab.vectors) | |
print('Total word vectors:', total_vectors) | |
unique_words = list(set([word for sublist in tokenized_corpus for word in sublist])) | |
word_glove_vectors = np.array([nlp(word).vector for word in unique_words]) | |
vec_df = pd.DataFrame(word_glove_vectors, index=unique_words) | |
vec_df | |
tsne = TSNE(n_components=2, random_state=42, n_iter=5000, perplexity=3) | |
np.set_printoptions(suppress=True) | |
T = tsne.fit_transform(word_glove_vectors) | |
labels = unique_words | |
plt.figure(figsize=(12, 6)) | |
plt.scatter(T[:, 0], T[:, 1], c='red', edgecolors='r') | |
for label, x, y in zip(labels, T[:, 0], T[:, 1]): | |
plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points') | |
# Similarity matrix | |
import numpy as np | |
from sklearn.metrics.pairwise import cosine_similarity | |
similarity_matrix = cosine_similarity(vec_df.values) | |
similarity_df = pd.DataFrame(similarity_matrix, index=unique_words, columns=unique_words) | |
similarity_df | |
feature_names = np.array(unique_words) | |
similarity_df.apply(lambda row: feature_names[np.argsort(-row.values)[1:4]], | |
axis=1) | |
# FastText Model | |
from gensim.models.fasttext import FastText | |
# Various Parameters | |
feature_size = 15 | |
window_context = 20 | |
min_word_count = 1 | |
# Downsample setting for frequent words | |
sample = 1e-3 | |
sg = 1 | |
ft_model = FastText(tokenized_corpus, size=feature_size, | |
window=window_context, min_count = min_word_count, | |
sg=sg, sample=sample, iter=5000) | |
ft_model | |
# Visualizing the Embeddings | |
from sklearn.manifold import TSNE | |
words = ft_model.wv.index2word | |
wvs = ft_model.wv[words] | |
tsne = TSNE(n_components=2, random_state=42, n_iter=5000, perplexity=5) | |
np.set_printoptions(suppress=True) | |
T = tsne.fit_transform(wvs) | |
labels = words | |
plt.figure(figsize=(12, 6)) | |
plt.scatter(T[:, 0], T[:, 1], c='green', edgecolors='k') | |
for label, x, y in zip(labels, T[:, 0], T[:, 1]): | |
plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points') | |
# Embedding Operations | |
print('Embedding') | |
print(ft_model.wv['ai']) | |
print('\nEmbedding Shape') | |
print(ft_model.wv['ai'].shape) | |
print(ft_model.wv.similarity(w1='ai', w2='pytorch')) | |
print(ft_model.wv.similarity(w1='ai', w2='interview')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment