Skip to content

Instantly share code, notes, and snippets.

@ivopbernardo
Created August 16, 2021 12:49
Show Gist options
  • Save ivopbernardo/38b7b4f2a41333426add8bf39e1c9994 to your computer and use it in GitHub Desktop.
Save ivopbernardo/38b7b4f2a41333426add8bf39e1c9994 to your computer and use it in GitHub Desktop.
word_vectors_cooccurrence
import wikipedia
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
def retrieve_page(page_name: str) -> list:
'''
Retrieves page data from wikipedia
and stores words in lower case format in
a list - tokenized format.
'''
usa_article = wikipedia.page(page_name)
# Strip puncuation from page
usa_article = (
usa_article.content.translate(str.maketrans('', '', string.punctuation))
)
# Lower text case
usa_article = usa_article.lower()
# Tokenize using NLTK word tokenizer
usa_article_token = word_tokenize(usa_article)
return usa_article_token
def build_vocabulary(page:list) -> list:
'''
Builds vocabulary with all the words
present in the list page.
'''
vocab = list(set(page))
vocab.sort()
vocab_dict = {}
for index, word in enumerate(vocab):
vocab_dict[word] = index
return vocab_dict
def build_context(
page:str,
co_occurrence_vectors: pd.DataFrame
) -> pd.DataFrame:
'''
Updates co-ocurrence vectors based on
text read from the page.
'''
for index, element in enumerate(page):
# Build start and finish of context
start = 0 if index-2 < 0 else index-2
finish = len(page) if index+2 > len(page) else index+3
# Retrieve Context for word
context = page[start:index]+page[index+1:finish]
for word in context:
# Update Co-Occurrence Matrix
co_occurrence_vectors.loc[element, word] = (
co_occurrence_vectors.loc[element, word]+1
)
return co_occurrence_vectors
usa_article_token = retrieve_page('United States of America')
vocab_dict = build_vocabulary(usa_article_token)
co_occurrence_vectors = pd.DataFrame(
np.zeros([len(vocab_dict), len(vocab_dict)]),
index = vocab_dict.keys(),
columns = vocab_dict.keys()
)
co_occurrence_vectors = build_context(
usa_article_token,
co_occurrence_vectors
)
similarity_words = pd.DataFrame(
cosine_similarity(co_occurrence_vectors),
columns = vocab_dict.keys(),
index = vocab_dict.keys()
)
# Example of Top 10 words by similarity
similarity_words.loc['china'].sort_values(ascending=False).head(10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment