ivopbernardo/cooccurrence_example.py

## cooccurrence_example.py
import wikipedia
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_page(page_name: str) -> list:
    '''
    Retrieves page data from wikipedia
    and stores words in lower case format in
    a list - tokenized format.
    '''
    usa_article = wikipedia.page(page_name)
    # Strip puncuation from page
    usa_article = (
        usa_article.content.translate(str.maketrans('', '', string.punctuation))
    )
    # Lower text case
    usa_article = usa_article.lower()
    # Tokenize using NLTK word tokenizer
    usa_article_token = word_tokenize(usa_article)
    return usa_article_token

def build_vocabulary(page:list) -> list:
    '''
    Builds vocabulary with all the words
    present in the list page.
    '''
    vocab = list(set(page))
    vocab.sort()

    vocab_dict = {}
    for index, word in enumerate(vocab):
        vocab_dict[word] = index
    return vocab_dict

def build_context(
    page:str,
    co_occurrence_vectors: pd.DataFrame
) -> pd.DataFrame:
    '''
    Updates co-ocurrence vectors based on
    text read from the page.
    '''
    for index, element in enumerate(page):
        # Build start and finish of context
        start = 0 if index-2 < 0 else index-2
        finish = len(page) if index+2 > len(page) else index+3
        # Retrieve Context for word
        context = page[start:index]+page[index+1:finish]
        for word in context:
            # Update Co-Occurrence Matrix
            co_occurrence_vectors.loc[element, word] = (
                co_occurrence_vectors.loc[element, word]+1
            )

    return co_occurrence_vectors

usa_article_token = retrieve_page('United States of America')
vocab_dict = build_vocabulary(usa_article_token)

co_occurrence_vectors = pd.DataFrame(
    np.zeros([len(vocab_dict), len(vocab_dict)]),
    index = vocab_dict.keys(),
    columns = vocab_dict.keys()
)


co_occurrence_vectors = build_context(
  usa_article_token,
  co_occurrence_vectors
)

similarity_words = pd.DataFrame(
    cosine_similarity(co_occurrence_vectors),
    columns = vocab_dict.keys(),
    index = vocab_dict.keys()
)

# Example of Top 10 words by similarity
similarity_words.loc['china'].sort_values(ascending=False).head(10)
	import wikipedia
	import pandas as pd
	import numpy as np
	import string
	from nltk.tokenize import word_tokenize
	from sklearn.metrics.pairwise import cosine_similarity

	def retrieve_page(page_name: str) -> list:
	'''
	Retrieves page data from wikipedia
	and stores words in lower case format in
	a list - tokenized format.
	'''
	usa_article = wikipedia.page(page_name)
	# Strip puncuation from page
	usa_article = (
	usa_article.content.translate(str.maketrans('', '', string.punctuation))
	)
	# Lower text case
	usa_article = usa_article.lower()
	# Tokenize using NLTK word tokenizer
	usa_article_token = word_tokenize(usa_article)
	return usa_article_token

	def build_vocabulary(page:list) -> list:
	'''
	Builds vocabulary with all the words
	present in the list page.
	'''
	vocab = list(set(page))
	vocab.sort()

	vocab_dict = {}
	for index, word in enumerate(vocab):
	vocab_dict[word] = index
	return vocab_dict

	def build_context(
	page:str,
	co_occurrence_vectors: pd.DataFrame
	) -> pd.DataFrame:
	'''
	Updates co-ocurrence vectors based on
	text read from the page.
	'''
	for index, element in enumerate(page):
	# Build start and finish of context
	start = 0 if index-2 < 0 else index-2
	finish = len(page) if index+2 > len(page) else index+3
	# Retrieve Context for word
	context = page[start:index]+page[index+1:finish]
	for word in context:
	# Update Co-Occurrence Matrix
	co_occurrence_vectors.loc[element, word] = (
	co_occurrence_vectors.loc[element, word]+1
	)

	return co_occurrence_vectors

	usa_article_token = retrieve_page('United States of America')
	vocab_dict = build_vocabulary(usa_article_token)

	co_occurrence_vectors = pd.DataFrame(
	np.zeros([len(vocab_dict), len(vocab_dict)]),
	index = vocab_dict.keys(),
	columns = vocab_dict.keys()
	)


	co_occurrence_vectors = build_context(
	usa_article_token,
	co_occurrence_vectors
	)

	similarity_words = pd.DataFrame(
	cosine_similarity(co_occurrence_vectors),
	columns = vocab_dict.keys(),
	index = vocab_dict.keys()
	)

	# Example of Top 10 words by similarity
	similarity_words.loc['china'].sort_values(ascending=False).head(10)