sevperez/search_relevance_3.py

## search_relevance_3.py
def build_corpus(doc_list, dictionary):
    """
    - Parameters: doc_list (list of spaCy Document objects), dictionary
      (Gensim Dictionary object).
    - Returns: A list of documents in bag-of-words format, containing
      tuples with (token_id, token_count) for each token in the text.
    """
    return [dictionary.doc2bow(get_token_texts(doc)) for doc in doc_list]

def build_td_matrix(doc_list, dictionary):
    """
    - Parameters: doc_list (list of spaCy Document objects), dictionary
      (Gensim Dictionary object).
    - Returns: A term-document matrix in the form of a 2D NumPy Array,
      where each row contains the count of a token in the corresponding
      document and each column index is the id of a token in the
      dictionary.
    """
    corpus = build_corpus(sotu_docs, sotu_dictionary)
    tdm = []
    for bow in corpus:
        vector = np.zeros(len(dictionary))
        for token_id, token_count in bow:
            vector[token_id] = token_count
        tdm.append(vector)
    return np.array(tdm)

def build_term_document_df(doc_list, dictionary):
    """
    - Parameters: doc_list (list of spaCy Document objects), dictionary
      (Gensim Dictionary object).
    - Returns a term-document matrix in the form of a Pandas Dataframe,
      where each row is a document and each column is a token. Values in
      the dataframe are token counts for the given document / token.
    """
    tdm = build_td_matrix(doc_list, dictionary)
    cols = list(dictionary.token2id.keys())
    return pd.DataFrame(tdm, columns=cols, dtype=pd.Int64Dtype)

sotu_td_df = build_term_document_df(sotu_docs, sotu_dictionary)
	def build_corpus(doc_list, dictionary):
	"""
	- Parameters: doc_list (list of spaCy Document objects), dictionary
	(Gensim Dictionary object).
	- Returns: A list of documents in bag-of-words format, containing
	tuples with (token_id, token_count) for each token in the text.
	"""
	return [dictionary.doc2bow(get_token_texts(doc)) for doc in doc_list]

	def build_td_matrix(doc_list, dictionary):
	"""
	- Parameters: doc_list (list of spaCy Document objects), dictionary
	(Gensim Dictionary object).
	- Returns: A term-document matrix in the form of a 2D NumPy Array,
	where each row contains the count of a token in the corresponding
	document and each column index is the id of a token in the
	dictionary.
	"""
	corpus = build_corpus(sotu_docs, sotu_dictionary)
	tdm = []
	for bow in corpus:
	vector = np.zeros(len(dictionary))
	for token_id, token_count in bow:
	vector[token_id] = token_count
	tdm.append(vector)
	return np.array(tdm)

	def build_term_document_df(doc_list, dictionary):
	"""
	- Parameters: doc_list (list of spaCy Document objects), dictionary
	(Gensim Dictionary object).
	- Returns a term-document matrix in the form of a Pandas Dataframe,
	where each row is a document and each column is a token. Values in
	the dataframe are token counts for the given document / token.
	"""
	tdm = build_td_matrix(doc_list, dictionary)
	cols = list(dictionary.token2id.keys())
	return pd.DataFrame(tdm, columns=cols, dtype=pd.Int64Dtype)

	sotu_td_df = build_term_document_df(sotu_docs, sotu_dictionary)