Skip to content

Instantly share code, notes, and snippets.

@sevperez
Created October 14, 2020 08:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sevperez/e035ec72b98c46623cecc7401709889c to your computer and use it in GitHub Desktop.
Save sevperez/e035ec72b98c46623cecc7401709889c to your computer and use it in GitHub Desktop.
def build_corpus(doc_list, dictionary):
"""
- Parameters: doc_list (list of spaCy Document objects), dictionary
(Gensim Dictionary object).
- Returns: A list of documents in bag-of-words format, containing
tuples with (token_id, token_count) for each token in the text.
"""
return [dictionary.doc2bow(get_token_texts(doc)) for doc in doc_list]
def build_td_matrix(doc_list, dictionary):
"""
- Parameters: doc_list (list of spaCy Document objects), dictionary
(Gensim Dictionary object).
- Returns: A term-document matrix in the form of a 2D NumPy Array,
where each row contains the count of a token in the corresponding
document and each column index is the id of a token in the
dictionary.
"""
corpus = build_corpus(sotu_docs, sotu_dictionary)
tdm = []
for bow in corpus:
vector = np.zeros(len(dictionary))
for token_id, token_count in bow:
vector[token_id] = token_count
tdm.append(vector)
return np.array(tdm)
def build_term_document_df(doc_list, dictionary):
"""
- Parameters: doc_list (list of spaCy Document objects), dictionary
(Gensim Dictionary object).
- Returns a term-document matrix in the form of a Pandas Dataframe,
where each row is a document and each column is a token. Values in
the dataframe are token counts for the given document / token.
"""
tdm = build_td_matrix(doc_list, dictionary)
cols = list(dictionary.token2id.keys())
return pd.DataFrame(tdm, columns=cols, dtype=pd.Int64Dtype)
sotu_td_df = build_term_document_df(sotu_docs, sotu_dictionary)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment