sevperez/search_relevance_2.py

## search_relevance_2.py
# load spaCy model
nlp = spacy.load("en_core_web_md")

# tokenize documents
def spacy_doc(model, text, lower=True):
    """
    - Parameters: model (spaCy model), text (string), lower (bool).
    - Returns: A spaCy Document object processed using the provided
      model. Document is all lowercase if lower is True.
    """
    if lower:
        text = text.lower()
    return model(text)

sotu_docs = [spacy_doc(nlp, text) for text in sotu_df["text"]]

# build dictionary
def get_token_texts(doc):
    """
    - Parameters: doc (spaCy Document object).
    - Returns: A list of strings based on the text value of each token
      in doc.
    """
    token_list = [token for token in doc]
    return [token.text for token in token_list]

def build_dictionary(doc_list):
    """
    - Parameters: doc_list (list of spaCy Document objects).
    - Returns: A Gensim Dictionary, built using the tokens in each
      document contained in doc_list.
    """
    return Dictionary([get_token_texts(doc) for doc in doc_list])

sotu_dictionary = build_dictionary(sotu_docs)
	# load spaCy model
	nlp = spacy.load("en_core_web_md")

	# tokenize documents
	def spacy_doc(model, text, lower=True):
	"""
	- Parameters: model (spaCy model), text (string), lower (bool).
	- Returns: A spaCy Document object processed using the provided
	model. Document is all lowercase if lower is True.
	"""
	if lower:
	text = text.lower()
	return model(text)

	sotu_docs = [spacy_doc(nlp, text) for text in sotu_df["text"]]

	# build dictionary
	def get_token_texts(doc):
	"""
	- Parameters: doc (spaCy Document object).
	- Returns: A list of strings based on the text value of each token
	in doc.
	"""
	token_list = [token for token in doc]
	return [token.text for token in token_list]

	def build_dictionary(doc_list):
	"""
	- Parameters: doc_list (list of spaCy Document objects).
	- Returns: A Gensim Dictionary, built using the tokens in each
	document contained in doc_list.
	"""
	return Dictionary([get_token_texts(doc) for doc in doc_list])

	sotu_dictionary = build_dictionary(sotu_docs)