Skip to content

Instantly share code, notes, and snippets.

View sevperez's full-sized avatar

Severin Perez sevperez

View GitHub Profile
def search_tfidf_df(tfidf_df, text_df, query_string: str):
"""
- Parameters: tfidf_df (Pandas DataFrame) representing a tf-idf
matrix, text_df (Pandas DataFrame) with a "text" column and rows
that correspond to the tfidf_df, and query_string (string).
- Returns: A new dataframe that only contains rows from text_df where
the corresponding tf-idf value was greater than zero for each of
the terms in query_string. Additional columns are added to show the
tf-idf value for each term and the sum of the tf-idf values.
"""
def document_frequency(td_df, term: str):
"""
- Parameters: td_df (Pandas DataFrame) representing a term-document
matrix, and term (string).
- Returns: The document frequency value showing the number of
documents in td_df where term occurs at least once.
"""
return td_df[td_df[term] > 0].shape[0]
def inverse_document_frequency(td_df, term: str):
def search_td_df(td_df, text_df, query_string: str):
"""
- Parameters: td_df (Pandas DataFrame) representing a term-document
matrix, text_df (Pandas DataFrame) with a "text" column and rows
that correspond to the td_df, and query_string (string).
- Returns: A new dataframe that only contains rows from text_df where
the "text" column had at least one occurence of each term in
query_string. Additional columns are added to show the count of
each term and the total count of all terms.
"""
def build_corpus(doc_list, dictionary):
"""
- Parameters: doc_list (list of spaCy Document objects), dictionary
(Gensim Dictionary object).
- Returns: A list of documents in bag-of-words format, containing
tuples with (token_id, token_count) for each token in the text.
"""
return [dictionary.doc2bow(get_token_texts(doc)) for doc in doc_list]
def build_td_matrix(doc_list, dictionary):
# load spaCy model
nlp = spacy.load("en_core_web_md")
# tokenize documents
def spacy_doc(model, text, lower=True):
"""
- Parameters: model (spaCy model), text (string), lower (bool).
- Returns: A spaCy Document object processed using the provided
model. Document is all lowercase if lower is True.
"""
def search_df_texts(df, query_string: str):
"""
- Parameters: df (Pandas DataFrame), query_string (string). df must
contain a "text" column.
- Returns: A subset of df containing only rows where each term in
query_string appeared as a substring in df["text"].
"""
terms = query_string.lower().split(" ")
filters = [df["text"].str.lower().str.contains(term) for term in terms]
return df[np.all(filters, axis=0)]
def binary_search(items, target):
left = 0
right = len(items) - 1
while left <= right:
mid = (left + right) // 2
if items[mid] == target:
return True
class Receipt:
def __init__(self, item, cost):
self.item = item
self.cost = cost
def receipt_msg(self):
return f"{self.item}, ${round(self.cost, 2)}"
def deliver(self):
msg = self.receipt_msg()
class Car:
def __init__(self, max_speed):
self.max_speed = max_speed
self.current_speed = 0
self.acceleration_rate = 1
def accelerate(self):
if self.current_speed < self.max_speed:
self.current_speed += self.acceleration_rate
def double(num):
return num * 2
my_numbers = [1, 2, 3, 4, 5]
doubled_numbers = list(map(double, my_numbers))
print(doubled_numbers) # [2, 4, 6, 8, 10]