Last active
July 3, 2021 17:20
-
-
Save chelseatroy/b3bc4dee31c47a7944aa71ed40ea273d to your computer and use it in GitHub Desktop.
Example Class with Singleton State and Fluent API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import sklearn | |
import nltk | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
class Content(): | |
_cached_data = None | |
def __init__(self): | |
self._refresh_data_cache() | |
def _refresh_data_cache(self): | |
if Content._cached_data is None: | |
# extract my data into a dataframe. I'm pulling from a file, | |
# but you might do this by fetching data from an endpoint | |
# or pulling it from a database. | |
df = pd.read_csv('all_my_data.csv') | |
# example data cleaning. I'm dropping nulls and filling in defaults, | |
# but you might do things like calculate means | |
# or do other preprocessing steps you know you will want for later. | |
df = df.dropna(subset=['column_where_null_values_render_row_useless_for_analysis']) | |
df['column_where_null_values_mess_up_my_analysis'] = df['column_where_null_values_mess_up_my_analysis'].fillna('') | |
Content._cached_data = df | |
# example method that does additional computationally-expensive pre-processing on the data. | |
# I made a separate method for this instead of the 'cheap' pre-processing like on lines 24 and 25 | |
# because it takes a while (15 seconds). So we only invest that time if the client explicitly _needs_ this done. | |
def tfidf_encoded(self): | |
if 'tfidf' not in Content._cached_data.columns: | |
df = Content._cached_data | |
base_contents = np.array(df['base_content']) | |
vocab_length = 40000 | |
tfidf_transformer = TfidfVectorizer(ngram_range = (1,3), max_features = vocab_length) | |
tfidf_encodings = tfidf_transformer.fit_transform(base_contents) | |
df['tfidf'] = list(tfidf_encodings.toarray()) | |
Content._cached_data = df | |
# Fluent interface methods return the object calling the method so the client can chain methods together. | |
return self | |
# In the 'original' fluent interfaces in Java, this method is often named .get(). | |
# I need it because I want the value at the end of my fluent statement to be a dataframe, not a Content object, | |
# but initializers must return None (so Python can give back the initialized object we initialized). | |
# This method hands over the internal state as a dataframe. | |
def to_dataframe(self): | |
return Content._cached_data.copy() # Many thanks to Bijay Gurung for catching an issue with this! Now updated to work properly :) | |
#You might note that the above method is not needed in the pandas fluent interface. | |
#That's because the pandas methods that return dataframes are _not_ initializers. | |
#pd.Dataframe(), pd.read_csv(), and pd.read_excel() are all _class_ methods that return a dataframe object. | |
#There is no "pd" instance. This setup is convenient for users, but it's tough to mock and test. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment