Example Class with Singleton State and Fluent API
import numpy as np | |
import pandas as pd | |
import sklearn | |
import nltk | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
class Content(): | |
_cached_data = None | |
def __init__(self): | |
self._refresh_data_cache() | |
def _refresh_data_cache(self): | |
if Content._cached_data is None: | |
# extract my data into a dataframe. I'm pulling from a file, | |
# but you might do this by fetching data from an endpoint | |
# or pulling it from a database. | |
df = pd.read_csv('all_my_data.csv') | |
# example data cleaning. I'm dropping nulls and filling in defaults, | |
# but you might do things like calculate means | |
# or do other preprocessing steps you know you will want for later. | |
df = df.dropna(subset=['column_where_null_values_render_row_useless_for_analysis']) | |
df['column_where_null_values_mess_up_my_analysis'] = df['column_where_null_values_mess_up_my_analysis'].fillna('') | |
Content._cached_data = df | |
# example method that does additional computationally-expensive pre-processing on the data. | |
# I made a separate method for this instead of the 'cheap' pre-processing like on lines 24 and 25 | |
# because it takes a while (15 seconds). So we only invest that time if the client explicitly _needs_ this done. | |
def tfidf_encoded(self): | |
if 'tfidf' not in Content._cached_data.columns: | |
df = Content._cached_data | |
base_contents = np.array(df['base_content']) | |
vocab_length = 40000 | |
tfidf_transformer = TfidfVectorizer(ngram_range = (1,3), max_features = vocab_length) | |
tfidf_encodings = tfidf_transformer.fit_transform(base_contents) | |
df['tfidf'] = list(tfidf_encodings.toarray()) | |
Content._cached_data = df | |
# Fluent interface methods return the object calling the method so the client can chain methods together. | |
return self | |
# In the 'original' fluent interfaces in Java, this method is often named .get(). | |
# I need it because I want the value at the end of my fluent statement to be a dataframe, not a Content object, | |
# but initializers must return None (so Python can give back the initialized object we initialized). | |
# This method hands over the internal state as a dataframe. | |
def to_dataframe(self): | |
return Content._cached_data.copy() # Many thanks to Bijay Gurung for catching an issue with this! Now updated to work properly :) | |
#You might note that the above method is not needed in the pandas fluent interface. | |
#That's because the pandas methods that return dataframes are _not_ initializers. | |
#pd.Dataframe(), pd.read_csv(), and pd.read_excel() are all _class_ methods that return a dataframe object. | |
#There is no "pd" instance. This setup is convenient for users, but it's tough to mock and test. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment