chelseatroy/prepare_data.py

## prepare_data.py
import numpy as np
import pandas as pd
import sklearn
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer

class Content():
    _cached_data = None

    def __init__(self):
        self._refresh_data_cache()

    def _refresh_data_cache(self):
        if Content._cached_data is None:
            # extract my data into a dataframe. I'm pulling from a file,
            # but you might do this by fetching data from an endpoint
            # or pulling it from a database.
            df = pd.read_csv('all_my_data.csv')

            # example data cleaning. I'm dropping nulls and filling in defaults,
            # but you might do things like calculate means
            # or do other preprocessing steps you know you will want for later.
            df = df.dropna(subset=['column_where_null_values_render_row_useless_for_analysis'])
            df['column_where_null_values_mess_up_my_analysis'] = df['column_where_null_values_mess_up_my_analysis'].fillna('')

            Content._cached_data = df

    # example method that does additional computationally-expensive pre-processing on the data.
    # I made a separate method for this instead of the 'cheap' pre-processing like on lines 24 and 25
    # because it takes a while (15 seconds). So we only invest that time if the client explicitly _needs_ this done.
    def tfidf_encoded(self):
        if 'tfidf' not in Content._cached_data.columns:
            df = Content._cached_data

            base_contents = np.array(df['base_content'])
            vocab_length = 40000

            tfidf_transformer = TfidfVectorizer(ngram_range = (1,3), max_features = vocab_length)
            tfidf_encodings = tfidf_transformer.fit_transform(base_contents)

            df['tfidf'] = list(tfidf_encodings.toarray())
            Content._cached_data = df
        # Fluent interface methods return the object calling the method so the client can chain methods together.
        return self

    # In the 'original' fluent interfaces in Java, this method is often named .get().
    # I need it because I want the value at the end of my fluent statement to be a dataframe, not a Content object,
    # but initializers must return None (so Python can give back the initialized object we initialized).
    # This method hands over the internal state as a dataframe.
    def to_dataframe(self):
        return Content._cached_data.copy() # Many thanks to Bijay Gurung for catching an issue with this! Now updated to work properly :)
    #You might note that the above method is not needed in the pandas fluent interface.
    #That's because the pandas methods that return dataframes are _not_ initializers.
    #pd.Dataframe(), pd.read_csv(), and pd.read_excel() are all _class_ methods that return a dataframe object.
    #There is no "pd" instance. This setup is convenient for users, but it's tough to mock and test.
	import numpy as np
	import pandas as pd
	import sklearn
	import nltk

	from sklearn.feature_extraction.text import TfidfVectorizer

	class Content():
	_cached_data = None

	def __init__(self):
	self._refresh_data_cache()

	def _refresh_data_cache(self):
	if Content._cached_data is None:
	# extract my data into a dataframe. I'm pulling from a file,
	# but you might do this by fetching data from an endpoint
	# or pulling it from a database.
	df = pd.read_csv('all_my_data.csv')

	# example data cleaning. I'm dropping nulls and filling in defaults,
	# but you might do things like calculate means
	# or do other preprocessing steps you know you will want for later.
	df = df.dropna(subset=['column_where_null_values_render_row_useless_for_analysis'])
	df['column_where_null_values_mess_up_my_analysis'] = df['column_where_null_values_mess_up_my_analysis'].fillna('')

	Content._cached_data = df

	# example method that does additional computationally-expensive pre-processing on the data.
	# I made a separate method for this instead of the 'cheap' pre-processing like on lines 24 and 25
	# because it takes a while (15 seconds). So we only invest that time if the client explicitly _needs_ this done.
	def tfidf_encoded(self):
	if 'tfidf' not in Content._cached_data.columns:
	df = Content._cached_data

	base_contents = np.array(df['base_content'])
	vocab_length = 40000

	tfidf_transformer = TfidfVectorizer(ngram_range = (1,3), max_features = vocab_length)
	tfidf_encodings = tfidf_transformer.fit_transform(base_contents)

	df['tfidf'] = list(tfidf_encodings.toarray())
	Content._cached_data = df
	# Fluent interface methods return the object calling the method so the client can chain methods together.
	return self

	# In the 'original' fluent interfaces in Java, this method is often named .get().
	# I need it because I want the value at the end of my fluent statement to be a dataframe, not a Content object,
	# but initializers must return None (so Python can give back the initialized object we initialized).
	# This method hands over the internal state as a dataframe.
	def to_dataframe(self):
	return Content._cached_data.copy() # Many thanks to Bijay Gurung for catching an issue with this! Now updated to work properly :)
	#You might note that the above method is not needed in the pandas fluent interface.
	#That's because the pandas methods that return dataframes are _not_ initializers.
	#pd.Dataframe(), pd.read_csv(), and pd.read_excel() are all _class_ methods that return a dataframe object.
	#There is no "pd" instance. This setup is convenient for users, but it's tough to mock and test.