evanfrisch/Function definition of create_categorical_feature in data_cleaner.py Secret

## Function definition of create_categorical_feature in data_cleaner.py
    def create_categorical_feature(self, dataframe, base_field, categorical_field, levels, increment=0):
        """Produces a PySpark dataframe containing a categorical field based on a specified field.
        :param dataframe: the PySpark dataframe
        :param base_field: the field that provides the values used to create the categorical field
        :param categorical_field: the name of the categorical field to be created
        :param levels: the number of levels to be created in the categorical field
        :param increment: the value to add to each level (Default value = 0)
        :returns: the PySpark dataframe containing a categorical field and all fields in the supplied dataframe
        """
        dataframe = self.fix_data_type(dataframe, [base_field], 'double')
        discretizer = QuantileDiscretizer(numBuckets=levels, inputCol=base_field, outputCol=categorical_field)
        dataframe = discretizer.fit(dataframe).transform(dataframe)
        return(dataframe.withColumn(categorical_field, dataframe[categorical_field].cast('int')+increment))
	def create_categorical_feature(self, dataframe, base_field, categorical_field, levels, increment=0):
	"""Produces a PySpark dataframe containing a categorical field based on a specified field.
	:param dataframe: the PySpark dataframe
	:param base_field: the field that provides the values used to create the categorical field
	:param categorical_field: the name of the categorical field to be created
	:param levels: the number of levels to be created in the categorical field
	:param increment: the value to add to each level (Default value = 0)
	:returns: the PySpark dataframe containing a categorical field and all fields in the supplied dataframe
	"""
	dataframe = self.fix_data_type(dataframe, [base_field], 'double')
	discretizer = QuantileDiscretizer(numBuckets=levels, inputCol=base_field, outputCol=categorical_field)
	dataframe = discretizer.fit(dataframe).transform(dataframe)
	return(dataframe.withColumn(categorical_field, dataframe[categorical_field].cast('int')+increment))