shaypal5/pdp_post_adv.py

## pdp_post_adv.py
from typing import Optional
import pdpipe as pdp
from pdpipe import df
from sklearn.linear_model import LogisticRegression
from pdpipe.skintegrate import PdPipelineAndSklearnEstimator

class MyPipelineAndModel(PdPipelineAndSklearnEstimator):

    def __init__(
        self,
        savings_max_val: Optional[int] = 100,
        drop_gender: Optional[bool] = False,
        standardize: Optional[bool] = False,
        ohencode_country: Optional[bool] = True,
        savings_bin_val: Optional[int] = None,
        pca_threshold: Optional[int] = 20,
        fit_intercept: Optional[bool] = True,
    ):
        # save pipeline parameters
        self.savings_max_val = savings_max_val
        self.drop_gender = drop_gender
        self.standardize = standardize
        self.ohencode_country = ohencode_country
        self.savings_bin_val = savings_bin_val
        self.pca_threshold = pca_threshold
        self.fit_intercept = fit_intercept
        # init helper lists
        cols_to_drop = ['Bearded']
        cols_to_encode = []
        # start with a prefix of non-optional stages
        stages = [
            # standard pipeline stages
            pdp.ColDrop(columns=pdp.cq.WithAtLeastMissingValueRate(0.2)),
            pdp.DropLabelsByValues(not_in_set=['Smoking', 'Non-Smoking']),
            pdp.EncodeLabel(),
            pdp.ColDrop(['Name'], errors='ignore'),
            # using pdpipe fly-handles 🚀
            df.set_index(keys='id'),
            pdp.drop_rows_where['Savings'] > savings_max_val,
            df['Viking'] << (df['Country'].isin(['Denmark', 'Finland']) & ~df['Bearded']),
            df['YearlyGrands'] << (df['Savings'] * 1000) / df['Age']
        ]
        # a few parameter-dependent pipeline stages
        if savings_bin_val:
            stages.append(pdp.Bin({'Savings': [savings_bin_val]}, drop=False))
            cols_to_encode.append('Savings_bin')
        if drop_gender:
            cols_to_drop.append('Gender')
        else:
            cols_to_encode.append('Gender')
        if ohencode_country:
            stages.append(pdp.OneHotEncode('Country'))
        else:
            cols_to_drop.append('Country')
        # processing the text column:
        # 1. we do this before standardization so tf-idf
        # representation is also standardized
        # 2. we do this after everything else, so all tf-idf
        # columns are last in column order (for ease of presentation)
        stages.extend([
            pdp.TokenizeText('Quote'),
            pdp.SnowballStem('EnglishStemmer', columns=['Quote']),
            pdp.RemoveStopwords('English', 'Quote'),
            pdp.TfidfVectorizeTokenLists('Quote', hierarchical_labels=True),
        ])
        # PCA all tf-idf columns if there are too many of them
        stages.append(
            pdp.Decompose(
                transformer='PCA',
                columns=pdp.cq.StartsWith('Quote'),
                prec=pdp.cond.HasAtLeastNQualifyingColumns(
                    n=pca_threshold,
                    qualifier=pdp.cq.StartsWith('Quote'),
                ),
                exraise=False,
            )
        )
        # more parameter-dependent pipeline stages
        if len(cols_to_encode) > 0:
            stages.append(pdp.Encode(cols_to_encode))
        if standardize:
            stages.append(pdp.Scale('StandardScaler'))
        # the suffix of non-optional pipeline stages
        stages.extend([
            pdp.ColDrop(cols_to_drop, errors='ignore'),
            pdp.Schematize(),
            pdp.ConditionValidator([
                pdp.cond.HasAtMostNQualifyingColumns(
                    n=150,
                    qualifier=pdp.cq.AllColumns(fittable=False),
                ),
                pdp.cond.HasNoMissingValues(),
            ]),
        ])
        pipeline = pdp.PdPipeline(stages)
        model = LogisticRegression(fit_intercept=fit_intercept)
        super().__init__(pipeline=pipeline, estimator=model)
	from typing import Optional
	import pdpipe as pdp
	from pdpipe import df
	from sklearn.linear_model import LogisticRegression
	from pdpipe.skintegrate import PdPipelineAndSklearnEstimator

	class MyPipelineAndModel(PdPipelineAndSklearnEstimator):

	def __init__(
	self,
	savings_max_val: Optional[int] = 100,
	drop_gender: Optional[bool] = False,
	standardize: Optional[bool] = False,
	ohencode_country: Optional[bool] = True,
	savings_bin_val: Optional[int] = None,
	pca_threshold: Optional[int] = 20,
	fit_intercept: Optional[bool] = True,
	):
	# save pipeline parameters
	self.savings_max_val = savings_max_val
	self.drop_gender = drop_gender
	self.standardize = standardize
	self.ohencode_country = ohencode_country
	self.savings_bin_val = savings_bin_val
	self.pca_threshold = pca_threshold
	self.fit_intercept = fit_intercept
	# init helper lists
	cols_to_drop = ['Bearded']
	cols_to_encode = []
	# start with a prefix of non-optional stages
	stages = [
	# standard pipeline stages
	pdp.ColDrop(columns=pdp.cq.WithAtLeastMissingValueRate(0.2)),
	pdp.DropLabelsByValues(not_in_set=['Smoking', 'Non-Smoking']),
	pdp.EncodeLabel(),
	pdp.ColDrop(['Name'], errors='ignore'),
	# using pdpipe fly-handles 🚀
	df.set_index(keys='id'),
	pdp.drop_rows_where['Savings'] > savings_max_val,
	df['Viking'] << (df['Country'].isin(['Denmark', 'Finland']) & ~df['Bearded']),
	df['YearlyGrands'] << (df['Savings'] * 1000) / df['Age']
	]
	# a few parameter-dependent pipeline stages
	if savings_bin_val:
	stages.append(pdp.Bin({'Savings': [savings_bin_val]}, drop=False))
	cols_to_encode.append('Savings_bin')
	if drop_gender:
	cols_to_drop.append('Gender')
	else:
	cols_to_encode.append('Gender')
	if ohencode_country:
	stages.append(pdp.OneHotEncode('Country'))
	else:
	cols_to_drop.append('Country')
	# processing the text column:
	# 1. we do this before standardization so tf-idf
	# representation is also standardized
	# 2. we do this after everything else, so all tf-idf
	# columns are last in column order (for ease of presentation)
	stages.extend([
	pdp.TokenizeText('Quote'),
	pdp.SnowballStem('EnglishStemmer', columns=['Quote']),
	pdp.RemoveStopwords('English', 'Quote'),
	pdp.TfidfVectorizeTokenLists('Quote', hierarchical_labels=True),
	])
	# PCA all tf-idf columns if there are too many of them
	stages.append(
	pdp.Decompose(
	transformer='PCA',
	columns=pdp.cq.StartsWith('Quote'),
	prec=pdp.cond.HasAtLeastNQualifyingColumns(
	n=pca_threshold,
	qualifier=pdp.cq.StartsWith('Quote'),
	),
	exraise=False,
	)
	)
	# more parameter-dependent pipeline stages
	if len(cols_to_encode) > 0:
	stages.append(pdp.Encode(cols_to_encode))
	if standardize:
	stages.append(pdp.Scale('StandardScaler'))
	# the suffix of non-optional pipeline stages
	stages.extend([
	pdp.ColDrop(cols_to_drop, errors='ignore'),
	pdp.Schematize(),
	pdp.ConditionValidator([
	pdp.cond.HasAtMostNQualifyingColumns(
	n=150,
	qualifier=pdp.cq.AllColumns(fittable=False),
	),
	pdp.cond.HasNoMissingValues(),
	]),
	])
	pipeline = pdp.PdPipeline(stages)
	model = LogisticRegression(fit_intercept=fit_intercept)
	super().__init__(pipeline=pipeline, estimator=model)