Tobias-K93/helpers.py

## helpers.py
import numpy as np

def standardize(gamma, upper_val = 0.3, lower_val = 0.1):

    s = (gamma-np.min(gamma))/(np.max(gamma)-np.min(gamma))
    out = s * (upper_val - lower_val) + lower_val

    return out

def is_pos_def(x):
            return np.all(np.linalg.eigvals(x) > 0)


def adjusting_assignment_level(level):

    adjustment_dict = {'low' : -0.545, 'medium' : 0, 'high' : 0.545}

    return adjustment_dict[level]

def revert_string_prob(string):

    reverse_dict = {'low': 0.35, 'medium': 0.5, 'high': 0.65}

    return reverse_dict[string]

# functions describing relation between X and y (g_0(X))
def linear_simple(self):
    return np.dot(self.X,self.weights_covariates_to_outputs)

def linear_interaction(self):
    return np.dot(self.X, self.weights_covariates_to_outputs) \
            + np.dot(self.X_interaction, self.weights_interaction)

def partial_nonlinear_simple(self):
    return 2.5*np.cos(np.dot(self.X,self.weights_covariates_to_outputs))**3 \
            + 2.5*0.2*np.dot(self.X,self.weights_covariates_to_outputs)

def partial_nonlinear_interaction(self):
    return 2.5*np.cos(np.dot(self.X,self.weights_covariates_to_outputs) + \
                  np.dot(self.X_interaction, self.weights_interaction))**3 \
                  + 2.5*0.2*(np.dot(self.X,self.weights_covariates_to_outputs)
                  + np.dot(self.X_interaction, self.weights_interaction))

def nonlinear_simple(self):
    return 3*np.cos(np.dot(self.X,self.weights_covariates_to_outputs))**3

def nonlinear_interaction(self):
    return 3*np.cos(np.dot(self.X,self.weights_covariates_to_outputs) +
                  np.dot(self.X_interaction, self.weights_interaction))**3

relation_dict = {'linear_simple': linear_simple,
                 'linear_interaction': linear_interaction,
                 'partial_nonlinear_simple': partial_nonlinear_simple,
                 'partial_nonlinear_interaction': partial_nonlinear_interaction,
                 'nonlinear_simple': nonlinear_simple,
                 'nonlinear_interaction': nonlinear_interaction}
# function to call in main class
def relation_fct(self, x_y_relation):
    function = relation_dict.get(x_y_relation)
    return function(self)

## interactive_plots_functions.py
import matplotlib.pyplot as plt
import seaborn as sns

def output_difference_plt(y_treated_continuous, y_not_treated_continuous,
                          y_treated_binary, y_not_treated_binary):
    fig, axes = plt.subplots(1,2, figsize=(10,4))

    axes[0].set_title('Continuous Output distributions')
    axes[1].set_title('Binary output distributions')

    axes[0].set_xlabel('y')
    axes[1].set_xlabel('y')

    axes[0].set_ylabel('Density')
    axes[1].set_ylabel('Density')

    sns.distplot(y_not_treated_continuous, hist=False, kde=True, ax = axes[0],
             kde_kws={'linewidth': 4, 'color' : 'darkblue'}, label = 'y not treated')

    sns.distplot(y_treated_continuous, hist=False, kde=True, ax = axes[0],
             kde_kws={'linewidth': 4, 'color' : 'darkred'}, label = 'y treated')

    sns.distplot(y_not_treated_binary, hist=False, kde=True, ax = axes[1],
             kde_kws={'linewidth': 4, 'color' : 'darkblue'}, label = 'y not treated')

    sns.distplot(y_treated_binary, hist=False, kde=True, ax = axes[1],
             kde_kws={'linewidth': 4, 'color' : 'darkred'}, label = 'y treated')

    sns.despine(right=True, top=True)
    plt.setp(axes[1], xticks=[0,1])
    plt.tight_layout()


def x_y_relation_plot(y,g_0_X):

    fig, axes = plt.subplots(1,1, figsize=(6,5))

    axes.set_title('y ~ X relation')

    axes.set_xlabel('X * b')

    axes.set_ylabel('y')

    axes.set_ylim([-7,7])

    axes.set_xlim([-5,5])

    sns.scatterplot(g_0_X, y, ax=axes, color='darkblue', s=18)

    sns.despine(left=False, right=True, top=True)

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])

## jupyter_nb_interactive_plots.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a11a12339a6146518a44beb067cefe6c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "interactive(children=(IntSlider(value=5, description='intensity', max=10, min=1), Output()), _dom_classes=('wi…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import ipywidgets as widgets\n",
    "from ipywidgets import interact, interactive, fixed, interact_manual\n",
    "\n",
    "from interactive_plots_functions import output_difference_plt\n",
    "from opossum import UserInterface\n",
    "\n",
    "\n",
    "def output_difference_interaction_fct(intensity):\n",
    "    u = UserInterface(10000,10, seed=7, categorical_covariates = None)\n",
    "    u.generate_treatment(intensity = intensity)\n",
    "\n",
    "    y_continuous, X_continuous, assignment_continuous, treatment_continuous = u.output_data(False)\n",
    "    y_binary, X_binary, assignment_binary, treatment_binary = u.output_data(True)\n",
    "\n",
    "\n",
    "\n",
    "    y_treated_continuous = y_continuous[assignment_continuous==1]\n",
    "    y_not_treated_continuous = y_continuous[assignment_continuous==0]\n",
    "    y_treated_binary = y_binary[assignment_binary==1]\n",
    "    y_not_treated_binary = y_binary[assignment_binary==0]\n",
    "\n",
    "    \n",
    "    output_difference_plt(y_treated_continuous, y_not_treated_continuous, \n",
    "                          y_treated_binary, y_not_treated_binary)\n",
    "    \n",
    "\n",
    "\n",
    "interactive(output_difference_interaction_fct, intensity=(1,10))\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

## opossum.py
from scipy import random, stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from helpers import standardize, is_pos_def, adjusting_assignment_level, \
                    revert_string_prob, relation_fct

class SimData:
    """
    Main class that package is built on
    """

    def __init__(self, N, k, seed):
        '''
        Parameters:
            N (int): Number of observations
            k (int): Number of covariates
        Attributes:
            weights_treatment_assignment (numpy array): Weight vector, drawn
                from a uniform distribution U(0,1), of length k. It is used to
                weight covariates when assigning treatment non-randomly and
                when creating heterogeneous treatment effects.
            weights_covariates_to_outputs (numpy array) Weight vector, drawn
                from a beta distribution Beta(1,5), of length k. It is used to
                weight covariate importance when creating output y from X.
            z_set_size_assignment (int): Number of covariates in subset Z of X
                that are used to assign treatment non-randomly.
            z_set_size_treatment (int): Number of covariates in subset Z of X
                that are used to create heterogeneous treatment effects.
            interaction_num (int): Number of interaction terms that are
                randomly created if chosen in output creation.
        '''
        if seed is not None:
            random.seed(seed) # For debugging
        self.N = N #  number of observations
        self.k = k #  number of covariates

        # initilizing weight vector for treatment assignment
        # using random weights from U[0,1]
        self.weights_treatment_assignment = np.random.uniform(0,1,self.k)
        # doing the same for relation of X and y with
        # beta distribution (alpha=1, beta=5)
        self.weights_covariates_to_outputs =  np.random.beta(1,5,self.k)
        # set size of subset Z of X for heterogeneous treatment creation
        self.z_set_size_treatment = np.int(self.k/2)
        # set size of subset Z of X for non-random treatment assignment
        self.z_set_size_assignment = np.int(self.k/2)
        # set number of covariates used for creating interaction terms of X
        self.interaction_num = int(np.sqrt(self.k))


    def generate_covariates(self, categorical_covariates):

        """
        Generates the covariates matrix

        Parameters:
            categorical_covariates (int or list): Either an int, indicating the
                number of categories that all covariates are made of; a list
                with 2 ints, the first int indicating the number of covariates
                and the second the number of categories; or a list with one int
                and a list of ints, where the list of ints includes the
                different number of categories wanted.
        ...

        Returns:
            None
        """

        A = random.rand(self.k, self.k)
        # To allow for negative correlations
        overlay_matrix = np.random.randint(2, size=(self.k, self.k))
        overlay_matrix[overlay_matrix == 0] = -1
        # correcting for number of covariates
        A = (10/(self.k)) * A * overlay_matrix
        # Assuring positive definitness
        sigma = np.dot(A, A.transpose())

        # Positive Definite Check
        if not is_pos_def(sigma):
            raise ValueError('sigma is not positive definite!')

        # Expected values
        mu = np.repeat(0, self.k)
        # Final covariates
        X = np.random.multivariate_normal(mu, sigma, self.N)

        ### Categorical variables ###

        if categorical_covariates == None:
            self.X = X
            return None

        # Single integer: all covariates become categorical with int categories
        if type(categorical_covariates) == int:
            # Standardizing column wise to [0,1]
            X = (X - np.min(X, axis=0))/(np.max(X, axis=0)-np.min(X, axis=0))

            X_categorical = np.zeros(X.shape)
            # Creating categorical variables with chosen number of categories
            for c in range(categorical_covariates-1):
                X_categorical += np.random.binomial(1, X)

            X = X_categorical

        elif type(categorical_covariates) == list and len(categorical_covariates) == 2:
            num_cat_covariates = categorical_covariates[0]
            if num_cat_covariates > self.k:
                raise Warning("Number of catigorical variables ({}) is greater "
                              "than number of covariates ({}). \nAll {} "
                              "covariates are made categorical."
                              .format(num_cat_covariates, self.k, self.k))

            X_cat_part = X[:,:num_cat_covariates]
            # Standardizing column wise to [0,1]
            X_cat_part = (X_cat_part - np.min(X_cat_part, axis=0)) \
                         /(np.max(X_cat_part, axis=0)-np.min(X_cat_part, axis=0))
            X_categorical = np.zeros(X_cat_part.shape)

            # List with 2 ints: Chosen number of covarites become categorical
            # with chosen number of categories
            if type(categorical_covariates[1]) ==  int:
                num_categories = categorical_covariates[1]
                # Creating categorical variables with chosen number of categories
                for c in range(num_categories-1):
                    X_categorical += np.random.binomial(1, X_cat_part)

            # List with int and list of ints: Chosen number of covariates become
            # categorical according to chosen list of number of categories
            elif type(categorical_covariates[1]) == list:

                num_categories_list = categorical_covariates[1]

                # Creating vector with wanted category numbers
                category_type_vector = np.array((num_cat_covariates //
                                                 len(num_categories_list) + 1)
                                                * num_categories_list)
                # Making sure it has the wanted length, which is the number of
                # wanted categorical covariates
                category_type_vector = category_type_vector[:num_cat_covariates]

                start = 0
                end = 0
                # Selecting one by one wanted category numbers
                for num_categories in num_categories_list:

                    end += np.sum(category_type_vector == num_categories)
                    # Adding up bernouli outcomes to get categorical variables
                    for c in range(num_categories-1):
                        X_categorical[:, start:end] += np.random.binomial(1,
                                                       X_cat_part[:, start:end])

                    start = end

            else:
                raise ValueError("categorical_covariates needs to be either an "
                                 "int, a list of 2 ints, or a list of one int "
                                 "and a list of ints. \nMake sure that the "
                                 "second item of the list is an int or a list "
                                 "of ints")

            X[:,:num_cat_covariates] = X_categorical


        else:
            raise ValueError("categorical_covariates needs to be either an int, "
                             "a list of 2 ints, or a list of one int and a list "
                             "of ints. \nMake sure that it is a list of length "
                             "2 or a single int" )


        self.X = X

        return None


    def generate_treatment_assignment(self, random, assignment_prob):

        """
        Generates treatment assignment vector

        Parameters:
            random (boolean): If True, treatment is assigned randomly
                according to assignment_prob parameter. If False, treatment
                assignment is determined depending on covariates.
                (default is True)
            assignment_prob (float or string): The probability with which
                treatment is assigned. In the case of random assignment, it can
                be a float with 0 < prob < 1. If assignment is not random it
                should be one of the following strings: 'low', 'medium', 'high'.
                The strings stand for the values 0.35, 0.5, 0.65 respectively
                and can also be used in the non-random case.
                (default is 0.5)

        ...

        Returns:
            None

        """
        # random treatment assignment
        if random:
            m_0 = assignment_prob  # probability

            # reverting strings like 'low' to float prob like 0.35, if necessary
            if isinstance(m_0, str):
                m_0 = revert_string_prob(m_0)

            # propensity scores for each observation
            self.propensity_scores = np.repeat(m_0,self.N)


        else:
            # Creating index for selection of covariates for assignment
            a_idx = np.concatenate((np.zeros(self.k - self.z_set_size_assignment)
                                    , np.ones(self.z_set_size_assignment)))
            np.random.shuffle(a_idx)
            # Selecting covariates
            X_a = self.X[:, a_idx == 1].copy()

            noise = np.random.uniform(0,0.25, self.N)

            a = np.dot(X_a, self.weights_treatment_assignment[a_idx == 1]) \
                + noise

            try:
                # get value that adjusts z and thus propensity scores
                z_adjustment = adjusting_assignment_level(assignment_prob)

            except KeyError:
                z_adjustment = 0

                # making sure default of 0.5 does not give warning
                if assignment_prob != 0.5:
                    raise Warning('When assignment is not random, expected '
                                  'assignment_prob can only be \'low\': 0.35, '
                                  '\'medium\': 0.5, or \'high\': 0.65. Now '
                                  'medium is chosen ')

            # Using empirical mean, sd
            a_mean = np.mean(a)
            a_sigma = np.std(a)
            # normalizing 'a' vector and adjust if chosen
            z = (a - a_mean) / a_sigma + z_adjustment

            # using normalized vector z to get probabilities from normal cdf
            # to later assign treatment with binomial in D
            m_0 = stats.norm.cdf(z)

            # propensity scores for each observation
            self.propensity_scores = m_0

        # creating array out of binomial distribution that assigns treatment
        # according to probability m_0
        self.D = np.random.binomial(1, m_0, self.N)


        return None


    def generate_treatment_effect(self, treatment_option_weights, constant_pos,
                                  constant_neg, heterogeneity_pos,
                                  heterogeneity_neg, no_treatment,
                                  discrete_heterogeneity, intensity):

        """
        Generates chosen kinds of treatment effects

        Parameters:
            constant_pos (boolean): If True, the treatment effect is a positive
                constant.
                (default is True)
            constant_neg (boolean): If True, the treatment effect is a negative
                constant.
                (default is False)
            heterogeneous_pos (boolean): If True, the treatment effect is
                positive and heterogeneous, i.e. it depends on a number of
                covariates and varries in size.
                (default is False)
            heterogeneous_neg (boolean): If True, the treatment effect is
                negative and heterogeneous, i.e. it depends on a number of
                covariates and varries in size.
                (default is False)
            no_treatment (boolean): If True, then there is no treatment effect.
                (default is False)
            discrete_heterogeneous (boolean): If True, then the treatment
                effect consists of 2 values of different size. The size
                is determined by a subset of covariates.
                (default is False)
            treatment_option_weights (list): List of length 6 with weights of
                wanted treatment effects in the following order:
                [const_pos, const_neg, heterogeneous_pos, heterogeneous_neg,
                no_treatment, discrete_heterogeneous]. Its values need to sum
                up to 1. If chosen, the values overwrite the boolean parameters
                for each treatment effect.
                (default is None)
            intensity (int or float): Value affects the size of the treatment
                effect. Needs to be between 1 and 10. Formula for the actual
                magnitude of the treatment effects are:
                const: intensity*0.2, heterogeneous: [0, intensity*0.4]
                discrete_heterogeneous: {intensity*0.1, intensity*0.2}
                (default is 5)

        When creating the treatment effect, there are two ways to choose which
        kind of effects are used. Either one sets all treatment effect booleans
        that are wanted to True and then they are equally weighted created, or
        one gives a list of length 6 with weights of the wanted distribution of
        effects to the parameter treatment_option_weights, which overwrites
        whatever booleans where chosen before.

        Returns:
            None
        """

        # length of treatment_option_weights vector/
        # number of treatment effect options
        tow_length = 6

        if intensity > 10 or intensity < 1:
            raise ValueError("intensity needs to be an int or float value of "
                             "[1,10]")

        if treatment_option_weights is not None:
            # make sure it's a numpy array
            treatment_option_weights = np.array(treatment_option_weights)
            if np.around(np.sum(treatment_option_weights),3) !=1:
                raise ValueError('Values in treatment_option_weights-vector '
                                 'must sum up to 1')
            if len(treatment_option_weights) !=tow_length:
                raise ValueError('Treatment_option_weights-vector must be of '
                                 'length {}'.format(tow_length))


            # take times N to get absolute number of each option
            absolute_ratio = (self.N*treatment_option_weights).astype(int)

            # adjusting possible rounding errors by increasing highest value
            if sum(absolute_ratio) < self.N:
                index_max = np.argmax(treatment_option_weights)
                absolute_ratio[index_max] = absolute_ratio[index_max] \
                                            + (self.N-sum(absolute_ratio))

            # fill up index-array with options 1-6 according to the weights
            weight_ratio_index = np.zeros((self.N,))
            counter = 0
            for i in range(len(absolute_ratio)):
                weight_ratio_index[counter:counter+absolute_ratio[i],] = i+1
                counter += absolute_ratio[i]
            # shuffle
            np.random.shuffle(weight_ratio_index)

            n_idx = weight_ratio_index

            # overwriting booleans according to given treatment_option_weights
            options_boolean = treatment_option_weights > 0

            constant_pos, constant_neg, heterogeneity_pos, heterogeneity_neg, \
            no_treatment, discrete_heterogeneity  = tuple(options_boolean)

        # Process options
        options = []

        options_boolean = np.array([constant_pos, constant_neg, heterogeneity_pos,
                                    heterogeneity_neg, no_treatment,
                                    discrete_heterogeneity])

        # selecting wanted treatment options into list
        for i in range(len(options_boolean)):
            if options_boolean[i]:
                options.append(i+1)

        if treatment_option_weights is None:
            if options ==[]:
                raise ValueError("At least one treatment effect option must be"
                                 "True")
            # assigning which individual gets which kind of treatment effect
            treatment_option_weights = np.zeros(len(options_boolean))
            treatment_option_weights[options_boolean] = 1/np.sum(options_boolean)
            # from options 1-6
            n_idx = np.random.choice(options, self.N, True)


        # array to fill up with theta values
        theta_combined = np.zeros(self.N)

        if constant_pos:
            # Option 1
            con = 0.2*intensity

            theta_combined[n_idx == 1] = con

        if constant_neg:
            # Option 2
            con = -0.2*intensity

            theta_combined[n_idx == 2] = con


        if heterogeneity_pos or heterogeneity_neg:
            # Options 3 & 4

            # creating index vector that assigns which covariates are part of Z
            h_idx = np.concatenate((np.zeros(self.k - self.z_set_size_treatment),
                                    np.ones(self.z_set_size_treatment)))
            np.random.shuffle(h_idx)

            X_h = self.X[:,h_idx == 1].copy()

            w = np.random.normal(0,0.25,self.N)

            weight_vector_adj = self.weights_treatment_assignment[h_idx == 1]

            gamma = np.sin(np.dot(X_h, weight_vector_adj)) + w

            # Standardize on [0,g(intensity)], g(): some function e.g. g(x)=0.2x
            theta_option2 = standardize(gamma, intensity*0.4, 0)
            # calculating percentage quantile of negative treatment effect weights
            percentage_neg = treatment_option_weights[3] \
                            / (treatment_option_weights[2]+
                               treatment_option_weights[3])
            # get quantile value that splits distribution into two groups
            quantile_value = np.quantile(theta_option2, percentage_neg)
            # move distribution into negative range by the amount of quantile
            # value
            theta_option2 = theta_option2 - quantile_value

            theta_combined[(n_idx == 3) | (n_idx == 4)] \
            = theta_option2[(n_idx == 3) | (n_idx == 4)]


        if no_treatment:
            # Option 5
            theta_combined[n_idx == 5] = 0

        if discrete_heterogeneity:
            # Option 6
            # assigning randomly which covariates affect treatment effect
            # creating index vector
            dh_idx = np.concatenate((np.zeros(self.k - self.z_set_size_treatment),
                                     np.ones(self.z_set_size_treatment)))
            np.random.shuffle(dh_idx)

            # choosing covariates in Z
            X_dh = self.X[:,dh_idx == 1].copy()
            # adjusting weight vector to length of Z
            weight_vector_adj = self.weights_treatment_assignment[dh_idx == 1]

            a = np.sin(np.dot(X_dh,weight_vector_adj))

            a = standardize(a, 1,0)
            theta_dh = np.random.binomial(1,a).astype(float) * -1
#            # normalizing 'a' vector
#            a_mean = np.mean(a)
#            a_sigma = np.std(a)
#            z = (a - a_mean) / a_sigma
#            # create probabilities
#            dh_effect_prob = stats.norm.cdf(z)
#
#            # assigning low and high treatment outcome
#            theta_dh = np.random.binomial(1,dh_effect_prob).astype(float) * -1

            low_effect = 0.1 * intensity

            high_effect = 0.2 * intensity

            theta_dh[theta_dh == 0] = low_effect

            theta_dh[theta_dh == -1] = high_effect

            theta_combined[n_idx == 6] = theta_dh[n_idx == 6]


        # Assign identifier 0 for each observation that did not get assigned to
        # treatment
        n_idx[self.D == 0] = 0

        # create vector that shows 0 for not assigned observations and
        # treatment-type (1-6) for assigned ones
        self.treatment_effect_type = n_idx
        # vector that includes sizes of treatment effects for each observation
        self.treatment_effect = theta_combined

        return None

    def generate_realized_treatment_effect(self):
        """
        Model-wise: Theta_0 * D
        :return:  Extract Treatment Effect where Treatment has been assigned
        """

        return self.get_treatment_effect() * self.get_treatment_assignment()

    def generate_noise(self):
        """
        model-wise: U or V
        Restriction: Expectation must be zero conditional on X, D.
        However, the expectation is independent anyways.
        :return: One-dim. array of normally distributed rv with 0 and 1
        """
        return np.random.normal(0, 1, self.N)


    def generate_outcome_variable(self, binary, x_y_relation):
        """
        Generates g_0(X), output variable y and returns simulated variables

        Parameters:
            binary (boolean): If True output is going to be binary, otherwise
                continuous.
            x_y_relation (string): Chooses the simulated relationship between
                X and y. Possible values are:
                'linear_simple', 'linear_interaction',
                'partial_nonlinear_simple', 'partial_nonlinear_interaction',
                'nonlinear_simple', 'nonlinear_interaction'

        ...

        Returns:
            tuple
        """
        # Creating random interaction terms of covariates
        interaction_idx_1 = np.random.choice(np.arange(self.k),
                                             self.interaction_num)
        interaction_idx_2 = np.random.choice(np.arange(self.k),
                                             self.interaction_num)

        self.X_interaction \
        = self.X[:,interaction_idx_1] * self.X[:,interaction_idx_2]

        self.weights_interaction \
        = self.weights_covariates_to_outputs[interaction_idx_1]

        try:
            self.g_0_X = relation_fct(self, x_y_relation)
        except TypeError:
            raise ValueError('x_y_relation needs to be one of the following '
                             'strings:\n"linear_simple", "linear_interaction", '
                             '"partial_nonlinear_simple", '
                             '"partial_nonlinear_interaction", '
                             '"nonlinear_simple", "nonlinear_interaction"')

        if not binary:
            # Theta_0 * D
            realized_treatment_effect = self.generate_realized_treatment_effect()
            # + g_0(x) + U
            y = realized_treatment_effect + self.g_0_X + self.generate_noise()

        if binary:
            # generating y as probability between 0.1 and 0.9
            y = self.g_0_X #+ self.generate_noise()
            y_probs = standardize(y, 0.1, 0.9)
            # generate treatment effect as probability
            realized_treatment_effect = self.generate_realized_treatment_effect()/10
            # max. range of treatment effect is [-4,4] (with intensity 10 and
            # only choosing pos. or neg. effect) thus dividing by 10 assures
            # that additional probability is at most 0.4


            y_probs += realized_treatment_effect
            y_probs = np.clip(y_probs, 0, 1)
            y = np.random.binomial(1, y_probs, self.N)


        return y, self.X, self.D, realized_treatment_effect


    def visualize_correlation(self):
        """ Generates Correlation Matrix of the Covariates """

        corr = np.corrcoef(self.X, rowvar = False)
        sns.heatmap(corr, annot = True)
        plt.show()
        return None

    def __str__(self):
        return "N = " + str(self.N) + ", k = " + str(self.k)

    def get_N(self):
        return self.N

    def get_k(self):
        return self.k

    def set_N(self, new_N):
        self.N = new_N

    def set_k(self, new_k):
        self.k = new_k

    def get_X(self):
        return self.X

    def get_g_0_X(self):
        return self.g_0_X

    def get_treatment_assignment(self):
        return self.D

    def get_treatment_effect(self):
        return self.treatment_effect


##### New class that includes SimData class by initizilaizing it internally and
##### only displays a few simple functions to user


class UserInterface:
    '''
    Class to wrap up all functionalities and give user just the functions that
    are necessary to create the wanted variables y, X, D, and treatment.
    '''
    def __init__(self, N, k, seed = None, categorical_covariates = None):
        '''
        Initilizes needed Classes and generates covariates

        Parameters:
            N (int): Number of observations
            k (int): Number of covariates
            seed (int): Random seed to allow reproducing of results
                (default is None)
            categorical_covariates (int or list): Either an int, indicating the
                number of categories that all covariates are made of; a list
                with 2 ints, the first int indicating the number of covariates
                and the second the number of categories; or a list with one int
                and a list of ints, where the list of ints includes the
                different number of categories wanted.

        Attributes:
            weights_treatment_assignment (numpy array): Weight vector, drawn
                from a uniform distribution U(0,1), of length k. It is used to
                weight covariates when assigning treatment non-randomly and
                when creating heterogeneous treatment effects.
            weights_covariates_to_outputs (numpy array) Weight vector, drawn
                from a beta distribution Beta(1,5), of length k. It is used to
                weight covariate importance when creating output y from X.
            z_set_size_assignment (int): Number of covariates in subset Z of X
                that are used to assign treatment non-randomly.
            z_set_size_treatment (int): Number of covariates in subset Z of X
            that are used to create heterogeneous treatment effects.
        '''

        self.backend = SimData(N, k, seed)
        self.backend.generate_covariates(categorical_covariates
                                         = categorical_covariates)


    def generate_treatment(self, random_assignment = True,
                           assignment_prob = 0.5,
                           constant_pos = True,
                           constant_neg = False,
                           heterogeneous_pos = False,
                           heterogeneous_neg = False,
                           no_treatment = False,
                           discrete_heterogeneous = False,
                           treatment_option_weights = None,
                           intensity = 5):
        '''
        Assigns and generates treatment effect

        Parameters:
            random_assignment (boolean): If True, treatment is assigned randomly
                according to assignment_prob parameter. If False, treatment
                assignment is determined depending on covariates.
                (default is True)
            assignment_prob (float or string): The probability with which
                treatment is assigned. In the case of random assignment, it can
                be a float with 0 < prob < 1. If assignment is not random it
                should be one of the following strings: 'low', 'medium', 'high'.
                The strings stand for the values 0.35, 0.5, 0.65 respectively
                and can also be used in the non-random case.
                (default is 0.5)
            constant_pos (boolean): If True, the treatment effect is a positive
                constant.
                (default is True)
            constant_neg (boolean): If True, the treatment effect is a negative
                constant.
                (default is False)
            heterogeneous_pos (boolean): If True, the treatment effect is
                positive and heterogeneous, i.e. it depends on a number of
                covariates and varries in size.
                (default is False)
            heterogeneous_neg (boolean): If True, the treatment effect is
                negative and heterogeneous, i.e. it depends on a number of
                covariates and varries in size.
                (default is False)
            no_treatment (boolean): If True, then there is no treatment effect.
                (default is False)
            discrete_heterogeneous (boolean): If True, then the treatment
                effect consists of 2 values of different size. The size
                is determined by a subset of covariates.
                (default is False)
            treatment_option_weights (list): List of length 6 with weights of
                wanted treatment effects in the following order:
                [const_pos, const_neg, heterogeneous_pos, heterogeneous_neg,
                no_treatment, discrete_heterogeneous]. Its values need to sum
                up to 1. If chosen, the values overwrite the boolean parameters
                for each treatment effect.
                (default is None)
            intensity (int or float): Value affects the size of the treatment
                effect. Needs to be between 1 and 10. Formula for the actual
                magnitude of the treatment effects are:
                const: intensity*0.2, heterogeneous: [0, intensity*0.4]
                discrete_heterogeneous: {intensity*0.1, intensity*0.2}
                (default is 5)

        Treatment assignment can be done randomly or determined by a subset Z of
        covariates. The assignment probability can be freely chosen between 0
        and 1 in the random case and from 3 levels ('low','medium','high') in
        the non-random case.
        When creating the treatment effect, there are two ways to choose which
        kind of effects are used. Either one sets all treatment effect booleans
        that are wanted to True and then they are equally weighted created, or
        one gives a list of length 6 with weights of the wanted distribution of
        effects to the parameter treatment_option_weights, which overwrites
        whatever booleans where chosen before.

        Returns:
            None

'''
        self.backend.generate_treatment_assignment(random_assignment,
                                                   assignment_prob)
        self.backend.generate_treatment_effect(treatment_option_weights,
                                               constant_pos,
                                               constant_neg, heterogeneous_pos,
                                               heterogeneous_neg, no_treatment,
                                               discrete_heterogeneous,
                                               intensity)

        return None

    def output_data(self, binary = False,
                    x_y_relation = 'partial_nonlinear_simple'):
        '''
        Generates g_0(X), output variable y and returns simulated variables

        Parameters:
            binary (boolean): If True output is going to be binary, otherwise
                continuous.
                (default is False)
            x_y_relation (string): Chooses the simulated relationship between
                X and y. Possible values are:
                'linear_simple', 'linear_interaction',
                'partial_nonlinear_simple', 'partial_nonlinear_interaction',
                'nonlinear_simple', 'nonlinear_interaction'
                (default is 'partial_nonlinear_simple')

        When simulating a dataset, there are different options to transform X
        into y. It can be linear or non-linear in different ways. The options
        that can be chosen for x_y_relation correspond to the following
        functions:
        linear -> y ~ X
        partial non-linear -> y ~ 2.5*cos(X)^3 + 0.5*X
        non-linear -> y ~ 3*cos(X)^3 + 0.6*X
        Depending on the addition "simple" or "interaction" X consists only of
        single covariates x_i or additionally on random interaction terms of
        some of the covariates x_i*x_j; i,j in{1,...,k}
        Generates output array "y" the following way:
        Y = Theta_0 * D + g_0(X) + U,
        where Theta_O is the treatment effect of each observation, D the dummy
        vector for assigning treatment, g_0() the transformation function, and
        U a normal-distributed noise-/error term

        Returns:
            tuple: A tuple with variables y, X, assignment_vector,
                    treatment_vector
        '''

        return self.backend.generate_outcome_variable(binary, x_y_relation)

    def plot_covariates_correlation(self):
        '''
        Shows a correlation heatmap of the covariates
        '''
        self.backend.visualize_correlation()
        return None

    def get_propensity_scores(self):
        '''
        Returns probabilities that were used in treatment assignment
        '''
        return self.backend.propensity_scores

    def get_weights_treatment_assignment(self):
        return self.backend.weights_treatment_assignment

    def get_weigths_covariates_to_outputs(self):
        return self.backend.weights_covariates_to_outputs

    def get_treatment_effect_type(self):
        '''
        Gives a vector with the type of treatment effect of each observation

        Treatment types are accordingly:
            0 No treatment assigned
            1 positive constant effect
            2 negative constant effect
            3 positive heterogeneous effect
            4 negative heterogeneous effect
            5 no treatment effect (but assigned)
            6 discrete heterogeneous effect

        Returns:
            numpy array: n*1 array with treatment type of each observation
        '''
        return self.backend.treatment_effect_type

    def set_weights_treatment_assignment(self, new_weight_vector):
        '''
        Change weight vector that is applied in treatment assignment and
        treatment effect creation
        '''
        if len(new_weight_vector) is not self.backend.get_k():
            raise ValueError('New weight vector needs to be of dimension k')

        self.backend.weights_treatment_assignment = np.array(new_weight_vector)

    def set_weights_covariates_to_outputs(self, new_weight_vector):
        '''
        Change weight vector that is applied in translation of X to y
        '''
        if len(new_weight_vector) is not self.backend.get_k():
            raise ValueError('New weight vector needs to be of dimension k')

        self.backend.weights_covariates_to_outputs= np.array(new_weight_vector)

    def set_subset_z_size_treatment(self, new_size):
        '''
        Adjusts number of covariates used to create heterogeneous treatment

        Parameters:
            new_size (int): Wanted number of covariates to determine
                heterogeneous treatment effects. Needs to be in set {1,...,k}.

        For the heterogeneous treatment effects, the resulting effects depend
        on values of covariates in a subset Z of X. This method adjusts how
        many covariates are randomly chosen to be in Z. Apply before using
        generate_treatment().
        '''

        if new_size < 1 or new_size > self.backend.get_k():
            raise ValueError('Size of subset Z needs to be within [1,k]')

        self.backend.z_set_size_treatment = new_size

    def set_subset_z_size_assignment(self, new_size):
        '''
        Adjusts number of covariates used to non-randomly assign treatment

        Parameters:
            new_size (int): Wanted number of covariates to determine
                treatment assignment. Needs to be in set {1,...,k}.

        Non-random treatment assignment depends on values of covariates in a
        subset Z of X. This method adjusts how many covariates are randomly
        chosen to be in Z. Apply before using generate_treatment()
        '''

        if new_size < 1 or new_size > self.backend.get_k():
            raise ValueError('Size of subset Z needs to be within [1,k]')

        self.backend.z_set_size_assignment = new_size

    def set_interaction_num(self, new_num):
        '''
        Adjust number of interaction terms used in output creation

        Parameters:
            new_num (int): Wanted number of interaction terms x_i*x_j that
                should be added to single covariates.

        When choosing one of the '..._interaction' options for x_y_relation
        in output_data(), interaction_num of interaction terms are randomly
        created and added. The internal default value for that attribute is
        sqrt(k). This method changes the default value to the chosen integer.
        Use between initilizing the class and using output_data().

        Return:
            None
        '''
        if not isinstance(new_num, int):
            raise ValueError('new_num needs to be of type int')

        self.interaction_num = new_num

    def __str__(self):
        return "N = {}, k = {} \n" .format(self.backend.get_N(),
                                           self.backend.get_k())


## requirements.txt
numpy
scipy
seaborn
matplotlib
ipywidgets

## x_y_relation_interaction.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              x_y_relation_interaction.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	import numpy as np

	def standardize(gamma, upper_val = 0.3, lower_val = 0.1):

	s = (gamma-np.min(gamma))/(np.max(gamma)-np.min(gamma))
	out = s * (upper_val - lower_val) + lower_val

	return out

	def is_pos_def(x):
	return np.all(np.linalg.eigvals(x) > 0)


	def adjusting_assignment_level(level):

	adjustment_dict = {'low' : -0.545, 'medium' : 0, 'high' : 0.545}

	return adjustment_dict[level]

	def revert_string_prob(string):

	reverse_dict = {'low': 0.35, 'medium': 0.5, 'high': 0.65}

	return reverse_dict[string]

	# functions describing relation between X and y (g_0(X))
	def linear_simple(self):
	return np.dot(self.X,self.weights_covariates_to_outputs)

	def linear_interaction(self):
	return np.dot(self.X, self.weights_covariates_to_outputs) \
	+ np.dot(self.X_interaction, self.weights_interaction)

	def partial_nonlinear_simple(self):
	return 2.5np.cos(np.dot(self.X,self.weights_covariates_to_outputs))*3 \
	+ 2.50.2np.dot(self.X,self.weights_covariates_to_outputs)

	def partial_nonlinear_interaction(self):
	return 2.5*np.cos(np.dot(self.X,self.weights_covariates_to_outputs) + \
	np.dot(self.X_interaction, self.weights_interaction))**3 \
	+ 2.50.2(np.dot(self.X,self.weights_covariates_to_outputs)
	+ np.dot(self.X_interaction, self.weights_interaction))

	def nonlinear_simple(self):
	return 3np.cos(np.dot(self.X,self.weights_covariates_to_outputs))*3

	def nonlinear_interaction(self):
	return 3*np.cos(np.dot(self.X,self.weights_covariates_to_outputs) +
	np.dot(self.X_interaction, self.weights_interaction))**3

	relation_dict = {'linear_simple': linear_simple,
	'linear_interaction': linear_interaction,
	'partial_nonlinear_simple': partial_nonlinear_simple,
	'partial_nonlinear_interaction': partial_nonlinear_interaction,
	'nonlinear_simple': nonlinear_simple,
	'nonlinear_interaction': nonlinear_interaction}
	# function to call in main class
	def relation_fct(self, x_y_relation):
	function = relation_dict.get(x_y_relation)
	return function(self)
	import matplotlib.pyplot as plt
	import seaborn as sns

	def output_difference_plt(y_treated_continuous, y_not_treated_continuous,
	y_treated_binary, y_not_treated_binary):
	fig, axes = plt.subplots(1,2, figsize=(10,4))

	axes[0].set_title('Continuous Output distributions')
	axes[1].set_title('Binary output distributions')

	axes[0].set_xlabel('y')
	axes[1].set_xlabel('y')

	axes[0].set_ylabel('Density')
	axes[1].set_ylabel('Density')

	sns.distplot(y_not_treated_continuous, hist=False, kde=True, ax = axes[0],
	kde_kws={'linewidth': 4, 'color' : 'darkblue'}, label = 'y not treated')

	sns.distplot(y_treated_continuous, hist=False, kde=True, ax = axes[0],
	kde_kws={'linewidth': 4, 'color' : 'darkred'}, label = 'y treated')

	sns.distplot(y_not_treated_binary, hist=False, kde=True, ax = axes[1],
	kde_kws={'linewidth': 4, 'color' : 'darkblue'}, label = 'y not treated')

	sns.distplot(y_treated_binary, hist=False, kde=True, ax = axes[1],
	kde_kws={'linewidth': 4, 'color' : 'darkred'}, label = 'y treated')

	sns.despine(right=True, top=True)
	plt.setp(axes[1], xticks=[0,1])
	plt.tight_layout()



	def x_y_relation_plot(y,g_0_X):

	fig, axes = plt.subplots(1,1, figsize=(6,5))

	axes.set_title('y ~ X relation')

	axes.set_xlabel('X * b')

	axes.set_ylabel('y')

	axes.set_ylim([-7,7])

	axes.set_xlim([-5,5])

	sns.scatterplot(g_0_X, y, ax=axes, color='darkblue', s=18)

	sns.despine(left=False, right=True, top=True)

	plt.tight_layout(rect=[0, 0.03, 1, 0.95])
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "a11a12339a6146518a44beb067cefe6c",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"interactive(children=(IntSlider(value=5, description='intensity', max=10, min=1), Output()), _dom_classes=('wi…"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"import ipywidgets as widgets\n",
	"from ipywidgets import interact, interactive, fixed, interact_manual\n",
	"\n",
	"from interactive_plots_functions import output_difference_plt\n",
	"from opossum import UserInterface\n",
	"\n",
	"\n",
	"def output_difference_interaction_fct(intensity):\n",
	" u = UserInterface(10000,10, seed=7, categorical_covariates = None)\n",
	" u.generate_treatment(intensity = intensity)\n",
	"\n",
	" y_continuous, X_continuous, assignment_continuous, treatment_continuous = u.output_data(False)\n",
	" y_binary, X_binary, assignment_binary, treatment_binary = u.output_data(True)\n",
	"\n",
	"\n",
	"\n",
	" y_treated_continuous = y_continuous[assignment_continuous==1]\n",
	" y_not_treated_continuous = y_continuous[assignment_continuous==0]\n",
	" y_treated_binary = y_binary[assignment_binary==1]\n",
	" y_not_treated_binary = y_binary[assignment_binary==0]\n",
	"\n",
	" \n",
	" output_difference_plt(y_treated_continuous, y_not_treated_continuous, \n",
	" y_treated_binary, y_not_treated_binary)\n",
	" \n",
	"\n",
	"\n",
	"interactive(output_difference_interaction_fct, intensity=(1,10))\n",
	"\n"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	},
	"toc": {
	"base_numbering": 1,
	"nav_menu": {},
	"number_sections": true,
	"sideBar": true,
	"skip_h1_title": true,
	"title_cell": "Table of Contents",
	"title_sidebar": "Contents",
	"toc_cell": true,
	"toc_position": {},
	"toc_section_display": true,
	"toc_window_display": false
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}