rcsmit/logistic_regression_following_sachinsdate

## logistic_regression_following_sachinsdate

import pandas as pd
import statsmodels.api as sm
import numpy as np
from patsy import dmatrices


def log_regression(a,b,c,d):
    """Calculate VE and CI's according
    https://timeseriesreasoning.com/contents/estimation-of-vaccine-efficacy-using-logistic-regression/

    Args:
        a ([type]): sick vax
        b ([type]): sick unvax
        c ([type]): total vax
        d ([type]): total unvax

    Returns:
        0"""
    def VE_(x):
        """Calculate VE and CI's

        Args:
        x (float): The coeff. or CI
        Returns
        x (float): The VE or CI in %"""

        odds_ratio = np.exp(x)
        IRR = (odds_ratio / ((1-p_sick_unvax) + (p_sick_unvax*odds_ratio)))
        VE = round((1-IRR)*100,2)
        return VE
    p_sick_unvax = b/d
    l=[]
    for i in range(c):
        if i<a:
            l.append([1,1])
        else:
            l.append([1,0])
    for i in range(d):
        if i<b:
            l.append([0,1])
        else:
            l.append([0,0])

    df = pd.DataFrame(l, columns = ['VACCINATED', 'INFECTED'])

    #Form the regression equation
    expr = 'INFECTED ~  VACCINATED'

    #We'll use Patsy to carve out the X and y matrices
    y_train, X_train = dmatrices(expr, df, return_type='dataframe')

    #Build and train a Logit model
    logit_model = sm.Logit(endog=y_train, exog=X_train)
    logit_results = logit_model.fit()
    params = logit_results.params

    #Print the model summary
    print(logit_results.summary2())


    VE = VE_(params[1])

    conf =  logit_results.conf_int()
    high, low  = conf[0][1],  conf[1][1]
    VE_low, VE_high = VE_(low), VE_(high)
    print (f"VE = {VE} % | [{VE_low} , {VE_high}]")


def main():
    a,b,c,d = 47498,56063, 12_365_333, 3_342_667 # Netherlands october 2021 without 0-9-years sick_vaxxed, sick_unvaxed, people_vaxxed, people_unvaxxed
    x = 10000 # we make the dataframe smaller for faster results. The smaller the df, the bigger the CI's! (and vv)
    a,b,c,d = int(a/x), int (b/x), int(c/x), int(d/x)
    log_regression(a,b,c,d )
main()

	import pandas as pd
	import statsmodels.api as sm
	import numpy as np
	from patsy import dmatrices


	def log_regression(a,b,c,d):
	"""Calculate VE and CI's according
	https://timeseriesreasoning.com/contents/estimation-of-vaccine-efficacy-using-logistic-regression/

	Args:
	a ([type]): sick vax
	b ([type]): sick unvax
	c ([type]): total vax
	d ([type]): total unvax

	Returns:
	0"""
	def VE_(x):
	"""Calculate VE and CI's

	Args:
	x (float): The coeff. or CI
	Returns
	x (float): The VE or CI in %"""

	odds_ratio = np.exp(x)
	IRR = (odds_ratio / ((1-p_sick_unvax) + (p_sick_unvax*odds_ratio)))
	VE = round((1-IRR)*100,2)
	return VE
	p_sick_unvax = b/d
	l=[]
	for i in range(c):
	if i<a:
	l.append([1,1])
	else:
	l.append([1,0])
	for i in range(d):
	if i<b:
	l.append([0,1])
	else:
	l.append([0,0])

	df = pd.DataFrame(l, columns = ['VACCINATED', 'INFECTED'])

	#Form the regression equation
	expr = 'INFECTED ~ VACCINATED'

	#We'll use Patsy to carve out the X and y matrices
	y_train, X_train = dmatrices(expr, df, return_type='dataframe')

	#Build and train a Logit model
	logit_model = sm.Logit(endog=y_train, exog=X_train)
	logit_results = logit_model.fit()
	params = logit_results.params

	#Print the model summary
	print(logit_results.summary2())


	VE = VE_(params[1])

	conf = logit_results.conf_int()
	high, low = conf[0][1], conf[1][1]
	VE_low, VE_high = VE_(low), VE_(high)
	print (f"VE = {VE} % \| [{VE_low} , {VE_high}]")


	def main():
	a,b,c,d = 47498,56063, 12_365_333, 3_342_667 # Netherlands october 2021 without 0-9-years sick_vaxxed, sick_unvaxed, people_vaxxed, people_unvaxxed
	x = 10000 # we make the dataframe smaller for faster results. The smaller the df, the bigger the CI's! (and vv)
	a,b,c,d = int(a/x), int (b/x), int(c/x), int(d/x)
	log_regression(a,b,c,d )
	main()