Skip to content

Instantly share code, notes, and snippets.

@threecourse
Created April 27, 2014 06:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save threecourse/11339125 to your computer and use it in GitHub Desktop.
Save threecourse/11339125 to your computer and use it in GitHub Desktop.
pythonGLM20140427
# -*- coding: utf-8 -*-
import pandas
import statsmodels.api as sm
import patsy as patsy
import numpy as np
from os import path
import matplotlib.pyplot as plt
# read csv file ======================================================
workdir = path.dirname( path.abspath( __file__ ) )
datafile_path = workdir + '\\' + 'diabetes.csv'
df = pandas.read_csv(datafile_path)
df = df[:-1] # cut off last row
print df
# create model matrix ======================================================
# only for modelE and F, set base level
offset_term = df['l_popn']
y_N, X_N = patsy.dmatrices('deaths ~ 1', data=df, return_type='dataframe') # null model
y_A, X_A = patsy.dmatrices('deaths ~ gender + agemidpt', data=df, return_type='dataframe')
y_E, X_E = patsy.dmatrices(
'deaths ~ C(gender, Treatment("Male"))+ C(age,Treatment("45-54"))',
data=df, return_type='dataframe')
y_F, X_F = patsy.dmatrices(
'deaths ~ C(gender, Treatment("Male")) + agemidpt + I(agemidpt ** 2) + agemidpt:C(gender, Treatment("Male")) ',
data=df, return_type='dataframe')
y_H, X_H = patsy.dmatrices('deaths ~ gender * age', data=df, return_type='dataframe')
# create GLM models ======================================================
model_N = sm.GLM(y_N, X_N, family=sm.families.Poisson(link=sm.families.links.log), offset = offset_term )
model_A = sm.GLM(y_A, X_A, family=sm.families.Poisson(link=sm.families.links.log), offset = offset_term )
model_E = sm.GLM(y_E, X_E, family=sm.families.Poisson(link=sm.families.links.log), offset = offset_term )
model_F = sm.GLM(y_F, X_F, family=sm.families.Poisson(link=sm.families.links.log), offset = offset_term )
model_H = sm.GLM(y_H, X_H, family=sm.families.Poisson(link=sm.families.links.log), offset = offset_term )
models = [model_N, model_A, model_E, model_F, model_H]
models_name = ["model_N", "model_A", "model_E", "model_F", "model_H"]
# GLM modeling finished!
# show summary, model pararameters and fittedvalues ===================================
# show summary
for m in models:
print m.fit().summary()
# calculate deviance
# deviance shown in summary is wrong due to bug in stable version, and fixed in developed version.
def poisson_deviance(fit):
Y = fit.model.endog
mu = fit.fittedvalues
return np.where(Y == 0, 0, Y * np.log(Y / mu)).sum() * 2
params_df = pandas.DataFrame()
params_df['name'] = models_name
params_df['aic'] = [m.fit().aic for m in models]
params_df['deviance'] = [poisson_deviance(m.fit()) for m in models]
print params_df
fitted_df = pandas.DataFrame()
fitted_df[['gender', 'agemidpt', 'popn', 'observed']] = df[['gender', 'agemidpt', 'popn', 'deaths']]
for i in range(0, len(models)):
fitted_df[models_name[i]] = models[i].fit().fittedvalues
print fitted_df
# plot mortality observed and fitted ===================================
mort_df = fitted_df
obs_and_models_name = ['observed'] + models_name
for name in obs_and_models_name:
mort_df[name] = mort_df[name] * 1.0 / mort_df['popn']
print mort_df
# Female mortality
mort_df_F = mort_df[mort_df['gender'] == "Female"]
fig_F = plt.figure()
ax_F = fig_F.add_subplot(111)
mort_df_F.plot(ax = ax_F, x = mort_df_F['agemidpt'], y = obs_and_models_name,
marker='o', xticks = mort_df_F['agemidpt'], xlim = [15, 95],
title = "Mortality_Female", label = obs_and_models_name)
ax_F.set_xlabel("")
ax_F.legend(loc='best')
#fig_F.show()
fig_F.savefig(workdir + "\\" + 'fig_F.pdf')
# Male mortality
mort_df_M = mort_df[mort_df['gender'] == "Male"]
fig_M = plt.figure()
ax_M = fig_M.add_subplot(111)
mort_df_M.plot(ax = ax_M, x = mort_df_M['agemidpt'], y = obs_and_models_name,
marker='o', xticks = mort_df_M['agemidpt'], xlim = [15, 95],
title = "Mortality_Male", label = obs_and_models_name)
ax_M.set_xlabel("")
ax_M.legend(loc='best')
#fig_M.show()
fig_M.savefig(workdir + "\\" + 'fig_M.pdf')
In [1]: %run C:/Users/dai/Documents/Python/GLM/glm.py
gender age deaths popn l_popn agemidpt
0 Male <25 3 1141100 13.947503 20
1 Male 25-34 0 485571 13.093081 30
2 Male 35-44 12 504312 13.130950 40
3 Male 45-54 25 447315 13.011018 50
4 Male 55-64 61 330902 12.709578 60
5 Male 65-74 130 226403 12.330072 70
6 Male 75-84 192 130527 11.779335 80
7 Male 85+ 102 29785 10.301760 90
8 Female <25 2 1086408 13.898387 20
9 Female 25-34 1 489948 13.102055 30
10 Female 35-44 3 504030 13.130391 40
11 Female 45-54 11 445763 13.007543 50
12 Female 55-64 30 323669 12.687477 60
13 Female 65-74 63 241488 12.394575 70
14 Female 75-84 174 179686 12.098966 80
15 Female 85+ 159 67203 11.115473 90
[16 rows x 6 columns]
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: deaths No. Observations: 16
Model: GLM Df Residuals: 15
Model Family: Poisson Df Model: 0
Link Function: log Scale: 1.0
Method: IRLS Log-Likelihood: -1691.0
Date: Sun, 27 Apr 2014 Deviance: 3325.8
Time: 15:05:34 Pearson chi2: 7.83e+03
No. Iterations: 8
==============================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept -8.8325 0.032 -274.803 0.000 -8.895 -8.770
==============================================================================
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: deaths No. Observations: 16
Model: GLM Df Residuals: 13
Model Family: Poisson Df Model: 2
Link Function: log Scale: 1.0
Method: IRLS Log-Likelihood: -49.218
Date: Sun, 27 Apr 2014 Deviance: 19.646
Time: 15:05:34 Pearson chi2: 18.8
No. Iterations: 8
==================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
----------------------------------------------------------------------------------
Intercept -15.5919 0.201 -77.663 0.000 -15.985 -15.198
gender[T.Male] 0.5374 0.065 8.256 0.000 0.410 0.665
agemidpt 0.1064 0.002 42.892 0.000 0.102 0.111
==================================================================================
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: deaths No. Observations: 16
Model: GLM Df Residuals: 7
Model Family: Poisson Df Model: 8
Link Function: log Scale: 1.0
Method: IRLS Log-Likelihood: -43.245
Date: Sun, 27 Apr 2014 Deviance: 4.7005
Time: 15:05:34 Pearson chi2: 10.3
No. Iterations: 10
==========================================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
----------------------------------------------------------------------------------------------------------
Intercept -9.8915 0.168 -58.732 0.000 -10.222 -9.561
C(gender, Treatment("Male"))[T.Female] -0.5233 0.065 -8.017 0.000 -0.651 -0.395
C(age, Treatment("45-54"))[T.25-34] -3.6702 1.014 -3.620 0.000 -5.657 -1.683
C(age, Treatment("45-54"))[T.35-44] -0.9965 0.307 -3.243 0.001 -1.599 -0.394
C(age, Treatment("45-54"))[T.55-64] 1.2357 0.197 6.276 0.000 0.850 1.622
C(age, Treatment("45-54"))[T.65-74] 2.3343 0.182 12.858 0.000 1.979 2.690
C(age, Treatment("45-54"))[T.75-84] 3.4184 0.175 19.562 0.000 3.076 3.761
C(age, Treatment("45-54"))[T.85+] 4.3055 0.178 24.151 0.000 3.956 4.655
C(age, Treatment("45-54"))[T.<25] -2.8939 0.477 -6.063 0.000 -3.829 -1.958
==========================================================================================================
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: deaths No. Observations: 16
Model: GLM Df Residuals: 11
Model Family: Poisson Df Model: 4
Link Function: log Scale: 1.0
Method: IRLS Log-Likelihood: -44.488
Date: Sun, 27 Apr 2014 Deviance: 11.838
Time: 15:05:34 Pearson chi2: 13.0
No. Iterations: 9
===================================================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
-------------------------------------------------------------------------------------------------------------------
Intercept -16.2965 0.695 -23.450 0.000 -17.659 -14.934
C(gender, Treatment("Male"))[T.Female] -1.4182 0.432 -3.279 0.001 -2.266 -0.571
agemidpt 0.1500 0.020 7.462 0.000 0.111 0.189
agemidpt:C(gender, Treatment("Male"))[T.Female] 0.0116 0.006 2.091 0.037 0.001 0.022
I(agemidpt ** 2) -0.0004 0.000 -2.409 0.016 -0.001 -6.52e-05
===================================================================================================================
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: deaths No. Observations: 16
Model: GLM Df Residuals: 0
Model Family: Poisson Df Model: 15
Link Function: log Scale: 1.0
Method: IRLS Log-Likelihood: -37.802
Date: Sun, 27 Apr 2014 Deviance: -1.1663e-11
Time: 15:05:34 Pearson chi2: 0.00137
No. Iterations: 10
===============================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------------
Intercept -13.1021 1.000 -13.102 0.000 -15.062 -11.142
gender[T.Male] -6.5815 16.397 -0.401 0.688 -38.720 25.557
age[T.35-44] 1.0703 1.155 0.927 0.354 -1.193 3.333
age[T.45-54] 2.4924 1.044 2.386 0.017 0.445 4.540
age[T.55-64] 3.8158 1.017 3.754 0.000 1.823 5.808
age[T.65-74] 4.8506 1.008 4.813 0.000 2.875 6.826
age[T.75-84] 6.1621 1.003 6.145 0.000 4.197 8.128
age[T.85+] 7.0555 1.003 7.033 0.000 5.089 9.022
age[T.<25] -0.1032 1.225 -0.084 0.933 -2.504 2.297
gender[T.Male]:age[T.35-44] 7.9673 16.410 0.486 0.627 -24.196 40.130
gender[T.Male]:age[T.45-54] 7.3990 16.401 0.451 0.652 -24.747 39.545
gender[T.Male]:age[T.55-64] 7.2691 16.399 0.443 0.658 -24.872 39.410
gender[T.Male]:age[T.65-74] 7.3704 16.398 0.449 0.653 -24.769 39.510
gender[T.Male]:age[T.75-84] 6.9996 16.398 0.427 0.669 -25.139 39.138
gender[T.Male]:age[T.85+] 6.9513 16.398 0.424 0.672 -25.188 39.090
gender[T.Male]:age[T.<25] 6.9379 16.423 0.422 0.673 -25.250 39.126
===============================================================================================
name aic deviance
0 model_N 3383.984681 3.306383e+03
1 model_A 104.435428 2.283412e+01
2 model_E 104.490450 1.088914e+01
3 model_F 98.976325 1.337501e+01
4 model_H 107.604059 1.224176e-11
[5 rows x 3 columns]
gender agemidpt popn observed model_N model_A model_E 0 Male 20 1141100 3 166.500827 2.776322 3.196614
1 Male 30 485571 0 70.850910 3.423832 0.625823
2 Male 40 504312 12 73.585457 10.305586 9.420784
3 Male 50 447315 25 65.268879 26.491128 22.634397
4 Male 60 330902 61 48.282759 56.793680 57.609404
5 Male 70 226403 130 33.035042 112.615046 118.257129
6 Male 80 130527 192 19.045529 188.160397 201.572546
7 Male 90 29785 102 4.346006 124.434008 111.683303
8 Female 20 1086408 2 158.520577 1.544296 1.803386
9 Female 30 489948 1 71.489569 2.018372 0.374177
10 Female 40 504030 3 73.544310 6.017571 5.579216
11 Female 50 445763 11 65.042422 15.423485 13.365603
12 Female 60 323669 30 47.227374 32.455867 33.390596
13 Female 70 241488 63 35.236133 70.178048 74.742871
14 Female 80 179686 174 26.218445 151.332983 164.427454
15 Female 90 67203 159 9.805762 164.029377 149.316697
model_F model_H
0 1.666557 3.000000
1 2.667646 0.001373
2 9.717352 12.000000
3 28.185737 25.000000
4 63.573595 61.000000
5 123.655991 130.000000
6 188.965751 192.000000
7 106.567371 102.000000
8 0.484227 2.000000
9 0.922189 1.000000
10 3.735376 3.000000
11 12.127859 11.000000
12 30.142383 30.000000
13 71.773233 63.000000
14 158.915345 174.000000
15 164.899387 159.000000
[16 rows x 9 columns]
gender agemidpt popn observed model_N model_A model_E 0 Male 20 1141100 0.000003 0.000146 0.000002 0.000003
1 Male 30 485571 0.000000 0.000146 0.000007 0.000001
2 Male 40 504312 0.000024 0.000146 0.000020 0.000019
3 Male 50 447315 0.000056 0.000146 0.000059 0.000051
4 Male 60 330902 0.000184 0.000146 0.000172 0.000174
5 Male 70 226403 0.000574 0.000146 0.000497 0.000522
6 Male 80 130527 0.001471 0.000146 0.001442 0.001544
7 Male 90 29785 0.003425 0.000146 0.004178 0.003750
8 Female 20 1086408 0.000002 0.000146 0.000001 0.000002
9 Female 30 489948 0.000002 0.000146 0.000004 0.000001
10 Female 40 504030 0.000006 0.000146 0.000012 0.000011
11 Female 50 445763 0.000025 0.000146 0.000035 0.000030
12 Female 60 323669 0.000093 0.000146 0.000100 0.000103
13 Female 70 241488 0.000261 0.000146 0.000291 0.000310
14 Female 80 179686 0.000968 0.000146 0.000842 0.000915
15 Female 90 67203 0.002366 0.000146 0.002441 0.002222
model_F model_H
0 0.000001 2.629042e-06
1 0.000005 2.828315e-09
2 0.000019 2.379479e-05
3 0.000063 5.588903e-05
4 0.000192 1.843446e-04
5 0.000546 5.741973e-04
6 0.001448 1.470960e-03
7 0.003578 3.424543e-03
8 0.000000 1.840929e-06
9 0.000002 2.041033e-06
10 0.000007 5.952027e-06
11 0.000027 2.467679e-05
12 0.000093 9.268728e-05
13 0.000297 2.608825e-04
14 0.000884 9.683559e-04
15 0.002454 2.365966e-03
[16 rows x 9 columns]
# csvにしたものから読み込む
data <- read.csv("diabetes.csv")
# exclude NA
data <- na.omit(data)
# str(data)で見ると、因子に余計なものが混じっている
data$gender <- droplevels(data$gender)
data$age <- droplevels(data$age)
# 一般化線形モデルの作成
attach(data)
Model_N <- glm(deaths ~ 1, family = poisson(link = log), offset = l_popn)
Model_A <- glm(deaths ~ gender + agemidpt, family = poisson(link = log), offset = l_popn)
Model_E <- glm(deaths ~ C(gender, base=2) + C(age,base=4) , family = poisson(link = log), offset = l_popn)
Model_F <- glm(deaths ~ C(gender, base=2) + agemidpt + I( agemidpt^2) + agemidpt:C(gender, base=2) ,family = poisson(link = log), offset = l_popn)
Model_H <- glm(deaths ~ gender * age, family = poisson(link = log), offset = l_popn)
# 結果をまとめてみる
Model_E
ModelList <- list(Model_N,Model_A, Model_E, Model_F, Model_H)
names <- c("Model_N", "Model_A", "Model_E", "Model_F", "Model_H")
deviances <- sapply(ModelList, function(m){m$deviance})
df.residuals <- sapply(ModelList, function(m){m$df.residual})
aics <- sapply(ModelList, function(m){m$aic})
ModelListDataFrame <- data.frame(NAME=names, DEVIANCE=deviances, DF.RESIDUAL=df.residuals, AIC=aics)
ModelListDataFrame
> # 結果をまとめてみる
> Model_E
Call: glm(formula = deaths ~ C(gender, base = 2) + C(age, base = 4),
family = poisson(link = log), offset = l_popn)
Coefficients:
(Intercept) C(gender, base = 2)1 C(age, base = 4)1 C(age, base = 4)2 C(age, base = 4)3
-9.8915 -0.5233 -2.8939 -3.6702 -0.9965
C(age, base = 4)5 C(age, base = 4)6 C(age, base = 4)7 C(age, base = 4)8
1.2357 2.3343 3.4184 4.3055
Degrees of Freedom: 15 Total (i.e. Null); 7 Residual
Null Deviance: 3306
Residual Deviance: 10.89 AIC: 104.5
> ModelListDataFrame
NAME DEVIANCE DF.RESIDUAL AIC
1 Model_N 3.306383e+03 15 3383.98468
2 Model_A 2.283412e+01 13 104.43543
3 Model_E 1.088914e+01 7 104.49045
4 Model_F 1.337501e+01 11 98.97633
5 Model_H 4.123213e-10 0 107.60131
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment