Created
April 27, 2014 06:46
-
-
Save threecourse/11339125 to your computer and use it in GitHub Desktop.
pythonGLM20140427
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import pandas | |
import statsmodels.api as sm | |
import patsy as patsy | |
import numpy as np | |
from os import path | |
import matplotlib.pyplot as plt | |
# read csv file ====================================================== | |
workdir = path.dirname( path.abspath( __file__ ) ) | |
datafile_path = workdir + '\\' + 'diabetes.csv' | |
df = pandas.read_csv(datafile_path) | |
df = df[:-1] # cut off last row | |
print df | |
# create model matrix ====================================================== | |
# only for modelE and F, set base level | |
offset_term = df['l_popn'] | |
y_N, X_N = patsy.dmatrices('deaths ~ 1', data=df, return_type='dataframe') # null model | |
y_A, X_A = patsy.dmatrices('deaths ~ gender + agemidpt', data=df, return_type='dataframe') | |
y_E, X_E = patsy.dmatrices( | |
'deaths ~ C(gender, Treatment("Male"))+ C(age,Treatment("45-54"))', | |
data=df, return_type='dataframe') | |
y_F, X_F = patsy.dmatrices( | |
'deaths ~ C(gender, Treatment("Male")) + agemidpt + I(agemidpt ** 2) + agemidpt:C(gender, Treatment("Male")) ', | |
data=df, return_type='dataframe') | |
y_H, X_H = patsy.dmatrices('deaths ~ gender * age', data=df, return_type='dataframe') | |
# create GLM models ====================================================== | |
model_N = sm.GLM(y_N, X_N, family=sm.families.Poisson(link=sm.families.links.log), offset = offset_term ) | |
model_A = sm.GLM(y_A, X_A, family=sm.families.Poisson(link=sm.families.links.log), offset = offset_term ) | |
model_E = sm.GLM(y_E, X_E, family=sm.families.Poisson(link=sm.families.links.log), offset = offset_term ) | |
model_F = sm.GLM(y_F, X_F, family=sm.families.Poisson(link=sm.families.links.log), offset = offset_term ) | |
model_H = sm.GLM(y_H, X_H, family=sm.families.Poisson(link=sm.families.links.log), offset = offset_term ) | |
models = [model_N, model_A, model_E, model_F, model_H] | |
models_name = ["model_N", "model_A", "model_E", "model_F", "model_H"] | |
# GLM modeling finished! | |
# show summary, model pararameters and fittedvalues =================================== | |
# show summary | |
for m in models: | |
print m.fit().summary() | |
# calculate deviance | |
# deviance shown in summary is wrong due to bug in stable version, and fixed in developed version. | |
def poisson_deviance(fit): | |
Y = fit.model.endog | |
mu = fit.fittedvalues | |
return np.where(Y == 0, 0, Y * np.log(Y / mu)).sum() * 2 | |
params_df = pandas.DataFrame() | |
params_df['name'] = models_name | |
params_df['aic'] = [m.fit().aic for m in models] | |
params_df['deviance'] = [poisson_deviance(m.fit()) for m in models] | |
print params_df | |
fitted_df = pandas.DataFrame() | |
fitted_df[['gender', 'agemidpt', 'popn', 'observed']] = df[['gender', 'agemidpt', 'popn', 'deaths']] | |
for i in range(0, len(models)): | |
fitted_df[models_name[i]] = models[i].fit().fittedvalues | |
print fitted_df | |
# plot mortality observed and fitted =================================== | |
mort_df = fitted_df | |
obs_and_models_name = ['observed'] + models_name | |
for name in obs_and_models_name: | |
mort_df[name] = mort_df[name] * 1.0 / mort_df['popn'] | |
print mort_df | |
# Female mortality | |
mort_df_F = mort_df[mort_df['gender'] == "Female"] | |
fig_F = plt.figure() | |
ax_F = fig_F.add_subplot(111) | |
mort_df_F.plot(ax = ax_F, x = mort_df_F['agemidpt'], y = obs_and_models_name, | |
marker='o', xticks = mort_df_F['agemidpt'], xlim = [15, 95], | |
title = "Mortality_Female", label = obs_and_models_name) | |
ax_F.set_xlabel("") | |
ax_F.legend(loc='best') | |
#fig_F.show() | |
fig_F.savefig(workdir + "\\" + 'fig_F.pdf') | |
# Male mortality | |
mort_df_M = mort_df[mort_df['gender'] == "Male"] | |
fig_M = plt.figure() | |
ax_M = fig_M.add_subplot(111) | |
mort_df_M.plot(ax = ax_M, x = mort_df_M['agemidpt'], y = obs_and_models_name, | |
marker='o', xticks = mort_df_M['agemidpt'], xlim = [15, 95], | |
title = "Mortality_Male", label = obs_and_models_name) | |
ax_M.set_xlabel("") | |
ax_M.legend(loc='best') | |
#fig_M.show() | |
fig_M.savefig(workdir + "\\" + 'fig_M.pdf') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
In [1]: %run C:/Users/dai/Documents/Python/GLM/glm.py | |
gender age deaths popn l_popn agemidpt | |
0 Male <25 3 1141100 13.947503 20 | |
1 Male 25-34 0 485571 13.093081 30 | |
2 Male 35-44 12 504312 13.130950 40 | |
3 Male 45-54 25 447315 13.011018 50 | |
4 Male 55-64 61 330902 12.709578 60 | |
5 Male 65-74 130 226403 12.330072 70 | |
6 Male 75-84 192 130527 11.779335 80 | |
7 Male 85+ 102 29785 10.301760 90 | |
8 Female <25 2 1086408 13.898387 20 | |
9 Female 25-34 1 489948 13.102055 30 | |
10 Female 35-44 3 504030 13.130391 40 | |
11 Female 45-54 11 445763 13.007543 50 | |
12 Female 55-64 30 323669 12.687477 60 | |
13 Female 65-74 63 241488 12.394575 70 | |
14 Female 75-84 174 179686 12.098966 80 | |
15 Female 85+ 159 67203 11.115473 90 | |
[16 rows x 6 columns] | |
Generalized Linear Model Regression Results | |
============================================================================== | |
Dep. Variable: deaths No. Observations: 16 | |
Model: GLM Df Residuals: 15 | |
Model Family: Poisson Df Model: 0 | |
Link Function: log Scale: 1.0 | |
Method: IRLS Log-Likelihood: -1691.0 | |
Date: Sun, 27 Apr 2014 Deviance: 3325.8 | |
Time: 15:05:34 Pearson chi2: 7.83e+03 | |
No. Iterations: 8 | |
============================================================================== | |
coef std err t P>|t| [95.0% Conf. Int.] | |
------------------------------------------------------------------------------ | |
Intercept -8.8325 0.032 -274.803 0.000 -8.895 -8.770 | |
============================================================================== | |
Generalized Linear Model Regression Results | |
============================================================================== | |
Dep. Variable: deaths No. Observations: 16 | |
Model: GLM Df Residuals: 13 | |
Model Family: Poisson Df Model: 2 | |
Link Function: log Scale: 1.0 | |
Method: IRLS Log-Likelihood: -49.218 | |
Date: Sun, 27 Apr 2014 Deviance: 19.646 | |
Time: 15:05:34 Pearson chi2: 18.8 | |
No. Iterations: 8 | |
================================================================================== | |
coef std err t P>|t| [95.0% Conf. Int.] | |
---------------------------------------------------------------------------------- | |
Intercept -15.5919 0.201 -77.663 0.000 -15.985 -15.198 | |
gender[T.Male] 0.5374 0.065 8.256 0.000 0.410 0.665 | |
agemidpt 0.1064 0.002 42.892 0.000 0.102 0.111 | |
================================================================================== | |
Generalized Linear Model Regression Results | |
============================================================================== | |
Dep. Variable: deaths No. Observations: 16 | |
Model: GLM Df Residuals: 7 | |
Model Family: Poisson Df Model: 8 | |
Link Function: log Scale: 1.0 | |
Method: IRLS Log-Likelihood: -43.245 | |
Date: Sun, 27 Apr 2014 Deviance: 4.7005 | |
Time: 15:05:34 Pearson chi2: 10.3 | |
No. Iterations: 10 | |
========================================================================================================== | |
coef std err t P>|t| [95.0% Conf. Int.] | |
---------------------------------------------------------------------------------------------------------- | |
Intercept -9.8915 0.168 -58.732 0.000 -10.222 -9.561 | |
C(gender, Treatment("Male"))[T.Female] -0.5233 0.065 -8.017 0.000 -0.651 -0.395 | |
C(age, Treatment("45-54"))[T.25-34] -3.6702 1.014 -3.620 0.000 -5.657 -1.683 | |
C(age, Treatment("45-54"))[T.35-44] -0.9965 0.307 -3.243 0.001 -1.599 -0.394 | |
C(age, Treatment("45-54"))[T.55-64] 1.2357 0.197 6.276 0.000 0.850 1.622 | |
C(age, Treatment("45-54"))[T.65-74] 2.3343 0.182 12.858 0.000 1.979 2.690 | |
C(age, Treatment("45-54"))[T.75-84] 3.4184 0.175 19.562 0.000 3.076 3.761 | |
C(age, Treatment("45-54"))[T.85+] 4.3055 0.178 24.151 0.000 3.956 4.655 | |
C(age, Treatment("45-54"))[T.<25] -2.8939 0.477 -6.063 0.000 -3.829 -1.958 | |
========================================================================================================== | |
Generalized Linear Model Regression Results | |
============================================================================== | |
Dep. Variable: deaths No. Observations: 16 | |
Model: GLM Df Residuals: 11 | |
Model Family: Poisson Df Model: 4 | |
Link Function: log Scale: 1.0 | |
Method: IRLS Log-Likelihood: -44.488 | |
Date: Sun, 27 Apr 2014 Deviance: 11.838 | |
Time: 15:05:34 Pearson chi2: 13.0 | |
No. Iterations: 9 | |
=================================================================================================================== | |
coef std err t P>|t| [95.0% Conf. Int.] | |
------------------------------------------------------------------------------------------------------------------- | |
Intercept -16.2965 0.695 -23.450 0.000 -17.659 -14.934 | |
C(gender, Treatment("Male"))[T.Female] -1.4182 0.432 -3.279 0.001 -2.266 -0.571 | |
agemidpt 0.1500 0.020 7.462 0.000 0.111 0.189 | |
agemidpt:C(gender, Treatment("Male"))[T.Female] 0.0116 0.006 2.091 0.037 0.001 0.022 | |
I(agemidpt ** 2) -0.0004 0.000 -2.409 0.016 -0.001 -6.52e-05 | |
=================================================================================================================== | |
Generalized Linear Model Regression Results | |
============================================================================== | |
Dep. Variable: deaths No. Observations: 16 | |
Model: GLM Df Residuals: 0 | |
Model Family: Poisson Df Model: 15 | |
Link Function: log Scale: 1.0 | |
Method: IRLS Log-Likelihood: -37.802 | |
Date: Sun, 27 Apr 2014 Deviance: -1.1663e-11 | |
Time: 15:05:34 Pearson chi2: 0.00137 | |
No. Iterations: 10 | |
=============================================================================================== | |
coef std err t P>|t| [95.0% Conf. Int.] | |
----------------------------------------------------------------------------------------------- | |
Intercept -13.1021 1.000 -13.102 0.000 -15.062 -11.142 | |
gender[T.Male] -6.5815 16.397 -0.401 0.688 -38.720 25.557 | |
age[T.35-44] 1.0703 1.155 0.927 0.354 -1.193 3.333 | |
age[T.45-54] 2.4924 1.044 2.386 0.017 0.445 4.540 | |
age[T.55-64] 3.8158 1.017 3.754 0.000 1.823 5.808 | |
age[T.65-74] 4.8506 1.008 4.813 0.000 2.875 6.826 | |
age[T.75-84] 6.1621 1.003 6.145 0.000 4.197 8.128 | |
age[T.85+] 7.0555 1.003 7.033 0.000 5.089 9.022 | |
age[T.<25] -0.1032 1.225 -0.084 0.933 -2.504 2.297 | |
gender[T.Male]:age[T.35-44] 7.9673 16.410 0.486 0.627 -24.196 40.130 | |
gender[T.Male]:age[T.45-54] 7.3990 16.401 0.451 0.652 -24.747 39.545 | |
gender[T.Male]:age[T.55-64] 7.2691 16.399 0.443 0.658 -24.872 39.410 | |
gender[T.Male]:age[T.65-74] 7.3704 16.398 0.449 0.653 -24.769 39.510 | |
gender[T.Male]:age[T.75-84] 6.9996 16.398 0.427 0.669 -25.139 39.138 | |
gender[T.Male]:age[T.85+] 6.9513 16.398 0.424 0.672 -25.188 39.090 | |
gender[T.Male]:age[T.<25] 6.9379 16.423 0.422 0.673 -25.250 39.126 | |
=============================================================================================== | |
name aic deviance | |
0 model_N 3383.984681 3.306383e+03 | |
1 model_A 104.435428 2.283412e+01 | |
2 model_E 104.490450 1.088914e+01 | |
3 model_F 98.976325 1.337501e+01 | |
4 model_H 107.604059 1.224176e-11 | |
[5 rows x 3 columns] | |
gender agemidpt popn observed model_N model_A model_E 0 Male 20 1141100 3 166.500827 2.776322 3.196614 | |
1 Male 30 485571 0 70.850910 3.423832 0.625823 | |
2 Male 40 504312 12 73.585457 10.305586 9.420784 | |
3 Male 50 447315 25 65.268879 26.491128 22.634397 | |
4 Male 60 330902 61 48.282759 56.793680 57.609404 | |
5 Male 70 226403 130 33.035042 112.615046 118.257129 | |
6 Male 80 130527 192 19.045529 188.160397 201.572546 | |
7 Male 90 29785 102 4.346006 124.434008 111.683303 | |
8 Female 20 1086408 2 158.520577 1.544296 1.803386 | |
9 Female 30 489948 1 71.489569 2.018372 0.374177 | |
10 Female 40 504030 3 73.544310 6.017571 5.579216 | |
11 Female 50 445763 11 65.042422 15.423485 13.365603 | |
12 Female 60 323669 30 47.227374 32.455867 33.390596 | |
13 Female 70 241488 63 35.236133 70.178048 74.742871 | |
14 Female 80 179686 174 26.218445 151.332983 164.427454 | |
15 Female 90 67203 159 9.805762 164.029377 149.316697 | |
model_F model_H | |
0 1.666557 3.000000 | |
1 2.667646 0.001373 | |
2 9.717352 12.000000 | |
3 28.185737 25.000000 | |
4 63.573595 61.000000 | |
5 123.655991 130.000000 | |
6 188.965751 192.000000 | |
7 106.567371 102.000000 | |
8 0.484227 2.000000 | |
9 0.922189 1.000000 | |
10 3.735376 3.000000 | |
11 12.127859 11.000000 | |
12 30.142383 30.000000 | |
13 71.773233 63.000000 | |
14 158.915345 174.000000 | |
15 164.899387 159.000000 | |
[16 rows x 9 columns] | |
gender agemidpt popn observed model_N model_A model_E 0 Male 20 1141100 0.000003 0.000146 0.000002 0.000003 | |
1 Male 30 485571 0.000000 0.000146 0.000007 0.000001 | |
2 Male 40 504312 0.000024 0.000146 0.000020 0.000019 | |
3 Male 50 447315 0.000056 0.000146 0.000059 0.000051 | |
4 Male 60 330902 0.000184 0.000146 0.000172 0.000174 | |
5 Male 70 226403 0.000574 0.000146 0.000497 0.000522 | |
6 Male 80 130527 0.001471 0.000146 0.001442 0.001544 | |
7 Male 90 29785 0.003425 0.000146 0.004178 0.003750 | |
8 Female 20 1086408 0.000002 0.000146 0.000001 0.000002 | |
9 Female 30 489948 0.000002 0.000146 0.000004 0.000001 | |
10 Female 40 504030 0.000006 0.000146 0.000012 0.000011 | |
11 Female 50 445763 0.000025 0.000146 0.000035 0.000030 | |
12 Female 60 323669 0.000093 0.000146 0.000100 0.000103 | |
13 Female 70 241488 0.000261 0.000146 0.000291 0.000310 | |
14 Female 80 179686 0.000968 0.000146 0.000842 0.000915 | |
15 Female 90 67203 0.002366 0.000146 0.002441 0.002222 | |
model_F model_H | |
0 0.000001 2.629042e-06 | |
1 0.000005 2.828315e-09 | |
2 0.000019 2.379479e-05 | |
3 0.000063 5.588903e-05 | |
4 0.000192 1.843446e-04 | |
5 0.000546 5.741973e-04 | |
6 0.001448 1.470960e-03 | |
7 0.003578 3.424543e-03 | |
8 0.000000 1.840929e-06 | |
9 0.000002 2.041033e-06 | |
10 0.000007 5.952027e-06 | |
11 0.000027 2.467679e-05 | |
12 0.000093 9.268728e-05 | |
13 0.000297 2.608825e-04 | |
14 0.000884 9.683559e-04 | |
15 0.002454 2.365966e-03 | |
[16 rows x 9 columns] | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# csvにしたものから読み込む | |
data <- read.csv("diabetes.csv") | |
# exclude NA | |
data <- na.omit(data) | |
# str(data)で見ると、因子に余計なものが混じっている | |
data$gender <- droplevels(data$gender) | |
data$age <- droplevels(data$age) | |
# 一般化線形モデルの作成 | |
attach(data) | |
Model_N <- glm(deaths ~ 1, family = poisson(link = log), offset = l_popn) | |
Model_A <- glm(deaths ~ gender + agemidpt, family = poisson(link = log), offset = l_popn) | |
Model_E <- glm(deaths ~ C(gender, base=2) + C(age,base=4) , family = poisson(link = log), offset = l_popn) | |
Model_F <- glm(deaths ~ C(gender, base=2) + agemidpt + I( agemidpt^2) + agemidpt:C(gender, base=2) ,family = poisson(link = log), offset = l_popn) | |
Model_H <- glm(deaths ~ gender * age, family = poisson(link = log), offset = l_popn) | |
# 結果をまとめてみる | |
Model_E | |
ModelList <- list(Model_N,Model_A, Model_E, Model_F, Model_H) | |
names <- c("Model_N", "Model_A", "Model_E", "Model_F", "Model_H") | |
deviances <- sapply(ModelList, function(m){m$deviance}) | |
df.residuals <- sapply(ModelList, function(m){m$df.residual}) | |
aics <- sapply(ModelList, function(m){m$aic}) | |
ModelListDataFrame <- data.frame(NAME=names, DEVIANCE=deviances, DF.RESIDUAL=df.residuals, AIC=aics) | |
ModelListDataFrame |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
> # 結果をまとめてみる | |
> Model_E | |
Call: glm(formula = deaths ~ C(gender, base = 2) + C(age, base = 4), | |
family = poisson(link = log), offset = l_popn) | |
Coefficients: | |
(Intercept) C(gender, base = 2)1 C(age, base = 4)1 C(age, base = 4)2 C(age, base = 4)3 | |
-9.8915 -0.5233 -2.8939 -3.6702 -0.9965 | |
C(age, base = 4)5 C(age, base = 4)6 C(age, base = 4)7 C(age, base = 4)8 | |
1.2357 2.3343 3.4184 4.3055 | |
Degrees of Freedom: 15 Total (i.e. Null); 7 Residual | |
Null Deviance: 3306 | |
Residual Deviance: 10.89 AIC: 104.5 | |
> ModelListDataFrame | |
NAME DEVIANCE DF.RESIDUAL AIC | |
1 Model_N 3.306383e+03 15 3383.98468 | |
2 Model_A 2.283412e+01 13 104.43543 | |
3 Model_E 1.088914e+01 7 104.49045 | |
4 Model_F 1.337501e+01 11 98.97633 | |
5 Model_H 4.123213e-10 0 107.60131 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment