Regresión logística múltiple
Diabetes.data <- read.table(file = "http://tarwi.lamolina.edu.pe/~clopez/Categoricos/Diabetes.txt", header = T)
attach(Diabetes.data)
Diabetes.m1 <- glm(Diabetes ~ ., family = binomial(link = logit), data = Diabetes.data)
summary(Diabetes.m1)
##
## Call:
## glm(formula = Diabetes ~ ., family = binomial(link = logit),
## data = Diabetes.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5566 -0.7274 -0.4159 0.7267 2.9297
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.4046964 0.7166359 -11.728 < 2e-16 ***
## Embarazos 0.1231823 0.0320776 3.840 0.000123 ***
## Plasma 0.0351637 0.0037087 9.481 < 2e-16 ***
## Presion -0.0132955 0.0052336 -2.540 0.011072 *
## Triceps 0.0006190 0.0068994 0.090 0.928515
## Suero -0.0011917 0.0009012 -1.322 0.186065
## Indice 0.0897010 0.0150876 5.945 2.76e-09 ***
## Pedigri 0.9451797 0.2991475 3.160 0.001580 **
## Edad 0.0148690 0.0093348 1.593 0.111192
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 723.45 on 759 degrees of freedom
## AIC: 741.45
##
## Number of Fisher Scoring iterations: 5
Diabetes.m2 <- glm(Diabetes ~ Embarazos + Plasma + Presion + Indice + Pedigri, family = binomial(link = logit))
summary(Diabetes.m2)
##
## Call:
## glm(formula = Diabetes ~ Embarazos + Plasma + Presion + Indice +
## Pedigri, family = binomial(link = logit))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7931 -0.7362 -0.4188 0.7251 2.9555
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.954952 0.675823 -11.771 < 2e-16 ***
## Embarazos 0.153492 0.027835 5.514 3.5e-08 ***
## Plasma 0.034658 0.003394 10.213 < 2e-16 ***
## Presion -0.012007 0.005031 -2.387 0.01700 *
## Indice 0.084832 0.014125 6.006 1.9e-09 ***
## Pedigri 0.910628 0.294027 3.097 0.00195 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 728.56 on 762 degrees of freedom
## AIC: 740.56
##
## Number of Fisher Scoring iterations: 5
# Prueba de razón de verosimilitud
anova(Diabetes.m2, Diabetes.m1 , test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: Diabetes ~ Embarazos + Plasma + Presion + Indice + Pedigri
## Model 2: Diabetes ~ Embarazos + Plasma + Presion + Triceps + Suero + Indice +
## Pedigri + Edad
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 762 728.56
## 2 759 723.45 3 5.1142 0.1636
# Prueba de Hosmer y Lemeshow
library(ResourceSelection)
## ResourceSelection 0.3-0 2016-11-04
hoslem.test(Diabetes, fitted(Diabetes.m2), g = 10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: Diabetes, fitted(Diabetes.m2)
## X-squared = 9.4733, df = 8, p-value = 0.304
# Estadística C
Diabetes.m0 <- glm(Diabetes ~ 1, family = binomial(link = logit))
C <- 2*(logLik(Diabetes.m2)[1] - logLik(Diabetes.m0)[1])
C
## [1] 264.9243
1 - pchisq(q = C, df = 5)
## [1] 0
library(pscl)
## Loading required package: MASS
## Loading required package: lattice
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
pR2(Diabetes.m2)[3]
## G2
## 264.9243
# Pseudo R2
pseudoR2 <- (logLik(Diabetes.m0)[1] - logLik(Diabetes.m2)[1])/logLik(Diabetes.m0)[1]
pseudoR2
## [1] 0.2666619
pR2(Diabetes.m2)[4]
## McFadden
## 0.2666619
# Importancia de las variables
library(caret)
## Loading required package: ggplot2
varImp(Diabetes.m2)
## Overall
## Embarazos 5.514303
## Plasma 10.213000
## Presion 2.386814
## Indice 6.005878
## Pedigri 3.097083
# Tasa de clasificación
set.seed(300)
indices <- createDataPartition(Diabetes.data$Diabetes, p = 0.7, list = FALSE)
Diabetes.train <- Diabetes.data[indices, ]
Diabetes.test <- Diabetes.data[-indices, ]
Diabetes.train.m1 <- glm(Diabetes ~ Embarazos + Plasma + Presion + Indice + Pedigri, family = binomial(link = logit), data = Diabetes.train)
prob <- predict(Diabetes.train.m1, newdata = Diabetes.test, type = "response")
pred <- ifelse(prob >= 0.5, 1, 0)
confusionMatrix(data = pred, Diabetes.test$Diabetes, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 129 37
## 1 22 42
##
## Accuracy : 0.7435
## 95% CI : (0.6819, 0.7986)
## No Information Rate : 0.6565
## P-Value [Acc > NIR] : 0.00286
##
## Kappa : 0.4042
## Mcnemar's Test P-Value : 0.06836
##
## Sensitivity : 0.5316
## Specificity : 0.8543
## Pos Pred Value : 0.6563
## Neg Pred Value : 0.7771
## Prevalence : 0.3435
## Detection Rate : 0.1826
## Detection Prevalence : 0.2783
## Balanced Accuracy : 0.6930
##
## 'Positive' Class : 1
##
# Residuales
res.pearson <- resid(Diabetes.m2, type = "pearson")
res.est.p <- res.pearson/sqrt(1 - lm.influence(Diabetes.m2)$hat)
res.devianza <- resid(Diabetes.m2, type = "deviance")
res.est.d <- res.devianza/sqrt(1 - lm.influence(Diabetes.m2)$hat)
residuales <- cbind(res.pearson, res.est.p, res.devianza, res.est.d)
residuales[1:10,]
## res.pearson res.est.p res.devianza res.est.d
## 1 0.7217376 0.7232687 0.9157573 0.9177000
## 2 -0.2152579 -0.2154809 -0.3009801 -0.3012919
## 3 0.4878578 0.4909426 0.6534546 0.6575865
## 4 -0.2261049 -0.2263629 -0.3157832 -0.3161435
## 5 0.3583168 0.3645645 0.4915134 0.5000834
## 6 -0.4272110 -0.4280076 -0.5790313 -0.5801110
## 7 3.5535251 3.5591605 2.2856477 2.2892725
## 8 -1.4065088 -1.4374257 -1.4773907 -1.5098656
## 9 0.5855506 0.5899804 0.7678656 0.7736745
## 10 5.3020283 5.3256525 2.5965875 2.6081571
Selección de modelos
library(MASS)
stepAIC(Diabetes.m1, direction = "backward")
## Start: AIC=741.45
## Diabetes ~ Embarazos + Plasma + Presion + Triceps + Suero + Indice +
## Pedigri + Edad
##
## Df Deviance AIC
## - Triceps 1 723.45 739.45
## - Suero 1 725.19 741.19
## <none> 723.45 741.45
## - Edad 1 725.97 741.97
## - Presion 1 729.99 745.99
## - Pedigri 1 733.78 749.78
## - Embarazos 1 738.68 754.68
## - Indice 1 764.22 780.22
## - Plasma 1 838.37 854.37
##
## Step: AIC=739.45
## Diabetes ~ Embarazos + Plasma + Presion + Suero + Indice + Pedigri +
## Edad
##
## Df Deviance AIC
## <none> 723.45 739.45
## - Suero 1 725.46 739.46
## - Edad 1 725.97 739.97
## - Presion 1 730.13 744.13
## - Pedigri 1 733.92 747.92
## - Embarazos 1 738.69 752.69
## - Indice 1 768.77 782.77
## - Plasma 1 840.87 854.87
##
## Call: glm(formula = Diabetes ~ Embarazos + Plasma + Presion + Suero +
## Indice + Pedigri + Edad, family = binomial(link = logit),
## data = Diabetes.data)
##
## Coefficients:
## (Intercept) Embarazos Plasma Presion Suero
## -8.405136 0.123172 0.035112 -0.013214 -0.001157
## Indice Pedigri Edad
## 0.090089 0.947595 0.014789
##
## Degrees of Freedom: 767 Total (i.e. Null); 760 Residual
## Null Deviance: 993.5
## Residual Deviance: 723.5 AIC: 739.5
stepAIC(Diabetes.m1, direction = "both")
## Start: AIC=741.45
## Diabetes ~ Embarazos + Plasma + Presion + Triceps + Suero + Indice +
## Pedigri + Edad
##
## Df Deviance AIC
## - Triceps 1 723.45 739.45
## - Suero 1 725.19 741.19
## <none> 723.45 741.45
## - Edad 1 725.97 741.97
## - Presion 1 729.99 745.99
## - Pedigri 1 733.78 749.78
## - Embarazos 1 738.68 754.68
## - Indice 1 764.22 780.22
## - Plasma 1 838.37 854.37
##
## Step: AIC=739.45
## Diabetes ~ Embarazos + Plasma + Presion + Suero + Indice + Pedigri +
## Edad
##
## Df Deviance AIC
## <none> 723.45 739.45
## - Suero 1 725.46 739.46
## - Edad 1 725.97 739.97
## + Triceps 1 723.45 741.45
## - Presion 1 730.13 744.13
## - Pedigri 1 733.92 747.92
## - Embarazos 1 738.69 752.69
## - Indice 1 768.77 782.77
## - Plasma 1 840.87 854.87
##
## Call: glm(formula = Diabetes ~ Embarazos + Plasma + Presion + Suero +
## Indice + Pedigri + Edad, family = binomial(link = logit),
## data = Diabetes.data)
##
## Coefficients:
## (Intercept) Embarazos Plasma Presion Suero
## -8.405136 0.123172 0.035112 -0.013214 -0.001157
## Indice Pedigri Edad
## 0.090089 0.947595 0.014789
##
## Degrees of Freedom: 767 Total (i.e. Null); 760 Residual
## Null Deviance: 993.5
## Residual Deviance: 723.5 AIC: 739.5
Diabetes.m0 <- glm(Diabetes ~ 1, family = binomial(link = logit), data = Diabetes.data)
stepAIC(Diabetes.m0, direction = "forward", scope = ~ Embarazos + Plasma + Presion + Triceps + Suero + Indice + Pedigri + Edad)
## Start: AIC=995.48
## Diabetes ~ 1
##
## Df Deviance AIC
## + Plasma 1 808.72 812.72
## + Indice 1 920.71 924.71
## + Edad 1 950.72 954.72
## + Embarazos 1 956.21 960.21
## + Pedigri 1 970.86 974.86
## + Suero 1 980.81 984.81
## + Triceps 1 989.19 993.19
## + Presion 1 990.13 994.13
## <none> 993.48 995.48
##
## Step: AIC=812.72
## Diabetes ~ Plasma
##
## Df Deviance AIC
## + Indice 1 771.40 777.40
## + Embarazos 1 784.95 790.95
## + Pedigri 1 796.99 802.99
## + Edad 1 797.36 803.36
## <none> 808.72 812.72
## + Triceps 1 807.07 813.07
## + Suero 1 807.77 813.77
## + Presion 1 808.59 814.59
##
## Step: AIC=777.4
## Diabetes ~ Plasma + Indice
##
## Df Deviance AIC
## + Embarazos 1 744.12 752.12
## + Edad 1 755.68 763.68
## + Pedigri 1 762.87 770.87
## + Suero 1 767.79 775.79
## + Presion 1 769.07 777.07
## <none> 771.40 777.40
## + Triceps 1 770.20 778.20
##
## Step: AIC=752.12
## Diabetes ~ Plasma + Indice + Embarazos
##
## Df Deviance AIC
## + Pedigri 1 734.31 744.31
## + Presion 1 738.43 748.43
## + Edad 1 742.10 752.10
## <none> 744.12 752.12
## + Suero 1 742.43 752.43
## + Triceps 1 743.60 753.60
##
## Step: AIC=744.31
## Diabetes ~ Plasma + Indice + Embarazos + Pedigri
##
## Df Deviance AIC
## + Presion 1 728.56 740.56
## + Suero 1 731.51 743.51
## <none> 734.31 744.31
## + Edad 1 732.51 744.51
## + Triceps 1 733.06 745.06
##
## Step: AIC=740.56
## Diabetes ~ Plasma + Indice + Embarazos + Pedigri + Presion
##
## Df Deviance AIC
## + Edad 1 725.46 739.46
## + Suero 1 725.97 739.97
## <none> 728.56 740.56
## + Triceps 1 728.00 742.00
##
## Step: AIC=739.46
## Diabetes ~ Plasma + Indice + Embarazos + Pedigri + Presion +
## Edad
##
## Df Deviance AIC
## + Suero 1 723.45 739.45
## <none> 725.46 739.46
## + Triceps 1 725.19 741.19
##
## Step: AIC=739.45
## Diabetes ~ Plasma + Indice + Embarazos + Pedigri + Presion +
## Edad + Suero
##
## Df Deviance AIC
## <none> 723.45 739.45
## + Triceps 1 723.45 741.45
##
## Call: glm(formula = Diabetes ~ Plasma + Indice + Embarazos + Pedigri +
## Presion + Edad + Suero, family = binomial(link = logit),
## data = Diabetes.data)
##
## Coefficients:
## (Intercept) Plasma Indice Embarazos Pedigri
## -8.405136 0.035112 0.090089 0.123172 0.947595
## Presion Edad Suero
## -0.013214 0.014789 -0.001157
##
## Degrees of Freedom: 767 Total (i.e. Null); 760 Residual
## Null Deviance: 993.5
## Residual Deviance: 723.5 AIC: 739.5