Regresion Logística

Regresión logística simple

Pulso.data <- read.table(file = "http://tarwi.lamolina.edu.pe/~clopez/Categoricos/Pulso.txt", header = T)
attach(Pulso.data)
Pulso.m1 <- glm(Pulso ~ Peso, family = binomial(link = logit))
summary(Pulso.m1)
## 
## Call:
## glm(formula = Pulso ~ Peso, family = binomial(link = logit))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0072  -0.7881  -0.6682  -0.4573   1.9954  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  1.45452    1.59763   0.910    0.363
## Peso        -0.01832    0.01127  -1.626    0.104
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 101.214  on 91  degrees of freedom
## Residual deviance:  98.377  on 90  degrees of freedom
## AIC: 102.38
## 
## Number of Fisher Scoring iterations: 4
vcov(Pulso.m1)
##             (Intercept)          Peso
## (Intercept)  2.55243657 -0.0177859492
## Peso        -0.01778595  0.0001270017
detach(Pulso.data)

Regresión logística múltiple

Diabetes.data <- read.table(file = "http://tarwi.lamolina.edu.pe/~clopez/Categoricos/Diabetes.txt", header = T)
attach(Diabetes.data)
Diabetes.m1 <- glm(Diabetes ~ ., family = binomial(link = logit), data = Diabetes.data)
summary(Diabetes.m1)
## 
## Call:
## glm(formula = Diabetes ~ ., family = binomial(link = logit), 
##     data = Diabetes.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5566  -0.7274  -0.4159   0.7267   2.9297  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.4046964  0.7166359 -11.728  < 2e-16 ***
## Embarazos    0.1231823  0.0320776   3.840 0.000123 ***
## Plasma       0.0351637  0.0037087   9.481  < 2e-16 ***
## Presion     -0.0132955  0.0052336  -2.540 0.011072 *  
## Triceps      0.0006190  0.0068994   0.090 0.928515    
## Suero       -0.0011917  0.0009012  -1.322 0.186065    
## Indice       0.0897010  0.0150876   5.945 2.76e-09 ***
## Pedigri      0.9451797  0.2991475   3.160 0.001580 ** 
## Edad         0.0148690  0.0093348   1.593 0.111192    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 723.45  on 759  degrees of freedom
## AIC: 741.45
## 
## Number of Fisher Scoring iterations: 5
Diabetes.m2  <- glm(Diabetes ~ Embarazos + Plasma + Presion + Indice + Pedigri, family = binomial(link = logit))
summary(Diabetes.m2)
## 
## Call:
## glm(formula = Diabetes ~ Embarazos + Plasma + Presion + Indice + 
##     Pedigri, family = binomial(link = logit))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7931  -0.7362  -0.4188   0.7251   2.9555  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.954952   0.675823 -11.771  < 2e-16 ***
## Embarazos    0.153492   0.027835   5.514  3.5e-08 ***
## Plasma       0.034658   0.003394  10.213  < 2e-16 ***
## Presion     -0.012007   0.005031  -2.387  0.01700 *  
## Indice       0.084832   0.014125   6.006  1.9e-09 ***
## Pedigri      0.910628   0.294027   3.097  0.00195 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 728.56  on 762  degrees of freedom
## AIC: 740.56
## 
## Number of Fisher Scoring iterations: 5
# Prueba de razón de verosimilitud
anova(Diabetes.m2, Diabetes.m1 , test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: Diabetes ~ Embarazos + Plasma + Presion + Indice + Pedigri
## Model 2: Diabetes ~ Embarazos + Plasma + Presion + Triceps + Suero + Indice + 
##     Pedigri + Edad
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       762     728.56                     
## 2       759     723.45  3   5.1142   0.1636
# Prueba de Hosmer y Lemeshow
library(ResourceSelection)
## ResourceSelection 0.3-0   2016-11-04
hoslem.test(Diabetes, fitted(Diabetes.m2), g = 10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  Diabetes, fitted(Diabetes.m2)
## X-squared = 9.4733, df = 8, p-value = 0.304
# Estadística C
Diabetes.m0 <- glm(Diabetes ~ 1, family = binomial(link = logit))
C <- 2*(logLik(Diabetes.m2)[1] - logLik(Diabetes.m0)[1])
C
## [1] 264.9243
1 - pchisq(q = C, df = 5)
## [1] 0
library(pscl)
## Loading required package: MASS
## Loading required package: lattice
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
pR2(Diabetes.m2)[3]
##       G2 
## 264.9243
# Pseudo R2
pseudoR2 <- (logLik(Diabetes.m0)[1] - logLik(Diabetes.m2)[1])/logLik(Diabetes.m0)[1]
pseudoR2
## [1] 0.2666619
pR2(Diabetes.m2)[4]
##  McFadden 
## 0.2666619
# Importancia de las variables
library(caret)
## Loading required package: ggplot2
varImp(Diabetes.m2)
##             Overall
## Embarazos  5.514303
## Plasma    10.213000
## Presion    2.386814
## Indice     6.005878
## Pedigri    3.097083
# Tasa de clasificación
set.seed(300)
indices <- createDataPartition(Diabetes.data$Diabetes, p = 0.7, list = FALSE)
Diabetes.train <- Diabetes.data[indices, ]
Diabetes.test <- Diabetes.data[-indices, ]

Diabetes.train.m1 <- glm(Diabetes ~ Embarazos + Plasma + Presion + Indice + Pedigri, family = binomial(link = logit), data = Diabetes.train)
 
prob <- predict(Diabetes.train.m1, newdata = Diabetes.test, type = "response")
pred <- ifelse(prob >= 0.5, 1, 0)
confusionMatrix(data = pred, Diabetes.test$Diabetes, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 129  37
##          1  22  42
##                                           
##                Accuracy : 0.7435          
##                  95% CI : (0.6819, 0.7986)
##     No Information Rate : 0.6565          
##     P-Value [Acc > NIR] : 0.00286         
##                                           
##                   Kappa : 0.4042          
##  Mcnemar's Test P-Value : 0.06836         
##                                           
##             Sensitivity : 0.5316          
##             Specificity : 0.8543          
##          Pos Pred Value : 0.6563          
##          Neg Pred Value : 0.7771          
##              Prevalence : 0.3435          
##          Detection Rate : 0.1826          
##    Detection Prevalence : 0.2783          
##       Balanced Accuracy : 0.6930          
##                                           
##        'Positive' Class : 1               
## 
# Residuales
res.pearson <- resid(Diabetes.m2, type = "pearson")
res.est.p <- res.pearson/sqrt(1 - lm.influence(Diabetes.m2)$hat)
res.devianza <- resid(Diabetes.m2, type = "deviance")
res.est.d <- res.devianza/sqrt(1 - lm.influence(Diabetes.m2)$hat)
residuales <- cbind(res.pearson, res.est.p, res.devianza, res.est.d)
residuales[1:10,]
##    res.pearson  res.est.p res.devianza  res.est.d
## 1    0.7217376  0.7232687    0.9157573  0.9177000
## 2   -0.2152579 -0.2154809   -0.3009801 -0.3012919
## 3    0.4878578  0.4909426    0.6534546  0.6575865
## 4   -0.2261049 -0.2263629   -0.3157832 -0.3161435
## 5    0.3583168  0.3645645    0.4915134  0.5000834
## 6   -0.4272110 -0.4280076   -0.5790313 -0.5801110
## 7    3.5535251  3.5591605    2.2856477  2.2892725
## 8   -1.4065088 -1.4374257   -1.4773907 -1.5098656
## 9    0.5855506  0.5899804    0.7678656  0.7736745
## 10   5.3020283  5.3256525    2.5965875  2.6081571

Selección de modelos

library(MASS)
stepAIC(Diabetes.m1, direction = "backward")
## Start:  AIC=741.45
## Diabetes ~ Embarazos + Plasma + Presion + Triceps + Suero + Indice + 
##     Pedigri + Edad
## 
##             Df Deviance    AIC
## - Triceps    1   723.45 739.45
## - Suero      1   725.19 741.19
## <none>           723.45 741.45
## - Edad       1   725.97 741.97
## - Presion    1   729.99 745.99
## - Pedigri    1   733.78 749.78
## - Embarazos  1   738.68 754.68
## - Indice     1   764.22 780.22
## - Plasma     1   838.37 854.37
## 
## Step:  AIC=739.45
## Diabetes ~ Embarazos + Plasma + Presion + Suero + Indice + Pedigri + 
##     Edad
## 
##             Df Deviance    AIC
## <none>           723.45 739.45
## - Suero      1   725.46 739.46
## - Edad       1   725.97 739.97
## - Presion    1   730.13 744.13
## - Pedigri    1   733.92 747.92
## - Embarazos  1   738.69 752.69
## - Indice     1   768.77 782.77
## - Plasma     1   840.87 854.87
## 
## Call:  glm(formula = Diabetes ~ Embarazos + Plasma + Presion + Suero + 
##     Indice + Pedigri + Edad, family = binomial(link = logit), 
##     data = Diabetes.data)
## 
## Coefficients:
## (Intercept)    Embarazos       Plasma      Presion        Suero  
##   -8.405136     0.123172     0.035112    -0.013214    -0.001157  
##      Indice      Pedigri         Edad  
##    0.090089     0.947595     0.014789  
## 
## Degrees of Freedom: 767 Total (i.e. Null);  760 Residual
## Null Deviance:       993.5 
## Residual Deviance: 723.5     AIC: 739.5
stepAIC(Diabetes.m1, direction = "both")
## Start:  AIC=741.45
## Diabetes ~ Embarazos + Plasma + Presion + Triceps + Suero + Indice + 
##     Pedigri + Edad
## 
##             Df Deviance    AIC
## - Triceps    1   723.45 739.45
## - Suero      1   725.19 741.19
## <none>           723.45 741.45
## - Edad       1   725.97 741.97
## - Presion    1   729.99 745.99
## - Pedigri    1   733.78 749.78
## - Embarazos  1   738.68 754.68
## - Indice     1   764.22 780.22
## - Plasma     1   838.37 854.37
## 
## Step:  AIC=739.45
## Diabetes ~ Embarazos + Plasma + Presion + Suero + Indice + Pedigri + 
##     Edad
## 
##             Df Deviance    AIC
## <none>           723.45 739.45
## - Suero      1   725.46 739.46
## - Edad       1   725.97 739.97
## + Triceps    1   723.45 741.45
## - Presion    1   730.13 744.13
## - Pedigri    1   733.92 747.92
## - Embarazos  1   738.69 752.69
## - Indice     1   768.77 782.77
## - Plasma     1   840.87 854.87
## 
## Call:  glm(formula = Diabetes ~ Embarazos + Plasma + Presion + Suero + 
##     Indice + Pedigri + Edad, family = binomial(link = logit), 
##     data = Diabetes.data)
## 
## Coefficients:
## (Intercept)    Embarazos       Plasma      Presion        Suero  
##   -8.405136     0.123172     0.035112    -0.013214    -0.001157  
##      Indice      Pedigri         Edad  
##    0.090089     0.947595     0.014789  
## 
## Degrees of Freedom: 767 Total (i.e. Null);  760 Residual
## Null Deviance:       993.5 
## Residual Deviance: 723.5     AIC: 739.5
Diabetes.m0 <- glm(Diabetes ~ 1, family = binomial(link = logit), data = Diabetes.data)
stepAIC(Diabetes.m0, direction = "forward", scope = ~ Embarazos + Plasma + Presion + Triceps + Suero + Indice + Pedigri + Edad)
## Start:  AIC=995.48
## Diabetes ~ 1
## 
##             Df Deviance    AIC
## + Plasma     1   808.72 812.72
## + Indice     1   920.71 924.71
## + Edad       1   950.72 954.72
## + Embarazos  1   956.21 960.21
## + Pedigri    1   970.86 974.86
## + Suero      1   980.81 984.81
## + Triceps    1   989.19 993.19
## + Presion    1   990.13 994.13
## <none>           993.48 995.48
## 
## Step:  AIC=812.72
## Diabetes ~ Plasma
## 
##             Df Deviance    AIC
## + Indice     1   771.40 777.40
## + Embarazos  1   784.95 790.95
## + Pedigri    1   796.99 802.99
## + Edad       1   797.36 803.36
## <none>           808.72 812.72
## + Triceps    1   807.07 813.07
## + Suero      1   807.77 813.77
## + Presion    1   808.59 814.59
## 
## Step:  AIC=777.4
## Diabetes ~ Plasma + Indice
## 
##             Df Deviance    AIC
## + Embarazos  1   744.12 752.12
## + Edad       1   755.68 763.68
## + Pedigri    1   762.87 770.87
## + Suero      1   767.79 775.79
## + Presion    1   769.07 777.07
## <none>           771.40 777.40
## + Triceps    1   770.20 778.20
## 
## Step:  AIC=752.12
## Diabetes ~ Plasma + Indice + Embarazos
## 
##           Df Deviance    AIC
## + Pedigri  1   734.31 744.31
## + Presion  1   738.43 748.43
## + Edad     1   742.10 752.10
## <none>         744.12 752.12
## + Suero    1   742.43 752.43
## + Triceps  1   743.60 753.60
## 
## Step:  AIC=744.31
## Diabetes ~ Plasma + Indice + Embarazos + Pedigri
## 
##           Df Deviance    AIC
## + Presion  1   728.56 740.56
## + Suero    1   731.51 743.51
## <none>         734.31 744.31
## + Edad     1   732.51 744.51
## + Triceps  1   733.06 745.06
## 
## Step:  AIC=740.56
## Diabetes ~ Plasma + Indice + Embarazos + Pedigri + Presion
## 
##           Df Deviance    AIC
## + Edad     1   725.46 739.46
## + Suero    1   725.97 739.97
## <none>         728.56 740.56
## + Triceps  1   728.00 742.00
## 
## Step:  AIC=739.46
## Diabetes ~ Plasma + Indice + Embarazos + Pedigri + Presion + 
##     Edad
## 
##           Df Deviance    AIC
## + Suero    1   723.45 739.45
## <none>         725.46 739.46
## + Triceps  1   725.19 741.19
## 
## Step:  AIC=739.45
## Diabetes ~ Plasma + Indice + Embarazos + Pedigri + Presion + 
##     Edad + Suero
## 
##           Df Deviance    AIC
## <none>         723.45 739.45
## + Triceps  1   723.45 741.45
## 
## Call:  glm(formula = Diabetes ~ Plasma + Indice + Embarazos + Pedigri + 
##     Presion + Edad + Suero, family = binomial(link = logit), 
##     data = Diabetes.data)
## 
## Coefficients:
## (Intercept)       Plasma       Indice    Embarazos      Pedigri  
##   -8.405136     0.035112     0.090089     0.123172     0.947595  
##     Presion         Edad        Suero  
##   -0.013214     0.014789    -0.001157  
## 
## Degrees of Freedom: 767 Total (i.e. Null);  760 Residual
## Null Deviance:       993.5 
## Residual Deviance: 723.5     AIC: 739.5