Selección de modelos

Factor de inflación de variancia

Grasa.data <- read.table(file = "http://tarwi.lamolina.edu.pe/~clopez/Regresion/Grasa.txt", header = T)
Grasa.m1 <- lm(Grasa ~ ., data = Grasa.data)
library(car)
## Warning: package 'car' was built under R version 3.1.3
vif(Grasa.m1)
##      Edad      Peso    Altura    Cuello     Pecho   Abdomen    Cadera 
##  2.250450 33.509320  1.674591  4.324463  9.460877 11.767073 14.796520 
##     Muslo   Rodilla   Tobillo    Biceps Antebrazo    Muñeca 
##  7.777865  4.612147  1.907961  3.619744  2.192492  3.377515
Grasa.m2 <- lm(Grasa ~ . - Peso - Abdomen - Cadera, data = Grasa.data)
vif(Grasa.m2)
##      Edad    Altura    Cuello     Pecho     Muslo   Rodilla   Tobillo 
##  1.755439  1.328693  3.921041  3.899393  5.309081  4.066763  1.835487 
##    Biceps Antebrazo    Muñeca 
##  3.479701  2.142100  3.242098

Validación cruzada

# Conjunto de entrenamiento y de prueba

library(caTools)
## Warning: package 'caTools' was built under R version 3.1.3
dim(Grasa.data)
## [1] 252  14
set.seed(100)
muestra <- sample.split(Grasa.data, SplitRatio = 0.75)
Grasa.train <- subset(Grasa.data, muestra == TRUE)
Grasa.test <- subset(Grasa.data, muestra == FALSE)
dim(Grasa.train)
## [1] 180  14
dim(Grasa.test)
## [1] 72 14
Grasa.train.m1 <- lm(Grasa ~ . - Peso - Abdomen - Cadera, data = Grasa.train)
mean((predict(Grasa.train.m1, data.frame(Grasa.test)) - Grasa.test[, 1])^2)
## [1] 35.44589
# Suma de cuadrados de los residuales de predicción (PRESS)

Forbes.data <- read.table(file = "http://tarwi.lamolina.edu.pe/~clopez/Regresion/Forbes.txt", header = T)
attach(Forbes.data)
Forbes.data
##    Temperatura Presion LogPresion
## 1        194.5   20.79     131.79
## 2        194.3   20.79     131.79
## 3        197.9   22.40     135.02
## 4        198.4   22.67     135.55
## 5        199.4   23.15     136.46
## 6        199.9   23.35     136.83
## 7        200.9   23.89     137.82
## 8        201.1   23.99     138.00
## 9        201.4   24.02     138.06
## 10       201.3   24.01     138.04
## 11       203.6   25.14     140.04
## 12       204.6   26.57     142.44
## 13       209.5   28.49     145.47
## 14       208.6   27.76     144.34
## 15       210.7   29.04     146.30
## 16       211.9   29.88     147.54
## 17       212.2   30.06     147.80
Forbes.m1 <- lm(LogPresion ~ Temperatura)
residual.press <- residuals(Forbes.m1)/(1 - lm.influence(Forbes.m1)$hat)
residual.press
##            1            2            3            4            5 
## -0.304675322 -0.083288239 -0.067650981  0.024204999  0.039614613 
##            6            7            8            9           10 
## -0.044828805  0.056900605  0.057775235 -0.165107507 -0.090911465 
##           11           12           13           14           15 
## -0.153905885  1.452649531  0.002157286 -0.365680092 -0.293122910 
##           16           17 
## -0.097839552 -0.110215517
PRESS <- sum(residual.press^2)
PRESS
## [1] 2.525852

Best subset selection

library(leaps)
Grasa.bss <- regsubsets(Grasa ~ ., data = Grasa.data, nvmax = 13)
summary(Grasa.bss)
## Subset selection object
## Call: regsubsets.formula(Grasa ~ ., data = Grasa.data, nvmax = 13)
## 13 Variables  (and intercept)
##           Forced in Forced out
## Edad          FALSE      FALSE
## Peso          FALSE      FALSE
## Altura        FALSE      FALSE
## Cuello        FALSE      FALSE
## Pecho         FALSE      FALSE
## Abdomen       FALSE      FALSE
## Cadera        FALSE      FALSE
## Muslo         FALSE      FALSE
## Rodilla       FALSE      FALSE
## Tobillo       FALSE      FALSE
## Biceps        FALSE      FALSE
## Antebrazo     FALSE      FALSE
## Muñeca        FALSE      FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: exhaustive
##           Edad Peso Altura Cuello Pecho Abdomen Cadera Muslo Rodilla
## 1  ( 1 )  " "  " "  " "    " "    " "   "*"     " "    " "   " "    
## 2  ( 1 )  " "  "*"  " "    " "    " "   "*"     " "    " "   " "    
## 3  ( 1 )  " "  "*"  " "    " "    " "   "*"     " "    " "   " "    
## 4  ( 1 )  " "  "*"  " "    " "    " "   "*"     " "    " "   " "    
## 5  ( 1 )  " "  "*"  " "    "*"    " "   "*"     " "    " "   " "    
## 6  ( 1 )  "*"  "*"  " "    " "    " "   "*"     " "    "*"   " "    
## 7  ( 1 )  "*"  "*"  " "    "*"    " "   "*"     " "    "*"   " "    
## 8  ( 1 )  "*"  "*"  " "    "*"    " "   "*"     "*"    "*"   " "    
## 9  ( 1 )  "*"  "*"  " "    "*"    " "   "*"     "*"    "*"   " "    
## 10  ( 1 ) "*"  "*"  " "    "*"    " "   "*"     "*"    "*"   " "    
## 11  ( 1 ) "*"  "*"  "*"    "*"    " "   "*"     "*"    "*"   " "    
## 12  ( 1 ) "*"  "*"  "*"    "*"    "*"   "*"     "*"    "*"   " "    
## 13  ( 1 ) "*"  "*"  "*"    "*"    "*"   "*"     "*"    "*"   "*"    
##           Tobillo Biceps Antebrazo Muñeca
## 1  ( 1 )  " "     " "    " "       " "   
## 2  ( 1 )  " "     " "    " "       " "   
## 3  ( 1 )  " "     " "    " "       "*"   
## 4  ( 1 )  " "     " "    "*"       "*"   
## 5  ( 1 )  " "     " "    "*"       "*"   
## 6  ( 1 )  " "     " "    "*"       "*"   
## 7  ( 1 )  " "     " "    "*"       "*"   
## 8  ( 1 )  " "     " "    "*"       "*"   
## 9  ( 1 )  " "     "*"    "*"       "*"   
## 10  ( 1 ) "*"     "*"    "*"       "*"   
## 11  ( 1 ) "*"     "*"    "*"       "*"   
## 12  ( 1 ) "*"     "*"    "*"       "*"   
## 13  ( 1 ) "*"     "*"    "*"       "*"
reg.summary <- summary(Grasa.bss)
names(reg.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
reg.summary$rsq
##  [1] 0.6616721 0.7187981 0.7277401 0.7350112 0.7379161 0.7409681 0.7444763
##  [8] 0.7465540 0.7477322 0.7484878 0.7489813 0.7490458 0.7490500
plot(reg.summary$rss, xlab = "Número de variables", ylab = "RSS", type = "l"

plot(reg.summary$adjr2, xlab = "Número de variables", ylab = "Adjusted RSq", type = "l"
which.max(reg.summary$adjr2)
## [1] 9
points(9, reg.summary$adjr2[9], col = "red", cex = 2, pch = 20)

plot(reg.summary$cp, xlab = "Número de variables", ylab = "Cp", type = 'l'
which.min(reg.summary$cp)
## [1] 7
points(7, reg.summary$cp[7], col = "red", cex = 2, pch = 20)

which.min(reg.summary$bic)
## [1] 4
plot(reg.summary$bic, xlab = "Número de variables", ylab = "BIC", type = 'l'
points(4, reg.summary$bic[4], col = "red", cex = 2, pch = 20)

plot(Grasa.bss, scale = "r2")

plot(Grasa.bss, scale = "adjr2")

plot(Grasa.bss, scale = "Cp")

plot(Grasa.bss, scale = "bic")

Forward y Backward Stepwise Selection

Grasa.fwd <- regsubsets(Grasa ~ ., data = Grasa.data, nvmax = 13, method = "forward")
summary(Grasa.fwd)
## Subset selection object
## Call: regsubsets.formula(Grasa ~ ., data = Grasa.data, nvmax = 13, 
##     method = "forward")
## 13 Variables  (and intercept)
##           Forced in Forced out
## Edad          FALSE      FALSE
## Peso          FALSE      FALSE
## Altura        FALSE      FALSE
## Cuello        FALSE      FALSE
## Pecho         FALSE      FALSE
## Abdomen       FALSE      FALSE
## Cadera        FALSE      FALSE
## Muslo         FALSE      FALSE
## Rodilla       FALSE      FALSE
## Tobillo       FALSE      FALSE
## Biceps        FALSE      FALSE
## Antebrazo     FALSE      FALSE
## Muñeca        FALSE      FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: forward
##           Edad Peso Altura Cuello Pecho Abdomen Cadera Muslo Rodilla
## 1  ( 1 )  " "  " "  " "    " "    " "   "*"     " "    " "   " "    
## 2  ( 1 )  " "  "*"  " "    " "    " "   "*"     " "    " "   " "    
## 3  ( 1 )  " "  "*"  " "    " "    " "   "*"     " "    " "   " "    
## 4  ( 1 )  " "  "*"  " "    " "    " "   "*"     " "    " "   " "    
## 5  ( 1 )  " "  "*"  " "    "*"    " "   "*"     " "    " "   " "    
## 6  ( 1 )  "*"  "*"  " "    "*"    " "   "*"     " "    " "   " "    
## 7  ( 1 )  "*"  "*"  " "    "*"    " "   "*"     " "    "*"   " "    
## 8  ( 1 )  "*"  "*"  " "    "*"    " "   "*"     "*"    "*"   " "    
## 9  ( 1 )  "*"  "*"  " "    "*"    " "   "*"     "*"    "*"   " "    
## 10  ( 1 ) "*"  "*"  " "    "*"    " "   "*"     "*"    "*"   " "    
## 11  ( 1 ) "*"  "*"  "*"    "*"    " "   "*"     "*"    "*"   " "    
## 12  ( 1 ) "*"  "*"  "*"    "*"    "*"   "*"     "*"    "*"   " "    
## 13  ( 1 ) "*"  "*"  "*"    "*"    "*"   "*"     "*"    "*"   "*"    
##           Tobillo Biceps Antebrazo Muñeca
## 1  ( 1 )  " "     " "    " "       " "   
## 2  ( 1 )  " "     " "    " "       " "   
## 3  ( 1 )  " "     " "    " "       "*"   
## 4  ( 1 )  " "     " "    "*"       "*"   
## 5  ( 1 )  " "     " "    "*"       "*"   
## 6  ( 1 )  " "     " "    "*"       "*"   
## 7  ( 1 )  " "     " "    "*"       "*"   
## 8  ( 1 )  " "     " "    "*"       "*"   
## 9  ( 1 )  " "     "*"    "*"       "*"   
## 10  ( 1 ) "*"     "*"    "*"       "*"   
## 11  ( 1 ) "*"     "*"    "*"       "*"   
## 12  ( 1 ) "*"     "*"    "*"       "*"   
## 13  ( 1 ) "*"     "*"    "*"       "*"
which.max(summary(Grasa.fwd)$adjr2)
## [1] 9
which.min(summary(Grasa.fwd)$cp)
## [1] 7
which.min(summary(Grasa.fwd)$bic)
## [1] 4
Grasa.bwd <- regsubsets(Grasa ~ ., data = Grasa.data, nvmax = 13, method = "backward")
summary(Grasa.bwd)
## Subset selection object
## Call: regsubsets.formula(Grasa ~ ., data = Grasa.data, nvmax = 13, 
##     method = "backward")
## 13 Variables  (and intercept)
##           Forced in Forced out
## Edad          FALSE      FALSE
## Peso          FALSE      FALSE
## Altura        FALSE      FALSE
## Cuello        FALSE      FALSE
## Pecho         FALSE      FALSE
## Abdomen       FALSE      FALSE
## Cadera        FALSE      FALSE
## Muslo         FALSE      FALSE
## Rodilla       FALSE      FALSE
## Tobillo       FALSE      FALSE
## Biceps        FALSE      FALSE
## Antebrazo     FALSE      FALSE
## Muñeca        FALSE      FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: backward
##           Edad Peso Altura Cuello Pecho Abdomen Cadera Muslo Rodilla
## 1  ( 1 )  " "  " "  " "    " "    " "   "*"     " "    " "   " "    
## 2  ( 1 )  " "  "*"  " "    " "    " "   "*"     " "    " "   " "    
## 3  ( 1 )  " "  "*"  " "    " "    " "   "*"     " "    " "   " "    
## 4  ( 1 )  " "  "*"  " "    " "    " "   "*"     " "    " "   " "    
## 5  ( 1 )  "*"  "*"  " "    " "    " "   "*"     " "    " "   " "    
## 6  ( 1 )  "*"  "*"  " "    " "    " "   "*"     " "    "*"   " "    
## 7  ( 1 )  "*"  "*"  " "    "*"    " "   "*"     " "    "*"   " "    
## 8  ( 1 )  "*"  "*"  " "    "*"    " "   "*"     "*"    "*"   " "    
## 9  ( 1 )  "*"  "*"  " "    "*"    " "   "*"     "*"    "*"   " "    
## 10  ( 1 ) "*"  "*"  " "    "*"    " "   "*"     "*"    "*"   " "    
## 11  ( 1 ) "*"  "*"  "*"    "*"    " "   "*"     "*"    "*"   " "    
## 12  ( 1 ) "*"  "*"  "*"    "*"    "*"   "*"     "*"    "*"   " "    
## 13  ( 1 ) "*"  "*"  "*"    "*"    "*"   "*"     "*"    "*"   "*"    
##           Tobillo Biceps Antebrazo Muñeca
## 1  ( 1 )  " "     " "    " "       " "   
## 2  ( 1 )  " "     " "    " "       " "   
## 3  ( 1 )  " "     " "    " "       "*"   
## 4  ( 1 )  " "     " "    "*"       "*"   
## 5  ( 1 )  " "     " "    "*"       "*"   
## 6  ( 1 )  " "     " "    "*"       "*"   
## 7  ( 1 )  " "     " "    "*"       "*"   
## 8  ( 1 )  " "     " "    "*"       "*"   
## 9  ( 1 )  " "     "*"    "*"       "*"   
## 10  ( 1 ) "*"     "*"    "*"       "*"   
## 11  ( 1 ) "*"     "*"    "*"       "*"   
## 12  ( 1 ) "*"     "*"    "*"       "*"   
## 13  ( 1 ) "*"     "*"    "*"       "*"
which.max(summary(Grasa.bwd)$adjr2)
## [1] 9
which.min(summary(Grasa.bwd)$cp)
## [1] 7
which.min(summary(Grasa.bwd)$bic)
## [1] 4
coef(Grasa.bss, 7)
## (Intercept)        Edad        Peso      Cuello     Abdomen       Muslo 
## -33.2579912   0.0681658  -0.1194405  -0.4038021   0.9178850   0.2219598 
##   Antebrazo      Muñeca 
##   0.5531394  -1.5324011
coef(Grasa.fwd, 7)
## (Intercept)        Edad        Peso      Cuello     Abdomen       Muslo 
## -33.2579912   0.0681658  -0.1194405  -0.4038021   0.9178850   0.2219598 
##   Antebrazo      Muñeca 
##   0.5531394  -1.5324011
coef(Grasa.bwd, 7)
## (Intercept)        Edad        Peso      Cuello     Abdomen       Muslo 
## -33.2579912   0.0681658  -0.1194405  -0.4038021   0.9178850   0.2219598 
##   Antebrazo      Muñeca 
##   0.5531394  -1.5324011
set.seed(150)
muestra <- sample.split(Grasa.data, SplitRatio = 0.70)
Grasa.train <- subset(Grasa.data, muestra == TRUE)
Grasa.test <- subset(Grasa.data, muestra == FALSE)

Grasa.train.m9 <- lm(Grasa ~ Edad + Peso + Cuello + Abdomen + Cadera + Muslo + Biceps + Antebrazo + Muñeca, data = Grasa.train
ETTm9 <- mean((Grasa.test[, 1] - predict(Grasa.train.m9, data.frame(Grasa.test)))^2)
ETTm9
## [1] 22.21635
Grasa.train.m7 <- lm(Grasa ~ Edad + Peso + Cuello + Abdomen + Muslo + Antebrazo + Muñeca, data = Grasa.train
ETTm7 <- mean((Grasa.test[, 1] - predict(Grasa.train.m7, data.frame(Grasa.test)))^2)
ETTm7
## [1] 22.52712
Grasa.train.m4 <- lm(Grasa ~ Peso + Abdomen + Antebrazo + Muñeca, data = Grasa.train
ETTm4 <- mean((Grasa.test[, 1] - predict(Grasa.train.m4, data.frame(Grasa.test)))^2)
ETTm4
## [1] 23.02023
set.seed(80)
muestra <- sample.split(Grasa.data, SplitRatio = 0.70)
Grasa.train <- subset(Grasa.data, muestra == TRUE)
Grasa.test <- subset(Grasa.data, muestra == FALSE)

ETTm9 <- mean((Grasa.test[, 1] - predict(Grasa.train.m9, data.frame(Grasa.test)))^2)
ETTm9
## [1] 16.78038
ETTm7 <- mean((Grasa.test[, 1] - predict(Grasa.train.m7, data.frame(Grasa.test)))^2)
ETTm7
## [1] 16.81096
ETTm4 <- mean((Grasa.test[, 1] - predict(Grasa.train.m4, data.frame(Grasa.test)))^2)
ETTm4
## [1] 16.70344

Validación Cruzada 10

library(caret)
## Warning: package 'caret' was built under R version 3.1.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.1.3
set.seed(500)
folds <- createFolds(Grasa.data[,1], k = 10, list = T, returnTrain = FALSE)

ETT <- rep(0, times = 10)
for (i in 1:10)
{
  Grasa.train <- Grasa.data[as.numeric(-unlist(folds[i])), ]
  Grasa.test <- Grasa.data[as.numeric(unlist(folds[i])), ]
  Grasa.train.fit <- lm(Grasa ~ Edad + Peso + Cuello + Abdomen + Cadera + Muslo + Biceps + Antebrazo + Muñeca, data = Grasa.train)
  ETT[i] <- mean((Grasa.test[, 1] - predict(Grasa.train.fit, data.frame(Grasa.test)))^2)
}

ETT
##  [1] 18.59544 19.59803 14.34408 17.63973 17.27257 16.66673 30.15685
##  [8] 22.02368 18.83509 16.31175
EVC10.m9 <- mean(ETT)
EVC10.m9
## [1] 19.1444
ETT <- rep(0, times = 10)
for (i in 1:10)
{
  Grasa.train <- Grasa.data[as.numeric(-unlist(folds[i])), ]
  Grasa.test <- Grasa.data[as.numeric(unlist(folds[i])), ]
  Grasa.train.fit <- lm(Grasa ~ Edad + Peso + Cuello + Abdomen + Muslo + Antebrazo + Muñeca, data = Grasa.train)
  ETT[i] <- mean((Grasa.test[, 1] - predict(Grasa.train.fit, data.frame(Grasa.test)))^2)
}
ETT
##  [1] 19.22187 19.12905 14.85992 17.31363 17.11515 17.39918 30.67429
##  [8] 19.77790 19.71245 16.84276
EVC10.m7 <- mean(ETT)
EVC10.m7
## [1] 19.20462
ETT <- rep(0, times = 10)
for (i in 1:10)
{
  Grasa.train <- Grasa.data[as.numeric(-unlist(folds[i])), ]
  Grasa.test <- Grasa.data[as.numeric(unlist(folds[i])), ]
  Grasa.train.fit <- lm(Grasa ~ Peso + Abdomen + Antebrazo + Muñeca, data = Grasa.train)
  ETT[i] <- mean((Grasa.test[, 1] - predict(Grasa.train.fit, data.frame(Grasa.test)))^2)
}
ETT
##  [1] 19.18556 18.30235 15.91159 16.44879 16.46345 18.69414 32.40676
##  [8] 17.97621 18.22337 18.58063
EVC10.m4 <- mean(ETT)
EVC10.m4
## [1] 19.21928