Selección de modelos
Factor de inflación de variancia
Grasa.data <- read.table(file = "http://tarwi.lamolina.edu.pe/~clopez/Regresion/Grasa.txt", header = T)
Grasa.m1 <- lm(Grasa ~ ., data = Grasa.data)
library(car)
## Warning: package 'car' was built under R version 3.1.3
vif(Grasa.m1)
## Edad Peso Altura Cuello Pecho Abdomen Cadera
## 2.250450 33.509320 1.674591 4.324463 9.460877 11.767073 14.796520
## Muslo Rodilla Tobillo Biceps Antebrazo Muñeca
## 7.777865 4.612147 1.907961 3.619744 2.192492 3.377515
Grasa.m2 <- lm(Grasa ~ . - Peso - Abdomen - Cadera, data = Grasa.data)
vif(Grasa.m2)
## Edad Altura Cuello Pecho Muslo Rodilla Tobillo
## 1.755439 1.328693 3.921041 3.899393 5.309081 4.066763 1.835487
## Biceps Antebrazo Muñeca
## 3.479701 2.142100 3.242098
Validación cruzada
# Conjunto de entrenamiento y de prueba
library(caTools)
## Warning: package 'caTools' was built under R version 3.1.3
dim(Grasa.data)
## [1] 252 14
set.seed(100)
muestra <- sample.split(Grasa.data, SplitRatio = 0.75)
Grasa.train <- subset(Grasa.data, muestra == TRUE)
Grasa.test <- subset(Grasa.data, muestra == FALSE)
dim(Grasa.train)
## [1] 180 14
dim(Grasa.test)
## [1] 72 14
Grasa.train.m1 <- lm(Grasa ~ . - Peso - Abdomen - Cadera, data = Grasa.train)
mean((predict(Grasa.train.m1, data.frame(Grasa.test)) - Grasa.test[, 1])^2)
## [1] 35.44589
# Suma de cuadrados de los residuales de predicción (PRESS)
Forbes.data <- read.table(file = "http://tarwi.lamolina.edu.pe/~clopez/Regresion/Forbes.txt", header = T)
attach(Forbes.data)
Forbes.data
## Temperatura Presion LogPresion
## 1 194.5 20.79 131.79
## 2 194.3 20.79 131.79
## 3 197.9 22.40 135.02
## 4 198.4 22.67 135.55
## 5 199.4 23.15 136.46
## 6 199.9 23.35 136.83
## 7 200.9 23.89 137.82
## 8 201.1 23.99 138.00
## 9 201.4 24.02 138.06
## 10 201.3 24.01 138.04
## 11 203.6 25.14 140.04
## 12 204.6 26.57 142.44
## 13 209.5 28.49 145.47
## 14 208.6 27.76 144.34
## 15 210.7 29.04 146.30
## 16 211.9 29.88 147.54
## 17 212.2 30.06 147.80
Forbes.m1 <- lm(LogPresion ~ Temperatura)
residual.press <- residuals(Forbes.m1)/(1 - lm.influence(Forbes.m1)$hat)
residual.press
## 1 2 3 4 5
## -0.304675322 -0.083288239 -0.067650981 0.024204999 0.039614613
## 6 7 8 9 10
## -0.044828805 0.056900605 0.057775235 -0.165107507 -0.090911465
## 11 12 13 14 15
## -0.153905885 1.452649531 0.002157286 -0.365680092 -0.293122910
## 16 17
## -0.097839552 -0.110215517
PRESS <- sum(residual.press^2)
PRESS
## [1] 2.525852
Best subset selection
library(leaps)
Grasa.bss <- regsubsets(Grasa ~ ., data = Grasa.data, nvmax = 13)
summary(Grasa.bss)
## Subset selection object
## Call: regsubsets.formula(Grasa ~ ., data = Grasa.data, nvmax = 13)
## 13 Variables (and intercept)
## Forced in Forced out
## Edad FALSE FALSE
## Peso FALSE FALSE
## Altura FALSE FALSE
## Cuello FALSE FALSE
## Pecho FALSE FALSE
## Abdomen FALSE FALSE
## Cadera FALSE FALSE
## Muslo FALSE FALSE
## Rodilla FALSE FALSE
## Tobillo FALSE FALSE
## Biceps FALSE FALSE
## Antebrazo FALSE FALSE
## Muñeca FALSE FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: exhaustive
## Edad Peso Altura Cuello Pecho Abdomen Cadera Muslo Rodilla
## 1 ( 1 ) " " " " " " " " " " "*" " " " " " "
## 2 ( 1 ) " " "*" " " " " " " "*" " " " " " "
## 3 ( 1 ) " " "*" " " " " " " "*" " " " " " "
## 4 ( 1 ) " " "*" " " " " " " "*" " " " " " "
## 5 ( 1 ) " " "*" " " "*" " " "*" " " " " " "
## 6 ( 1 ) "*" "*" " " " " " " "*" " " "*" " "
## 7 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " "
## 8 ( 1 ) "*" "*" " " "*" " " "*" "*" "*" " "
## 9 ( 1 ) "*" "*" " " "*" " " "*" "*" "*" " "
## 10 ( 1 ) "*" "*" " " "*" " " "*" "*" "*" " "
## 11 ( 1 ) "*" "*" "*" "*" " " "*" "*" "*" " "
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*"
## Tobillo Biceps Antebrazo Muñeca
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " "*"
## 4 ( 1 ) " " " " "*" "*"
## 5 ( 1 ) " " " " "*" "*"
## 6 ( 1 ) " " " " "*" "*"
## 7 ( 1 ) " " " " "*" "*"
## 8 ( 1 ) " " " " "*" "*"
## 9 ( 1 ) " " "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*"
reg.summary <- summary(Grasa.bss)
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
reg.summary$rsq
## [1] 0.6616721 0.7187981 0.7277401 0.7350112 0.7379161 0.7409681 0.7444763
## [8] 0.7465540 0.7477322 0.7484878 0.7489813 0.7490458 0.7490500
plot(reg.summary$rss, xlab = "Número de variables", ylab = "RSS", type = "l"

plot(reg.summary$adjr2, xlab = "Número de variables", ylab = "Adjusted RSq", type = "l"
which.max(reg.summary$adjr2)
## [1] 9
points(9, reg.summary$adjr2[9], col = "red", cex = 2, pch = 20)

plot(reg.summary$cp, xlab = "Número de variables", ylab = "Cp", type = 'l'
which.min(reg.summary$cp)
## [1] 7
points(7, reg.summary$cp[7], col = "red", cex = 2, pch = 20)

which.min(reg.summary$bic)
## [1] 4
plot(reg.summary$bic, xlab = "Número de variables", ylab = "BIC", type = 'l'
points(4, reg.summary$bic[4], col = "red", cex = 2, pch = 20)

plot(Grasa.bss, scale = "r2")

plot(Grasa.bss, scale = "adjr2")

plot(Grasa.bss, scale = "Cp")

plot(Grasa.bss, scale = "bic")

Forward y Backward Stepwise Selection
Grasa.fwd <- regsubsets(Grasa ~ ., data = Grasa.data, nvmax = 13, method = "forward")
summary(Grasa.fwd)
## Subset selection object
## Call: regsubsets.formula(Grasa ~ ., data = Grasa.data, nvmax = 13,
## method = "forward")
## 13 Variables (and intercept)
## Forced in Forced out
## Edad FALSE FALSE
## Peso FALSE FALSE
## Altura FALSE FALSE
## Cuello FALSE FALSE
## Pecho FALSE FALSE
## Abdomen FALSE FALSE
## Cadera FALSE FALSE
## Muslo FALSE FALSE
## Rodilla FALSE FALSE
## Tobillo FALSE FALSE
## Biceps FALSE FALSE
## Antebrazo FALSE FALSE
## Muñeca FALSE FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: forward
## Edad Peso Altura Cuello Pecho Abdomen Cadera Muslo Rodilla
## 1 ( 1 ) " " " " " " " " " " "*" " " " " " "
## 2 ( 1 ) " " "*" " " " " " " "*" " " " " " "
## 3 ( 1 ) " " "*" " " " " " " "*" " " " " " "
## 4 ( 1 ) " " "*" " " " " " " "*" " " " " " "
## 5 ( 1 ) " " "*" " " "*" " " "*" " " " " " "
## 6 ( 1 ) "*" "*" " " "*" " " "*" " " " " " "
## 7 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " "
## 8 ( 1 ) "*" "*" " " "*" " " "*" "*" "*" " "
## 9 ( 1 ) "*" "*" " " "*" " " "*" "*" "*" " "
## 10 ( 1 ) "*" "*" " " "*" " " "*" "*" "*" " "
## 11 ( 1 ) "*" "*" "*" "*" " " "*" "*" "*" " "
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*"
## Tobillo Biceps Antebrazo Muñeca
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " "*"
## 4 ( 1 ) " " " " "*" "*"
## 5 ( 1 ) " " " " "*" "*"
## 6 ( 1 ) " " " " "*" "*"
## 7 ( 1 ) " " " " "*" "*"
## 8 ( 1 ) " " " " "*" "*"
## 9 ( 1 ) " " "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*"
which.max(summary(Grasa.fwd)$adjr2)
## [1] 9
which.min(summary(Grasa.fwd)$cp)
## [1] 7
which.min(summary(Grasa.fwd)$bic)
## [1] 4
Grasa.bwd <- regsubsets(Grasa ~ ., data = Grasa.data, nvmax = 13, method = "backward")
summary(Grasa.bwd)
## Subset selection object
## Call: regsubsets.formula(Grasa ~ ., data = Grasa.data, nvmax = 13,
## method = "backward")
## 13 Variables (and intercept)
## Forced in Forced out
## Edad FALSE FALSE
## Peso FALSE FALSE
## Altura FALSE FALSE
## Cuello FALSE FALSE
## Pecho FALSE FALSE
## Abdomen FALSE FALSE
## Cadera FALSE FALSE
## Muslo FALSE FALSE
## Rodilla FALSE FALSE
## Tobillo FALSE FALSE
## Biceps FALSE FALSE
## Antebrazo FALSE FALSE
## Muñeca FALSE FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: backward
## Edad Peso Altura Cuello Pecho Abdomen Cadera Muslo Rodilla
## 1 ( 1 ) " " " " " " " " " " "*" " " " " " "
## 2 ( 1 ) " " "*" " " " " " " "*" " " " " " "
## 3 ( 1 ) " " "*" " " " " " " "*" " " " " " "
## 4 ( 1 ) " " "*" " " " " " " "*" " " " " " "
## 5 ( 1 ) "*" "*" " " " " " " "*" " " " " " "
## 6 ( 1 ) "*" "*" " " " " " " "*" " " "*" " "
## 7 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " "
## 8 ( 1 ) "*" "*" " " "*" " " "*" "*" "*" " "
## 9 ( 1 ) "*" "*" " " "*" " " "*" "*" "*" " "
## 10 ( 1 ) "*" "*" " " "*" " " "*" "*" "*" " "
## 11 ( 1 ) "*" "*" "*" "*" " " "*" "*" "*" " "
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*"
## Tobillo Biceps Antebrazo Muñeca
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " "*"
## 4 ( 1 ) " " " " "*" "*"
## 5 ( 1 ) " " " " "*" "*"
## 6 ( 1 ) " " " " "*" "*"
## 7 ( 1 ) " " " " "*" "*"
## 8 ( 1 ) " " " " "*" "*"
## 9 ( 1 ) " " "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*"
which.max(summary(Grasa.bwd)$adjr2)
## [1] 9
which.min(summary(Grasa.bwd)$cp)
## [1] 7
which.min(summary(Grasa.bwd)$bic)
## [1] 4
coef(Grasa.bss, 7)
## (Intercept) Edad Peso Cuello Abdomen Muslo
## -33.2579912 0.0681658 -0.1194405 -0.4038021 0.9178850 0.2219598
## Antebrazo Muñeca
## 0.5531394 -1.5324011
coef(Grasa.fwd, 7)
## (Intercept) Edad Peso Cuello Abdomen Muslo
## -33.2579912 0.0681658 -0.1194405 -0.4038021 0.9178850 0.2219598
## Antebrazo Muñeca
## 0.5531394 -1.5324011
coef(Grasa.bwd, 7)
## (Intercept) Edad Peso Cuello Abdomen Muslo
## -33.2579912 0.0681658 -0.1194405 -0.4038021 0.9178850 0.2219598
## Antebrazo Muñeca
## 0.5531394 -1.5324011
set.seed(150)
muestra <- sample.split(Grasa.data, SplitRatio = 0.70)
Grasa.train <- subset(Grasa.data, muestra == TRUE)
Grasa.test <- subset(Grasa.data, muestra == FALSE)
Grasa.train.m9 <- lm(Grasa ~ Edad + Peso + Cuello + Abdomen + Cadera + Muslo + Biceps + Antebrazo + Muñeca, data = Grasa.train
ETTm9 <- mean((Grasa.test[, 1] - predict(Grasa.train.m9, data.frame(Grasa.test)))^2)
ETTm9
## [1] 22.21635
Grasa.train.m7 <- lm(Grasa ~ Edad + Peso + Cuello + Abdomen + Muslo + Antebrazo + Muñeca, data = Grasa.train
ETTm7 <- mean((Grasa.test[, 1] - predict(Grasa.train.m7, data.frame(Grasa.test)))^2)
ETTm7
## [1] 22.52712
Grasa.train.m4 <- lm(Grasa ~ Peso + Abdomen + Antebrazo + Muñeca, data = Grasa.train
ETTm4 <- mean((Grasa.test[, 1] - predict(Grasa.train.m4, data.frame(Grasa.test)))^2)
ETTm4
## [1] 23.02023
set.seed(80)
muestra <- sample.split(Grasa.data, SplitRatio = 0.70)
Grasa.train <- subset(Grasa.data, muestra == TRUE)
Grasa.test <- subset(Grasa.data, muestra == FALSE)
ETTm9 <- mean((Grasa.test[, 1] - predict(Grasa.train.m9, data.frame(Grasa.test)))^2)
ETTm9
## [1] 16.78038
ETTm7 <- mean((Grasa.test[, 1] - predict(Grasa.train.m7, data.frame(Grasa.test)))^2)
ETTm7
## [1] 16.81096
ETTm4 <- mean((Grasa.test[, 1] - predict(Grasa.train.m4, data.frame(Grasa.test)))^2)
ETTm4
## [1] 16.70344
Validación Cruzada 10
library(caret)
## Warning: package 'caret' was built under R version 3.1.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.1.3
set.seed(500)
folds <- createFolds(Grasa.data[,1], k = 10, list = T, returnTrain = FALSE)
ETT <- rep(0, times = 10)
for (i in 1:10)
{
Grasa.train <- Grasa.data[as.numeric(-unlist(folds[i])), ]
Grasa.test <- Grasa.data[as.numeric(unlist(folds[i])), ]
Grasa.train.fit <- lm(Grasa ~ Edad + Peso + Cuello + Abdomen + Cadera + Muslo + Biceps + Antebrazo + Muñeca, data = Grasa.train)
ETT[i] <- mean((Grasa.test[, 1] - predict(Grasa.train.fit, data.frame(Grasa.test)))^2)
}
ETT
## [1] 18.59544 19.59803 14.34408 17.63973 17.27257 16.66673 30.15685
## [8] 22.02368 18.83509 16.31175
EVC10.m9 <- mean(ETT)
EVC10.m9
## [1] 19.1444
ETT <- rep(0, times = 10)
for (i in 1:10)
{
Grasa.train <- Grasa.data[as.numeric(-unlist(folds[i])), ]
Grasa.test <- Grasa.data[as.numeric(unlist(folds[i])), ]
Grasa.train.fit <- lm(Grasa ~ Edad + Peso + Cuello + Abdomen + Muslo + Antebrazo + Muñeca, data = Grasa.train)
ETT[i] <- mean((Grasa.test[, 1] - predict(Grasa.train.fit, data.frame(Grasa.test)))^2)
}
ETT
## [1] 19.22187 19.12905 14.85992 17.31363 17.11515 17.39918 30.67429
## [8] 19.77790 19.71245 16.84276
EVC10.m7 <- mean(ETT)
EVC10.m7
## [1] 19.20462
ETT <- rep(0, times = 10)
for (i in 1:10)
{
Grasa.train <- Grasa.data[as.numeric(-unlist(folds[i])), ]
Grasa.test <- Grasa.data[as.numeric(unlist(folds[i])), ]
Grasa.train.fit <- lm(Grasa ~ Peso + Abdomen + Antebrazo + Muñeca, data = Grasa.train)
ETT[i] <- mean((Grasa.test[, 1] - predict(Grasa.train.fit, data.frame(Grasa.test)))^2)
}
ETT
## [1] 19.18556 18.30235 15.91159 16.44879 16.46345 18.69414 32.40676
## [8] 17.97621 18.22337 18.58063
EVC10.m4 <- mean(ETT)
EVC10.m4
## [1] 19.21928