Capítulo 6: Transformaciones

Diagramas de dispersión

Mamiferos.data <- read.table(file = "http://tarwi.lamolina.edu.pe/~clopez/Regresion/Mamiferos.txt", header = T)
head(Mamiferos.data)
##   SWS  PS   TS PesoCuerpo PesoCerebro Life  GP P SE D
## 1  NA  NA  3.3   6654.000      5712.0 38.6 645 3  5 3
## 2 6.3 2.0  8.3      1.000         6.6  4.5  42 3  1 3
## 3  NA  NA 12.5      3.385        44.5 14.0  60 1  1 1
## 4  NA  NA 16.5      0.920         5.7   NA  25 5  2 3
## 5 2.1 1.8  3.9   2547.000      4603.0 69.0 624 3  5 4
## 6 9.1 0.7  9.8     10.550       179.5 27.0 180 4  4 4
##                     Especie
## 1          African_elephant
## 2 African_giant_pouched_rat
## 3                Arctic_Fox
## 4    Arctic_ground_squirrel
## 5            Asian_elephant
## 6                    Baboon
attach(Mamiferos.data)
plot(PesoCuerpo, PesoCerebro, xlab = "Peso del cuerpo (kg)", ylab = "Peso del cerebro (g)")
text(4400, 5750, "African_elephant", adj = 0)
text(1600, 4300, "Asian_elephant", adj = 0)
text(300, 1400, "Human", adj = 0)

Transformaciones potencia

par(mfrow = c(2, 2), mai = c(0.6, 0.6, 0.1, 0.1), mgp = c(2, 1, 0), cex.lab = 1.0, cex = 0.6)

PesoCuerpo1 <- 1/PesoCuerpo 
PesoCerebro1 <- 1/PesoCerebro 
Mamiferos.m1 <- lm(PesoCerebro1 ~ PesoCuerpo1) 
plot(PesoCuerpo1, PesoCerebro1, xlab = expression(paste("(a)  ",PesoCuerpo^-1)), ylab = expression(PesoCerebro^-1)) 
abline(Mamiferos.m1) 
lines(lowess(PesoCerebro1 ~ PesoCuerpo1, f = 2/3, iter = 1), lty = 2, col = "red")

PesoCuerpo2 <- log(PesoCuerpo) 
PesoCerebro2 <- log(PesoCerebro) 
Mamiferos.m2 <- lm(PesoCerebro2 ~ PesoCuerpo2) 
plot(PesoCuerpo2, PesoCerebro2, xlab = expression(paste("(b)  ",log[e](PesoCuerpo))), ylab = expression(log[e](PesoCerebro))) 
abline(Mamiferos.m2) 
lines(lowess(PesoCerebro2 ~ PesoCuerpo2, f = 2/3, iter = 1), lty = 2, col ="red")

PesoCuerpo3 <- (PesoCuerpo)^(1/3) 
PesoCerebro3 <- (PesoCerebro)^(1/3)
Mamiferos.m3 <- lm(PesoCerebro3 ~ PesoCuerpo3) 
plot(PesoCuerpo3, PesoCerebro3, xlab = expression(paste("(c)  ",PesoCuerpo^0.33)), ylab = expression(PesoCerebro^0.33))
abline(Mamiferos.m3)
lines(lowess(PesoCerebro3 ~ PesoCuerpo3, f = 2/3, iter = 1), lty = 2, col = "red")

PesoCuerpo4 <- (PesoCuerpo)^(1/2)
PesoCerebro4 <- (PesoCerebro)^(1/2) 
Mamiferos.m4 <- lm(PesoCerebro4 ~ PesoCuerpo4) 
plot(PesoCuerpo4, PesoCerebro4, xlab = expression(paste("(d)  ",PesoCuerpo^.5)), ylab = expression(PesoCerebro^.5)) 
abline(Mamiferos.m4) 
lines(lowess(PesoCerebro4 ~ PesoCuerpo4, f = 2/3, iter = 1), lty = 2, col = "red")

Transformando solo la variable predictora

Arboles.data <- read.table(file = "http://tarwi.lamolina.edu.pe/~clopez/Regresion/Arboles.txt", header = T)
head(Arboles.data)
##   Plot Tree Species Dbh Altura
## 1    3    5      WC 360    207
## 2    3    8      WC 380    225
## 3    4    1      WC 460    180
## 4    6    2      WC 290    220
## 5    6    3      WC 294    320
## 6    6    4      WC 685    260
attach(Arboles.data)
library(alr3)
## Warning: package 'alr3' was built under R version 3.3.3
## Loading required package: car
nuevo <- seq(min(Dbh), max(Dbh), length = 100) 
plot(Dbh, Altura, xlab = "X = Dhn", ylab = "Y = Altura", cex = 0.7)

Arboles.m1 <- lm(Altura ~ bcPower(Dbh, lambda = 1))           
lines(nuevo, predict(Arboles.m1, data.frame(Dbh = nuevo)), col = "blue")
Arboles.m2 <- lm(Altura ~ bcPower(Dbh, lambda = 0))           
lines(nuevo, predict(Arboles.m2, data.frame(Dbh = nuevo)), col ="red")
Arboles.m3 <- lm(Altura ~ bcPower(Dbh, lambda = -1))           
lines(nuevo, predict(Arboles.m3, data.frame(Dbh = nuevo)), col ="purple")

legend(940, 200, legend = c(1, 0, -1), cex = 0.75, lty = 1, col = c("blue", "red", "purple"), xjust = 1, yjust = 1)

summary(Arboles.m1)
## 
## Call:
## lm(formula = Altura ~ bcPower(Dbh, lambda = 1))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -77.693 -29.467   0.713  28.959 115.237 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              111.34002    7.44300   14.96   <2e-16 ***
## bcPower(Dbh, lambda = 1)   0.31885    0.01736   18.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37.61 on 137 degrees of freedom
## Multiple R-squared:  0.7113, Adjusted R-squared:  0.7091 
## F-statistic: 337.5 on 1 and 137 DF,  p-value: < 2.2e-16
summary(Arboles.m2)
## 
## Call:
## lm(formula = Altura ~ bcPower(Dbh, lambda = 0))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -89.485 -20.046   3.652  22.586 104.017 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -463.314     32.438  -14.28   <2e-16 ***
## bcPower(Dbh, lambda = 0)  119.519      5.532   21.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33.33 on 137 degrees of freedom
## Multiple R-squared:  0.7731, Adjusted R-squared:  0.7715 
## F-statistic: 466.8 on 1 and 137 DF,  p-value: < 2.2e-16
plot(logb(Dbh,2), Altura, ylab = "Y = Altura", xlab = expression(log[2](Dbh))) 
abline(lm(Altura ~ logb(Dbh, 2)), col = "red")

Transformación de Box y Cox

data(cars)
attach(cars)
plot(speed, dist)

cars.m1 <- lm(dist ~ speed)
library(nortest)
ad.test(resid(cars.m1))
## 
##  Anderson-Darling normality test
## 
## data:  resid(cars.m1)
## A = 0.79406, p-value = 0.0369
#Librería alr3
boxCox(cars.m1, lambda = seq(0, 1, by = 0.1))

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:alr3':
## 
##     forbes
boxCox(dist ~ speed, lambda = seq(0, 1, by = 0.1), data = cars)
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not
## a graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter

dist1 <- sqrt(dist)
cars.m2 <- lm(dist1 ~ speed)
ad.test(resid(cars.m2))
## 
##  Anderson-Darling normality test
## 
## data:  resid(cars.m2)
## A = 0.39752, p-value = 0.3551

Transformación y matrices de dispersión

Carreteras.data <- read.table(file = "http://tarwi.lamolina.edu.pe/~clopez/Regresion/Carreteras.txt", header = T)
attach(Carreteras.data)
pairs(Tasa ~ Longitud + TDP + Volumen + Borde + Señales)

Carreteras.m1 <- lm(Tasa ~ Longitud + TDP + Volumen + Borde + Señales)
summary(Carreteras.m1)
## 
## Call:
## lm(formula = Tasa ~ Longitud + TDP + Volumen + Borde + Señales)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.203 -1.122 -0.078  0.796  3.777 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.9264883  1.2617697   6.282 4.22e-07 ***
## Longitud    -0.0764574  0.0355275  -2.152  0.03880 *  
## TDP         -0.0007746  0.0143184  -0.054  0.95719    
## Volumen     -0.1859878  0.1182841  -1.572  0.12540    
## Borde       -0.2418428  0.0859140  -2.815  0.00816 ** 
## Señales      1.0109635  0.4143986   2.440  0.02024 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.39 on 33 degrees of freedom
## Multiple R-squared:  0.5746, Adjusted R-squared:  0.5101 
## F-statistic: 8.914 on 5 and 33 DF,  p-value: 1.958e-05

Selección automática de las transformaciones

Señales1 <- (Señales*Longitud + 1)/Longitud
Transf <- powerTransform(cbind(Longitud, TDP, Volumen, Borde, Señales1) ~ 1)
summary(Transf)
## bcPower Transformations to Multinormality 
##          Est.Power Std.Err. Wald Lower Bound Wald Upper Bound
## Longitud    0.1437   0.2127          -0.2732           0.5607
## TDP         0.0509   0.1206          -0.1854           0.2872
## Volumen    -0.7028   0.6177          -1.9134           0.5078
## Borde       1.3456   0.3630           0.6341           2.0570
## Señales1   -0.2408   0.1496          -0.5341           0.0525
## 
## Likelihood ratio tests about transformation parameters
##                                      LRT df         pval
## LR test, lambda = (0 0 0 0 0)  23.324467  5 0.0002926014
## LR test, lambda = (1 1 1 1 1) 132.857421  5 0.0000000000
## LR test, lambda = (0 0 0 1 0)   6.088599  5 0.2976930877
pairs(Tasa ~ log2(Longitud) + log2(TDP) + log2(Volumen) + Borde + log2(Señales1))

Carreteras.m2 <- lm(Tasa ~ log2(Longitud) + log2(TDP) + log2(Volumen) + Borde + log2(Señales1))
summary(Carreteras.m2)
## 
## Call:
## lm(formula = Tasa ~ log2(Longitud) + log2(TDP) + log2(Volumen) + 
##     Borde + log2(Señales1))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1607 -1.0318 -0.0185  0.6674  3.2916 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    13.39210    2.19083   6.113 6.93e-07 ***
## log2(Longitud) -0.80907    0.30481  -2.654  0.01213 *  
## log2(TDP)      -0.13838    0.18381  -0.753  0.45686    
## log2(Volumen)  -1.20875    0.72174  -1.675  0.10343    
## Borde          -0.23129    0.08356  -2.768  0.00918 ** 
## log2(Señales1)  0.35568    0.16177   2.199  0.03501 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.267 on 33 degrees of freedom
## Multiple R-squared:  0.6467, Adjusted R-squared:  0.5931 
## F-statistic: 12.08 on 5 and 33 DF,  p-value: 1.074e-06
AIC(Carreteras.m1)
## [1] 143.8508
AIC(Carreteras.m2)
## [1] 136.6083

Transformando la variable respuesta

invTranPlot(Tasa, predict(Carreteras.m2), ylab = "Valores estimados")

##       lambda      RSS
## 1  0.1197427 32.32520
## 2 -1.0000000 35.70143
## 3  0.0000000 32.36532
## 4  1.0000000 34.24662
boxCox(Carreteras.m2)

Carreteras.m3 <- lm(log2(Tasa) ~ log2(Longitud) + log2(TDP) + log2(Volumen) + Borde + log2(Señales1))
summary(Carreteras.m3)
## 
## Call:
## lm(formula = log2(Tasa) ~ log2(Longitud) + log2(TDP) + log2(Volumen) + 
##     Borde + log2(Señales1))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.68076 -0.24836 -0.08724  0.30347  0.86489 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     5.02094    0.71522   7.020 4.96e-08 ***
## log2(Longitud) -0.28734    0.09951  -2.888  0.00680 ** 
## log2(TDP)      -0.06182    0.06001  -1.030  0.31040    
## log2(Volumen)  -0.37603    0.23562  -1.596  0.12004    
## Borde          -0.07761    0.02728  -2.845  0.00757 ** 
## log2(Señales1)  0.12547    0.05281   2.376  0.02347 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4136 on 33 degrees of freedom
## Multiple R-squared:  0.667,  Adjusted R-squared:  0.6166 
## F-statistic: 13.22 on 5 and 33 DF,  p-value: 4.209e-07
AIC(Carreteras.m3)
## [1] 49.2915