Capitulo 8: Arboles

Arboles de clasificación

library(tree)

## Warning: package 'tree' was built under R version 3.1.2

library(ISLR)
attach(Carseats)
High=ifelse(Sales<=8,"No","Yes")
Carseats=data.frame(Carseats,High)
tree.carseats=tree(High~.-Sales,Carseats)
summary(tree.carseats)

## 
## Classification tree:
## tree(formula = High ~ . - Sales, data = Carseats)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Income"      "CompPrice"   "Population" 
## [6] "Advertising" "Age"         "US"         
## Number of terminal nodes:  27 
## Residual mean deviance:  0.458 = 171 / 373 
## Misclassification error rate: 0.09 = 36 / 400

plot(tree.carseats)
text(tree.carseats,pretty=0)

plot of chunk unnamed-chunk-1

tree.carseats

## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 400 500 No ( 0.59 0.41 )  
##     2) ShelveLoc: Bad,Medium 315 400 No ( 0.69 0.31 )  
##       4) Price < 92.5 46  60 Yes ( 0.30 0.70 )  
##         8) Income < 57 10  10 No ( 0.70 0.30 )  
##          16) CompPrice < 110.5 5   0 No ( 1.00 0.00 ) *
##          17) CompPrice > 110.5 5   7 Yes ( 0.40 0.60 ) *
##         9) Income > 57 36  40 Yes ( 0.19 0.81 )  
##          18) Population < 207.5 16  20 Yes ( 0.38 0.62 ) *
##          19) Population > 207.5 20   8 Yes ( 0.05 0.95 ) *
##       5) Price > 92.5 269 300 No ( 0.75 0.25 )  
##        10) Advertising < 13.5 224 200 No ( 0.82 0.18 )  
##          20) CompPrice < 124.5 96  40 No ( 0.94 0.06 )  
##            40) Price < 106.5 38  30 No ( 0.84 0.16 )  
##              80) Population < 177 12  20 No ( 0.58 0.42 )  
##               160) Income < 60.5 6   0 No ( 1.00 0.00 ) *
##               161) Income > 60.5 6   5 Yes ( 0.17 0.83 ) *
##              81) Population > 177 26   8 No ( 0.96 0.04 ) *
##            41) Price > 106.5 58   0 No ( 1.00 0.00 ) *
##          21) CompPrice > 124.5 128 200 No ( 0.73 0.27 )  
##            42) Price < 122.5 51  70 Yes ( 0.49 0.51 )  
##              84) ShelveLoc: Bad 11   7 No ( 0.91 0.09 ) *
##              85) ShelveLoc: Medium 40  50 Yes ( 0.38 0.62 )  
##               170) Price < 109.5 16   7 Yes ( 0.06 0.94 ) *
##               171) Price > 109.5 24  30 No ( 0.58 0.42 )  
##                 342) Age < 49.5 13  20 Yes ( 0.31 0.69 ) *
##                 343) Age > 49.5 11   7 No ( 0.91 0.09 ) *
##            43) Price > 122.5 77  60 No ( 0.88 0.12 )  
##              86) CompPrice < 147.5 58  20 No ( 0.97 0.03 ) *
##              87) CompPrice > 147.5 19  30 No ( 0.63 0.37 )  
##               174) Price < 147 12  20 Yes ( 0.42 0.58 )  
##                 348) CompPrice < 152.5 7   6 Yes ( 0.14 0.86 ) *
##                 349) CompPrice > 152.5 5   5 No ( 0.80 0.20 ) *
##               175) Price > 147 7   0 No ( 1.00 0.00 ) *
##        11) Advertising > 13.5 45  60 Yes ( 0.44 0.56 )  
##          22) Age < 54.5 25  30 Yes ( 0.20 0.80 )  
##            44) CompPrice < 130.5 14  20 Yes ( 0.36 0.64 )  
##              88) Income < 100 9  10 No ( 0.56 0.44 ) *
##              89) Income > 100 5   0 Yes ( 0.00 1.00 ) *
##            45) CompPrice > 130.5 11   0 Yes ( 0.00 1.00 ) *
##          23) Age > 54.5 20  20 No ( 0.75 0.25 )  
##            46) CompPrice < 122.5 10   0 No ( 1.00 0.00 ) *
##            47) CompPrice > 122.5 10  10 No ( 0.50 0.50 )  
##              94) Price < 125 5   0 Yes ( 0.00 1.00 ) *
##              95) Price > 125 5   0 No ( 1.00 0.00 ) *
##     3) ShelveLoc: Good 85  90 Yes ( 0.22 0.78 )  
##       6) Price < 135 68  50 Yes ( 0.12 0.88 )  
##        12) US: No 17  20 Yes ( 0.35 0.65 )  
##          24) Price < 109 8   0 Yes ( 0.00 1.00 ) *
##          25) Price > 109 9  10 No ( 0.67 0.33 ) *
##        13) US: Yes 51  20 Yes ( 0.04 0.96 ) *
##       7) Price > 135 17  20 No ( 0.65 0.35 )  
##        14) Income < 46 6   0 No ( 1.00 0.00 ) *
##        15) Income > 46 11  20 Yes ( 0.45 0.55 ) *

set.seed(2)
train=sample(1:nrow(Carseats), 200)
Carseats.test=Carseats[-train,]
High.test=High[-train]
tree.carseats=tree(High~.-Sales,Carseats,subset=train)
tree.pred=predict(tree.carseats,Carseats.test,type="class")
table(tree.pred,High.test)

##          High.test
## tree.pred No Yes
##       No  86  27
##       Yes 30  57

(86+57)/200

## [1] 0.715

set.seed(3)
cv.carseats=cv.tree(tree.carseats,FUN=prune.misclass)
names(cv.carseats)

## [1] "size"   "dev"    "k"      "method"

cv.carseats

## $size
## [1] 19 17 14 13  9  7  3  2  1
## 
## $dev
## [1] 55 55 53 52 50 56 69 65 80
## 
## $k
## [1]    -Inf  0.0000  0.6667  1.0000  1.7500  2.0000  4.2500  5.0000 23.0000
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"

par(mfrow=c(1,2))
plot(cv.carseats$size,cv.carseats$dev,type="b")
plot(cv.carseats$k,cv.carseats$dev,type="b")

plot of chunk unnamed-chunk-1

prune.carseats=prune.misclass(tree.carseats,best=9)
plot(prune.carseats)
text(prune.carseats,pretty=0)
tree.pred=predict(prune.carseats,Carseats.test,type="class")
table(tree.pred,High.test)

##          High.test
## tree.pred No Yes
##       No  94  24
##       Yes 22  60

(94+60)/200

## [1] 0.77

prune.carseats=prune.misclass(tree.carseats,best=15)
plot(prune.carseats)
text(prune.carseats,pretty=0)

plot of chunk unnamed-chunk-1

tree.pred=predict(prune.carseats,Carseats.test,type="class")
table(tree.pred,High.test)

##          High.test
## tree.pred No Yes
##       No  86  22
##       Yes 30  62

(86+62)/200

## [1] 0.74

Arboles de regresión

library(MASS)
set.seed(1)
train = sample(1:nrow(Boston), nrow(Boston)/2)
tree.boston=tree(medv~.,Boston,subset=train)
summary(tree.boston)

## 
## Regression tree:
## tree(formula = medv ~ ., data = Boston, subset = train)
## Variables actually used in tree construction:
## [1] "lstat" "rm"    "dis"  
## Number of terminal nodes:  8 
## Residual mean deviance:  12.6 = 3100 / 245 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -14.100  -2.040  -0.054   0.000   1.960  12.600

plot(tree.boston)
text(tree.boston,pretty=0)

plot of chunk unnamed-chunk-2

cv.boston=cv.tree(tree.boston)
plot(cv.boston$size,cv.boston$dev,type='b')

plot of chunk unnamed-chunk-2

par(mfrow=c(2,3))
plot(tree.boston)
title("Arbol completo")
prune.boston4=prune.tree(tree.boston,best=4)
plot(prune.boston4)
title("best=4")
prune.boston5=prune.tree(tree.boston,best=5)
plot(prune.boston5)
title("best=5")
prune.boston6=prune.tree(tree.boston,best=6)
plot(prune.boston6)
title("best=6")
prune.boston7=prune.tree(tree.boston,best=7)
plot(prune.boston7)
title("best=7")
prune.boston8=prune.tree(tree.boston,best=8)
plot(prune.boston8)
title("best=8")

plot of chunk unnamed-chunk-2

par(mfrow=c(1,1))
plot(prune.boston5)
text(prune.boston5,pretty=0)

plot of chunk unnamed-chunk-2

yhat=predict(tree.boston,newdata=Boston[-train,])
boston.test=Boston[-train,"medv"]
plot(boston.test,yhat)

plot of chunk unnamed-chunk-2

mean((yhat-boston.test)^2)

## [1] 25.05

Bagging y Random Forest

library(randomForest)

## Warning: package 'randomForest' was built under R version 3.1.2

## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.

set.seed(1)
bag.boston=randomForest(medv~.,data=Boston,subset=train,mtry=13,importance=TRUE)
bag.boston

## 
## Call:
##  randomForest(formula = medv ~ ., data = Boston, mtry = 13, importance = TRUE,      subset = train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 13
## 
##           Mean of squared residuals: 11.03
##                     % Var explained: 86.65

yhat.bag = predict(bag.boston,newdata=Boston[-train,])
plot(yhat.bag, boston.test)
abline(0,1)

plot of chunk unnamed-chunk-3

mean((yhat.bag-boston.test)^2)

## [1] 13.47

bag.boston=randomForest(medv~.,data=Boston,subset=train,mtry=13,ntree=25)
yhat.bag = predict(bag.boston,newdata=Boston[-train,])
mean((yhat.bag-boston.test)^2)

## [1] 13.43

set.seed(1)
rf.boston=randomForest(medv~.,data=Boston,subset=train,mtry=6,importance=TRUE)
yhat.rf = predict(rf.boston,newdata=Boston[-train,])
mean((yhat.rf-boston.test)^2)

## [1] 11.48

importance(rf.boston)

##         %IncMSE IncNodePurity
## crim     12.548       1094.65
## zn        1.375         64.40
## indus     9.304       1086.09
## chas      2.519         76.37
## nox      12.836       1008.74
## rm       31.646       6705.03
## age       9.970        575.14
## dis      12.774       1351.02
## rad       3.912         93.78
## tax       7.624        453.19
## ptratio  12.008        919.07
## black     7.376        358.97
## lstat    27.667       6927.98

varImpPlot(rf.boston)

plot of chunk unnamed-chunk-3