Clasificación

Regresión logística

library(ISLR)
## Warning: package 'ISLR' was built under R version 3.1.1
attach(Default)
head(Default)
##   default student balance income
## 1      No      No   729.5  44362
## 2      No     Yes   817.2  12106
## 3      No      No  1073.5  31767
## 4      No      No   529.3  35704
## 5      No      No   785.7  38463
## 6      No     Yes   919.6   7492
plot(balance,income,type="n")
points(balance[default=="No"],income[default=="No"],col="blue")
points(balance[default=="Yes"],income[default=="Yes"],col="red")

plot of chunk unnamed-chunk-1

par(mfrow=c(1,2))
boxplot(balance ~ default, xlab="Default", ylab="Balance", col=c("blue","red"))
boxplot(income ~ default, xlab="Default", ylab="Income", col=c("blue","red"))

plot of chunk unnamed-chunk-1

m1 <- glm(default ~ balance, family=binomial(link=logit))
summary(m1)
## 
## Call:
## glm(formula = default ~ balance, family = binomial(link = logit))
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.270  -0.146  -0.059  -0.022   3.759  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -10.65133    0.36116   -29.5   <2e-16 ***
## balance       0.00550    0.00022    24.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1596.5  on 9998  degrees of freedom
## AIC: 1600
## 
## Number of Fisher Scoring iterations: 8
predict(m1, data.frame(balance=1000), type="response")
##        1 
## 0.005752
predict(m1, data.frame(balance=2000), type="response")
##      1 
## 0.5858
m2 <- glm(default ~ student, family=binomial(link=logit))
summary(m2)
## 
## Call:
## glm(formula = default ~ student, family = binomial(link = logit))
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.297  -0.297  -0.243  -0.243   2.659  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.5041     0.0707  -49.55  < 2e-16 ***
## studentYes    0.4049     0.1150    3.52  0.00043 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 2908.7  on 9998  degrees of freedom
## AIC: 2913
## 
## Number of Fisher Scoring iterations: 6
predict(m2, data.frame(student="Yes"), type="response")
##       1 
## 0.04314
predict(m2, data.frame(student="No"), type="response")
##      1 
## 0.0292
m3 <- glm(default ~ balance + income + student, family=binomial(link=logit))
summary(m3)
## 
## Call:
## glm(formula = default ~ balance + income + student, family = binomial(link = logit))
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.469  -0.142  -0.056  -0.020   3.738  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.09e+01   4.92e-01  -22.08   <2e-16 ***
## balance      5.74e-03   2.32e-04   24.74   <2e-16 ***
## income       3.03e-06   8.20e-06    0.37   0.7115    
## studentYes  -6.47e-01   2.36e-01   -2.74   0.0062 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1571.5  on 9996  degrees of freedom
## AIC: 1580
## 
## Number of Fisher Scoring iterations: 8
m4 <- glm(default ~ balance + student, family=binomial(link=logit))
m4.probs <- predict(m4, type="response")
m4.pred <- ifelse(m4.probs>=0.5, "Yes", "No")
table(m4.pred, default)
##        default
## m4.pred   No  Yes
##     No  9628  228
##     Yes   39  105

Smarket data

data(Smarket)
attach(Smarket)
head(Smarket)
##   Year   Lag1   Lag2   Lag3   Lag4   Lag5 Volume  Today Direction
## 1 2001  0.381 -0.192 -2.624 -1.055  5.010  1.191  0.959        Up
## 2 2001  0.959  0.381 -0.192 -2.624 -1.055  1.296  1.032        Up
## 3 2001  1.032  0.959  0.381 -0.192 -2.624  1.411 -0.623      Down
## 4 2001 -0.623  1.032  0.959  0.381 -0.192  1.276  0.614        Up
## 5 2001  0.614 -0.623  1.032  0.959  0.381  1.206  0.213        Up
## 6 2001  0.213  0.614 -0.623  1.032  0.959  1.349  1.392        Up
log.m1 <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume ,family=binomial(link=logit))
log.m1.probs <- predict(log.m1, type="response") 
log.m1.pred <- ifelse(log.m1.probs>0.5, "Up", "Down")
table(log.m1.pred,Direction)
##            Direction
## log.m1.pred Down  Up
##        Down  145 141
##        Up    457 507
mean(log.m1.pred==Direction)
## [1] 0.5216

Data de entrenamiento y de prueba

entrenamiento <- Year < 2005
log.m2 <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, family=binomial, subset=entrenamiento)
log.m2.probs <- predict(log.m2, newdata=Smarket[!entrenamiento,], type="response") 
log.m2.pred <- ifelse(log.m2.probs>0.5, "Up", "Down")
Direction.2005 <- Smarket$Direction[!entrenamiento]
table(log.m2.pred, Direction.2005)
##            Direction.2005
## log.m2.pred Down Up
##        Down   77 97
##        Up     34 44
mean(log.m2.pred==Direction.2005)
## [1] 0.4802

Analisis Discriminante Lineal

library(MASS)
lda.m1 <- lda(Direction ~ Lag1 + Lag2, subset=Year<2005)
lda.m1
## Call:
## lda(Direction ~ Lag1 + Lag2, subset = Year < 2005)
## 
## Prior probabilities of groups:
##  Down    Up 
## 0.492 0.508 
## 
## Group means:
##          Lag1     Lag2
## Down  0.04279  0.03389
## Up   -0.03955 -0.03133
## 
## Coefficients of linear discriminants:
##          LD1
## Lag1 -0.6420
## Lag2 -0.5135
plot(lda.m1)

plot of chunk unnamed-chunk-4

Smarket.2005 <- subset(Smarket, Year==2005)
lda.m1.pred <- predict(lda.m1, Smarket.2005)
table(lda.m1.pred$class, Smarket.2005$Direction)
##       
##        Down  Up
##   Down   35  35
##   Up     76 106
mean(lda.m1.pred$class==Smarket.2005$Direction)
## [1] 0.5595

Analisis Discriminante Cuadratico

library(MASS)
qda.m1 <- qda(Direction ~ Lag1 + Lag2, subset=Year<2005)
qda.m1
## Call:
## qda(Direction ~ Lag1 + Lag2, subset = Year < 2005)
## 
## Prior probabilities of groups:
##  Down    Up 
## 0.492 0.508 
## 
## Group means:
##          Lag1     Lag2
## Down  0.04279  0.03389
## Up   -0.03955 -0.03133
qda.m1.pred <- predict(qda.m1, Smarket.2005)
table(qda.m1.pred$class, Smarket.2005$Direction)
##       
##        Down  Up
##   Down   30  20
##   Up     81 121
mean(qda.m1.pred$class==Smarket.2005$Direction)
## [1] 0.5992