rm(list = ls()) # clear variables
graphics.off() # close graphs
library(MASS) # library with many useful statistical functions
library(leaps) # may need to install (click Tools/Install Packages and type leaps). Includes functions for Model selection algorithms (regsubsets)
data<-read.csv("statedata.csv")
pairs(data)

round(cor(data, use="everything"),2)
## Life.Exp Population Income Illiteracy Murder HS.Grad Frost Area
## Life.Exp 1.00 -0.07 0.34 -0.59 -0.78 0.58 0.26 -0.11
## Population -0.07 1.00 0.21 0.11 0.34 -0.10 -0.33 0.02
## Income 0.34 0.21 1.00 -0.44 -0.23 0.62 0.23 0.36
## Illiteracy -0.59 0.11 -0.44 1.00 0.70 -0.66 -0.67 0.08
## Murder -0.78 0.34 -0.23 0.70 1.00 -0.49 -0.54 0.23
## HS.Grad 0.58 -0.10 0.62 -0.66 -0.49 1.00 0.37 0.33
## Frost 0.26 -0.33 0.23 -0.67 -0.54 0.37 1.00 0.06
## Area -0.11 0.02 0.36 0.08 0.23 0.33 0.06 1.00
lm.full=lm(Life.Exp~.,data)
summary(lm.full)
##
## Call:
## lm(formula = Life.Exp ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.48895 -0.51232 -0.02747 0.57002 1.49447
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.094e+01 1.748e+00 40.586 < 2e-16 ***
## Population 5.180e-05 2.919e-05 1.775 0.0832 .
## Income -2.180e-05 2.444e-04 -0.089 0.9293
## Illiteracy 3.382e-02 3.663e-01 0.092 0.9269
## Murder -3.011e-01 4.662e-02 -6.459 8.68e-08 ***
## HS.Grad 4.893e-02 2.332e-02 2.098 0.0420 *
## Frost -5.735e-03 3.143e-03 -1.825 0.0752 .
## Area -7.383e-08 1.668e-06 -0.044 0.9649
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7448 on 42 degrees of freedom
## Multiple R-squared: 0.7362, Adjusted R-squared: 0.6922
## F-statistic: 16.74 on 7 and 42 DF, p-value: 2.534e-10
lm.red=lm(Life.Exp~.-HS.Grad-Murder, data)
summary(lm.red)
##
## Call:
## lm(formula = Life.Exp ~ . - HS.Grad - Murder, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.60365 -0.70832 -0.05274 0.77479 2.69988
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.195e+01 1.845e+00 38.994 < 2e-16 ***
## Population -3.413e-05 3.923e-05 -0.870 0.388982
## Income 3.607e-04 3.339e-04 1.080 0.285940
## Illiteracy -1.474e+00 4.035e-01 -3.652 0.000688 ***
## Frost -6.632e-03 4.427e-03 -1.498 0.141299
## Area -1.540e-06 2.109e-06 -0.730 0.469205
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.097 on 44 degrees of freedom
## Multiple R-squared: 0.4009, Adjusted R-squared: 0.3328
## F-statistic: 5.888 on 5 and 44 DF, p-value: 0.0003004
anova(lm.red,lm.full)
## Analysis of Variance Table
##
## Model 1: Life.Exp ~ (Population + Income + Illiteracy + Murder + HS.Grad +
## Frost + Area) - HS.Grad - Murder
## Model 2: Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad +
## Frost + Area
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 44 52.904
## 2 42 23.297 2 29.607 26.688 3.312e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
names(lm.full)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
residuals.state=lm.full$residuals #pairnoume ta ypoloipa apo to lm.full
qqnorm(residuals.state)
qqline(residuals.state)

plot(lm.full$fitted.values,residuals.state)

bss=regsubsets(Life.Exp~., data, nvmax=7)
bss.summary=summary(bss)
summary(bss)
## Subset selection object
## Call: regsubsets.formula(Life.Exp ~ ., data, nvmax = 7)
## 7 Variables (and intercept)
## Forced in Forced out
## Population FALSE FALSE
## Income FALSE FALSE
## Illiteracy FALSE FALSE
## Murder FALSE FALSE
## HS.Grad FALSE FALSE
## Frost FALSE FALSE
## Area FALSE FALSE
## 1 subsets of each size up to 7
## Selection Algorithm: exhaustive
## Population Income Illiteracy Murder HS.Grad Frost Area
## 1 ( 1 ) " " " " " " "*" " " " " " "
## 2 ( 1 ) " " " " " " "*" "*" " " " "
## 3 ( 1 ) " " " " " " "*" "*" "*" " "
## 4 ( 1 ) "*" " " " " "*" "*" "*" " "
## 5 ( 1 ) "*" "*" " " "*" "*" "*" " "
## 6 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 7 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
names(bss.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
par(mfrow=c(2,2))
plot(bss.summary$rss, xlab="Number of Variables", ylab="SSres", type="l")
K0=which.min(bss.summary$rss)
points(K0, bss.summary$rss[K0], col="red",cex=2,pch=20)
plot(bss.summary$adjr2,xlab="Number of Variables", ylab="Adjusted RSq", type="l")
K1=which.max(bss.summary$adjr2)
points(K1, bss.summary$adjr2[K1], col="red",cex=2,pch=20)
plot(bss.summary$cp,xlab="Number of Variables", ylab="C_p", type="l")
K2=which.min(bss.summary$cp)
points(K2, bss.summary$cp[K2], col="red",cex=2,pch=20)
plot(bss.summary$bic,xlab="Number of Variables", ylab="BIC", type="l")
K3=which.min(bss.summary$bic)
points(K3, bss.summary$bic[K3], col="red",cex=2,pch=20)

par(mfrow=c(1,3))
plot(bss, scale = "Cp")
title("Cp")
plot(bss, scale = "adjr2")
title("AdjR2")
plot(bss, scale = "bic")
title("BIC")

coef(bss,4)
## (Intercept) Population Murder HS.Grad Frost
## 7.102713e+01 5.013998e-05 -3.001488e-01 4.658225e-02 -5.943290e-03
fs=regsubsets(Life.Exp~.,data, method="forward",nvmax=7)
fs.summary=summary(fs)
summary(fs)
## Subset selection object
## Call: regsubsets.formula(Life.Exp ~ ., data, method = "forward", nvmax = 7)
## 7 Variables (and intercept)
## Forced in Forced out
## Population FALSE FALSE
## Income FALSE FALSE
## Illiteracy FALSE FALSE
## Murder FALSE FALSE
## HS.Grad FALSE FALSE
## Frost FALSE FALSE
## Area FALSE FALSE
## 1 subsets of each size up to 7
## Selection Algorithm: forward
## Population Income Illiteracy Murder HS.Grad Frost Area
## 1 ( 1 ) " " " " " " "*" " " " " " "
## 2 ( 1 ) " " " " " " "*" "*" " " " "
## 3 ( 1 ) " " " " " " "*" "*" "*" " "
## 4 ( 1 ) "*" " " " " "*" "*" "*" " "
## 5 ( 1 ) "*" "*" " " "*" "*" "*" " "
## 6 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 7 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
names(fs.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
par(mfrow=c(2,2))
plot(fs.summary$rss, xlab="Number of Variables", ylab="SSE", type="l")
K0=which.min(fs.summary$rss)
points(K0, fs.summary$rss[K0], col="red",cex=2,pch=20)
plot(fs.summary$adjr2,xlab="Number of Variables", ylab="Adjusted RSq", type="l")
K1=which.max(fs.summary$adjr2)
points(K1, fs.summary$adjr2[K1], col="red",cex=2,pch=20)
plot(fs.summary$cp,xlab="Number of Variables", ylab="C_p", type="l")
K2=which.min(fs.summary$cp)
points(K2, fs.summary$cp[K2], col="red",cex=2,pch=20)
plot(fs.summary$bic,xlab="Number of Variables", ylab="BIC", type="l")
K3=which.min(fs.summary$bic)
points(K3, fs.summary$bic[K3], col="red",cex=2,pch=20)

par(mfrow=c(1,3))
plot(fs, scale = "Cp")
title("Cp")
plot(fs, scale = "adjr2")
title("AdjR2")
plot(fs, scale = "bic")
title("BIC")

coef(fs,4)
## (Intercept) Population Murder HS.Grad Frost
## 7.102713e+01 5.013998e-05 -3.001488e-01 4.658225e-02 -5.943290e-03
predict.regsubsets=function(object, newdata, id, ...){ #input einai ena
# antikeimeno geniko, edw tha einai to bss.cv,
# ta dedomena sta opoia tha kanei predict kai enas deiktis
# o opoios tha leei me pio montelo na kanei predict, edw to M_id
form=as.formula(object$call[[2]]) # pairnei apo to object,
# to opoio edw tha einai to bss.cv ti formula pou to parigage
mat=model.matrix(form, newdata) # xrisi tis pio panw
# formulas gia na dimiourgisei ton pinaka X
# me ta newdata ws dedomena (gia oles tis ep metavlites)
coefi=coef(object, id=id) # epilegei tis metavlites
# pou vriskontai sto montelo M_id
xvars=names(coefi) # vriskei ta onomata twn metavlitwn
# gia na ta xrisimopoiisei ws deiktes pio katw
return(mat[,xvars]%*%coefi) # kanei ton pollaplasiasmo gia tin provlepsi
}
K=5
q=7
set.seed(5)
folds=sample(1:K, nrow(data), replace=TRUE)
folds
## [1] 2 3 1 3 1 1 5 3 3 2 5 4 2 5 3 1 4 3 2 5 2 2 3 1 2 4 5 3 2 1 2 2 4 4 4 4 5 3
## [39] 4 5 3 5 3 3 3 2 4 1 2 4
table(folds)
## folds
## 1 2 3 4 5
## 7 12 13 10 8
cv.errors=matrix(NA,K,q,dimnames=list(NULL, paste(1:q)))
for(k in 1:K){
bss.cv=regsubsets(Life.Exp~.,data[folds!=k,],nvmax=7)
for(p in 1:q){
cv.pred=predict(bss.cv, data[folds==k,],id=p)
cv.errors[k,p]=mean((data$Life.Exp[folds==k]-cv.pred)^2)
}
}
mean.cv.errors=apply(cv.errors,2,mean)
mean.cv.errors
## 1 2 3 4 5 6 7
## 0.8154044 1.0544277 0.9683880 0.7974961 0.8987317 0.9606597 0.9838639
p.cv=which.min(mean.cv.errors)
plot(mean.cv.errors,type='b')
points(p.cv, mean.cv.errors[p.cv], col="red",cex=2,pch=20)

bss=regsubsets(Life.Exp~.,data,nvmax=7)
summary(bss)
## Subset selection object
## Call: regsubsets.formula(Life.Exp ~ ., data, nvmax = 7)
## 7 Variables (and intercept)
## Forced in Forced out
## Population FALSE FALSE
## Income FALSE FALSE
## Illiteracy FALSE FALSE
## Murder FALSE FALSE
## HS.Grad FALSE FALSE
## Frost FALSE FALSE
## Area FALSE FALSE
## 1 subsets of each size up to 7
## Selection Algorithm: exhaustive
## Population Income Illiteracy Murder HS.Grad Frost Area
## 1 ( 1 ) " " " " " " "*" " " " " " "
## 2 ( 1 ) " " " " " " "*" "*" " " " "
## 3 ( 1 ) " " " " " " "*" "*" "*" " "
## 4 ( 1 ) "*" " " " " "*" "*" "*" " "
## 5 ( 1 ) "*" "*" " " "*" "*" "*" " "
## 6 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 7 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
names(bss)
## [1] "np" "nrbar" "d" "rbar" "thetab" "first"
## [7] "last" "vorder" "tol" "rss" "bound" "nvmax"
## [13] "ress" "ir" "nbest" "lopt" "il" "ier"
## [19] "xnames" "method" "force.in" "force.out" "sserr" "intercept"
## [25] "lindep" "nullrss" "nn" "call"
coef(bss,4)
## (Intercept) Population Murder HS.Grad Frost
## 7.102713e+01 5.013998e-05 -3.001488e-01 4.658225e-02 -5.943290e-03