rm(list = ls()) # clear variables
graphics.off()  # close graphs
library(MASS) # library with many useful statistical functions
library(leaps) # may need to install (click Tools/Install Packages and type leaps). Includes functions for Model selection algorithms (regsubsets)
data<-read.csv("statedata.csv")
pairs(data) 

round(cor(data, use="everything"),2) 
##            Life.Exp Population Income Illiteracy Murder HS.Grad Frost  Area
## Life.Exp       1.00      -0.07   0.34      -0.59  -0.78    0.58  0.26 -0.11
## Population    -0.07       1.00   0.21       0.11   0.34   -0.10 -0.33  0.02
## Income         0.34       0.21   1.00      -0.44  -0.23    0.62  0.23  0.36
## Illiteracy    -0.59       0.11  -0.44       1.00   0.70   -0.66 -0.67  0.08
## Murder        -0.78       0.34  -0.23       0.70   1.00   -0.49 -0.54  0.23
## HS.Grad        0.58      -0.10   0.62      -0.66  -0.49    1.00  0.37  0.33
## Frost          0.26      -0.33   0.23      -0.67  -0.54    0.37  1.00  0.06
## Area          -0.11       0.02   0.36       0.08   0.23    0.33  0.06  1.00
lm.full=lm(Life.Exp~.,data)
summary(lm.full)
## 
## Call:
## lm(formula = Life.Exp ~ ., data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.48895 -0.51232 -0.02747  0.57002  1.49447 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.094e+01  1.748e+00  40.586  < 2e-16 ***
## Population   5.180e-05  2.919e-05   1.775   0.0832 .  
## Income      -2.180e-05  2.444e-04  -0.089   0.9293    
## Illiteracy   3.382e-02  3.663e-01   0.092   0.9269    
## Murder      -3.011e-01  4.662e-02  -6.459 8.68e-08 ***
## HS.Grad      4.893e-02  2.332e-02   2.098   0.0420 *  
## Frost       -5.735e-03  3.143e-03  -1.825   0.0752 .  
## Area        -7.383e-08  1.668e-06  -0.044   0.9649    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7448 on 42 degrees of freedom
## Multiple R-squared:  0.7362, Adjusted R-squared:  0.6922 
## F-statistic: 16.74 on 7 and 42 DF,  p-value: 2.534e-10
lm.red=lm(Life.Exp~.-HS.Grad-Murder, data)
summary(lm.red)
## 
## Call:
## lm(formula = Life.Exp ~ . - HS.Grad - Murder, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.60365 -0.70832 -0.05274  0.77479  2.69988 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.195e+01  1.845e+00  38.994  < 2e-16 ***
## Population  -3.413e-05  3.923e-05  -0.870 0.388982    
## Income       3.607e-04  3.339e-04   1.080 0.285940    
## Illiteracy  -1.474e+00  4.035e-01  -3.652 0.000688 ***
## Frost       -6.632e-03  4.427e-03  -1.498 0.141299    
## Area        -1.540e-06  2.109e-06  -0.730 0.469205    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.097 on 44 degrees of freedom
## Multiple R-squared:  0.4009, Adjusted R-squared:  0.3328 
## F-statistic: 5.888 on 5 and 44 DF,  p-value: 0.0003004
anova(lm.red,lm.full)
## Analysis of Variance Table
## 
## Model 1: Life.Exp ~ (Population + Income + Illiteracy + Murder + HS.Grad + 
##     Frost + Area) - HS.Grad - Murder
## Model 2: Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad + 
##     Frost + Area
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1     44 52.904                                  
## 2     42 23.297  2    29.607 26.688 3.312e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
names(lm.full)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
residuals.state=lm.full$residuals #pairnoume ta ypoloipa apo to lm.full
qqnorm(residuals.state)
qqline(residuals.state) 

plot(lm.full$fitted.values,residuals.state) 

bss=regsubsets(Life.Exp~., data, nvmax=7)
bss.summary=summary(bss) 
summary(bss)
## Subset selection object
## Call: regsubsets.formula(Life.Exp ~ ., data, nvmax = 7)
## 7 Variables  (and intercept)
##            Forced in Forced out
## Population     FALSE      FALSE
## Income         FALSE      FALSE
## Illiteracy     FALSE      FALSE
## Murder         FALSE      FALSE
## HS.Grad        FALSE      FALSE
## Frost          FALSE      FALSE
## Area           FALSE      FALSE
## 1 subsets of each size up to 7
## Selection Algorithm: exhaustive
##          Population Income Illiteracy Murder HS.Grad Frost Area
## 1  ( 1 ) " "        " "    " "        "*"    " "     " "   " " 
## 2  ( 1 ) " "        " "    " "        "*"    "*"     " "   " " 
## 3  ( 1 ) " "        " "    " "        "*"    "*"     "*"   " " 
## 4  ( 1 ) "*"        " "    " "        "*"    "*"     "*"   " " 
## 5  ( 1 ) "*"        "*"    " "        "*"    "*"     "*"   " " 
## 6  ( 1 ) "*"        "*"    "*"        "*"    "*"     "*"   " " 
## 7  ( 1 ) "*"        "*"    "*"        "*"    "*"     "*"   "*"
names(bss.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
par(mfrow=c(2,2)) 

plot(bss.summary$rss, xlab="Number of Variables", ylab="SSres", type="l") 
K0=which.min(bss.summary$rss) 
points(K0, bss.summary$rss[K0], col="red",cex=2,pch=20)

plot(bss.summary$adjr2,xlab="Number of Variables", ylab="Adjusted RSq", type="l") 
K1=which.max(bss.summary$adjr2) 
points(K1, bss.summary$adjr2[K1], col="red",cex=2,pch=20)

plot(bss.summary$cp,xlab="Number of Variables", ylab="C_p", type="l")
K2=which.min(bss.summary$cp) 
points(K2, bss.summary$cp[K2], col="red",cex=2,pch=20)

plot(bss.summary$bic,xlab="Number of Variables", ylab="BIC", type="l") 
K3=which.min(bss.summary$bic) 
points(K3, bss.summary$bic[K3], col="red",cex=2,pch=20)

par(mfrow=c(1,3))
plot(bss, scale = "Cp")
title("Cp")
plot(bss, scale = "adjr2")
title("AdjR2")
plot(bss, scale = "bic")
title("BIC")

coef(bss,4)
##   (Intercept)    Population        Murder       HS.Grad         Frost 
##  7.102713e+01  5.013998e-05 -3.001488e-01  4.658225e-02 -5.943290e-03
fs=regsubsets(Life.Exp~.,data, method="forward",nvmax=7) 
fs.summary=summary(fs) 
summary(fs)
## Subset selection object
## Call: regsubsets.formula(Life.Exp ~ ., data, method = "forward", nvmax = 7)
## 7 Variables  (and intercept)
##            Forced in Forced out
## Population     FALSE      FALSE
## Income         FALSE      FALSE
## Illiteracy     FALSE      FALSE
## Murder         FALSE      FALSE
## HS.Grad        FALSE      FALSE
## Frost          FALSE      FALSE
## Area           FALSE      FALSE
## 1 subsets of each size up to 7
## Selection Algorithm: forward
##          Population Income Illiteracy Murder HS.Grad Frost Area
## 1  ( 1 ) " "        " "    " "        "*"    " "     " "   " " 
## 2  ( 1 ) " "        " "    " "        "*"    "*"     " "   " " 
## 3  ( 1 ) " "        " "    " "        "*"    "*"     "*"   " " 
## 4  ( 1 ) "*"        " "    " "        "*"    "*"     "*"   " " 
## 5  ( 1 ) "*"        "*"    " "        "*"    "*"     "*"   " " 
## 6  ( 1 ) "*"        "*"    "*"        "*"    "*"     "*"   " " 
## 7  ( 1 ) "*"        "*"    "*"        "*"    "*"     "*"   "*"
names(fs.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
par(mfrow=c(2,2))
plot(fs.summary$rss, xlab="Number of Variables", ylab="SSE", type="l")
K0=which.min(fs.summary$rss)
points(K0, fs.summary$rss[K0], col="red",cex=2,pch=20)
plot(fs.summary$adjr2,xlab="Number of Variables", ylab="Adjusted RSq", type="l")
K1=which.max(fs.summary$adjr2)
points(K1, fs.summary$adjr2[K1], col="red",cex=2,pch=20)
plot(fs.summary$cp,xlab="Number of Variables", ylab="C_p", type="l")
K2=which.min(fs.summary$cp)
points(K2, fs.summary$cp[K2], col="red",cex=2,pch=20)
plot(fs.summary$bic,xlab="Number of Variables", ylab="BIC", type="l")
K3=which.min(fs.summary$bic)
points(K3, fs.summary$bic[K3], col="red",cex=2,pch=20)

par(mfrow=c(1,3))
plot(fs, scale = "Cp")
title("Cp")
plot(fs, scale = "adjr2")
title("AdjR2")
plot(fs, scale = "bic")
title("BIC")

coef(fs,4)
##   (Intercept)    Population        Murder       HS.Grad         Frost 
##  7.102713e+01  5.013998e-05 -3.001488e-01  4.658225e-02 -5.943290e-03
predict.regsubsets=function(object, newdata, id, ...){ #input einai ena
  # antikeimeno geniko, edw tha einai to bss.cv,
  # ta dedomena sta opoia tha kanei predict kai enas deiktis 
  # o opoios tha leei me pio montelo na kanei predict, edw to M_id
  form=as.formula(object$call[[2]]) # pairnei apo to object, 
  # to opoio edw tha einai to bss.cv ti formula pou to parigage
  mat=model.matrix(form, newdata) # xrisi tis pio panw
  # formulas gia na dimiourgisei ton pinaka X 
  # me ta newdata ws dedomena (gia oles tis ep metavlites)
  coefi=coef(object, id=id) # epilegei tis metavlites
  # pou vriskontai sto montelo M_id
  xvars=names(coefi) # vriskei ta onomata twn metavlitwn
  # gia na ta xrisimopoiisei ws deiktes pio katw
  return(mat[,xvars]%*%coefi) # kanei ton pollaplasiasmo gia tin provlepsi
}
K=5 
q=7 
set.seed(5) 
folds=sample(1:K, nrow(data), replace=TRUE) 
folds
##  [1] 2 3 1 3 1 1 5 3 3 2 5 4 2 5 3 1 4 3 2 5 2 2 3 1 2 4 5 3 2 1 2 2 4 4 4 4 5 3
## [39] 4 5 3 5 3 3 3 2 4 1 2 4
table(folds)
## folds
##  1  2  3  4  5 
##  7 12 13 10  8
cv.errors=matrix(NA,K,q,dimnames=list(NULL, paste(1:q))) 
for(k in 1:K){
  bss.cv=regsubsets(Life.Exp~.,data[folds!=k,],nvmax=7) 

  for(p in 1:q){
    cv.pred=predict(bss.cv, data[folds==k,],id=p) 
    cv.errors[k,p]=mean((data$Life.Exp[folds==k]-cv.pred)^2)
  }
}
mean.cv.errors=apply(cv.errors,2,mean) 
mean.cv.errors
##         1         2         3         4         5         6         7 
## 0.8154044 1.0544277 0.9683880 0.7974961 0.8987317 0.9606597 0.9838639
p.cv=which.min(mean.cv.errors)
plot(mean.cv.errors,type='b')
points(p.cv, mean.cv.errors[p.cv], col="red",cex=2,pch=20)

bss=regsubsets(Life.Exp~.,data,nvmax=7) 
summary(bss) 
## Subset selection object
## Call: regsubsets.formula(Life.Exp ~ ., data, nvmax = 7)
## 7 Variables  (and intercept)
##            Forced in Forced out
## Population     FALSE      FALSE
## Income         FALSE      FALSE
## Illiteracy     FALSE      FALSE
## Murder         FALSE      FALSE
## HS.Grad        FALSE      FALSE
## Frost          FALSE      FALSE
## Area           FALSE      FALSE
## 1 subsets of each size up to 7
## Selection Algorithm: exhaustive
##          Population Income Illiteracy Murder HS.Grad Frost Area
## 1  ( 1 ) " "        " "    " "        "*"    " "     " "   " " 
## 2  ( 1 ) " "        " "    " "        "*"    "*"     " "   " " 
## 3  ( 1 ) " "        " "    " "        "*"    "*"     "*"   " " 
## 4  ( 1 ) "*"        " "    " "        "*"    "*"     "*"   " " 
## 5  ( 1 ) "*"        "*"    " "        "*"    "*"     "*"   " " 
## 6  ( 1 ) "*"        "*"    "*"        "*"    "*"     "*"   " " 
## 7  ( 1 ) "*"        "*"    "*"        "*"    "*"     "*"   "*"
names(bss)
##  [1] "np"        "nrbar"     "d"         "rbar"      "thetab"    "first"    
##  [7] "last"      "vorder"    "tol"       "rss"       "bound"     "nvmax"    
## [13] "ress"      "ir"        "nbest"     "lopt"      "il"        "ier"      
## [19] "xnames"    "method"    "force.in"  "force.out" "sserr"     "intercept"
## [25] "lindep"    "nullrss"   "nn"        "call"
coef(bss,4)
##   (Intercept)    Population        Murder       HS.Grad         Frost 
##  7.102713e+01  5.013998e-05 -3.001488e-01  4.658225e-02 -5.943290e-03