# Many of the same type of variable selection analyses that we have seen in SAS can be # performed in R. One analysis, similar to the backward, stepwise, and forward methods from SAS # uses the step function, illustrated below. # A more exhaustive analysis like with the SAS 'selection = rsquare' option is available # from the regsubsets function in the leaps package, shown below. Autos1 <- read.table("c:/temp/autos.txt",header=T) names(Autos1) # define power, crossproduct, and other terms Autos1$LMPG <- log(Autos1$MPG) Autos1$HP2 <- (Autos1$HP)^2 Autos1$SP2 <- (Autos1$SP)^2 Autos1$WT2 <- (Autos1$WT)^2 Autos1$VOL2 <- (Autos1$VOL)^2 Autos1$HPSP <- (Autos1$HP)*(Autos1$SP) Autos1$HPVOL <- (Autos1$HP)*(Autos1$VOL) Autos1$HPWT <- (Autos1$HP)*(Autos1$WT) Autos1$SPVOL <- (Autos1$SP)*(Autos1$VOL) Autos1$SPWT <- (Autos1$SP)*(Autos1$WT) Autos1$WTVOL <- (Autos1$WT)*(Autos1$VOL) Autos1$lowvol <- as.numeric(Autos1$VOL <= 51) Autos1$hihp <- as.numeric(Autos1$HP >= 200) # look at the big model with all terms summary(lm(LMPG ~ WT +HP +SP +VOL +HP2 +SP2 +WT2 +VOL2 +HPSP +HPVOL +HPWT + +SPVOL +SPWT +WTVOL +lowvol +hihp, data = Autos1)) # equivalent way to get the same model with less typing summary(lm(LMPG ~ . -MAKE.MODEL -MPG, data = Autos1)) # stepwise-type analyses based on the AIC criterion using the step function step(lm(LMPG ~ 1,data = Autos1),LMPG ~ WT +HP +SP +VOL +HP2 +SP2 +WT2 +VOL2 +HPSP +HPVOL +HPWT + +SPVOL +SPWT +WTVOL +lowvol +hihp, direction="forward") step(lm(LMPG ~ 1,data = Autos1),LMPG ~ WT +HP +SP +VOL +HP2 +SP2 +WT2 +VOL2 +HPSP +HPVOL +HPWT + +SPVOL +SPWT +WTVOL +lowvol +hihp, direction="both") step(lm(LMPG ~ . -MAKE.MODEL -MPG,data = Autos1),LMPG ~ WT +HP +SP +VOL +HP2 +SP2 +WT2 +VOL2 + +HPSP +HPVOL +HPWT +SPVOL +SPWT +WTVOL +lowvol +hihp, direction="backward") # Here we use the k = option in step to obtain analyses based on the BIC criterion # the option k = log(dim(Autos1)[[1]]) is k = log(n), where n is the sample size # this option gives BIC values instead of AIC values, # although note that this is the same as the SBC option in SAS, not the SAS BIC option step(lm(LMPG ~ 1,data = Autos1),LMPG ~ WT +HP +SP +VOL +HP2 +SP2 +WT2 +VOL2 +HPSP +HPVOL +HPWT + +SPVOL +SPWT +WTVOL +lowvol +hihp, direction="forward", k = log(dim(Autos1)[[1]])) step(lm(LMPG ~ . -MAKE.MODEL -MPG,data = Autos1),LMPG ~ WT +HP +SP +VOL +HP2 +SP2 +WT2 +VOL2 + +HPSP +HPVOL +HPWT +SPVOL +SPWT +WTVOL +lowvol +hihp, direction="backward", k = log(dim(Autos1)[[1]])) step(lm(LMPG ~ . -MAKE.MODEL -MPG,data = Autos1),LMPG ~ WT +HP +SP +VOL +HP2 +SP2 +WT2 +VOL2 + +HPSP +HPVOL +HPWT +SPVOL +SPWT +WTVOL +lowvol +hihp, direction="both", k = log(dim(Autos1)[[1]])) # Here we use the leaps package and its function regsubsets to perform an all subsets analysis library(leaps) summary(regsubsets(log(LMPG) ~ . -MAKE.MODEL -MPG, data=Autos1, nbest=5, nvmax = 16)) plot(regsubsets(log(LMPG) ~ . -MAKE.MODEL -MPG, data=Autos1, nbest=5, nvmax = 16)) plot(regsubsets(log(LMPG) ~ . -MAKE.MODEL -MPG, data=Autos1, nbest=5, nvmax = 16), scale="adjr2") plot(regsubsets(log(LMPG) ~ . -MAKE.MODEL -MPG, data=Autos1, nbest=5, nvmax = 16), scale="bic") summary(regsubsets(log(LMPG) ~ . -MAKE.MODEL -MPG, data=Autos1, nbest=5, nvmax = 16, method = "adjr2")) Auto.SubFit <- regsubsets(log(LMPG) ~ . -MAKE.MODEL -MPG, data=Autos1, nbest=5, nvmax = 16) # the subsets function in the car package provides alternative plots for regsubsets objects, # the min.size and max.size options let the user focus on subsets of all models, like those # having between 6 and 12 terms as shown below. library(car) subsets(Auto.SubFit,statistic="bic") subsets(Auto.SubFit,statistic="bic",cex.subsets=.8,min.size=6,max.size=12)