data stamey ; infile "c:\temp\stameyprostate2.csv" delimiter = ',' firstobs = 2 ; input obs $ lcavol lweight age lbph svi lcp gleason pgg45 lpsa train $ ; run ; proc print data = stamey ; run ; proc corr data = stamey (where = (train = "TRUE")) ; var lcavol lweight age lbph svi lcp gleason pgg45 ; run ; * ordinary least squares regression ; ods html ; ods graphics on; proc standard data = stamey (where = (train = "TRUE")) mean = 0 std = 1 out = zstamey ; var lcavol lweight age lbph svi lcp gleason pgg45 ; run ; * svi is binary ; proc print data = zstamey ; run ; proc reg data = zstamey ; model lpsa = lcavol lweight age lbph svi lcp gleason pgg45 ; run ; * best subset selection ; proc reg data=zstamey outest = reszstamey ; model lpsa = lcavol lweight age lbph svi lcp gleason pgg45 / selection = rsquare sse cp aic sbc best = 10 vif ; run ; proc plot data = reszstamey ; plot ( _RSQ_ _RMSE_ _SBC_ _AIC_)*_P_ ; run ; proc plot data = reszstamey (where = (_p_ ge 3)) ; plot ( _RSQ_ _RMSE_ _SBC_ _AIC_)*_P_ ; run ; proc reg data=zstamey ; model lpsa = lcavol lweight / vif ; run ; * ridge regression ; proc reg data=zstamey outest=b outvif ridge = 0 to .2 by .005 ; model lpsa = lcavol lweight age lbph svi lcp gleason pgg45 / vif ; run ; proc print data = b ; run ; * LASSO regression ; proc glmselect data = zstamey ; model lpsa = lcavol lweight age lbph svi lcp gleason pgg45 / selection = LASSO (stop = 5) ; run ; * Partial least squares regression ; proc pls data = zstamey method = pls nfac = 2 ; model lpsa = lcavol lweight age lbph svi lcp gleason pgg45 / intercept solution ; run ; * Further exploring the PLS method ; * It can be seen that the coefficients of the PLS solution for one factor are proportional to the correlations they have with y ; proc corr data = zstamey cov ; var lpsa lcavol lweight age lbph svi lcp gleason pgg45 ; run ; data zstamey2 ; set zstamey ; z1 = .733*lcavol +.485*lweight +.228*age +.263*lbph +.557*svi +.489*lcp +.342*gleason +.448*pgg45 ; proc reg data = zstamey2 ; model lpsa = z1 ; run ; proc pls data = zstamey method = pls nfac = 1 ; model lpsa = lcavol lweight age lbph svi lcp gleason pgg45 / intercept solution ; run ; * Principal component regresssion ; proc pls data = zstamey method = pcr nfac = 7 ; model lpsa = lcavol lweight age lbph svi lcp gleason pgg45 / intercept solution ; run ; proc pls data = stamey method = pcr nfac = 7 ; model lpsa = lcavol lweight age lbph svi lcp gleason pgg45 / solution ; run ; * Further exploring PC regression ; proc princomp data = zstamey ; var lcavol lweight age lbph svi lcp gleason pgg45 ; run ; data zstamey3 ; set zstamey ; pc1 = .436*lcavol +.168*lweight +.241*age +.030*lbph +.396*svi +.460*lcp +.394*gleason +.446*pgg45 ; proc reg data = zstamey3 ; model lpsa = pc1 ; run ; proc pls data = zstamey method = pcr nfac = 1 ; model lpsa = lcavol lweight age lbph svi lcp gleason pgg45 / intercept solution ; run ; ods graphics off; ods html close ;