#Biostatistics 401 #CHOOSING AN OPTIMAL LINEAR MODEL K=read.table("KNNLCh9SurgicalUnit.txt") K attach(K) options(digits=6) #FITTING FULL LINEAR MODEL FM=lm(Y~X1+X2+X3+X4+X5+factor(X6)) #SETTING UP MATRIX ALGEBRA OBJECTS AND HAT MATRIX n=length(Y) I=diag(n) J=matrix(nrow=n,ncol=n,1) X=model.matrix(FM) p=ncol(X) H=X%*%solve(t(X)%*%X)%*%t(X) #HAT MATRIX #CALCULATING SUMS OF SQUARES #USING MATRIX ALGEBRA IN R SSR=t(Y)%*%(H-((1/n)*J))%*%Y #Sum of Squares Regression SSE=t(Y)%*%(I-H)%*%Y #Sum of Squares Error SSTO=t(Y)%*%(I-(1/n)*J)%*%Y #Sum of Squares Total SSTO rbind(SSR,SSE,SSTO) #COMPARING WITH anova() OUTPUT anova(FM) #CALCULATING USEFUL INFORMATION FOR MODEL SELECTION #SUM OF SQUARES ERROR SSE[1] #from matrix calculation above converted to scalar summary(FM)$sigma^2*summary(FM)$df[2] #extracted from summary() #COEFFICIENT OF MULTIPLE DETERMINATION Rsq=1-(SSE[1]/SSTO[1]) Rsq summary(FM)$r.squared #extracted from summary() #ADJUSTED COEFFICIENT OF MULTIPLE DETERMINATION MSE=SSE[1]/(n-p) MSTO=SSTO[1]/(n-1) Rsqa=1-(MSE/MSTO) Rsqa summary(FM)$adj.r.squared #extracted from summary() #MALLOW'S Cp Cp=SSE[1]/MSE -(n-2*p) Cp require(wle) #DOWNLOAD {wle} PACKAGE FROM CRAN WEBSITE mle.cp(FM) #DON'T KNOW MUCH ABOUT THIS ONE, BUT INTERESTING! #AKAIKE'S INFORMATION CRITERION AIC AIC=n*log(SSE[1])-n*log(n)+2*p AIC extractAIC(FM) #REPORTS: (equivalent df, AIC) see ?extractAIC #SCHWARTZ'S BAYESIAN CRITERION SBC SBC=n*log(SSE[1])-n*log(n)+log(n)*p SBC extractAIC(FM,k=log(n)) #PRESS CRITERION e=residuals(FM) d=e/(1-diag(H)) #KNNL Eq. 10.21a diag(H) #MAIN DIAGONAL OF H AS A VECTOR hatvalues(FM) #ALTERNATE CALCULATION USING BUILT-IN FUNCTION & FM PRESS=sum(d^2) PRESS #MARGINAL TESTING OF EACH INDEPENDENT VARIABLE drop1(FM) require(car) Anova(FM) #Marginal extra sums of squares ANOVA table #STEPWISE REGRESSION USING AIC step(FM,direction="backward")