### -------------------------------------------------------------------------### ### NMSA407 - Linear Regression ### ### Winter Term 2021/2022 ### ### ### ### EXERCISE #4 Regression model diagnostics ### ### -------------------------------------------------------------------------### ### -------------------------------------------------------------------------### ### Block 1: TEACHING MATERIAL ### ### -------------------------------------------------------------------------### library("mffSM") library("splines") # B-splines modelling library("MASS") # function stdres for standardized residuals ### Working data data(Dris, package = "mffSM") help("Dris") head(Dris) summary(Dris) ### a simple linear model fitted in the last session aLogMg <- lm(yield ~ log2(Mg), data = Dris) summary(aLogMg) ### Basic model checking ### ### EXPLANATION OF THE THREE BASIC RESIDUAL PLOTS ### (see Section 3.2 of the lecture notes) ### ### Raw residuals U_1, ..., U_n satisfy: ### E(U_i | X) = 0, var(U_i| X) = sigma^2 * m_{i,i} ### U_i ~ NORMAL in a normal LM, ### but with different variances for different i ### ### Standardized residuals V_1, ..., V_n are defined as ### V_i = U_i/sqrt(MS_e * m_{i,i}) ### and satisfy ### E(V_i | X) = 0, var(V_i | X) = 1 (in a normal model) ### ### V_i's are not normal (even in a normal model) ### but normality is often checked by using normality ### diagnostic tools based on the standardized residuals. ### Using functionality of standard R function plot.lm layout(matrix(c(0,1,1,0,2,2,3,3), nrow = 2, byrow = TRUE)) plot(aLogMg, which = 1:3, ask = FALSE) par(mfrow = c(1, 1)) ### equivalently with the plotLM() function from package mffSM plotLM(aLogMg) ### QUESTIONS ### Can we reconstruct these plots? ### First plot - fitted vs residuals plot(aLogMg, which = 1) ### manual computation points(fitted(aLogMg),residuals(aLogMg),lwd=2,col="blue4",bg="skyblue",pch=21) lines(lowess(fitted(aLogMg),residuals(aLogMg)),col=2,lwd=3) ### lowess is a nonparametric robust regression estimator ### more on this function in NMST434: Modern Statistical Methods ### Second plot - QQ plot of the standardized residuals plot(aLogMg, which = 2, ask = FALSE) ### standardized residuals H = qr.Q(aLogMg[["qr"]]) %*% t(qr.Q(aLogMg[["qr"]])) M = diag(nrow(H))-H stdres = residuals(aLogMg)/sqrt(deviance(aLogMg)/aLogMg$df.residual*diag(M)) qq = qqnorm(stdres,plot.it=FALSE) points(qq$x, qq$y,lwd=2,col="blue4",bg="skyblue",pch=21) all.equal(qq$y,stdres) qqline(stdres,lwd=2,lty=3) ### or using function stdres from library MASS all.equal(stdres(aLogMg),stdres) ### Third plot - fitted vs sqrt(abs(standardized residuals)) plot(aLogMg, which = 3, ask = FALSE) points(fitted(aLogMg),sqrt(abs(stdres)),lwd=2,col="blue4",bg="skyblue",pch=21) lines(lowess(fitted(aLogMg),sqrt(abs(stdres))),col=2,lwd=3) ### Overall, all three diagnostic plots plotLM(aLogMg) ### Is there possibly something wrong with model aLogMg? ### Note that there are infinitely many possibilities how to check that ### E(U_i| X) = 0. (a scatterplot of fitted values vs. residuals is only one ### of them, other common possibility is to plot the covariate values vs. ### residuals) plot(log2(Dris$Mg), residuals(aLogMg), xlab=expression(log[2](Mg)), ylab="Residuals", main="Residuals vs. covariate", pch = 21, col = "blue4", bg = "skyblue"); lines(lowess(residuals(aLogMg)~log2(Dris$Mg)), col="red") ### Is the following surprising? abline(mean(aLogMg$residuals), 0, col = "green", lwd=2) ### This shouldn't surprise us anymore sum(aLogMg$residuals) sum(aLogMg$residuals*log2(Dris$Mg)) ### ======================================================================== ### ### Three types of confidence intervals ### ### ======================================================================== ### ### Suppose for the moment that the model aLogMg is a USEFUL model summary(aLogMg) ### 1. Pointwise confidence intervals (region/band) around the regression line ### (constructing a sequence of confidence intervals for E(Y|x) as x ### varies along the X grid ### What is the purpose of the predict command? help(predict) help(predict.lm) ### What is the difference between the outputs from "predict" and "fitted"? ### Only the first 10 values are printed here data.frame(Predict = predict(aLogMg), Fitted = fitted(aLogMg))[1:10,] all.equal(predict(aLogMg), fitted(aLogMg)) ### Point prediction of E(Y|X=10) predict(aLogMg, newdata = data.frame(Mg = 10)) (point.fit<-aLogMg$coef[1]+aLogMg$coef[2]*log2(10)) ### Now, with the standard error predict(aLogMg, newdata = data.frame(Mg = 10), se.fit = TRUE) ### Calculation of "se.fit" lvec <- c(1, log2(10)) (se.fit <- c(sqrt(t(lvec) %*% vcov(aLogMg) %*% lvec))) all.equal(predict(aLogMg, newdata = data.frame(Mg = 10), se.fit = TRUE)$se.fit, se.fit) # Recall - residual degrees of freedom, i.e. $df aLogMg$df.residual # sqrt(MS_e) = estimate of sigma, i.e. $residual scale sqrt(deviance(aLogMg)/aLogMg$df.residual) summary(aLogMg)$sigma ## or, using a function from mffSM LSest(aLogMg, L = c(1, log2(10))) ### And now with the confidence interval for E(Y|x) (using "predict") predict(aLogMg, newdata = data.frame(Mg = 10), interval = "confidence", level = 0.95) ### Manual computation of the confidence interval crossprod(c(1, log2(10)), coef(aLogMg)) ## What is this? as.numeric(crossprod(c(1, log2(10)), coef(aLogMg))) + c(-1, 1) * se.fit * qt(0.975, df = aLogMg[["df.residual"]]) ### We can calculate a sequence of confidence intervals for E(Y|x) ### for different values of x predict(aLogMg, newdata = data.frame(Mg = 10:20), interval = "confidence", level = 0.95) predict(aLogMg, newdata = data.frame(Mg = 10:20), interval = "confidence", level = 0.95, se.fit = TRUE) ### A denser grid: preparation of a grid of points for the pointwise confidence ### band around the regression line x.Mg <- seq(min(Dris[, "Mg"]) - 1, max(Dris[, "Mg"]) + 1, length = 101) ### Calculate the confidence intervals ciEY.aLogMg <- predict(aLogMg, data.frame(Mg = x.Mg), interval = "confidence", level = 0.95) ### Observations together with the estimated regression line ### and a sequence of confidence intervals for E(Y|x) visualized ### as a band --> confidence region (band) AROUND the regression line. plot(yield ~ log2(Mg), pch = 23, col = "orange", bg = "beige", xlab = "Log2(Mg)", ylab = "Yield", data = Dris) lines(log2(x.Mg), ciEY.aLogMg[, "fit"], lwd = 2, col = "red3") lines(log2(x.Mg), ciEY.aLogMg[, "lwr"], lwd = 2, col = "red3", lty = 2) lines(log2(x.Mg), ciEY.aLogMg[, "upr"], lwd = 2, col = "red3", lty = 2) ### Where is the band narrowest? lines(rep(mean(log2(Dris$Mg)), 2), c(min(Dris$yield), max(Dris$yield))) lines(c(min(log2(Dris$Mg)), max(log2(Dris$Mg))), rep(mean(Dris$yield), 2)) ### We can also plot everything on the original scale of Mg plot(yield ~ Mg, pch = 23, col = "orange", bg = "beige", xlab = "Mg", ylab = "Yield", data = Dris) lines(x.Mg, ciEY.aLogMg[, "fit"], lwd = 2, col = "red3") lines(x.Mg, ciEY.aLogMg[, "lwr"], lwd = 2, col = "red3", lty = 2) lines(x.Mg, ciEY.aLogMg[, "upr"], lwd = 2, col = "red3", lty = 2) ### What about now? lines(rep(mean(Dris$Mg), 2), c(min(Dris$yield), max(Dris$yield))) lines(c(min(Dris$Mg), max(Dris$Mg)), rep(mean(Dris$yield), 2)) ### 2. Confidence region (band) FOR the (whole) regression line. ### (later in the lecture, we will discuss also a confidence region (band) ### FOR the regression line. ### ### Code towards the band FOR the regression line should now be considered as ### purely illustrative. It is now only important to understand the difference ### between "AROUND" and "FOR". ciEY2.aLogMg <- predict(aLogMg, data.frame(Mg = x.Mg), se.fit = TRUE) forEY.aLogMg <- data.frame(fit = ciEY2.aLogMg[["fit"]], lwr = ciEY2.aLogMg[["fit"]] - ciEY2.aLogMg[["se.fit"]] * sqrt(qf(0.95, 2, ciEY2.aLogMg[["df"]])*2), upr = ciEY2.aLogMg[["fit"]] + ciEY2.aLogMg[["se.fit"]] * sqrt(qf(0.95, 2, ciEY2.aLogMg[["df"]])*2)) head(forEY.aLogMg) ### The computation of the confidence band is based on the confidence ellipse ### for the regression parameters (beta_0, beta_1) confidenceEllipse(aLogMg) ### x-axis: Log(Mg) plot(yield ~ log2(Mg), pch = 23, col = "orange", bg = "beige", xlab = "Log(Mg)", ylab = "Yield", data = Dris) lines(log2(x.Mg), ciEY.aLogMg[, "fit"], lwd = 2, col = "darkgreen") ### Band AROUND lines(log2(x.Mg), ciEY.aLogMg[, "lwr"], lwd = 2, col = "red3", lty = 2) lines(log2(x.Mg), ciEY.aLogMg[, "upr"], lwd = 2, col = "red3", lty = 2) ### Band FOR lines(log2(x.Mg), forEY.aLogMg[, "lwr"], lwd = 2, col = "blue2", lty = 4) lines(log2(x.Mg), forEY.aLogMg[, "upr"], lwd = 2, col = "blue2", lty = 4) legend(4, 8.5, legend = c("For", "Around"), col = c("blue2", "red3"), lty = c(4, 2), lwd = 2) ### x-axis: Mg plot(yield ~ Mg, pch = 23, col = "orange", bg = "beige", xlab = "Mg", ylab = "Yield", data = Dris) lines(x.Mg, ciEY.aLogMg[, "fit"], lwd = 2, col = "darkgreen") ### Band AROUND lines(x.Mg, ciEY.aLogMg[, "lwr"], lwd = 2, col = "red3", lty = 2) lines(x.Mg, ciEY.aLogMg[, "upr"], lwd = 2, col = "red3", lty = 2) ### Band FOR lines(x.Mg, forEY.aLogMg[, "lwr"], lwd = 2, col = "blue2", lty = 4) lines(x.Mg, forEY.aLogMg[, "upr"], lwd = 2, col = "blue2", lty = 4) legend(18, 8.5, legend = c("For", "Around"), col = c("blue2", "red3"), lty = c(4, 2), lwd = 2) ### 3. Prediction interval (for given x) ### (interval which covers with probability (1 - alpha) ### a NEW value of response obtained for X being set to x) ### Expression towards the prediction interval will be explained ### during one of the later lectures. Concentrate now on correct interpretation ### and understanding its difference from a confidence interval for E(Y|x) ### Calculation in R predict(aLogMg, newdata = data.frame(Mg = 10), interval = "prediction") ## compare with conf. int. for E(Y|x) predict(aLogMg, newdata = data.frame(Mg = 10), interval = "confidence") ### Manual computation of the prediction interval s = summary(aLogMg)$sigma se.fit2 = sqrt(s^2+(se.fit)^2) as.numeric(crossprod(c(1, log2(10)), coef(aLogMg))) + c(-1, 1) * se.fit2 * qt(0.975, df = aLogMg[["df.residual"]]) ### Prediction band - a sequence of prediction intervals for a dense grid of ### x-values pred.aLogMg <- predict(aLogMg, data.frame(Mg = x.Mg), interval = "prediction", level = 0.95) head(pred.aLogMg) ### x-axis: Log(Mg) plot(yield ~ log2(Mg), pch = 23, col = "orange", bg = "beige", xlab = "Log(Mg)", ylab = "Yield", data = Dris) lines(log2(x.Mg), ciEY.aLogMg[, "fit"], lwd = 2, col = "red3") ### Band AROUND the regression line lines(log2(x.Mg), ciEY.aLogMg[, "lwr"], lwd = 2, col = "red3", lty = 2) lines(log2(x.Mg), ciEY.aLogMg[, "upr"], lwd = 2, col = "red3", lty = 2) ### Prediction band lines(log2(x.Mg), pred.aLogMg[, "lwr"], lwd = 2, col = "blue2", lty = 4) lines(log2(x.Mg), pred.aLogMg[, "upr"], lwd = 2, col = "blue2", lty = 4) legend("topleft", legend = c("Prediction", "Around the regression line"), col = c("blue2", "red3"), lty = c(4, 2), lwd = 2) ### x-axis: Mg plot(yield ~ Mg, pch = 23, col = "orange", bg = "beige", xlab = "Mg", ylab = "Yield", data = Dris) lines(x.Mg, ciEY.aLogMg[, "fit"], lwd = 2, col = "red3") ### Band AROUND the regression line lines(x.Mg, ciEY.aLogMg[, "lwr"], lwd = 2, col = "red3", lty = 2) lines(x.Mg, ciEY.aLogMg[, "upr"], lwd = 2, col = "red3", lty = 2) ### Prediction band lines(x.Mg, pred.aLogMg[, "lwr"], lwd = 2, col = "blue2", lty = 4) lines(x.Mg, pred.aLogMg[, "upr"], lwd = 2, col = "blue2", lty = 4) legend("bottomright", legend = c("Prediction", "Around the regression line"), col = c("blue2", "red3"), lty = c(4, 2), lwd = 2) ### ### Polynomials and splines as regressors ### ### EXAMPLE ### A data frame giving a series of measurements of head acceleration ### in a simulated motorcycle accident, used to test crash helmets. data(Motorcycle, package = "mffSM") help(Motorcycle) head(Motorcycle) dim(Motorcycle) summary(Motorcycle) ### We are interested how head accelleration [haccel] depends on times ### after accident in miliseconds [time]. BTY <- "o"; par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1) plot(haccel ~ time, data = Motorcycle, pch = 21, bg = "orange", col = "red3", xlab = "Time [ms]", ylab = "Head acceleration [g]") fit0 = lm(haccel~time, data=Motorcycle) abline(fit0) ### In the diagnostic plots, we now clearly see that model fit0 ### is hardly appropriate plotLM(fit0) ### Quite a flexible shape is observed in the plot => ### we use POLYNOMIALS for fiting the model ### (still staying in the linear regression framework) ### Fitting polynomial of various degrees fit6 <- lm(haccel~poly(time, 6, raw=TRUE), data=Motorcycle) fit12 <- lm(haccel~poly(time, 12, raw=TRUE), data=Motorcycle) ### raw polynomials, i.e. the "usual" ones ### try this with 70 xgrid <- seq(0, 60, length = 500) par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1) plot(haccel ~ time, data = Motorcycle, pch = 21, bg = "orange", col = "red3", xlab = "Time [ms]", ylab = "Head acceleration [g]") lines(xgrid, predict(fit6, newdata=data.frame(time=xgrid)), lwd=2, col="blue") lines(xgrid, predict(fit12, newdata=data.frame(time=xgrid)), lwd=2, col="darkgreen") legend(37,-70, lty=rep(1, 2), col=c("blue", "darkgreen"), legend=paste("p=", c(6,12),sep=""), lwd=c(2,2), cex=1.2) ### Using SPLINES ### Choice of knots knots <- c(0, 11, 12, 13, 20, 30, 32, 34, 40, 50, 60) (inner <- knots[-c(1, length(knots))]) # inner knots (bound <- knots[c(1, length(knots))]) # boundary knots ### B-splines of degree 3 DEGREE <- 3; # piecewise cubic Bgrid <- bs(xgrid, knots = inner, Boundary.knots = bound, degree = DEGREE, intercept = TRUE) dim(Bgrid) # 13-dimensional linear space of piecewise polynomials COL <- rainbow_hcl(ncol(Bgrid), c = 80) par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1) plot(xgrid, Bgrid[,1], type = "l", col = COL[1], lwd = 2, xlab = "x", ylab = "B(x)", xlim = range(knots), ylim = range(Bgrid, na.rm = TRUE)) for (j in 2:ncol(Bgrid)) lines(xgrid, Bgrid[,j], col = COL[j], lwd = 2) points(knots, rep(0, length(knots)), pch = 21, bg = "blue", col = "skyblue", cex = 1.5) ### an animation for (j in 1:ncol(Bgrid)){ plot(xgrid, Bgrid[,j], col = COL[j], ylim = c(0,1), lwd = 2,type="l", xlab="x", ylab="B(x)") points(knots, rep(0, length(knots)), pch = 21, bg = "blue", col = "skyblue", cex = 1.5) Sys.sleep(.3) } ### Fitting the model with splines ### http://www.chebfun.org/examples/approx/BSplineConv.html m1 <- lm(haccel ~ bs(time, knots = inner, Boundary.knots = bound, degree = DEGREE, intercept = TRUE) - 1, data = Motorcycle) names(m1$coefficients) <- paste("B", 1:13) summary(m1) ### Note that as the intercept is not formally included in m1 (because of ### the -1 in the model formula), neither 'Multiple R-squared' ### nor the 'F-statistic' quantity in the output are meaningful numbers # R-squared from the model summary(m1)$r.squared # manual computation: two ways 1 - deviance(m1)/sum(Motorcycle$haccel^2) sum(fitted(m1)^2)/sum(Motorcycle$haccel^2) # why are all these numbers the same? # interpretation the F-test from summary(m1) as a test of submodel summary(m1)$fstatistic # the usual F-test is the following m2 <- lm(haccel ~ 1, data = Motorcycle); anova(m2,m1) # in a model without intercept it is this one (m0 <- lm(haccel ~ -1, data=Motorcycle)) anova(m0,m1) ### Nevertheless, as the intercept is in the linear span of the columns of X, ### it makes sense to calculcate the (usual) multiple R-squared. ### One can do it 'manually' as var(fitted(m1))/var(Motorcycle$haccel) cor(Motorcycle$haccel, fitted(m1))^2 1 - deviance(m1)/deviance(m2) ### Alternatively one can use B-splines (the basis without intercept) ### with an intercept in the model to get some meaningful numbers in the output m0 <- lm(haccel ~ bs(time, knots = inner, Boundary.knots = bound, degree = DEGREE, intercept = FALSE), data = Motorcycle) names(m0$coefficients) <- c("(Intercept)", paste("B", 1:12)) summary(m0) anova(m2,m0) ### The fitted values given by the models m0 and m1 are the same summary(fitted(m0)-fitted(m1)) ### Predicted values pm1 <- predict(m1, newdata=data.frame(time=xgrid)) plot(haccel ~ time, data = Motorcycle, pch = 21, bg = "palegoldenrod", col = "orange", xlab = "Time [ms]", ylab = "Head acceleration [g]") lines(xgrid, pm1, col = "red", lwd = 3) points(knots, rep(min(Motorcycle[, "haccel"]), length(knots)), pch = 21, bg = "blue", col = "skyblue", cex = 1.5) lines(xgrid, predict(fit12, newdata=data.frame(time=xgrid)), lwd=2, col="darkgreen"); legend("topright", col=c("red", "darkgreen"), legend=c("splines", "p=12"), lwd=c(3,2), cex=1.2) ### Add a prediction interval ci.pred.spline <- predict(m1, newdata=data.frame(time=xgrid), interval = "prediction", level = 0.95) plot(haccel ~ time, data = Motorcycle, pch = 21, bg = "palegoldenrod", col = "orange", xlab = "Time [ms]", ylab = "Head acceleration [g]", ylim=range(ci.pred.spline)) lines(xgrid, pm1, col = "blue", lwd = 2) lines(xgrid, ci.pred.spline[,2], col = "blue", lwd = 2, lty="dashed") lines(xgrid, ci.pred.spline[,3], col = "blue", lwd = 2, lty="dashed") ### Do you expect any problem with this prediction interval? ### Think about the homoscedasticity assumption. ### Is this problem obvious from any of the following figures? plotLM(m1) ### And what about the following figures? ### Residuals vs. covariate plot par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1) plot(residuals(m1) ~ Motorcycle[, "time"], pch = 21, col = "blue4", bg = "skyblue", xlab = "Time", ylab = "Residuals") lines(lowess(residuals(m1)~Motorcycle[, "time"]), col="red") ### Scale vs. covariate plot plot(abs(rstandard(m1)) ~ Motorcycle[, "time"], pch = 21, col = "blue4", bg = "skyblue", xlab = "Time", ylab = expression(sqrt("|standardized residuals|"))) lines(lowess(abs(rstandard(m1))~Motorcycle[, "time"]), col="red") ### -------------------------------------------------------------------------### ### Block 2: INDIVIDUAL WORK ### ### -------------------------------------------------------------------------### ### plotLM truly only calls plot.lm and displays all plots in a single figure plotLM ### QQ-plot of standardized residuals in model aLogMg plot(aLogMg, which = 2, ask = FALSE) ### For comparison, the QQ plot of raw residuals qqnorm(residuals(aLogMg)) qqline(residuals(aLogMg)) ### Is such a diagnostic plot useful? ### Confidence intervals for E(Y|x) again plot(yield ~ log2(Mg), pch = 23, col = "orange", bg = "beige", xlab = "Log2(Mg)", ylab = "Yield", data = Dris) lines(log2(x.Mg), ciEY.aLogMg[, "fit"], lwd = 2, col = "red3") lines(log2(x.Mg), ciEY.aLogMg[, "lwr"], lwd = 2, col = "red3", lty = 2) lines(log2(x.Mg), ciEY.aLogMg[, "upr"], lwd = 2, col = "red3", lty = 2) ### At which point x is the confidence interval the most narrow? f = Vectorize(function(x) sqrt(t(c(1,x)) %*% vcov(aLogMg) %*% c(1,x))) t = seq(3,4,length=101) plot(t,f(t),type="l",main="Estimated std. dev. of the predicted E(Y|x)", ylab="std.dev.",xlab="log2(Mg)", lwd=2, col=2) abline(v=mean(log2(Dris$Mg)),lty=2) legend("bottomleft","average lMg", lty=2) ### Verify by computation that the confidence interval is most narrrow at the ### average of the x-values. Is this true only in simple regression models, ### i.e. models of the form E(Y|x) = beta_0 + beta_1*x? ### Can we say something analogous for more general regression models? ### Fitting orthogonal polynomials to Motorcycle data ### Fitting polynomial of various degrees fit6 <- lm(haccel~poly(time, 6, raw=TRUE), data=Motorcycle) ### raw polynomials, i.e. the "usual" ones fit6.orto <- lm(haccel~poly(time, 6), data=Motorcycle) ### orthogonal polynomials fit12 <- lm(haccel~poly(time, 12, raw=TRUE), data=Motorcycle); # raw fit12.orto <- lm(haccel~poly(time, 12), data=Motorcycle); # orthogonal xgrid <- seq(0, 60, length = 500) par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1) plot(haccel ~ time, data = Motorcycle, pch = 21, bg = "orange", col = "red3", xlab = "Time [ms]", ylab = "Head acceleration [g]") lines(xgrid, predict(fit6, newdata=data.frame(time=xgrid)), lwd=2, col="blue") # lines(xgrid, predict(fit6.orto, newdata=data.frame(time=xgrid)), lwd=3, col=2, lty="dotted") lines(xgrid, predict(fit12, newdata=data.frame(time=xgrid)), lwd=2, col="darkgreen") lines(xgrid, predict(fit12.orto, newdata=data.frame(time=xgrid)), lwd=2, col=2, lty="dotted") legend(37,-70, lty=rep(1, 2), col=c("blue", "darkgreen"), legend=paste("p=", c(6,12),sep=""), lwd=c(2,2), cex=1.2) ### The fitted lines using orthogonal polynomials appear to be the same as ### for usual raw polynomials. Is this always true? What is different in fitting ### orthogonal polynomials, as opposed to the raw ones? Which model do you ### prefer, and why? ### QUESTION: ### What is the form of the model matrix used for the two fits in the ### previous example? fit6x = lm(haccel~poly(time, 6, raw=TRUE), data=Motorcycle, x=TRUE) unname(head(fit6x$x)) ### the first row of the matrix X unname(fit6x$x[1,]) Motorcycle$time[1]^c(0:6) poly(Motorcycle$time,6,raw=TRUE)[1,] ### Difference between raw and orthogonal polynomials ### Difference in fitted values summary(predict(fit12.orto, newdata=data.frame(time=xgrid)) - predict(fit12, newdata=data.frame(time=xgrid))); #### Difference in regression coefficients coef12 = cbind(fit12$coef,fit12.orto$coef) rownames(coef12) = paste("degree ",0:12) colnames(coef12) = c("Raw","Orthogonal") signif(coef12,digits=3) ### -------------------------------------------------------------------------### ### Block 3: ADDITIONAL MATERIAL ### ### -------------------------------------------------------------------------### ### plot.lm also produces additional diagnostic plots ### These will be discussed (much) later in the course plot(aLogMg,which=4) plot(aLogMg,which=5) plot(aLogMg,which=6) ### Neither residuals nor standardized residuals are in general indpendent, or ### identically distributed. This is a consequence of matrix M not being ### diagonal. For the purpose of diagnostics, this is usually ignored, but ### should be kept in mind ### H matrix, projection on columns of X ### M matrix, projection on the orthogonal complement to the columns of X H = qr.Q(aLogMg[["qr"]]) %*% t(qr.Q(aLogMg[["qr"]])) M = diag(nrow(H))-H ### hat(Y) = H * Y ### U = M * Y ### Y = hat(Y) + U all.equal(unname(fitted(aLogMg)),c(H%*%Dris$yield)) all.equal(unname(residuals(aLogMg)),c(M%*%Dris$yield)) all.equal(unname(fitted(aLogMg) + residuals(aLogMg)), Dris$yield) # H and M are symmetric all.equal(H,t(H)) all.equal(M,t(M)) # H and M are idempotent all.equal(H%*%H,H) all.equal(M%*%M,M) # but they are not diagonal H[1:5,1:5] M[1:5,1:5] # visualisation of the two matrices image(H) image(M) ### Motorcycle data ### Try different choices of knots and different degrees of smoothness of the ### splines knots <- c(0, 10, 14.5, 21, 31, 40, 50, 60) inner <- knots[-c(1, length(knots))] # inner knots bound <- knots[c(1, length(knots))] # boundary knots DEGREE <- 1; # 1 for piecewise linear functions ### B-splines Bgrid2 <- bs(xgrid, knots = inner, Boundary.knots = bound, degree = DEGREE, intercept = TRUE) COL <- rainbow_hcl(ncol(Bgrid2), c = 80) par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1) plot(xgrid, Bgrid2[,1], type = "l", col = COL[1], lwd = 2, xlab = "x", ylab = "B(x)", xlim = range(knots), ylim = range(Bgrid2, na.rm = TRUE)) for (j in 2:ncol(Bgrid2)) lines(xgrid, Bgrid2[,j], col = COL[j], lwd = 2) points(knots, rep(0, length(knots)), pch = 21, bg = "blue", col = "skyblue", cex = 1.5) ### Fitting the model with splines B2 <- bs(Motorcycle[, "time"], knots = inner, Boundary.knots = bound, degree = DEGREE, intercept = TRUE) m2 <- lm(haccel ~ B2 - 1, data = Motorcycle) summary(m2) ### Predicted values pm2 <- as.numeric(Bgrid2 %*% coef(m2)) plot(haccel ~ time, data = Motorcycle, pch = 21, bg = "palegoldenrod", col = "orange", xlab = "Time [ms]", ylab = "Head acceleration [g]") lines(xgrid, pm2, col = "red3", lwd = 2) points(knots, rep(min(Motorcycle[, "haccel"]), length(knots)), pch = 21, bg = "blue", col = "skyblue", cex = 1.5)