library(qgam); library(mgcViz); library(gamair);
data(co2s)
with(co2s, plot(c.month, co2, type="l"))
b <- qgam(co2~s(c.month, bs="cr", k=100), data=co2s, qu = 0.5, err = 0.1)
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.5..........done
co2plot <- function(co2s,b) {
fv <- predict(b,data.frame(c.month=1:543,month=c(rep(1:12,45),1:3)),se=TRUE)
ul <- fv$fit + 2*fv$se
ll <- fv$fit - 2*fv$se
with(co2s,plot(c.month,co2,pch=19,cex=.3,col=2,
ylim=range(c(ul,ll)),xlim=c(0,550)))
lines(1:543,fv$fit)
lines(1:543,ul,lty=2)
lines(1:543,ll,lty=2)
}
co2plot(co2s,b) ## nonsense predictions - extrapolation artefact
b1 <- qgam(co2~s(c.month,bs="cr",k=50)+s(month,bs="cc"),data=co2s, qu = 0.5,
argGam = list(knots=list(month=c(1,13))), err = 0.1)
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.5.........done
co2plot(co2s,b1)
This is much better, as the short-term seasonal effect has been separated from long terms smooth terms, allowing longer range extrapolation of slow long range trend.
We convert to an gamViz
object. We need to set nsim = 0
because we don’t know how to simulate from a gam
model:
b1 <- getViz(b1, nsim = 0)
print(plot(b1), pages = 1) # Plotting smooth effects
qq.gam(b1)
Residuals look fairly normal, but this is only because we are fitting quantile 0.5. In fact, if we fit quantile 0.8:
b2 <- qgam(co2~s(c.month,bs="cr",k=50)+s(month,bs="cc"),data=co2s, qu = 0.8,
argGam = list(knots=list(month=c(1,13))), err = 0.1)
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.8.............done
qq.gam( getViz(b2, nsim = 0) )
This does not look good at all. In general, we can’t expect the residuals from quantile regression to be normally distributed.
library(qgam)
data("UKload")
qu <- 0.5
form <- NetDemand~s(wM,k=20,bs='cr') + s(wM_s95,k=20,bs='cr') +
s(Posan, k=50, bs = "cr") + Dow + s(Trend,k=4,bs='cr') +
NetDemand.48 + Holy
library(parallel)
cl <- makeCluster(2)
clusterEvalQ(cl, {library(RhpcBLASctl); blas_set_num_threads(1)}) #(optional)
## [[1]]
## NULL
##
## [[2]]
## NULL
sigSeq <- seq(4, 8, length.out = 16)
closs <- tuneLearn(form = form, data = UKload,
lsig = sigSeq, qu = qu, control = list("K" = 20), cluster = cl, err=0.1)
stopCluster(cl)
check( closs )
We generally expect the EDF to decrease with \(\log \sigma\), however the decrease does not need be monotonic.
fit <- qgam(form = form, data = UKload, qu = qu, err = 0.1, lsig = closs$lsig)
fit <- getViz(fit, nsim = 0)
print(plot(fit), pages = 1)
summary(fit)
##
## Family: elf
## Link function: identity
##
## Formula:
## NetDemand ~ s(wM, k = 20, bs = "cr") + s(wM_s95, k = 20, bs = "cr") +
## s(Posan, k = 50, bs = "cr") + Dow + s(Trend, k = 4, bs = "cr") +
## NetDemand.48 + Holy
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.096e+04 5.362e+02 39.10 <2e-16 ***
## Dowjeudi 3.603e+03 9.580e+01 37.61 <2e-16 ***
## Dowlundi 6.148e+03 6.002e+01 102.44 <2e-16 ***
## Dowmardi 3.654e+03 9.935e+01 36.78 <2e-16 ***
## Dowmercredi 3.711e+03 9.487e+01 39.12 <2e-16 ***
## Dowsamedi -1.663e+03 9.242e+01 -18.00 <2e-16 ***
## Dowvendredi 3.334e+03 9.522e+01 35.02 <2e-16 ***
## NetDemand.48 4.216e-01 1.438e-02 29.31 <2e-16 ***
## Holy1 -3.608e+03 3.007e+02 -12.00 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(wM) 7.163 8.795 350.73 < 2e-16 ***
## s(wM_s95) 3.645 4.824 19.45 0.00136 **
## s(Posan) 24.224 29.639 494.20 < 2e-16 ***
## s(Trend) 2.940 2.997 453.83 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.946 Deviance explained = 86.4%
## -REML = 7048.3 Scale est. = 1 n = 2008
The effect of Posan
if fairly wiggly and drops sharply in the Christmas period.
form <- NetDemand~s(wM,k=20,bs='cr') + s(wM_s95,k=20,bs='cr') +
s(Posan, bs='ad', k=50) + Dow + s(Trend,k=4) +
NetDemand.48 + Holy
fit <- qgam(form = form, data = UKload, qu = qu, err = 0.1, lsig = closs$lsig)
fit <- getViz(fit, nsim = 0)
print(plot(fit), pages = 1)
summary(fit)
##
## Family: elf
## Link function: identity
##
## Formula:
## NetDemand ~ s(wM, k = 20, bs = "cr") + s(wM_s95, k = 20, bs = "cr") +
## s(Posan, bs = "ad", k = 50) + Dow + s(Trend, k = 4) + NetDemand.48 +
## Holy
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.088e+04 5.375e+02 38.84 <2e-16 ***
## Dowjeudi 3.593e+03 9.636e+01 37.29 <2e-16 ***
## Dowlundi 6.152e+03 5.968e+01 103.08 <2e-16 ***
## Dowmardi 3.651e+03 9.947e+01 36.70 <2e-16 ***
## Dowmercredi 3.703e+03 9.589e+01 38.62 <2e-16 ***
## Dowsamedi -1.674e+03 9.271e+01 -18.05 <2e-16 ***
## Dowvendredi 3.324e+03 9.559e+01 34.78 <2e-16 ***
## NetDemand.48 4.238e-01 1.442e-02 29.39 <2e-16 ***
## Holy1 -3.453e+03 2.943e+02 -11.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(wM) 7.204 8.841 357.19 < 2e-16 ***
## s(wM_s95) 3.593 4.758 19.18 0.00146 **
## s(Posan) 16.546 19.964 499.81 < 2e-16 ***
## s(Trend) 2.942 2.997 451.26 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.946 Deviance explained = 86.4%
## -REML = 7035.6 Scale est. = 1 n = 2008
Now the effect of Posan
is smoother along most of the year, but it drops around Christmas even more than before. This is because many businesses are shut during this period. An adaptive basis makes so that we use lots of degrees of freedom where they are needed (winter holiday) and few where the effect is smooth. Alternatively, we could have added a factor for the winter period (although one might point out that we have already included a dummy variable indicating bank holidays).
nqu <- 10
qus <- seq(0.05, 0.95, length.out = nqu)
fitM <- mqgam(form = form, data = UKload, err = 0.15, lsig = closs$lsig, qu = qus)
df = sapply(qus, function(.q) qdo(fitM, .q, pen.edf))
matplot(qus, t(df)[ , c(1:3, 8)], type = 'l', ylab = "EDF")
legend("topleft", lty = 1:4, col = 1:4, c("wM", "wM_95", "Posan", "Trend"))
We expect the effective degrees for freedom to drop in the tails, simply because the smooths corresponding to extreme quantiles are based on very few data points.
We start by getting a getViz
object for each quantile:
fits <- qdo(fitM, qus, getViz, nsim = 0)
Then, for each of the four smooth effects, we extract 10 plots (one per quantile):
pl <- list()
for( ii in 1:4 ){
smo <- lapply(fits, sm, ii)
pl[[ii]] <- lapply(smo, plot, n = 200)
}
Then, for each effect, we take the first plot and we add to it the lines corresponding to all quantiles:
EdfColors <- c("#FE5815", "#FFA02F", "#C4D600", "#509E2F", "#005BBB", "#001A70")
EdfColors <- colorRampPalette(EdfColors[1:4])(length(qus))
basic <- list()
for( ii in 1:4 ){ # Loop over effects
basic[[ii]] <- pl[[ii]][[1]]
for( kk in 1:nqu ){ # Loop over nqu quantiles
basic[[ii]] <- basic[[ii]] + geom_line(data = pl[[ii]][[kk]]$data$fit,
colour = EdfColors[kk], na.rm = T)
}
}
# To plot using grid.arrange, we need to extract the ggplot objects
library(gridExtra)
grid.arrange(grobs = lapply(basic, "[[", "ggObj"))
Green for high quantiles, orange for low. Notice that when the effects of low and high quantiles diverge, the conditional variance of the response is increasing (other things being equal). Along wM_s95
we can also see that at low temperatures the load is skewed to the right (again, ceteris paribus). Looking at the plot for Posan
, look at how the Chrismas effect changes depending on the quantile. We can zoom in to have a better look
basic[[3]] + xlim(0.8, 1)
The lowest quantiles, are much strongly effected: they go down and then bounce back. We couldn’t get such insights with a Gaussian GAM!
We extract the gam
model corresponding to the fifth quantile (\(\tau = 0.45\)):
fit3 <- qdo(fitM, qus[5], getViz, nsim = 0)
We produce a QQ-plot and some checks along some of the covariates:
pl <- list()
pl[[1]] <- qq.gam(fit3)
pl[[2]] <- check1D(fit3, x = "wM") + l_densCheck() + l_points() + theme(legend.position="none")
pl[[3]] <- check1D(fit3, x = "wM_s95") + l_densCheck() + l_points() + theme(legend.position="none")
pl[[4]] <- check1D(fit3, x = "Posan") + l_densCheck() + l_points() + theme(legend.position="none")
grid.arrange(grobs = lapply(pl, "[[", 1))
Plots above are not informative at all, because there is in quantile regression there is no reason to expect the residuals to be normal! Instead a sensible thing to do is looking at the fraction of residuals falling below the fitted quantile, which should be close to 0.45 given that we are fitting quantile \(\tau = 0.45\). Hence a more sensible check is:
fracBelow <- function(.x){
sum(.x < 0) / length(.x)
}
pl <- list()
pl[[1]] <- check1D(fit3, x = "wM_s95") + l_gridCheck1D(gridFun = fracBelow, level = 0) +
geom_hline(yintercept = qus[5], linetype = 2)
pl[[2]] <- check1D(fit3, x = "Posan") + l_gridCheck1D(gridFun = fracBelow, level = 0) +
geom_hline(yintercept = qus[5], linetype = 2)
grid.arrange(grobs = lapply(pl, "[[", 1))
Nice, but without confidence bands it is impossible to assess whether these departures from 0.45 are significant or not. To solve this we need to add further layers:
funCreator <- function(.qu, .lev, .type){
ciFun <- function(.x){
.n <- length(.x)
if( .type == "lower" ){ return( qbinom(.lev/2, .n, .qu) / .n ) }
if( .type == "upper" ){ return( qbinom(.lev/2, .n, .qu, lower.tail = FALSE) / .n ) }
}
}
lowCI <- funCreator(.qu = qus[5], .lev = 0.05, .type = "lower")
upCI <- funCreator(.qu = qus[5], .lev = 0.05, .type = "upper")
pl <- list()
pl[[1]] <- check1D(fit3, x = "wM_s95") + l_gridCheck1D(gridFun = fracBelow, level = 0, shape = 3, size = 2) +
l_gridCheck1D(gridFun = lowCI, level = 0, colour = 2) +
l_gridCheck1D(gridFun = upCI, level = 0, colour = 3) +
geom_hline(yintercept = qus[5], linetype = 2)
pl[[2]] <- check1D(fit3, x = "Posan") + l_gridCheck1D(gridFun = fracBelow, level = 0, shape = 3, size = 2) +
l_gridCheck1D(gridFun = lowCI, level = 0, colour = 2) +
l_gridCheck1D(gridFun = upCI, level = 0, colour = 3) +
geom_hline(yintercept = qus[5], linetype = 2)
grid.arrange(grobs = lapply(pl, "[[", 1), ncol = 2)
Looks good, most departures are within 95 percent confidence bands.