1 C02 modelling

  1. Load packages
library(qgam); library(mgcViz); library(gamair); 
data(co2s)
  1. Plot data
with(co2s, plot(c.month, co2, type="l"))

  1. Fit model for median
b <- qgam(co2~s(c.month, bs="cr", k=100), data=co2s, qu = 0.5, err = 0.1)
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.5..........done
  1. Use it for prediction
co2plot <- function(co2s,b) {
  fv <- predict(b,data.frame(c.month=1:543,month=c(rep(1:12,45),1:3)),se=TRUE)
  ul <- fv$fit + 2*fv$se
  ll <- fv$fit - 2*fv$se
  with(co2s,plot(c.month,co2,pch=19,cex=.3,col=2,
                 ylim=range(c(ul,ll)),xlim=c(0,550)))
  lines(1:543,fv$fit)
  lines(1:543,ul,lty=2)
  lines(1:543,ll,lty=2)
}

co2plot(co2s,b) ## nonsense predictions - extrapolation artefact

  1. Fit a better model
b1 <- qgam(co2~s(c.month,bs="cr",k=50)+s(month,bs="cc"),data=co2s, qu = 0.5,
           argGam = list(knots=list(month=c(1,13))), err = 0.1)
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.5.........done
  1. Predict again
co2plot(co2s,b1)

This is much better, as the short-term seasonal effect has been separated from long terms smooth terms, allowing longer range extrapolation of slow long range trend.

  1. mgcViz plotting and model checking

We convert to an gamViz object. We need to set nsim = 0 because we don’t know how to simulate from a gam model:

b1 <- getViz(b1, nsim = 0)
print(plot(b1), pages = 1) # Plotting smooth effects

qq.gam(b1)

Residuals look fairly normal, but this is only because we are fitting quantile 0.5. In fact, if we fit quantile 0.8:

b2 <- qgam(co2~s(c.month,bs="cr",k=50)+s(month,bs="cc"),data=co2s, qu = 0.8,
           argGam = list(knots=list(month=c(1,13))), err = 0.1)
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.8.............done
qq.gam( getViz(b2, nsim = 0) )

This does not look good at all. In general, we can’t expect the residuals from quantile regression to be normally distributed.

2 Electricity load forecasting

  1. Load data and create model formula
library(qgam)
data("UKload")
qu <- 0.5
form <- NetDemand~s(wM,k=20,bs='cr') + s(wM_s95,k=20,bs='cr') + 
        s(Posan, k=50, bs = "cr") + Dow + s(Trend,k=4,bs='cr') + 
        NetDemand.48 + Holy
  1. Tune learning rate in parallel
library(parallel)
cl <- makeCluster(2)
clusterEvalQ(cl, {library(RhpcBLASctl); blas_set_num_threads(1)}) #(optional)
## [[1]]
## NULL
## 
## [[2]]
## NULL
sigSeq <- seq(4, 8, length.out = 16)
closs <- tuneLearn(form = form, data = UKload, 
                   lsig = sigSeq, qu = qu, control = list("K" = 20), cluster = cl, err=0.1)
stopCluster(cl)
  1. Check loss and edf
check( closs )

We generally expect the EDF to decrease with \(\log \sigma\), however the decrease does not need be monotonic.

  1. Fit model and plot smooths
fit <- qgam(form = form, data = UKload, qu = qu, err = 0.1, lsig = closs$lsig)
fit <- getViz(fit, nsim = 0)
print(plot(fit), pages = 1)

summary(fit)
## 
## Family: elf 
## Link function: identity 
## 
## Formula:
## NetDemand ~ s(wM, k = 20, bs = "cr") + s(wM_s95, k = 20, bs = "cr") + 
##     s(Posan, k = 50, bs = "cr") + Dow + s(Trend, k = 4, bs = "cr") + 
##     NetDemand.48 + Holy
## 
## Parametric coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.096e+04  5.362e+02   39.10   <2e-16 ***
## Dowjeudi      3.603e+03  9.580e+01   37.61   <2e-16 ***
## Dowlundi      6.148e+03  6.002e+01  102.44   <2e-16 ***
## Dowmardi      3.654e+03  9.935e+01   36.78   <2e-16 ***
## Dowmercredi   3.711e+03  9.487e+01   39.12   <2e-16 ***
## Dowsamedi    -1.663e+03  9.242e+01  -18.00   <2e-16 ***
## Dowvendredi   3.334e+03  9.522e+01   35.02   <2e-16 ***
## NetDemand.48  4.216e-01  1.438e-02   29.31   <2e-16 ***
## Holy1        -3.608e+03  3.007e+02  -12.00   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf Ref.df Chi.sq p-value    
## s(wM)      7.163  8.795 350.73 < 2e-16 ***
## s(wM_s95)  3.645  4.824  19.45 0.00136 ** 
## s(Posan)  24.224 29.639 494.20 < 2e-16 ***
## s(Trend)   2.940  2.997 453.83 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.946   Deviance explained = 86.4%
## -REML = 7048.3  Scale est. = 1         n = 2008

The effect of Posan if fairly wiggly and drops sharply in the Christmas period.

  1. Modify model formula and refit
form <- NetDemand~s(wM,k=20,bs='cr') + s(wM_s95,k=20,bs='cr') + 
        s(Posan, bs='ad', k=50) + Dow + s(Trend,k=4) + 
        NetDemand.48 + Holy

fit <- qgam(form = form, data = UKload, qu = qu, err = 0.1, lsig = closs$lsig)
fit <- getViz(fit, nsim = 0)
print(plot(fit), pages = 1)

summary(fit)
## 
## Family: elf 
## Link function: identity 
## 
## Formula:
## NetDemand ~ s(wM, k = 20, bs = "cr") + s(wM_s95, k = 20, bs = "cr") + 
##     s(Posan, bs = "ad", k = 50) + Dow + s(Trend, k = 4) + NetDemand.48 + 
##     Holy
## 
## Parametric coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.088e+04  5.375e+02   38.84   <2e-16 ***
## Dowjeudi      3.593e+03  9.636e+01   37.29   <2e-16 ***
## Dowlundi      6.152e+03  5.968e+01  103.08   <2e-16 ***
## Dowmardi      3.651e+03  9.947e+01   36.70   <2e-16 ***
## Dowmercredi   3.703e+03  9.589e+01   38.62   <2e-16 ***
## Dowsamedi    -1.674e+03  9.271e+01  -18.05   <2e-16 ***
## Dowvendredi   3.324e+03  9.559e+01   34.78   <2e-16 ***
## NetDemand.48  4.238e-01  1.442e-02   29.39   <2e-16 ***
## Holy1        -3.453e+03  2.943e+02  -11.73   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf Ref.df Chi.sq p-value    
## s(wM)      7.204  8.841 357.19 < 2e-16 ***
## s(wM_s95)  3.593  4.758  19.18 0.00146 ** 
## s(Posan)  16.546 19.964 499.81 < 2e-16 ***
## s(Trend)   2.942  2.997 451.26 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.946   Deviance explained = 86.4%
## -REML = 7035.6  Scale est. = 1         n = 2008

Now the effect of Posan is smoother along most of the year, but it drops around Christmas even more than before. This is because many businesses are shut during this period. An adaptive basis makes so that we use lots of degrees of freedom where they are needed (winter holiday) and few where the effect is smooth. Alternatively, we could have added a factor for the winter period (although one might point out that we have already included a dummy variable indicating bank holidays).

  1. mqgam fit and plotting EDF for each quantile
nqu <- 10
qus <- seq(0.05, 0.95, length.out = nqu)
fitM <- mqgam(form = form, data = UKload, err = 0.15, lsig = closs$lsig, qu = qus)

df = sapply(qus, function(.q) qdo(fitM, .q, pen.edf))
matplot(qus, t(df)[ , c(1:3, 8)], type = 'l', ylab = "EDF")
legend("topleft", lty = 1:4, col = 1:4, c("wM", "wM_95", "Posan", "Trend"))

We expect the effective degrees for freedom to drop in the tails, simply because the smooths corresponding to extreme quantiles are based on very few data points.

  1. Plotting the smooth effects of all quantiles

We start by getting a getViz object for each quantile:

fits <- qdo(fitM, qus, getViz, nsim = 0)

Then, for each of the four smooth effects, we extract 10 plots (one per quantile):

pl <- list()
for( ii in 1:4 ){
 smo <- lapply(fits, sm, ii)
 pl[[ii]] <- lapply(smo, plot, n = 200)
}

Then, for each effect, we take the first plot and we add to it the lines corresponding to all quantiles:

EdfColors <- c("#FE5815", "#FFA02F", "#C4D600", "#509E2F", "#005BBB", "#001A70")
EdfColors <- colorRampPalette(EdfColors[1:4])(length(qus))

basic <- list()
for( ii in 1:4 ){ # Loop over effects
  basic[[ii]] <- pl[[ii]][[1]]
  for( kk in 1:nqu ){ # Loop over nqu quantiles
    basic[[ii]] <- basic[[ii]] + geom_line(data = pl[[ii]][[kk]]$data$fit, 
                                           colour = EdfColors[kk], na.rm = T)
  }
}

# To plot using grid.arrange, we need to extract the ggplot objects
library(gridExtra)
grid.arrange(grobs = lapply(basic, "[[", "ggObj"))

Green for high quantiles, orange for low. Notice that when the effects of low and high quantiles diverge, the conditional variance of the response is increasing (other things being equal). Along wM_s95 we can also see that at low temperatures the load is skewed to the right (again, ceteris paribus). Looking at the plot for Posan, look at how the Chrismas effect changes depending on the quantile. We can zoom in to have a better look

basic[[3]] + xlim(0.8, 1)

The lowest quantiles, are much strongly effected: they go down and then bounce back. We couldn’t get such insights with a Gaussian GAM!

  1. Model checking

We extract the gam model corresponding to the fifth quantile (\(\tau = 0.45\)):

fit3 <- qdo(fitM, qus[5], getViz, nsim = 0)

We produce a QQ-plot and some checks along some of the covariates:

pl <- list()
pl[[1]] <- qq.gam(fit3)
pl[[2]] <- check1D(fit3, x = "wM") + l_densCheck() + l_points() + theme(legend.position="none")
pl[[3]] <- check1D(fit3, x = "wM_s95") + l_densCheck() + l_points() + theme(legend.position="none")
pl[[4]] <- check1D(fit3, x = "Posan") + l_densCheck() + l_points() + theme(legend.position="none")

grid.arrange(grobs = lapply(pl, "[[", 1))

Plots above are not informative at all, because there is in quantile regression there is no reason to expect the residuals to be normal! Instead a sensible thing to do is looking at the fraction of residuals falling below the fitted quantile, which should be close to 0.45 given that we are fitting quantile \(\tau = 0.45\). Hence a more sensible check is:

fracBelow <- function(.x){
 sum(.x < 0) / length(.x)
}

pl <- list()
pl[[1]] <- check1D(fit3, x = "wM_s95") + l_gridCheck1D(gridFun = fracBelow, level = 0) +
                                         geom_hline(yintercept = qus[5], linetype = 2)
pl[[2]] <- check1D(fit3, x = "Posan") + l_gridCheck1D(gridFun = fracBelow, level = 0) +
                                         geom_hline(yintercept = qus[5], linetype = 2)

grid.arrange(grobs = lapply(pl, "[[", 1))

Nice, but without confidence bands it is impossible to assess whether these departures from 0.45 are significant or not. To solve this we need to add further layers:

funCreator <- function(.qu, .lev, .type){

   ciFun <- function(.x){

     .n <- length(.x)

     if( .type == "lower" ){ return( qbinom(.lev/2, .n, .qu) / .n ) }

     if( .type == "upper" ){ return( qbinom(.lev/2, .n, .qu, lower.tail = FALSE) / .n ) }

   }

}

lowCI <- funCreator(.qu = qus[5], .lev = 0.05, .type = "lower")
upCI  <- funCreator(.qu = qus[5], .lev = 0.05, .type = "upper")

pl <- list()
pl[[1]] <- check1D(fit3, x = "wM_s95") + l_gridCheck1D(gridFun = fracBelow, level = 0, shape = 3, size = 2) +
                                         l_gridCheck1D(gridFun = lowCI, level = 0, colour = 2) +
                                         l_gridCheck1D(gridFun = upCI, level = 0, colour = 3) +
                                         geom_hline(yintercept = qus[5], linetype = 2)

pl[[2]] <- check1D(fit3, x = "Posan") + l_gridCheck1D(gridFun = fracBelow, level = 0, shape = 3, size = 2) +
                                        l_gridCheck1D(gridFun = lowCI, level = 0, colour = 2) +
                                        l_gridCheck1D(gridFun = upCI, level = 0, colour = 3) +
                                        geom_hline(yintercept = qus[5], linetype = 2)

grid.arrange(grobs = lapply(pl, "[[", 1), ncol = 2)

Looks good, most departures are within 95 percent confidence bands.