1 Introduction

The main fitting functions are:

2 Fitting one or several quantiles

Here we are fitting a regression model with an adaptive spline basis to quantile 0.8 of the motorcycle dataset.

library(qgam); library(MASS); library(mgcViz)

set.seed(6436)
fit <- qgam(accel~s(times, k=20, bs="ad"), # <- adaptive basis 
            data = mcycle, 
            err = 0.05,                    # <- smoothing pinball loss
            qu = 0.8)                      # <- chosen quantile
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.8..............done
# "fit" inherits from mgcv::gamObject 

# Convert to "gamViz": nsim = 0 because no simulation method available!
fit <- getViz(fit, nsim = 0)

plot(sm(fit, 1), trans = function(.x){ .x + fit$coefficients[1] }) +
  geom_point(data = mcycle, mapping = aes(x = times, y = accel), colour = "grey") +
  l_fitLine() + l_ciLine(colour = 2)

We might want to fit several quantiles at once. This can be done with mqgam.

quSeq <- seq(0.2, 0.8, length.out = 5)
set.seed(6436)
fit <- mqgam(accel~s(times, k=20, bs="ad"), 
             data = mcycle, 
             err = 0.05,
             qu = quSeq, 
             control = list("tol" = 0.01)) # <- sloppy tolerance to speed-up calibration 
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.5......done 
## qu = 0.35.......done 
## qu = 0.65......done 
## qu = 0.2.......done 
## qu = 0.8.........done
# Fit contains a list of gamObjects + other stuff

The output of mqgam can be manipulated using the qdo function. Using qdo we can print out the summary for each quantile, for instance:

# Summary for quantile 0.2
qdo(fit, qu = 0.2, summary)
## 
## Family: elf 
## Link function: identity 
## 
## Formula:
## accel ~ s(times, k = 20, bs = "ad")
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -43.256      1.874  -23.09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##            edf Ref.df Chi.sq p-value    
## s(times) 8.306  9.526  966.4  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.749   Deviance explained = 71.1%
## -REML = 498.42  Scale est. = 1         n = 133

We can convert a specific quantile fit to a gamViz object and then plot it:

o <- qdo(fit, qu = 0.2, getViz, nsim = 0)
plot(o)

We can look at the residuals:

qdo(fit, 0.2, check.gam)[[1]] 
## 
## Method: REML   Optimizer: outer newton
## full convergence after 14 iterations.
## Gradient range [7.805869e-06,0.0007285186]
## (score 498.4152 & scale 1).
## Hessian positive definite, eigenvalue range [7.805955e-06,1.53885].
## Model rank =  20 / 20 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##             k'   edf k-index p-value
## s(times) 19.00  8.31    0.78    0.17

Does not look good because the model is mispecified: it is based on the pinball loss, not on a probabilistic model of the response. Makes more sense checking fraction of observations falling above or below fit (see exercises).

3 Convergence checking

Let’s simulate some data:

library(qgam)
set.seed(15560)
n <- 1000
x <- rnorm(n, 0, 1); z <- rnorm(n)
X <- cbind(1, x, x^2, z, x*z)
beta <- c(0, 1, 1, 1, 0.5)
y <- drop(X %*% beta) + rnorm(n) 
dataf <- data.frame(cbind(y, x, z))
names(dataf) <- c("y", "x", "z")

We fit qgam model:

fit <- qgam(y ~ s(x) + z + I(x*z), 
            qu = 0.99, 
            err = 0.05,  # <- default
            data = dataf)
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.99.................done

Look at calibration loss:

check(fit$calibr, sel = 2)

Being overly ambitious with err is dangerous:

fit <- qgam(y ~ s(x) + z + I(x*z), 
            qu = 0.99, 
            err = 0.0001,  # <- default is 0.05
            lsig = -1,     # <- hand chosen log learning rate
            data = dataf)
## Error in gam.fit4(x, y, sp, Eb, UrS = UrS, weights = weights, start = start, : inner loop 3; can't correct step size

Function qgam and mqgam internally call tuneLearnFast for calibration. We can do it manually with tuneLearn:

dat <- gamSim(1, n=400, dist="normal", scale=2)
## Gu & Wahba 4 term additive model
cal <- tuneLearn(y ~ s(x0)+s(x1)+s(x2)+s(x3), 
                 data = dat,
                 qu = 0.8, 
                 lsig =  seq(-2, 2, by = 0.2), 
                 control = list("progress" = "none"))
check(cal)

4 Dealing with heteroscedasticity

Let us simulate some data from an heteroscedastic model.

set.seed(651)
n <- 5000
x <- seq(-4, 3, length.out = n)
X <- cbind(1, x, x^2)
beta <- c(0, 1, 1)
sigma =  1.2 + sin(2*x)
f <- drop(X %*% beta)
dat <- f + rnorm(n, 0, sigma)
dataf <- data.frame(cbind(dat, x))
names(dataf) <- c("y", "x")
   
qus <- seq(0.05, 0.95, length.out = 10)
plot(x, dat, col = "grey", ylab = "y")
for(iq in qus){ lines(x, qnorm(iq, f, sigma)) }

We now fit ten quantiles between 0.05 and 0.95.

lsig <- c(-0.96, -0.83, -0.69, -0.63, -0.76, -0.76, -0.89, -0.85, -0.99, -1.06)
fit <- mqgam(y~s(x, k = 30, bs = "cr"), 
             data = dataf,
             lsig = lsig,
             qu = qus, err = 0.05)
             
qus <- seq(0.05, 0.95, length.out = 10)
plot(x, dat, col = "grey", ylab = "y")
for(iq in qus){ 
 lines(x, qnorm(iq, f, sigma), col = 2)
 lines(x, qdo(fit, iq, predict))
}
legend("top", c("truth", "fitted"), col = 2:1, lty = rep(1, 2))

Let’s look at intervals for quantile 0.95.

plot(x, dat, col = "grey", ylab = "y")
tmp <- qdo(fit, 0.95, predict, se = TRUE)
lines(x, tmp$fit)
lines(x, tmp$fit + 3 * tmp$se.fit, col = 2)
lines(x, tmp$fit - 3 * tmp$se.fit, col = 2)

We can do better by letting the learning rate vary with the covariate.

fit <- qgam(list(y~s(x, k = 30, bs = "cr"), ~ s(x, k = 30, bs = "cr")), 
            data = dataf, qu = 0.95, err = 0.05, lsig = -1.16)

plot(x, dat, col = "grey", ylab = "y")
tmp <- predict(fit, se = TRUE)
lines(x, tmp$fit[ , 1])
lines(x, tmp$fit[ , 1] + 3 * tmp$se.fit[ , 1], col = 2)
lines(x, tmp$fit[ , 1] - 3 * tmp$se.fit[ , 1], col = 2)

Now the credible intervals correctly represent the underlying uncertainty.