diagnostic graphics

Packages

library(tidyverse)
theme_set(theme_bw())
library(car)
library(broom)
library(broom.mixed)
library(magrittr)
## modeling
library(lme4)
library(MCMCglmm)
library(glmmTMB)
library(coda) ## Bayesian methods (trace plots etc.)
library(lattice) ## built-in
library(cowplot)
library(datasauRus)
library(nullabor) ## visual inference

Anscombe’s quartet

Anscombe’s quartet (2)

“Datasaurus”

Matejka and Fitzmaurice (2017)

Diagnostics: goals and ideas

Diagnostics: principles

always do diagnostics after fitting model

marginal distributions

quantile plots

m0 <- lm(y~x,dd)
a0 <- (augment(m0) ## broom: 'tidy' predictions/resids/etc
  %>% select(x,y,.resid)
  %>% pivot_longer(everything(), names_to="type",values_to="value")
  %>% mutate(type=factor(type,levels=c("x","y",".resid")))
)

(ggplot(a0,aes(sample=value))
  + stat_qq()
  + facet_wrap(~type)
  + stat_qq_line(colour="red")
)

model diagnosis

look for mis-specification (in order!):

influential points/groups (leverage/outliers/etc.)

upstream problems affect downstream diagnostics

bias

bias: residuals vs fitted plot

m1 <- lm(price~carat,diamonds)
a1 <- augment(m1,data=diamonds) ## include original data
ggplot(a1,aes(.fitted,.resid)) +
    geom_point(alpha=0.1)+geom_smooth()

bias 2: add faceting/colouring

ggplot(a1,aes(.fitted,.resid,colour=cut)) +
    facet_wrap(~clarity) +
    geom_point(alpha=0.4)+geom_smooth(se=FALSE)

useful to use dynamic graphics ggmap::gglocator (may need devtools::install_github("dkahle/ggmap"))

bias: solutions

heteroscedasticity

heteroscedasticity diagnostics

heteroscedasticity solutions

outliers

influence plots

ii <- car::influencePlot(m2)

outliers: solutions

distributional assumptions

histograms

quantile plots

distributional solutions

correlation

rarely tested! can’t detect without some kind of structure in data

binary data

Fit:

library(lme4)
data(Contraception,package="mlmRev")
Contraception <- Contraception %>%
    mutate(ch=factor(livch != 0, labels = c("N", "Y")))
m3 <- glmer(use ~ age * ch + I(age^2) + urban + (1 | urban:district),
            data=Contraception, family=binomial)

plot

a3 <- augment(m3,data=Contraception,type.residuals="response")
gg_bin1 <- (ggplot(a3,aes(.fitted,.resid))+
            geom_point()+ geom_smooth(method="loess"))
print(gg_bin1)

grouping

get_mid <- function(x) {
    cc <- as.character(x)
    lo <- as.numeric(gsub("[\\(\\[]([[:digit:].-]+).*","\\1",cc))
    hi <- as.numeric(gsub(".*,([[:digit:].-]+)[])]","\\1",cc))
    return((lo+hi)/2)
}
(a3
    %>% mutate(.fit_cut=cut_number(.fitted,20))
    %>% group_by(.fit_cut)
    %>% summarise(.resid=mean(.resid))
    %>% ungroup
    %>% mutate(.fitted=get_mid(.fit_cut))
) -> a3_sum

plot with grouping

gg_bin1+geom_point(data=a3_sum,colour="blue")

ggplot(a3,aes(.fitted,.resid,colour=livch,shape=urban,linetype=urban))+
            geom_point()+ geom_smooth(se=FALSE)+
    scale_colour_brewer(palette="Dark2")

keep trying …

ggplot(a3,aes(age,.resid,colour=urban))+
    geom_point()+
    geom_smooth(method="loess")+
    facet_wrap(~livch)

ggplot(a3,aes(age,.resid,colour=urban))+
    geom_point()+
    geom_smooth(method="gam",formula =y ~ s(x, k=25)) +
    facet_wrap(~livch)

DHARma

rr <- DHARMa::simulateResiduals(m3)
plot(rr)

likelihood profiles

use \(\\sqrt{-2 \\log (L-L\_0)}\) (\(\\sf V\)-shaped), signed square root (straight line/symmetry)

## `geom_smooth()` using formula 'y ~ x'

MCMC

lattice::xyplot(m4$Sol,aspect="fill",layout=c(2,3))

more MCMC diagnostics


complex models

Wickham et al. (2010); Gelman (2004); Buja et al. (2009)

simdat <- (simulate(m2,8)
    %>% data.frame(speed=cars$speed)
    %>% gather(sample,dist,-speed))
ddsim <- (cars
    %>% select(dist,speed)
    %>% mutate(sample="true")
    %>% bind_rows(simdat))
ddsimplot <- ggplot(ddsim,aes(speed,dist))+geom_point()+
    facet_wrap(~sample)

references

Buja, A., D. Cook, H. Hofmann, M. Lawrence, E.-K. Lee, D. F. Swayne, and H. Wickham. 2009. “Statistical Inference for Exploratory Data Analysis and Model Diagnostics.” Philosophical Transactions of the Royal Society A: Mathematical, Physical and Engineering Sciences 367 (1906): 4361–83. https://doi.org/10.1098/rsta.2009.0120.

Gelman, Andrew. 2004. “Exploratory Data Analysis for Complex Models.” Journal of Computational and Graphical Statistics 13 (4): 755–79. https://doi.org/10.1198/106186004X11435.

Matejka, Justin, and George Fitzmaurice. 2017. “The Datasaurus Dozen - Same Stats, Different Graphs: Generating Datasets with Varied Appearance and Identical Statistics Through Simulated Annealing.” In ACM Sigchi Conference on Human Factors in Computing Systems. https://doi.org/10.1145/3025453.3025912.

Quinn, Gerry P., and Michael J. Keough. 2002. Experimental Design and Data Analysis for Biologists. Cambridge, England: Cambridge University Press.

Säilynoja, Teemu, Paul-Christian Bürkner, and Aki Vehtari. 2021. “Graphical Test for Discrete Uniformity and Its Applications in Goodness of Fit Evaluation and Multiple Sample Comparison.” arXiv:2103.10522 \[Stat\], March. http://arxiv.org/abs/2103.10522.

Talts, Sean, Michael Betancourt, Daniel Simpson, Aki Vehtari, and Andrew Gelman. 2020. “Validating Bayesian Inference Algorithms with Simulation-Based Calibration.” arXiv:1804.06788 \[Stat\], October. http://arxiv.org/abs/1804.06788.

Tibshirani, Rob. 1987. “Estimating Optimal Transformations for Regression.” Journal of the American Statistical Association 83: 394.

Wickham, H., D. Cook, H. Hofmann, and Andreas Buja. 2010. “Graphical Inference for Infovis.” IEEE Transactions on Visualization and Computer Graphics 16 (6): 973–79. https://doi.org/10.1109/TVCG.2010.161.

Zeileis, Achim. 2006. “Object-Oriented Computation of Sandwich Estimators.” Journal of Statistical Software 16 (9): 1–16. http://www.jstatsoft.org/v16/i09/.