library(tidyverse)
theme_set(theme_bw())
library(car)
library(broom)
library(broom.mixed)
library(magrittr)
## modeling
library(lme4)
library(MCMCglmm)
library(glmmTMB)
library(coda) ## Bayesian methods (trace plots etc.)
library(lattice) ## built-in
library(cowplot)
library(datasauRus)
library(nullabor) ## visual inference
Matejka and Fitzmaurice (2017)
m0 <- lm(y~x,dd)
a0 <- (augment(m0) ## broom: 'tidy' predictions/resids/etc
%>% select(x,y,.resid)
%>% pivot_longer(everything(), names_to="type",values_to="value")
%>% mutate(type=factor(type,levels=c("x","y",".resid")))
)
(ggplot(a0,aes(sample=value))
+ stat_qq()
+ facet_wrap(~type)
+ stat_qq_line(colour="red")
)
look for mis-specification (in order!):
influential points/groups (leverage/outliers/etc.)
upstream problems affect downstream diagnostics
m1 <- lm(price~carat,diamonds)
a1 <- augment(m1,data=diamonds) ## include original data
ggplot(a1,aes(.fitted,.resid)) +
geom_point(alpha=0.1)+geom_smooth()
ggplot(a1,aes(.fitted,.resid,colour=cut)) +
facet_wrap(~clarity) +
geom_point(alpha=0.4)+geom_smooth(se=FALSE)
useful to use dynamic graphics ggmap::gglocator
(may need devtools::install_github("dkahle/ggmap")
)
acepack::avas
, Tibshirani (1987))use standardized residuals
(adjust variance for position)
m2 <- lm(dist ~ speed, data=cars) ggplot(augment(m2),aes(.fitted,sqrt(abs(.std.resid))))+ geom_point()+geom_smooth()
sandwich
package: Zeileis (2006))ii <- car::influencePlot(m2)
stat_qq()
, stat_qq_line()
plot.lm(.,which=3)
; qqnorm()
car::qqPlot
(adds confidence envelope)avas
, Box-Cox (MASS:boxcox
), Yeo-Johnson etc. \[`?car::bcPower`\])rarely tested! can’t detect without some kind of structure in data
gls()
on residualssize=abs(.resid)
, colour=sign(.resid)
(or colour ramp)Fit:
library(lme4)
data(Contraception,package="mlmRev")
Contraception <- Contraception %>%
mutate(ch=factor(livch != 0, labels = c("N", "Y")))
m3 <- glmer(use ~ age * ch + I(age^2) + urban + (1 | urban:district),
data=Contraception, family=binomial)
a3 <- augment(m3,data=Contraception,type.residuals="response")
gg_bin1 <- (ggplot(a3,aes(.fitted,.resid))+
geom_point()+ geom_smooth(method="loess"))
print(gg_bin1)
get_mid <- function(x) {
cc <- as.character(x)
lo <- as.numeric(gsub("[\\(\\[]([[:digit:].-]+).*","\\1",cc))
hi <- as.numeric(gsub(".*,([[:digit:].-]+)[])]","\\1",cc))
return((lo+hi)/2)
}
(a3
%>% mutate(.fit_cut=cut_number(.fitted,20))
%>% group_by(.fit_cut)
%>% summarise(.resid=mean(.resid))
%>% ungroup
%>% mutate(.fitted=get_mid(.fit_cut))
) -> a3_sum
gg_bin1+geom_point(data=a3_sum,colour="blue")
ggplot(a3,aes(.fitted,.resid,colour=livch,shape=urban,linetype=urban))+
geom_point()+ geom_smooth(se=FALSE)+
scale_colour_brewer(palette="Dark2")
ggplot(a3,aes(age,.resid,colour=urban))+
geom_point()+
geom_smooth(method="loess")+
facet_wrap(~livch)
loess too bumpy?
ggplot(a3,aes(age,.resid,colour=urban))+ geom_point()+ geom_smooth(method=“loess”, method.args=list(family=“symmetric”),span=1)+ facet_wrap(~livch)
method="gam"
?ggplot(a3,aes(age,.resid,colour=urban))+
geom_point()+
geom_smooth(method="gam",formula =y ~ s(x, k=25)) +
facet_wrap(~livch)
rr <- DHARMa::simulateResiduals(m3)
plot(rr)
use \(\\sqrt{-2 \\log (L-L\_0)}\) (\(\\sf V\)-shaped), signed square root (straight line/symmetry)
## `geom_smooth()` using formula 'y ~ x'
lattice::xyplot(m4$Sol,aspect="fill",layout=c(2,3))
Wickham et al. (2010); Gelman (2004); Buja et al. (2009)
simdat <- (simulate(m2,8)
%>% data.frame(speed=cars$speed)
%>% gather(sample,dist,-speed))
ddsim <- (cars
%>% select(dist,speed)
%>% mutate(sample="true")
%>% bind_rows(simdat))
ddsimplot <- ggplot(ddsim,aes(speed,dist))+geom_point()+
facet_wrap(~sample)
Buja, A., D. Cook, H. Hofmann, M. Lawrence, E.-K. Lee, D. F. Swayne, and H. Wickham. 2009. “Statistical Inference for Exploratory Data Analysis and Model Diagnostics.” Philosophical Transactions of the Royal Society A: Mathematical, Physical and Engineering Sciences 367 (1906): 4361–83. https://doi.org/10.1098/rsta.2009.0120.
Gelman, Andrew. 2004. “Exploratory Data Analysis for Complex Models.” Journal of Computational and Graphical Statistics 13 (4): 755–79. https://doi.org/10.1198/106186004X11435.
Matejka, Justin, and George Fitzmaurice. 2017. “The Datasaurus Dozen - Same Stats, Different Graphs: Generating Datasets with Varied Appearance and Identical Statistics Through Simulated Annealing.” In ACM Sigchi Conference on Human Factors in Computing Systems. https://doi.org/10.1145/3025453.3025912.
Quinn, Gerry P., and Michael J. Keough. 2002. Experimental Design and Data Analysis for Biologists. Cambridge, England: Cambridge University Press.
Säilynoja, Teemu, Paul-Christian Bürkner, and Aki Vehtari. 2021. “Graphical Test for Discrete Uniformity and Its Applications in Goodness of Fit Evaluation and Multiple Sample Comparison.” arXiv:2103.10522 \[Stat\], March. http://arxiv.org/abs/2103.10522.
Talts, Sean, Michael Betancourt, Daniel Simpson, Aki Vehtari, and Andrew Gelman. 2020. “Validating Bayesian Inference Algorithms with Simulation-Based Calibration.” arXiv:1804.06788 \[Stat\], October. http://arxiv.org/abs/1804.06788.
Tibshirani, Rob. 1987. “Estimating Optimal Transformations for Regression.” Journal of the American Statistical Association 83: 394.
Wickham, H., D. Cook, H. Hofmann, and Andreas Buja. 2010. “Graphical Inference for Infovis.” IEEE Transactions on Visualization and Computer Graphics 16 (6): 973–79. https://doi.org/10.1109/TVCG.2010.161.
Zeileis, Achim. 2006. “Object-Oriented Computation of Sandwich Estimators.” Journal of Statistical Software 16 (9): 1–16. http://www.jstatsoft.org/v16/i09/.