1 Preamble

1.1 Install Libraries

Code
#install.packages("remotes")
#remotes::install_github("DevPsyLab/petersenlab")

1.2 Load Libraries

2 Import Data

Code
mydata <- read.csv("https://osf.io/8syp5/download")

3 Data Preparation

Code
mydata$countVariable <- as.integer(mydata$bpi_antisocialT2Sum)
mydata$orderedVariable <- factor(mydata$countVariable, ordered = TRUE)

mydata$female <- NA
mydata$female[which(mydata$sex == "male")] <- 0
mydata$female[which(mydata$sex == "female")] <- 1

4 Overview of Multiple Regression

https://isaactpetersen.github.io/Fantasy-Football-Analytics-Textbook/multiple-regression.html

5 Linear Regression

5.1 Linear regression model

Code
multipleRegressionModel <- lm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.exclude)

summary(multipleRegressionModel)

Call:
lm(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    data = mydata, na.action = na.exclude)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.3755 -1.2337 -0.2212  0.9911 12.8017 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)              1.19830    0.05983  20.029  < 2e-16 ***
bpi_antisocialT1Sum      0.46553    0.01858  25.049  < 2e-16 ***
bpi_anxiousDepressedSum  0.16075    0.02916   5.513 3.83e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.979 on 2871 degrees of freedom
  (8656 observations deleted due to missingness)
Multiple R-squared:  0.262, Adjusted R-squared:  0.2615 
F-statistic: 509.6 on 2 and 2871 DF,  p-value: < 2.2e-16
Code
confint(multipleRegressionModel)
                            2.5 %    97.5 %
(Intercept)             1.0809881 1.3156128
bpi_antisocialT1Sum     0.4290884 0.5019688
bpi_anxiousDepressedSum 0.1035825 0.2179258
Code
print(effectsize::standardize_parameters(multipleRegressionModel), digits = 2)
# Standardization method: refit

Parameter               | Std. Coef. |        95% CI
----------------------------------------------------
(Intercept)             |   1.62e-16 | [-0.03, 0.03]
bpi antisocialT1Sum     |       0.46 | [ 0.42, 0.49]
bpi anxiousDepressedSum |       0.10 | [ 0.06, 0.14]

5.1.1 Remove missing data

Code
multipleRegressionModelNoMissing <- lm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.omit)

multipleRegressionModelNoMissing <- lm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata %>% select(bpi_antisocialT2Sum, bpi_antisocialT1Sum, bpi_anxiousDepressedSum) %>% na.omit)

5.2 Linear regression model on correlation/covariance matrix (for pairwise deletion)

Code
multipleRegressionModelPairwise <- psych::setCor(
  y = "bpi_antisocialT2Sum",
  x = c("bpi_antisocialT1Sum","bpi_anxiousDepressedSum"),
  data = cov(mydata[,c("bpi_antisocialT2Sum","bpi_antisocialT1Sum","bpi_anxiousDepressedSum")], use = "pairwise.complete.obs"),
  n.obs = nrow(mydata))

Code
summary(multipleRegressionModelPairwise)

Multiple Regression from matrix input 
psych::setCor(y = "bpi_antisocialT2Sum", x = c("bpi_antisocialT1Sum", 
    "bpi_anxiousDepressedSum"), data = cov(mydata[, c("bpi_antisocialT2Sum", 
    "bpi_antisocialT1Sum", "bpi_anxiousDepressedSum")], use = "pairwise.complete.obs"), 
    n.obs = nrow(mydata))

Multiple Regression from matrix input 

Beta weights 
                        bpi_antisocialT2Sum
bpi_antisocialT1Sum                    0.46
bpi_anxiousDepressedSum                0.09

Multiple R 
bpi_antisocialT2Sum 
               0.51 

Multiple R2 
bpi_antisocialT2Sum 
               0.26 

Cohen's set correlation R2 
[1] 0.26

Squared Canonical Correlations
NULL
Code
multipleRegressionModelPairwise[c("coefficients","se","Probability","R2","shrunkenR2")]
$coefficients
                        bpi_antisocialT2Sum
bpi_antisocialT1Sum               0.4560961
bpi_anxiousDepressedSum           0.0899642

$se
                        bpi_antisocialT2Sum
bpi_antisocialT1Sum             0.009253718
bpi_anxiousDepressedSum         0.009253718

$Probability
                        bpi_antisocialT2Sum
bpi_antisocialT1Sum            0.000000e+00
bpi_anxiousDepressedSum        2.959352e-22

$R2
bpi_antisocialT2Sum 
           0.256918 

$shrunkenR2
bpi_antisocialT2Sum 
           0.256789 

5.3 Linear regression model with robust covariance matrix (rms)

Code
rmsMultipleRegressionModel <- robcov(ols(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  x = TRUE,
  y = TRUE))

rmsMultipleRegressionModel
Frequencies of Missing Values Due to Each Variable
    bpi_antisocialT2Sum     bpi_antisocialT1Sum bpi_anxiousDepressedSum 
                   7501                    7613                    7616 

Linear Regression Model

ols(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    data = mydata, x = TRUE, y = TRUE)

                Model Likelihood    Discrimination    
                      Ratio Test           Indexes    
Obs    2874    LR chi2    873.09    R2       0.262    
sigma1.9786    d.f.            2    R2 adj   0.261    
d.f.   2871    Pr(> chi2) 0.0000    g        1.278    

Residuals

    Min      1Q  Median      3Q     Max 
-8.3755 -1.2337 -0.2212  0.9911 12.8017 

                        Coef   S.E.   t     Pr(>|t|)
Intercept               1.1983 0.0622 19.26 <0.0001 
bpi_antisocialT1Sum     0.4655 0.0229 20.30 <0.0001 
bpi_anxiousDepressedSum 0.1608 0.0330  4.87 <0.0001 
Code
confint(rmsMultipleRegressionModel)
                             2.5 %    97.5 %
Intercept               1.07631713 1.3202837
bpi_antisocialT1Sum     0.42056957 0.5104877
bpi_anxiousDepressedSum 0.09606644 0.2254418
Code
print(effectsize::standardize_parameters(rmsMultipleRegressionModel), digits = 2)
# Standardization method: refit

Parameter               | Std. Coef. |        95% CI
----------------------------------------------------
Intercept               |   1.62e-16 | [-0.03, 0.03]
bpi antisocialT1Sum     |       0.46 | [ 0.42, 0.49]
bpi anxiousDepressedSum |       0.10 | [ 0.06, 0.14]

5.4 Robust linear regression (MM-type iteratively reweighted least squares regression)

Code
robustLinearRegression <- lmrob(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.exclude)

summary(robustLinearRegression)

Call:
lmrob(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    data = mydata, na.action = na.exclude)
 \--> method = "MM"
Residuals:
     Min       1Q   Median       3Q      Max 
-8.43518 -1.06680 -0.06707  1.14090 13.05599 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)              0.94401    0.05406  17.464  < 2e-16 ***
bpi_antisocialT1Sum      0.49135    0.02237  21.966  < 2e-16 ***
bpi_anxiousDepressedSum  0.15766    0.03102   5.083 3.96e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Robust residual standard error: 1.628 
  (8656 observations deleted due to missingness)
Multiple R-squared:  0.326, Adjusted R-squared:  0.3255 
Convergence in 14 IRWLS iterations

Robustness weights: 
 12 observations c(52,347,354,517,709,766,768,979,1618,2402,2403,2404)
     are outliers with |weight| = 0 ( < 3.5e-05); 
 283 weights are ~= 1. The remaining 2579 ones are summarized as
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
0.0001311 0.8599000 0.9470000 0.8814000 0.9816000 0.9990000 
Algorithmic parameters: 
       tuning.chi                bb        tuning.psi        refine.tol 
        1.548e+00         5.000e-01         4.685e+00         1.000e-07 
          rel.tol         scale.tol         solve.tol          zero.tol 
        1.000e-07         1.000e-10         1.000e-07         1.000e-10 
      eps.outlier             eps.x warn.limit.reject warn.limit.meanrw 
        3.479e-05         2.365e-11         5.000e-01         5.000e-01 
     nResample         max.it         groups        n.group       best.r.s 
           500             50              5            400              2 
      k.fast.s          k.max    maxit.scale      trace.lev            mts 
             1            200            200              0           1000 
    compute.rd fast.s.large.n 
             0           2000 
                  psi           subsampling                   cov 
           "bisquare"         "nonsingular"         ".vcov.avar1" 
compute.outlier.stats 
                 "SM" 
seed : int(0) 
Code
confint(robustLinearRegression)
                             2.5 %    97.5 %
(Intercept)             0.83801566 1.0499981
bpi_antisocialT1Sum     0.44749135 0.5352118
bpi_anxiousDepressedSum 0.09683728 0.2184779
Code
print(effectsize::standardize_parameters(robustLinearRegression), digits = 2)
# Standardization method: refit

Parameter               | Std. Coef. |         95% CI
-----------------------------------------------------
(Intercept)             |      -0.08 | [-0.11, -0.05]
bpi antisocialT1Sum     |       0.48 | [ 0.44,  0.52]
bpi anxiousDepressedSum |       0.10 | [ 0.06,  0.14]

5.5 Least trimmed squares regression (for removing outliers)

Code
ltsRegression <- robustbase::ltsReg(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.exclude)

summary(ltsRegression)

Call:
ltsReg.formula(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + 
    bpi_anxiousDepressedSum, data = mydata, na.action = na.exclude)

Residuals (from reweighted LS):
    Min      1Q  Median      3Q     Max 
-3.9305 -0.9091  0.0000  0.9997  3.9036 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
Intercept                0.74932    0.04795  15.626  < 2e-16 ***
bpi_antisocialT1Sum      0.54338    0.01524  35.647  < 2e-16 ***
bpi_anxiousDepressedSum  0.13826    0.02343   5.901 4.06e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.534 on 2710 degrees of freedom
Multiple R-Squared: 0.4121, Adjusted R-squared: 0.4116 
F-statistic: 949.7 on 2 and 2710 DF,  p-value: < 2.2e-16 
Code
confint(robustLinearRegression)
                             2.5 %    97.5 %
(Intercept)             0.83801566 1.0499981
bpi_antisocialT1Sum     0.44749135 0.5352118
bpi_anxiousDepressedSum 0.09683728 0.2184779

5.6 Bayesian linear regression

Code
bayesianRegularizedRegression <- brms::brm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  chains = 4,
  iter = 2000,
  seed = 52242)
Code
summary(bayesianRegularizedRegression)
 Family: gaussian 
  Links: mu = identity 
Formula: bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum 
   Data: mydata (Number of observations: 2874) 
  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
         total post-warmup draws = 4000

Regression Coefficients:
                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
Intercept                   1.20      0.06     1.08     1.32 1.00     4472
bpi_antisocialT1Sum         0.47      0.02     0.43     0.50 1.00     3350
bpi_anxiousDepressedSum     0.16      0.03     0.10     0.22 1.00     3184
                        Tail_ESS
Intercept                   3070
bpi_antisocialT1Sum         3292
bpi_anxiousDepressedSum     2958

Further Distributional Parameters:
      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma     1.98      0.03     1.93     2.03 1.00     4104     3454

Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
Code
print(effectsize::standardize_parameters(bayesianRegularizedRegression, method = "basic"), digits = 2)
# Standardization method: basic

Component   |               Parameter | Std. Median |       95% CI
------------------------------------------------------------------
conditional |             (Intercept) |        0.00 | [0.00, 0.00]
conditional |     bpi_antisocialT1Sum |        0.46 | [0.42, 0.49]
conditional | bpi_anxiousDepressedSum |        0.10 | [0.07, 0.14]
sigma       |                   sigma |        1.98 | [1.93, 2.03]

6 Generalized Linear Regression

6.1 Generalized regression model

In this example, we predict a count variable that has a poisson distribution. We could change the distribution.

Code
generalizedRegressionModel <- glm(
  countVariable ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  family = "poisson",
  na.action = na.exclude)

summary(generalizedRegressionModel)

Call:
glm(formula = countVariable ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    family = "poisson", data = mydata, na.action = na.exclude)

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)             0.459999   0.019650  23.410  < 2e-16 ***
bpi_antisocialT1Sum     0.141577   0.004891  28.948  < 2e-16 ***
bpi_anxiousDepressedSum 0.047567   0.008036   5.919 3.23e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 5849.4  on 2873  degrees of freedom
Residual deviance: 4562.6  on 2871  degrees of freedom
  (8656 observations deleted due to missingness)
AIC: 11482

Number of Fisher Scoring iterations: 5
Code
confint(generalizedRegressionModel)
                             2.5 %     97.5 %
(Intercept)             0.42136094 0.49838751
bpi_antisocialT1Sum     0.13196544 0.15113708
bpi_anxiousDepressedSum 0.03177425 0.06327445
Code
print(effectsize::standardize_parameters(generalizedRegressionModel), digits = 2)
# Standardization method: refit

Parameter               | Std. Coef. |       95% CI
---------------------------------------------------
(Intercept)             |       0.91 | [0.89, 0.94]
bpi antisocialT1Sum     |       0.32 | [0.30, 0.34]
bpi anxiousDepressedSum |       0.07 | [0.05, 0.09]

- Response is unstandardized.

6.2 Generalized regression model (rms)

In this example, we predict a count variable that has a poisson distribution. We could change the distribution.

Code
rmsGeneralizedRegressionModel <- rms::Glm(
  countVariable ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  x = TRUE,
  y = TRUE,
  family = "poisson")

rmsGeneralizedRegressionModel
Frequencies of Missing Values Due to Each Variable
          countVariable     bpi_antisocialT1Sum bpi_anxiousDepressedSum 
                   7501                    7613                    7616 

General Linear Model

rms::Glm(formula = countVariable ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    family = "poisson", data = mydata, x = TRUE, y = TRUE)

                       Model Likelihood    
                             Ratio Test    
          Obs2874    LR chi2    1286.72    
Residual d.f.2871    d.f.             2    
          g 0.387    Pr(> chi2) <0.0001    

                        Coef   S.E.   Wald Z Pr(>|Z|)
Intercept               0.4600 0.0196 23.41  <0.0001 
bpi_antisocialT1Sum     0.1416 0.0049 28.95  <0.0001 
bpi_anxiousDepressedSum 0.0476 0.0080  5.92  <0.0001 
Code
confint(rmsGeneralizedRegressionModel)
Error in `summary.rms()`:
! adjustment values not defined here or with datadist for bpi_antisocialT1Sum bpi_anxiousDepressedSum
Code
print(effectsize::standardize_parameters(rmsGeneralizedRegressionModel), digits = 2)
# Standardization method: refit

Parameter               | Std. Coef. |       95% CI
---------------------------------------------------
Intercept               |       0.91 | [0.89, 0.94]
bpi antisocialT1Sum     |       0.32 | [0.30, 0.34]
bpi anxiousDepressedSum |       0.07 | [0.05, 0.09]

- Response is unstandardized.

6.3 Bayesian generalized linear model

In this example, we predict a count variable that has a poisson distribution. We could change the distribution. For example, we could use Gamma regression, family = Gamma, when the response variable is continuous and positive, and the coefficient of variation–rather than the variance–is constant.

Code
bayesianGeneralizedLinearRegression <- brm(
  countVariable ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  family = poisson,
  chains = 4,
  seed = 52242,
  iter = 2000)
Code
summary(bayesianGeneralizedLinearRegression)
 Family: poisson 
  Links: mu = log 
Formula: countVariable ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum 
   Data: mydata (Number of observations: 2874) 
  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
         total post-warmup draws = 4000

Regression Coefficients:
                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
Intercept                   0.46      0.02     0.42     0.50 1.00     3095
bpi_antisocialT1Sum         0.14      0.00     0.13     0.15 1.00     2776
bpi_anxiousDepressedSum     0.05      0.01     0.03     0.06 1.00     2620
                        Tail_ESS
Intercept                   2987
bpi_antisocialT1Sum         2762
bpi_anxiousDepressedSum     2689

Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
Code
print(effectsize::standardize_parameters(bayesianGeneralizedLinearRegression, method = "basic"), digits = 2)
# Standardization method: basic

Parameter               | Std. Median |       95% CI
----------------------------------------------------
(Intercept)             |        0.00 | [0.00, 0.00]
bpi_antisocialT1Sum     |        0.32 | [0.30, 0.34]
bpi_anxiousDepressedSum |        0.07 | [0.05, 0.09]

- Response is unstandardized.

6.4 Robust generalized regression

Code
robustGeneralizedRegression <- robustbase::glmrob(
  countVariable ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  family = "poisson",
  na.action = na.exclude)

summary(robustGeneralizedRegression)

Call:  robustbase::glmrob(formula = countVariable ~ bpi_antisocialT1Sum +      bpi_anxiousDepressedSum, family = "poisson", data = mydata,      na.action = na.exclude) 


Coefficients:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)             0.365934   0.020794  17.598  < 2e-16 ***
bpi_antisocialT1Sum     0.156275   0.005046  30.969  < 2e-16 ***
bpi_anxiousDepressedSum 0.050526   0.008332   6.064 1.32e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Robustness weights w.r * w.x: 
 2273 weights are ~= 1. The remaining 601 ones are summarized as
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.1286  0.5550  0.7574  0.7183  0.8874  0.9982 

Number of observations: 2874 
Fitted by method 'Mqle'  (in 5 iterations)

(Dispersion parameter for poisson family taken to be 1)

No deviance values available 
Algorithmic parameters: 
   acc    tcc 
0.0001 1.3450 
maxit 
   50 
test.acc 
  "coef" 
Code
confint(robustGeneralizedRegression)
Error in `glm.control()`:
! unused arguments (acc = 1e-04, test.acc = "coef", tcc = 1.345)
Code
print(effectsize::standardize_parameters(robustGeneralizedRegression), digits = 2)
# Standardization method: refit

Parameter               | Std. Coef. |       95% CI
---------------------------------------------------
(Intercept)             |       0.87 | [0.84, 0.89]
bpi antisocialT1Sum     |       0.35 | [0.33, 0.38]
bpi anxiousDepressedSum |       0.07 | [0.05, 0.10]

- Response is unstandardized.

6.5 Ordinal regression model

Code
ordinalRegressionModel1 <- MASS::polr(
  orderedVariable ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata)

ordinalRegressionModel2 <- rms::lrm(
  orderedVariable ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  x = TRUE,
  y = TRUE)

ordinalRegressionModel3 <- rms::orm(
  orderedVariable ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  x = TRUE,
  y = TRUE)

summary(ordinalRegressionModel1)
Call:
MASS::polr(formula = orderedVariable ~ bpi_antisocialT1Sum + 
    bpi_anxiousDepressedSum, data = mydata)

Coefficients:
                         Value Std. Error t value
bpi_antisocialT1Sum     0.4404    0.01862  23.647
bpi_anxiousDepressedSum 0.1427    0.02615   5.457

Intercepts:
      Value   Std. Error t value
0|1   -0.6001  0.0630    -9.5185
1|2    0.6766  0.0585    11.5656
2|3    1.5801  0.0634    24.9191
3|4    2.4110  0.0720    33.4816
4|5    3.1617  0.0825    38.3006
5|6    3.8560  0.0949    40.6120
6|7    4.5367  0.1106    41.0316
7|8    5.1667  0.1291    40.0147
8|9    5.8129  0.1545    37.6263
9|10   6.5364  0.1962    33.3193
10|11  7.1835  0.2513    28.5820
11|12  7.8469  0.3331    23.5566
12|14  8.7818  0.5113    17.1741

Residual Deviance: 11088.49 
AIC: 11118.49 
(8656 observations deleted due to missingness)
Code
confint(ordinalRegressionModel1)
                             2.5 %    97.5 %
bpi_antisocialT1Sum     0.40403039 0.4770422
bpi_anxiousDepressedSum 0.09149919 0.1940080
Code
ordinalRegressionModel2
Frequencies of Missing Values Due to Each Variable
        orderedVariable     bpi_antisocialT1Sum bpi_anxiousDepressedSum 
                   7501                    7613                    7616 

Logistic Regression Model

rms::lrm(formula = orderedVariable ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    data = mydata, x = TRUE, y = TRUE)


Frequencies of Responses

  0   1   2   3   4   5   6   7   8   9  10  11  12  14 
455 597 527 444 313 205 133  80  52  33  16   9   6   4 

                       Model Likelihood       Discrimination    Rank Discrim.    
                             Ratio Test              Indexes          Indexes    
Obs          2874    LR chi2     881.26       R2       0.268    C       0.705    
max |deriv| 6e-06    d.f.             2      R2(2,2874)0.264    Dxy     0.410    
                     Pr(> chi2) <0.0001    R2(2,2803.4)0.269    gamma   0.426    
                                              Brier    0.195    tau-a   0.350    

                        Coef   S.E.   Wald Z Pr(>|Z|)
bpi_antisocialT1Sum     0.4404 0.0186 23.65  <0.0001 
bpi_anxiousDepressedSum 0.1427 0.0261  5.46  <0.0001 
Code
print(effectsize::standardize_parameters(ordinalRegressionModel2), digits = 2)
# Standardization method: refit

Parameter               | Std. Coef. |         95% CI
-----------------------------------------------------
y>=1                    |       2.01 | [ 1.90,  2.12]
y>=2                    |       0.73 | [ 0.65,  0.81]
y>=3                    |      -0.17 | [-0.25, -0.09]
y>=4                    |      -1.00 | [-1.09, -0.91]
y>=5                    |      -1.75 | [-1.86, -1.65]
y>=6                    |      -2.45 | [-2.58, -2.32]
y>=7                    |      -3.13 | [-3.29, -2.97]
y>=8                    |      -3.76 | [-3.96, -3.56]
y>=9                    |      -4.40 | [-4.66, -4.14]
y>=10                   |      -5.13 | [-5.48, -4.78]
y>=11                   |      -5.78 | [-6.24, -5.31]
y>=12                   |      -6.44 | [-7.07, -5.81]
y>=14                   |      -7.37 | [-8.36, -6.39]
bpi antisocialT1Sum     |       0.99 | [ 0.91,  1.08]
bpi anxiousDepressedSum |       0.21 | [ 0.13,  0.28]

- Response is unstandardized.
Code
ordinalRegressionModel3
Frequencies of Missing Values Due to Each Variable
        orderedVariable     bpi_antisocialT1Sum bpi_anxiousDepressedSum 
                   7501                    7613                    7616 

Logistic (Proportional Odds) Ordinal Regression Model

rms::orm(formula = orderedVariable ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    data = mydata, x = TRUE, y = TRUE)


Frequencies of Responses

  0   1   2   3   4   5   6   7   8   9  10  11  12  14 
455 597 527 444 313 205 133  80  52  33  16   9   6   4 

                        Model Likelihood               Discrimination    Rank Discrim.    
                              Ratio Test                      Indexes          Indexes    
Obs           2874    LR chi2     881.26    R2                  0.268    rho     0.506    
ESS         2803.4    d.f.             2    R2(2,2874)          0.264    Dxy     0.410    
Distinct Y      14    Pr(> chi2) <0.0001    R2(2,2803.4)        0.269                     
Median Y         3    Score chi2  916.79    |Pr(Y>=median)-0.5| 0.188                     
max |deriv|  2e-12    Pr(> chi2) <0.0001                                                  

                        Coef   S.E.   Wald Z Pr(>|Z|)
bpi_antisocialT1Sum     0.4404 0.0186 23.65  <0.0001 
bpi_anxiousDepressedSum 0.1427 0.0261  5.46  <0.0001 
Code
confint(ordinalRegressionModel3)
                              2.5 %     97.5 %
y>=1                             NA         NA
y>=2                    -0.79129972 -0.5619711
y>=3                             NA         NA
y>=4                             NA         NA
y>=5                             NA         NA
y>=6                             NA         NA
y>=7                             NA         NA
y>=8                             NA         NA
y>=9                             NA         NA
y>=10                            NA         NA
y>=11                            NA         NA
y>=12                            NA         NA
y>=14                            NA         NA
bpi_antisocialT1Sum      0.40390078  0.4769035
bpi_anxiousDepressedSum  0.09143921  0.1939303
Code
print(effectsize::standardize_parameters(ordinalRegressionModel3), digits = 2)
# Standardization method: refit

Parameter               | Std. Coef. |       95% CI
---------------------------------------------------
y>=2                    |       0.73 | [0.65, 0.81]
bpi antisocialT1Sum     |       0.99 | [0.91, 1.08]
bpi anxiousDepressedSum |       0.21 | [0.13, 0.28]

- Response is unstandardized.

6.6 Bayesian ordinal regression model

Code
bayesianOrdinalRegression <- brm(
  orderedVariable ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  family = cumulative(),
  chains = 4,
  seed = 52242,
  iter = 2000)
Code
summary(bayesianOrdinalRegression)
 Family: cumulative 
  Links: mu = logit 
Formula: orderedVariable ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum 
   Data: mydata (Number of observations: 2874) 
  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
         total post-warmup draws = 4000

Regression Coefficients:
                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
Intercept[1]               -0.60      0.06    -0.73    -0.48 1.00     4659
Intercept[2]                0.67      0.06     0.55     0.79 1.00     6134
Intercept[3]                1.57      0.06     1.45     1.70 1.00     5916
Intercept[4]                2.40      0.07     2.26     2.54 1.00     5305
Intercept[5]                3.15      0.08     2.99     3.31 1.00     4846
Intercept[6]                3.84      0.09     3.66     4.03 1.00     4741
Intercept[7]                4.52      0.11     4.31     4.74 1.00     4796
Intercept[8]                5.15      0.13     4.90     5.40 1.00     4908
Intercept[9]                5.79      0.15     5.50     6.11 1.00     5172
Intercept[10]               6.51      0.20     6.14     6.90 1.00     5196
Intercept[11]               7.17      0.25     6.69     7.69 1.00     5867
Intercept[12]               7.87      0.34     7.25     8.56 1.00     5503
Intercept[13]               8.91      0.53     7.99    10.06 1.00     5868
bpi_antisocialT1Sum         0.44      0.02     0.40     0.48 1.00     4321
bpi_anxiousDepressedSum     0.14      0.03     0.09     0.19 1.00     4514
                        Tail_ESS
Intercept[1]                3538
Intercept[2]                3487
Intercept[3]                3551
Intercept[4]                3532
Intercept[5]                3866
Intercept[6]                3599
Intercept[7]                3222
Intercept[8]                3125
Intercept[9]                2991
Intercept[10]               3098
Intercept[11]               3043
Intercept[12]               2906
Intercept[13]               2873
bpi_antisocialT1Sum         2945
bpi_anxiousDepressedSum     3156

Further Distributional Parameters:
     Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
disc     1.00      0.00     1.00     1.00   NA       NA       NA

Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
Code
print(effectsize::standardize_parameters(bayesianOrdinalRegression, method = "basic"), digits = 2)
# Standardization method: basic

Parameter               | Std. Median |         95% CI
------------------------------------------------------
Intercept[1]            |        0.00 | [ 0.00,  0.00]
Intercept[2]            |        1.51 | [ 1.25,  1.78]
Intercept[3]            |        2.26 | [ 2.08,  2.45]
Intercept[4]            |        2.40 | [ 2.26,  2.54]
Intercept[5]            |        3.15 | [ 2.99,  3.31]
Intercept[6]            |        3.84 | [ 3.66,  4.03]
Intercept[7]            |        4.52 | [ 4.31,  4.74]
Intercept[8]            |        5.14 | [ 4.90,  5.40]
Intercept[9]            |        5.78 | [ 5.50,  6.11]
Intercept[10]           |        6.50 | [ 6.14,  6.90]
Intercept[11]           |        7.15 | [ 6.69,  7.69]
Intercept[12]           |        7.84 | [ 7.25,  8.56]
Intercept[13]           |        8.87 | [ 7.99, 10.06]
bpi_antisocialT1Sum     |        0.44 | [ 0.40,  0.48]
bpi_anxiousDepressedSum |        0.14 | [ 0.09,  0.19]

- Response is unstandardized.

6.7 Bayesian count regression model

Code
bayesianCountRegression <- brm(
  countVariable ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  family = "poisson",
  chains = 4,
  seed = 52242,
  iter = 2000)
Code
summary(bayesianCountRegression)
 Family: poisson 
  Links: mu = log 
Formula: countVariable ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum 
   Data: mydata (Number of observations: 2874) 
  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
         total post-warmup draws = 4000

Regression Coefficients:
                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
Intercept                   0.46      0.02     0.42     0.50 1.00     3095
bpi_antisocialT1Sum         0.14      0.00     0.13     0.15 1.00     2776
bpi_anxiousDepressedSum     0.05      0.01     0.03     0.06 1.00     2620
                        Tail_ESS
Intercept                   2987
bpi_antisocialT1Sum         2762
bpi_anxiousDepressedSum     2689

Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
Code
print(effectsize::standardize_parameters(bayesianCountRegression, method = "basic"), digits = 2)
# Standardization method: basic

Parameter               | Std. Median |       95% CI
----------------------------------------------------
(Intercept)             |        0.00 | [0.00, 0.00]
bpi_antisocialT1Sum     |        0.32 | [0.30, 0.34]
bpi_anxiousDepressedSum |        0.07 | [0.05, 0.09]

- Response is unstandardized.

6.8 Logistic regression model (rms)

Code
logisticRegressionModel <- rms::robcov(rms::lrm(
  female ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  x = TRUE,
  y = TRUE))

logisticRegressionModel
Frequencies of Missing Values Due to Each Variable
                 female     bpi_antisocialT1Sum bpi_anxiousDepressedSum 
                      2                    7613                    7616 

Logistic Regression Model

rms::lrm(formula = female ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    data = mydata, x = TRUE, y = TRUE)

                       Model Likelihood       Discrimination    Rank Discrim.    
                             Ratio Test              Indexes          Indexes    
Obs          3914    LR chi2      66.63       R2       0.023    C       0.571    
 0           1965    d.f.             2      R2(2,3914)0.016    Dxy     0.142    
 1           1949    Pr(> chi2) <0.0001    R2(2,2935.5)0.022    gamma   0.147    
max |deriv| 1e-07                             Brier    0.246    tau-a   0.071    

                        Coef    S.E.   Wald Z Pr(>|Z|)
Intercept                0.3002 0.0529  5.67  <0.0001 
bpi_antisocialT1Sum     -0.1244 0.0166 -7.48  <0.0001 
bpi_anxiousDepressedSum  0.0382 0.0253  1.51  0.1314  
Code
confint(logisticRegressionModel)
Error in `summary.rms()`:
! adjustment values not defined here or with datadist for bpi_antisocialT1Sum bpi_anxiousDepressedSum
Code
print(effectsize::standardize_parameters(logisticRegressionModel), digits = 2)
# Standardization method: refit

Parameter               | Std. Coef. |         95% CI
-----------------------------------------------------
Intercept               |  -9.88e-03 | [-0.07,  0.05]
bpi antisocialT1Sum     |      -0.29 | [-0.36, -0.21]
bpi anxiousDepressedSum |       0.06 | [-0.02,  0.13]

- Response is unstandardized.

6.9 Bayesian logistic regression model

Code
bayesianLogisticRegression <- brm(
  female ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  family = bernoulli,
  chains = 4,
  seed = 52242,
  iter = 2000)
Code
summary(bayesianLogisticRegression)
 Family: bernoulli 
  Links: mu = logit 
Formula: female ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum 
   Data: mydata (Number of observations: 3914) 
  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
         total post-warmup draws = 4000

Regression Coefficients:
                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
Intercept                   0.30      0.05     0.20     0.41 1.00     4125
bpi_antisocialT1Sum        -0.13      0.02    -0.16    -0.09 1.00     2943
bpi_anxiousDepressedSum     0.04      0.03    -0.01     0.09 1.00     3084
                        Tail_ESS
Intercept                   3269
bpi_antisocialT1Sum         2857
bpi_anxiousDepressedSum     2730

Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
Code
print(effectsize::standardize_parameters(bayesianLogisticRegression, method = "basic"), digits = 2)
# Standardization method: basic

Parameter               | Std. Median |         95% CI
------------------------------------------------------
(Intercept)             |        0.00 | [ 0.00,  0.00]
bpi_antisocialT1Sum     |       -0.29 | [-0.36, -0.22]
bpi_anxiousDepressedSum |        0.06 | [-0.02,  0.13]

- Response is unstandardized.

7 Hierarchical Linear Regression

Code
model1 <- lm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum,
  data = mydata %>% select(bpi_antisocialT2Sum, bpi_antisocialT1Sum, bpi_anxiousDepressedSum) %>% na.omit,
  na.action = na.exclude)

model2 <- lm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata %>% select(bpi_antisocialT2Sum, bpi_antisocialT1Sum, bpi_anxiousDepressedSum) %>% na.omit,
  na.action = na.exclude)

summary(model2)$adj.r.squared
[1] 0.2614694
Code
anova(model1, model2)
Code
summary(model2)$adj.r.squared - summary(model1)$adj.r.squared
[1] 0.007559301

8 Moderated Multiple Regression

When testing for the possibility of moderation/interaction, it is first important to think about what shape the interaction might take. If you expect a bilinear interaction (i.e., the slope of the regression line between X and Y changes as a linear function of a third variable, Z), it is appropriate to treat the moderator as continuous in moderated multiple regression. However, if you expect that the interaction might take a different form (e.g., a threshold effect), you may need to consider other approaches, such as using categorical variables, polynomial interactions, spline functions, generalized additive models (GAMs), or machine learning.

For information on the (low) power to detect interactions and how to improve it, see: https://isaactpetersen.github.io/Principles-Psychological-Assessment/reliability.html#sec-productTerm-reliability

The examples below use the interactions package in R. https://cran.r-project.org/web/packages/interactions/vignettes/interactions.html (archived at https://perma.cc/P34H-7BH3)

Code
states <- as.data.frame(state.x77)
states$HS.Grad <- states$`HS Grad`

8.1 Mean Center Predictors

Make sure to mean-center or orthogonalize predictors before computing the interaction term.

Code
states$Illiteracy_centered <- scale(states$Illiteracy, scale = FALSE)
states$Murder_centered <- scale(states$Murder, scale = FALSE)

8.2 Model

Code
interactionModel <- lm(
  Income ~ Illiteracy_centered + Murder_centered + Illiteracy_centered:Murder_centered + HS.Grad,
  data = states)

summary(interactionModel)

Call:
lm(formula = Income ~ Illiteracy_centered + Murder_centered + 
    Illiteracy_centered:Murder_centered + HS.Grad, data = states)

Residuals:
    Min      1Q  Median      3Q     Max 
-916.27 -244.42   28.42  228.14 1221.16 

Coefficients:
                                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)                          2421.46     594.36   4.074 0.000185 ***
Illiteracy_centered                    37.12     192.56   0.193 0.848005    
Murder_centered                        17.07      25.52   0.669 0.507085    
HS.Grad                                40.76      10.92   3.733 0.000530 ***
Illiteracy_centered:Murder_centered   -97.04      35.86  -2.706 0.009584 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 459.5 on 45 degrees of freedom
Multiple R-squared:  0.4864,    Adjusted R-squared:  0.4407 
F-statistic: 10.65 on 4 and 45 DF,  p-value: 3.689e-06
Code
print(effectsize::standardize_parameters(interactionModel), digits = 2)
# Standardization method: refit

Parameter                             | Std. Coef. |         95% CI
-------------------------------------------------------------------
(Intercept)                           |       0.24 | [-0.04,  0.53]
Illiteracy centered                   |       0.04 | [-0.35,  0.42]
Murder centered                       |       0.10 | [-0.21,  0.41]
HS Grad                               |       0.54 | [ 0.25,  0.82]
Illiteracy centered × Murder centered |      -0.36 | [-0.62, -0.09]

8.3 Plots

Code
interactions::interact_plot(
  interactionModel,
  pred = Illiteracy_centered,
  modx = Murder_centered)

Code
interactions::interact_plot(
  interactionModel,
  pred = Illiteracy_centered,
  modx = Murder_centered,
  plot.points = TRUE)

Code
interactions::interact_plot(
  interactionModel,
  pred = Illiteracy_centered,
  modx = Murder_centered,
  interval = TRUE)

Code
interactions::johnson_neyman(
  interactionModel,
  pred = Illiteracy_centered,
  modx = Murder_centered,
  alpha = .05)
JOHNSON-NEYMAN INTERVAL

When Murder_centered is OUTSIDE the interval [-8.13, 4.37], the slope of
Illiteracy_centered is p < .05.

Note: The range of observed values of Murder_centered is [-5.98, 7.72]

8.4 Simple Slopes Analysis

Code
interactions::sim_slopes(
  interactionModel,
  pred = Illiteracy_centered,
  modx = Murder_centered,
  johnson_neyman = FALSE)
SIMPLE SLOPES ANALYSIS

Slope of Illiteracy_centered when Murder_centered = -3.69154e+00 (- 1 SD): 

    Est.     S.E.   t val.      p
-------- -------- -------- ------
  395.34   274.84     1.44   0.16

Slope of Illiteracy_centered when Murder_centered = -1.24345e-16 (Mean): 

   Est.     S.E.   t val.      p
------- -------- -------- ------
  37.12   192.56     0.19   0.85

Slope of Illiteracy_centered when Murder_centered =  3.69154e+00 (+ 1 SD): 

     Est.     S.E.   t val.      p
--------- -------- -------- ------
  -321.10   183.49    -1.75   0.09
Code
interactions::sim_slopes(
  interactionModel,
  pred = Illiteracy_centered,
  modx = Murder_centered,
  modx.values = c(0, 5, 10),
  johnson_neyman = FALSE)
SIMPLE SLOPES ANALYSIS

Slope of Illiteracy_centered when Murder_centered =  0.00: 

   Est.     S.E.   t val.      p
------- -------- -------- ------
  37.12   192.56     0.19   0.85

Slope of Illiteracy_centered when Murder_centered =  5.00: 

     Est.     S.E.   t val.      p
--------- -------- -------- ------
  -448.07   202.17    -2.22   0.03

Slope of Illiteracy_centered when Murder_centered = 10.00: 

     Est.     S.E.   t val.      p
--------- -------- -------- ------
  -933.27   330.10    -2.83   0.01

8.5 Johnson-Neyman intervals

Indicates all the values of the moderator for which the slope of the predictor is statistically significant.

Code
interactions::sim_slopes(
  interactionModel,
  pred = Illiteracy_centered,
  modx = Murder_centered,
  johnson_neyman = TRUE)
JOHNSON-NEYMAN INTERVAL

When Murder_centered is OUTSIDE the interval [-8.13, 4.37], the slope of
Illiteracy_centered is p < .05.

Note: The range of observed values of Murder_centered is [-5.98, 7.72]

SIMPLE SLOPES ANALYSIS

Slope of Illiteracy_centered when Murder_centered = -3.69154e+00 (- 1 SD): 

    Est.     S.E.   t val.      p
-------- -------- -------- ------
  395.34   274.84     1.44   0.16

Slope of Illiteracy_centered when Murder_centered = -1.24345e-16 (Mean): 

   Est.     S.E.   t val.      p
------- -------- -------- ------
  37.12   192.56     0.19   0.85

Slope of Illiteracy_centered when Murder_centered =  3.69154e+00 (+ 1 SD): 

     Est.     S.E.   t val.      p
--------- -------- -------- ------
  -321.10   183.49    -1.75   0.09
Code
interactions::probe_interaction(
  interactionModel,
  pred = Illiteracy_centered,
  modx = Murder_centered,
  cond.int = TRUE,
  interval = TRUE,
  jnplot = TRUE)
JOHNSON-NEYMAN INTERVAL

When Murder_centered is OUTSIDE the interval [-8.13, 4.37], the slope of
Illiteracy_centered is p < .05.

Note: The range of observed values of Murder_centered is [-5.98, 7.72]

SIMPLE SLOPES ANALYSIS

When Murder_centered = -3.69154e+00 (- 1 SD): 

                                        Est.     S.E.   t val.      p
---------------------------------- --------- -------- -------- ------
Slope of Illiteracy_centered          395.34   274.84     1.44   0.16
Conditional intercept                4523.23   134.99    33.51   0.00

When Murder_centered = -1.24345e-16 (Mean): 

                                        Est.     S.E.   t val.      p
---------------------------------- --------- -------- -------- ------
Slope of Illiteracy_centered           37.12   192.56     0.19   0.85
Conditional intercept                4586.22    85.52    53.63   0.00

When Murder_centered =  3.69154e+00 (+ 1 SD): 

                                        Est.     S.E.   t val.      p
---------------------------------- --------- -------- -------- ------
Slope of Illiteracy_centered         -321.10   183.49    -1.75   0.09
Conditional intercept                4649.22   118.97    39.08   0.00

9 Comparing Correlations

Fisher’s r-to-z test.

http://comparingcorrelations.org (archived at https://perma.cc/X3EU-24GL)

Independent groups (two different groups):

Code
cocor::cocor.indep.groups(
  r1.jk = .7,
  r2.hm = .6,
  n1 = 305,
  n2 = 210,
  data.name = c("group1", "group2"),
  var.labels = c("age", "intelligence", "age", "intelligence"))

  Results of a comparison of two correlations based on independent groups

Comparison between r1.jk (age, intelligence) = 0.7 and r2.hm (age, intelligence) = 0.6
Difference: r1.jk - r2.hm = 0.1
Data: group1: j = age, k = intelligence; group2: h = age, m = intelligence
Group sizes: n1 = 305, n2 = 210
Null hypothesis: r1.jk is equal to r2.hm
Alternative hypothesis: r1.jk is not equal to r2.hm (two-sided)
Alpha: 0.05

fisher1925: Fisher's z (1925)
  z = 1.9300, p-value = 0.0536
  Null hypothesis retained

zou2007: Zou's (2007) confidence interval
  95% confidence interval for r1.jk - r2.hm: -0.0014 0.2082
  Null hypothesis retained (Interval includes 0)

Dependent groups (same group), overlapping correlation (shares a variable in common—in this case, the variable age is shared in both correlations):

Code
cocor::cocor.dep.groups.overlap(
  r.jk = .2, # Correlation (age, intelligence)
  r.jh = .5, # Correlation (age, shoe size)
  r.kh = .1, # Correlation (intelligence, shoe index)
  n = 315,
  var.labels = c("age", "intelligence", "shoe size"))

  Results of a comparison of two overlapping correlations based on dependent groups

Comparison between r.jk (age, intelligence) = 0.2 and r.jh (age, shoe size) = 0.5
Difference: r.jk - r.jh = -0.3
Related correlation: r.kh = 0.1
Data: j = age, k = intelligence, h = shoe size
Group size: n = 315
Null hypothesis: r.jk is equal to r.jh
Alternative hypothesis: r.jk is not equal to r.jh (two-sided)
Alpha: 0.05

pearson1898: Pearson and Filon's z (1898)
  z = -4.4807, p-value = 0.0000
  Null hypothesis rejected

hotelling1940: Hotelling's t (1940)
  t = -4.6314, df = 312, p-value = 0.0000
  Null hypothesis rejected

williams1959: Williams' t (1959)
  t = -4.4950, df = 312, p-value = 0.0000
  Null hypothesis rejected

olkin1967: Olkin's z (1967)
  z = -4.4807, p-value = 0.0000
  Null hypothesis rejected

dunn1969: Dunn and Clark's z (1969)
  z = -4.4412, p-value = 0.0000
  Null hypothesis rejected

hendrickson1970: Hendrickson, Stanley, and Hills' (1970) modification of Williams' t (1959)
  t = -4.6313, df = 312, p-value = 0.0000
  Null hypothesis rejected

steiger1980: Steiger's (1980) modification of Dunn and Clark's z (1969) using average correlations
  z = -4.4152, p-value = 0.0000
  Null hypothesis rejected

meng1992: Meng, Rosenthal, and Rubin's z (1992)
  z = -4.3899, p-value = 0.0000
  Null hypothesis rejected
  95% confidence interval for r.jk - r.jh: -0.5013 -0.1918
  Null hypothesis rejected (Interval does not include 0)

hittner2003: Hittner, May, and Silver's (2003) modification of Dunn and Clark's z (1969) using a backtransformed average Fisher's (1921) Z procedure
  z = -4.4077, p-value = 0.0000
  Null hypothesis rejected

zou2007: Zou's (2007) confidence interval
  95% confidence interval for r.jk - r.jh: -0.4307 -0.1675
  Null hypothesis rejected (Interval does not include 0)

Dependent groups (same group), non-overlapping correlation (does not share a variable in common):

Code
cocor::cocor.dep.groups.nonoverlap(
  r.jk = .2, # Correlation (age, intelligence)
  r.hm = .7, # Correlation (body mass index, shoe size)
  r.jh = .4, # Correlation (age, body mass index)
  r.jm = .5, # Correlation (age, shoe size)
  r.kh = .1, # Correlation (intelligence, body mass index)
  r.km = .3, # Correlation (intelligence, shoe size)
  n = 232,
  var.labels = c("age", "intelligence", "body mass index", "shoe size"))

  Results of a comparison of two nonoverlapping correlations based on dependent groups

Comparison between r.jk (age, intelligence) = 0.2 and r.hm (body mass index, shoe size) = 0.7
Difference: r.jk - r.hm = -0.5
Related correlations: r.jh = 0.4, r.jm = 0.5, r.kh = 0.1, r.km = 0.3
Data: j = age, k = intelligence, h = body mass index, m = shoe size
Group size: n = 232
Null hypothesis: r.jk is equal to r.hm
Alternative hypothesis: r.jk is not equal to r.hm (two-sided)
Alpha: 0.05

pearson1898: Pearson and Filon's z (1898)
  z = -7.1697, p-value = 0.0000
  Null hypothesis rejected

dunn1969: Dunn and Clark's z (1969)
  z = -7.3134, p-value = 0.0000
  Null hypothesis rejected

steiger1980: Steiger's (1980) modification of Dunn and Clark's z (1969) using average correlations
  z = -7.3010, p-value = 0.0000
  Null hypothesis rejected

raghunathan1996: Raghunathan, Rosenthal, and Rubin's (1996) modification of Pearson and Filon's z (1898)
  z = -7.3134, p-value = 0.0000
  Null hypothesis rejected

silver2004: Silver, Hittner, and May's (2004) modification of Dunn and Clark's z (1969) using a backtransformed average Fisher's (1921) Z procedure
  z = -7.2737, p-value = 0.0000
  Null hypothesis rejected

zou2007: Zou's (2007) confidence interval
  95% confidence interval for r.jk - r.hm: -0.6375 -0.3629
  Null hypothesis rejected (Interval does not include 0)

10 Approaches to Handling Missingness

10.1 Listwise deletion

Listwise deletion deletes every row in the data file that has a missing value for one of the model variables.

Code
listwiseDeletionModel <- lm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.exclude)

summary(listwiseDeletionModel)

Call:
lm(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    data = mydata, na.action = na.exclude)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.3755 -1.2337 -0.2212  0.9911 12.8017 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)              1.19830    0.05983  20.029  < 2e-16 ***
bpi_antisocialT1Sum      0.46553    0.01858  25.049  < 2e-16 ***
bpi_anxiousDepressedSum  0.16075    0.02916   5.513 3.83e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.979 on 2871 degrees of freedom
  (8656 observations deleted due to missingness)
Multiple R-squared:  0.262, Adjusted R-squared:  0.2615 
F-statistic: 509.6 on 2 and 2871 DF,  p-value: < 2.2e-16
Code
confint(listwiseDeletionModel)
                            2.5 %    97.5 %
(Intercept)             1.0809881 1.3156128
bpi_antisocialT1Sum     0.4290884 0.5019688
bpi_anxiousDepressedSum 0.1035825 0.2179258
Code
print(effectsize::standardize_parameters(listwiseDeletionModel), digits = 2)
# Standardization method: refit

Parameter               | Std. Coef. |        95% CI
----------------------------------------------------
(Intercept)             |   1.62e-16 | [-0.03, 0.03]
bpi antisocialT1Sum     |       0.46 | [ 0.42, 0.49]
bpi anxiousDepressedSum |       0.10 | [ 0.06, 0.14]

10.2 Pairwise deletion

Also see here:

Adapted from here: https://stefvanbuuren.name/fimd/sec-simplesolutions.html#sec:pairwise (archived at https://perma.cc/EGU6-3M3Q)

Code
modelData <- mydata[,c("bpi_antisocialT2Sum","bpi_antisocialT1Sum","bpi_anxiousDepressedSum")]
varMeans <- colMeans(modelData, na.rm = TRUE)
varCovariances <- cov(modelData, use = "pairwise")

pairwiseRegression_syntax <- '
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum
  bpi_antisocialT2Sum ~~ bpi_antisocialT2Sum
  bpi_antisocialT2Sum ~ 1
'

pairwiseRegression_fit <- lavaan::lavaan(
  pairwiseRegression_syntax,
  sample.mean = varMeans,
  sample.cov = varCovariances,
  sample.nobs = sum(complete.cases(modelData))
)

summary(
  pairwiseRegression_fit,
  standardized = TRUE,
  rsquare = TRUE)
lavaan 0.6-21 ended normally after 1 iteration

  Estimator                                         ML
  Optimization method                           NLMINB
  Number of model parameters                         4

  Number of observations                          2874

Model Test User Model:
                                                      
  Test statistic                                 0.000
  Degrees of freedom                                 0

Parameter Estimates:

  Standard errors                             Standard
  Information                                 Expected
  Information saturated (h1) model          Structured

Regressions:
                        Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  bpi_antisocialT2Sum ~                                                      
    bpi_antsclT1Sm         0.441    0.018   24.611    0.000    0.441    0.456
    bp_nxsDprssdSm         0.137    0.028    4.854    0.000    0.137    0.090

Intercepts:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .bpi_antsclT2Sm    1.175    0.058   20.125    0.000    1.175    0.523

Variances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .bpi_antsclT2Sm    3.755    0.099   37.908    0.000    3.755    0.743

R-Square:
                   Estimate
    bpi_antsclT2Sm    0.257

10.3 Full-information maximum likelihood (FIML)

Code
fimlRegression_syntax <- '
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum
  bpi_antisocialT2Sum ~~ bpi_antisocialT2Sum
  bpi_antisocialT2Sum ~ 1
'

fimlRegression_fit <- lavaan::lavaan(
  fimlRegression_syntax,
  data = mydata,
  missing = "ML",
)

summary(
  fimlRegression_fit,
  standardized = TRUE,
  rsquare = TRUE)
lavaan 0.6-21 ended normally after 11 iterations

  Estimator                                         ML
  Optimization method                           NLMINB
  Number of model parameters                         4

                                                  Used       Total
  Number of observations                          3914       11530
  Number of missing patterns                         2            

Model Test User Model:
                                                      
  Test statistic                                 0.000
  Degrees of freedom                                 0

Parameter Estimates:

  Standard errors                             Standard
  Information                                 Observed
  Observed information based on                Hessian

Regressions:
                        Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  bpi_antisocialT2Sum ~                                                      
    bpi_antsclT1Sm         0.466    0.019   25.062    0.000    0.466    0.466
    bp_nxsDprssdSm         0.161    0.029    5.516    0.000    0.161    0.102

Intercepts:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .bpi_antsclT2Sm    1.198    0.060   20.039    0.000    1.198    0.516

Variances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .bpi_antsclT2Sm    3.911    0.103   37.908    0.000    3.911    0.725

R-Square:
                   Estimate
    bpi_antsclT2Sm    0.275

10.4 Multiple imputation

Code
modelData_imputed <- mice::mice(
  modelData,
  m = 5,
  method = "pmm") # predictive mean matching; can choose among many methods

 iter imp variable
  1   1  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  1   2  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  1   3  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  1   4  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  1   5  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  2   1  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  2   2  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  2   3  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  2   4  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  2   5  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  3   1  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  3   2  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  3   3  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  3   4  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  3   5  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  4   1  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  4   2  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  4   3  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  4   4  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  4   5  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  5   1  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  5   2  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  5   3  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  5   4  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
  5   5  bpi_antisocialT2Sum  bpi_antisocialT1Sum  bpi_anxiousDepressedSum
Code
imputedData_fit <- with(
  modelData_imputed,
  lm(bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum))

imputedData_pooledEstimates <- pool(imputedData_fit)
summary(imputedData_pooledEstimates)

11 Squared Semi-Partial Correlation (\(sr^2\))

Code
rockchalk::getDeltaRsquare(multipleRegressionModel)
The deltaR-square values: the change in the R-square
      observed when a single term is removed.
Same as the square of the 'semi-partial correlation coefficient'
                        deltaRsquare
bpi_antisocialT1Sum      0.161297120
bpi_anxiousDepressedSum  0.007813728

12 Partial Regression Plot

A partial regression plot (aka added-variable) examines the association between the predictor and the outcome after controlling for the covariates.

Code
car::avPlots(
  multipleRegressionModel,
  id = FALSE)

Manually:

Code
# DV residuals (outcome residualized on covariates)
ry <- resid(lm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum, 
  data = mydata, na.action = na.exclude))

# Predictor residuals (focal predictor residualized on covariates)
rx <- resid(lm(
  bpi_anxiousDepressedSum ~ bpi_antisocialT1Sum,
  data = mydata, na.action = na.exclude))

# Create the partial regression plot
df_plot <- data.frame(rx, ry)

ggplot(
  data = df_plot,
  mapping = aes(x = rx, y = ry)) +
  geom_point(alpha = 0.6) +
  geom_smooth(
    method = "lm",
    color = "blue") +
  labs(
    x = "Residualized bpi_antisocialT1Sum",
    y = "Residualized bpi_antisocialT2Sum",
    title = "Partial Regression Plot\nbpi_antisocialT2Sum ~ bpi_antisocialT1Sum controlling for bpi_anxiousDepressedSum"
  )

13 Dominance Analysis

Code
parameters::dominance_analysis(multipleRegressionModel)
# Dominance Analysis Results

Model R2 Value:  0.262 

General Dominance Statistics

Parameter               | General Dominance | Percent | Ranks |                  Subset
---------------------------------------------------------------------------------------
(Intercept)             |                   |         |       |                constant
bpi_antisocialT1Sum     |             0.208 |   0.793 |     1 |     bpi_antisocialT1Sum
bpi_anxiousDepressedSum |             0.054 |   0.207 |     2 | bpi_anxiousDepressedSum

Conditional Dominance Statistics

Subset                  | IVs: 1 | IVs: 2
-----------------------------------------
bpi_antisocialT1Sum     |  0.254 |  0.161
bpi_anxiousDepressedSum |  0.101 |  0.008

Complete Dominance Designations

Subset                  | < bpi_antisocialT1Sum | < bpi_anxiousDepressedSum
---------------------------------------------------------------------------
bpi_antisocialT1Sum     |                       |                     FALSE
bpi_anxiousDepressedSum |                  TRUE |                          
Code
allPossibleSubsetsRegression <- yhat::aps(
  dataMatrix = mydata,
  dv = "bpi_antisocialT2Sum",
  ivlist = list("bpi_antisocialT1Sum", "bpi_anxiousDepressedSum"))

yhat::dominance(allPossibleSubsetsRegression)
$DA
                                            k        R2 bpi_antisocialT1Sum.Inc
bpi_antisocialT1Sum                         1 0.2541698                      NA
bpi_anxiousDepressedSum                     1 0.1006864               0.1612971
bpi_antisocialT1Sum,bpi_anxiousDepressedSum 2 0.2619835                      NA
                                            bpi_anxiousDepressedSum.Inc
bpi_antisocialT1Sum                                         0.007813728
bpi_anxiousDepressedSum                                              NA
bpi_antisocialT1Sum,bpi_anxiousDepressedSum                          NA

$CD
     bpi_antisocialT1Sum bpi_anxiousDepressedSum
CD:0           0.2541698             0.100686422
CD:1           0.1612971             0.007813728

$GD
    bpi_antisocialT1Sum bpi_anxiousDepressedSum 
             0.20773347              0.05425008 

14 Commonality Analysis

Code
yhat::calc.yhat(multipleRegressionModel)
$PredictorMetrics
                            b  Beta     r    rs   rs2 Unique Common  CD:0  CD:1
bpi_antisocialT1Sum     0.466 0.456 0.504 0.985 0.970  0.161  0.093 0.254 0.161
bpi_anxiousDepressedSum 0.161 0.100 0.317 0.620 0.384  0.008  0.093 0.101 0.008
Total                      NA    NA    NA    NA 1.354  0.169  0.186 0.355 0.169
                        GenDom Pratt   RLW
bpi_antisocialT1Sum      0.208 0.230 0.208
bpi_anxiousDepressedSum  0.054 0.032 0.054
Total                    0.262 0.262 0.262

$OrderedPredictorMetrics
                        b Beta r rs rs2 Unique Common CD:0 CD:1 GenDom Pratt
bpi_antisocialT1Sum     1    1 1  1   1      1      1    1    1      1     1
bpi_anxiousDepressedSum 2    2 2  2   2      2      2    2    2      2     2
                        RLW
bpi_antisocialT1Sum       1
bpi_anxiousDepressedSum   2

$PairedDominanceMetrics
                                            Comp Cond Gen
bpi_antisocialT1Sum>bpi_anxiousDepressedSum    1    1   1

$APSRelatedMetrics
                                            Commonality  % Total    R2
bpi_antisocialT1Sum                               0.161    0.616 0.254
bpi_anxiousDepressedSum                           0.008    0.030 0.101
bpi_antisocialT1Sum,bpi_anxiousDepressedSum       0.093    0.354 0.262
Total                                             0.262    1.000    NA
                                            bpi_antisocialT1Sum.Inc
bpi_antisocialT1Sum                                              NA
bpi_anxiousDepressedSum                                       0.161
bpi_antisocialT1Sum,bpi_anxiousDepressedSum                      NA
Total                                                            NA
                                            bpi_anxiousDepressedSum.Inc
bpi_antisocialT1Sum                                               0.008
bpi_anxiousDepressedSum                                              NA
bpi_antisocialT1Sum,bpi_anxiousDepressedSum                          NA
Total                                                                NA
Code
allPossibleSubsetsRegression <- yhat::aps(
  dataMatrix = mydata,
  dv = "bpi_antisocialT2Sum",
  ivlist = list("bpi_antisocialT1Sum", "bpi_anxiousDepressedSum"))

yhat::commonality(allPossibleSubsetsRegression)
                                            Coefficient    % Total
bpi_antisocialT1Sum                         0.161297120 0.61567654
bpi_anxiousDepressedSum                     0.007813728 0.02982526
bpi_antisocialT1Sum,bpi_anxiousDepressedSum 0.092872694 0.35449820
Code
commonalityAnalysis <- yhat::commonalityCoefficients(
  dataMatrix = mydata,
  dv = "bpi_antisocialT2Sum",
  ivlist = list("bpi_antisocialT1Sum", "bpi_anxiousDepressedSum"))

commonalityAnalysis
$CC
                                                           Coefficient
Unique to bpi_antisocialT1Sum                                   0.1613
Unique to bpi_anxiousDepressedSum                               0.0078
Common to bpi_antisocialT1Sum, and bpi_anxiousDepressedSum      0.0929
Total                                                           0.2620
                                                               % Total
Unique to bpi_antisocialT1Sum                                    61.57
Unique to bpi_anxiousDepressedSum                                 2.98
Common to bpi_antisocialT1Sum, and bpi_anxiousDepressedSum       35.45
Total                                                           100.00

$CCTotalbyVar
                        Unique Common  Total
bpi_antisocialT1Sum     0.1613 0.0929 0.2542
bpi_anxiousDepressedSum 0.0078 0.0929 0.1007

15 Model Building Steps

  1. Examine extent and type of missing data, consider how to handle missing values (multiple imputation, FIML, pairwise deletion, listwise deletion)
  2. Examine descriptive statistics, consider variable transformations
  3. Examine scatterplot matrix to examine whether associations between predictor and outcomes are linear or nonlinear
  4. Model building: theory and empiricism (stepwise regression, likelihood ratio tests, k-fold cross validation to check for over-fitting compared to simpler model)
  5. Test assumptions
    • Examine whether predictors have linear association with outcome (Residual Plots, Marginal Model Plots, Added-Variable Plots—check for non-horizontal lines)
    • Examine whether residuals have constant variance across levels of outcome and predictors Residual Plots, Spread-Level Plots—check for fan-shaped plot or other increasing/decreasing structure)
    • Examine whether predictors show multicollinearity (VIF)
    • Examine whether residuals are normally distributed (QQ plot and density plot)
    • Examine influence of influential observations (high outlyingness and leverage) on parameter stimates by iteratively removing influential observations and refitting (Added-Variable Plots)
  6. Handle violated assumptions, select final set of predictors/outcomes and transformation of each
  7. Use k-fold cross validation to identify the best estimation procedure (OLS, MM, or LTS) on the final model
  8. Use identified estimation procedure to fit final model and determine the best parameter point estimates
  9. Calculate bootstrapped estimates to determine the confidence intervals around the parameter estimates of the final model

16 Bootstrapped Estimates

To determine the confidence intervals of parameter estimates

16.1 Linear Regression

Code
multipleRegressionModelBootstrapped <- Boot(multipleRegressionModelNoMissing, R = 1000)
summary(multipleRegressionModelBootstrapped)
Code
confint(multipleRegressionModelBootstrapped, level = .95, type = "bca")
Bootstrap percent confidence intervals

                             2.5 %    97.5 %
(Intercept)             1.07742194 1.3158099
bpi_antisocialT1Sum     0.42422867 0.5131381
bpi_anxiousDepressedSum 0.09321455 0.2290622
Code
hist(multipleRegressionModelBootstrapped)

16.2 Generalized Regression

Code
generalizedRegressionModelBootstrapped <- Boot(multipleRegressionModelNoMissing, R = 1000)
summary(generalizedRegressionModelBootstrapped)
Code
confint(generalizedRegressionModelBootstrapped, level = .95, type = "bca")
Bootstrap percent confidence intervals

                            2.5 %    97.5 %
(Intercept)             1.0741944 1.3250446
bpi_antisocialT1Sum     0.4191542 0.5124379
bpi_anxiousDepressedSum 0.1001762 0.2299066
Code
hist(generalizedRegressionModelBootstrapped)

17 Cross Validation

To examine degree of prediction error and over-fitting to determine best model

https://stats.stackexchange.com/questions/103459/how-do-i-know-which-method-of-cross-validation-is-best (archived at https://perma.cc/38BL-KLRJ)

17.1 K-fold cross validation

Code
kFolds <- 10
replications <- 20

folds <- cvFolds(nrow(mydata), K = kFolds, R = replications)

fitLm <- lm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = "na.exclude")

fitLmrob <- lmrob(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = "na.exclude")

fitLts <- ltsReg(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = "na.exclude")

cvFitLm <- cvLm(
  fitLm,
  K = kFolds,
  R = replications)

cvFitLmrob <- cvLmrob(
  fitLmrob,
  K = kFolds,
  R = replications)

cvFitLts <- cvLts(
  fitLts,
  K = kFolds,
  R = replications)

cvFits <- cvSelect(
  OLS = cvFitLm,
  MM = cvFitLmrob,
  LTS = cvFitLts)
cvFits

10-fold CV results:
  Fit       CV
1 OLS 1.980824
2  MM 1.026541
3 LTS 1.007301

Best model:
   CV 
"LTS" 
Code
bwplot(
  cvFits,
  xlab = "Root Mean Square Error",
  xlim = c(0, max(cvFits$cv$CV) + 0.2))

18 Examining Model Fits

18.1 Effect Plots

Code
allEffects(multipleRegressionModel)
 model: bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum

 bpi_antisocialT1Sum effect
bpi_antisocialT1Sum
       0      3.2      6.5      9.8       13 
1.394591 2.884283 4.420527 5.956772 7.446463 

 bpi_anxiousDepressedSum effect
bpi_anxiousDepressedSum
       0        2        4        6        8 
2.502636 2.824145 3.145653 3.467161 3.788669 
Code
plot(allEffects(multipleRegressionModel))

18.2 Confidence Ellipses

Code
confidenceEllipse(
  multipleRegressionModel,
  levels = c(0.5, .95))

18.3 Data Ellipse

Code
mydata_nomissing <- na.omit(mydata[,c("bpi_antisocialT1Sum","bpi_antisocialT2Sum")])
dataEllipse(
  mydata_nomissing$bpi_antisocialT1Sum,
  mydata_nomissing$bpi_antisocialT2Sum,
  levels = c(0.5, .95))

19 Diagnostics

19.1 Assumptions

19.1.1 1. Linear relation between predictors and outcome

19.1.1.1 Ways to Test

19.1.1.1.1 Before Model Fitting
  • scatterplot matrix
  • distance correlation
19.1.1.1.1.1 Scatterplot Matrix
Code
scatterplotMatrix(
  ~ bpi_antisocialT2Sum + bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata)

19.1.1.1.1.2 Distance correlation

The distance correlation is an index of the degree of the linear and non-linear association between two variables.

Code
correlation(
  mydata[,c("bpi_antisocialT1Sum","bpi_antisocialT2Sum")],
  method = "distance",
  p_adjust = "none")
19.1.1.1.2 After Model Fitting

Check for nonlinearities (non-horizontal line) in plots of:

19.1.1.2 Ways to Handle

  • Transform outcome/predictor variables (Box-Cox transformations)
  • Semi-parametric regression models: Generalized additive models (GAM)
  • Non-parametric regression models: Nearest-Neighbor Kernel Regression
19.1.1.2.1 Semi-parametric or non-parametric regression models

http://www.lisa.stat.vt.edu/?q=node/7517 (archived at https://web.archive.org/web/20180113065042/http://www.lisa.stat.vt.edu/?q=node/7517)

Note: using semi-parametric or non-parametric models increases fit in context of nonlinearity at the expense of added complexity. Make sure to avoid fitting an overly complex model (e.g., use k-fold cross validation). Often, the simpler (generalized) linear model is preferable to semi-paremetric or non-parametric approaches

19.1.1.2.1.1 Semi-parametric: Generalized Additive Models

http://documents.software.dell.com/Statistics/Textbook/Generalized-Additive-Models (archived at https://web.archive.org/web/20170213041653/http://documents.software.dell.com/Statistics/Textbook/Generalized-Additive-Models)

Code
generalizedAdditiveModel <- gam(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  family = gaussian(),
  data = mydata,
  na.action = na.exclude)

summary(generalizedAdditiveModel)

Family: gaussian 
Link function: identity 

Formula:
bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum

Parametric coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)              1.19830    0.05983  20.029  < 2e-16 ***
bpi_antisocialT1Sum      0.46553    0.01858  25.049  < 2e-16 ***
bpi_anxiousDepressedSum  0.16075    0.02916   5.513 3.83e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


R-sq.(adj) =  0.261   Deviance explained = 26.2%
GCV = 3.9191  Scale est. = 3.915     n = 2874
Code
confint(generalizedAdditiveModel)
Error in `glm.control()`:
! unused arguments (nthreads = 1, ncv.threads = 1, irls.reg = 0, mgcv.tol = 1e-07, mgcv.half = 15, rank.tol = 1.49011611938477e-08, nlm = list(7, 1e-06, 2, 1e-04, 200, FALSE), optim = list(1e+07), newton = list(1e-06, 5, 2, 30, FALSE), idLinksBases = TRUE, scalePenalty = TRUE, efs.lspmax = 15, efs.tol = 0.1, keepData = FALSE, scale.est = "fletcher", edge.correct = FALSE)
19.1.1.2.1.2 Non-parametric: Nearest-Neighbor Kernel Regression

19.1.2 2. Exogeneity

Exogeneity means that the association between predictors and outcome is fully causal and unrelated to other variables.

19.1.2.1 Ways to Test

  • Durbin-Wu-Hausman test of endogeneity
19.1.2.1.1 Durbin-Wu-Hausman test of endogeneity

The instrumental variables (2SLS) estimator is implemented in the R package AER as command:

Code
ivreg(y ~ x1 + x2 + w1 + w2 | z1 + z2 + z3 + w1 + w2)

where x1 and x2 are endogenous regressors, w1 and w2 exogeneous regressors, and z1 to z3 are excluded instruments.

Durbin-Wu-Hausman test:

Code
hsng2 <- read.dta("https://www.stata-press.com/data/r11/hsng2.dta") #archived at https://perma.cc/7P2Q-ARKR
Code
fiv <- ivreg(
  rent ~ hsngval + pcturban | pcturban + faminc + reg2 + reg3 + reg4,
  data = hsng2) #Housing values are likely endogeneous and therefore instrumented by median family income (faminc) and 3 regional dummies (reg2, reg4, reg4)

summary(
  fiv,
  diagnostics = TRUE)

Call:
ivreg(formula = rent ~ hsngval + pcturban | pcturban + faminc + 
    reg2 + reg3 + reg4, data = hsng2)

Residuals:
     Min       1Q   Median       3Q      Max 
-84.1948 -11.6023  -0.5239   8.6583  73.6130 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1.207e+02  1.571e+01   7.685 7.55e-10 ***
hsngval     2.240e-03  3.388e-04   6.612 3.17e-08 ***
pcturban    8.152e-02  3.082e-01   0.265    0.793    

Diagnostic tests:
                 df1 df2 statistic  p-value    
Weak instruments   4  44     13.30  3.5e-07 ***
Wu-Hausman         1  46     15.91 0.000236 ***
Sargan             3  NA     11.29 0.010268 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 22.86 on 47 degrees of freedom
Multiple R-Squared: 0.5989, Adjusted R-squared: 0.5818 
Wald test: 42.66 on 2 and 47 DF,  p-value: 2.731e-11 

The Eicker-Huber-White covariance estimator which is robust to heteroscedastic error terms is reported after estimation with vcov = sandwich in coeftest()

First stage results are reported by explicitly estimating them. For example:

Code
first <- lm(
  hsngval ~ pcturban + faminc + reg2 + reg3 + reg4,
  data = hsng2)

summary(first)

Call:
lm(formula = hsngval ~ pcturban + faminc + reg2 + reg3 + reg4, 
    data = hsng2)

Residuals:
   Min     1Q Median     3Q    Max 
-10504  -5223  -1162   2939  46756 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1.867e+04  1.200e+04  -1.557 0.126736    
pcturban     1.822e+02  1.150e+02   1.584 0.120289    
faminc       2.731e+00  6.819e-01   4.006 0.000235 ***
reg2        -5.095e+03  4.122e+03  -1.236 0.223007    
reg3        -1.778e+03  4.073e+03  -0.437 0.664552    
reg4         1.341e+04  4.048e+03   3.314 0.001849 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9253 on 44 degrees of freedom
Multiple R-squared:  0.6908,    Adjusted R-squared:  0.6557 
F-statistic: 19.66 on 5 and 44 DF,  p-value: 3.032e-10

In case of a single endogenous variable (K = 1), the F-statistic to assess weak instruments is reported after estimating the first stage with, for example:

Code
waldtest(
  first,
  . ~ . - faminc - reg2 - reg3 - reg4)

or in case of heteroscedatistic errors:

Code
waldtest(
  first,
  . ~ . - faminc - reg2 - reg3- reg4,
  vcov = sandwich)

19.1.2.2 Ways to Handle

  • Conduct an experiment/RCT with random assignment
  • Instrumental variables

19.1.3 3. Homoscedasticity of residuals

Homoscedasticity of the residuals means that the variance of the residuals does not differ as a function of the outcome/predictors (i.e., the residuals show constant variance as a function of outcome/predictors).

19.1.3.1 Ways to Test

  • Plot residuals vs. outcome and predictor variables (Residual Plots)
  • Plot residuals vs. fitted values (Residual Plots)
  • Time Series data: Plot residuals vs. time
  • Spread-level plot
  • Breusch-Pagan test: bptest() function from lmtest package
  • Goldfeld-Quandt Test
19.1.3.1.1 Residuals vs. outcome and predictor variables

Plot residuals, or perhaps their absolute values, versus the outcome and predictor variables (Residual Plots). Examine whether residual variance is constant at all levels of other variables or whether it increases/decreases as a function of another variable (or shows some others structure, e.g., small variance at low and high levels of a predictor and high variance in the middle). Note that this is different than whether the residuals show non-linearities—i.e., a non-horizontal line, which would indicate a nonlinear association between variables (see Assumption #1, above). Rather, here we are examining whether there is change in the variance as a function of another variable (e.g., a fan-shaped Residual Plot)

19.1.3.1.2 Spread-level plot

Examining whether level (e.g., mean) depends on spread (e.g., variance)—plot of log of the absolute Studentized residuals against the log of the fitted values

Code
spreadLevelPlot(multipleRegressionModel)


Suggested power transformation:  0.5304728 
19.1.3.1.3 Breusch-Pagan test

https://www.rdocumentation.org/packages/lmtest/versions/0.9-40/topics/bptest (archived at https://perma.cc/K4WC-7TVW)

Code
bptest(multipleRegressionModel)

    studentized Breusch-Pagan test

data:  multipleRegressionModel
BP = 94.638, df = 2, p-value < 2.2e-16
19.1.3.1.4 Goldfeld-Quandt Test
Code
gqtest(multipleRegressionModel)

    Goldfeld-Quandt test

data:  multipleRegressionModel
GQ = 1.0755, df1 = 1434, df2 = 1434, p-value = 0.08417
alternative hypothesis: variance increases from segment 1 to 2
19.1.3.1.5 Test of dependence of spread on level
Code
ncvTest(multipleRegressionModel)
Non-constant Variance Score Test 
Variance formula: ~ fitted.values 
Chisquare = 232.1196, Df = 1, p = < 2.22e-16
19.1.3.1.6 Test of dependence of spread on predictors
Code
ncvTest(
  multipleRegressionModel,
  ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum)
Non-constant Variance Score Test 
Variance formula: ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum 
Chisquare = 233.2878, Df = 2, p = < 2.22e-16

19.1.3.2 Ways to Handle

  • If residual variance increases as a function of the fitted values, consider a poisson regression model (especially for count data), for which the variance does increase with the mean
  • If residual variance differs as a function of model predictors, consider adding interactions/product terms (the effect of one variable depends on the level of another variable)
  • Try transforming the outcome variable to be normally distributed (log-transform if the errors seem consistent in percentage rather than absolute terms)
  • If the error variance is proportional to a variable \(z\), then fit the model using Weighted Least Squares (WLS), with the weights given be \(1/z\)
  • Weighted least squares (WLS) using the “weights” argument of the lm() function; to get weights, see: https://stats.stackexchange.com/a/100410/20338 (archived at https://perma.cc/C6BY-G9MS)
  • Huber-White standard errors (a.k.a. “Sandwich” estimates) from a heteroscedasticity-corrected covariance matrix
    • coeftest() function from the sandwich package along with hccm sandwich estimates from the car package
    • robcov() function from the rms package
  • Time series data: ARCH (auto-regressive conditional heteroscedasticity) models
  • Time series data: seasonal patterns can be addressed by applying log transformation to outcome variable
19.1.3.2.1 Huber-White standard errors

Standard errors (SEs) on the diagonal increase

Code
vcov(multipleRegressionModel)
                          (Intercept) bpi_antisocialT1Sum
(Intercept)              0.0035795220       -0.0006533375
bpi_antisocialT1Sum     -0.0006533375        0.0003453814
bpi_anxiousDepressedSum -0.0003167540       -0.0002574524
                        bpi_anxiousDepressedSum
(Intercept)                       -0.0003167540
bpi_antisocialT1Sum               -0.0002574524
bpi_anxiousDepressedSum            0.0008501565
Code
hccm(multipleRegressionModel)
Error in `V %*% t(X) %*% apply(X, 2, "*", (e^2) / factor)`:
! non-conformable arguments
Code
summary(multipleRegressionModel)

Call:
lm(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    data = mydata, na.action = na.exclude)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.3755 -1.2337 -0.2212  0.9911 12.8017 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)              1.19830    0.05983  20.029  < 2e-16 ***
bpi_antisocialT1Sum      0.46553    0.01858  25.049  < 2e-16 ***
bpi_anxiousDepressedSum  0.16075    0.02916   5.513 3.83e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.979 on 2871 degrees of freedom
  (8656 observations deleted due to missingness)
Multiple R-squared:  0.262, Adjusted R-squared:  0.2615 
F-statistic: 509.6 on 2 and 2871 DF,  p-value: < 2.2e-16
Code
coeftest(multipleRegressionModel, vcov = sandwich)

t test of coefficients:

                        Estimate Std. Error t value  Pr(>|t|)    
(Intercept)             1.198300   0.062211 19.2618 < 2.2e-16 ***
bpi_antisocialT1Sum     0.465529   0.022929 20.3030 < 2.2e-16 ***
bpi_anxiousDepressedSum 0.160754   0.032991  4.8727  1.16e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
coeftest(multipleRegressionModel, vcov = hccm)
Error in `V %*% t(X) %*% apply(X, 2, "*", (e^2) / factor)`:
! non-conformable arguments
Code
robcov(ols(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  x = TRUE,
  y = TRUE))
Frequencies of Missing Values Due to Each Variable
    bpi_antisocialT2Sum     bpi_antisocialT1Sum bpi_anxiousDepressedSum 
                   7501                    7613                    7616 

Linear Regression Model

ols(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    data = mydata, x = TRUE, y = TRUE)

                Model Likelihood    Discrimination    
                      Ratio Test           Indexes    
Obs    2874    LR chi2    873.09    R2       0.262    
sigma1.9786    d.f.            2    R2 adj   0.261    
d.f.   2871    Pr(> chi2) 0.0000    g        1.278    

Residuals

    Min      1Q  Median      3Q     Max 
-8.3755 -1.2337 -0.2212  0.9911 12.8017 

                        Coef   S.E.   t     Pr(>|t|)
Intercept               1.1983 0.0622 19.26 <0.0001 
bpi_antisocialT1Sum     0.4655 0.0229 20.30 <0.0001 
bpi_anxiousDepressedSum 0.1608 0.0330  4.87 <0.0001 

Use “cluster” variable to account for nested data within-subject:

Code
robcov(ols(
  t_ext ~ m_ext + age,
  data = mydata,
  x = TRUE,
  y = TRUE),
  cluster = mydata$tcid) #account for nested data within subject

19.1.4 4. Errors are independent

Independent errors means that the errors are uncorrelated with each other.

19.1.4.1 Ways to Test

  • Intraclass correlation coefficient (ICC) from mixed model
  • Plot residuals vs. predictors (Residual Plots)
  • Time Series data: Residual time series plot (residuals vs. row number)
  • Time Series data: Table or plot of residual autocorrelations
  • Time Series data: Durbin-Watson statistic for test of significant residual autocorrelation at lag 1

19.1.4.2 Ways to Handle

19.1.5 5. No multicollinearity

Multicollinearity occurs when the predictors are correlated with each other.

19.1.5.1 Ways to Test

  • Variance Inflation Factor (VIF)
  • Generalized Variance Inflation Factor (GVIF)—when models have related regressors (multiple polynomial terms or contrasts from same predictor)
  • Correlation
  • Tolerance
  • Condition Index
19.1.5.1.1 Variance Inflation Factor (VIF)

\[ \text{VIF} = 1/\text{Tolerance} \]

If the variance inflation factor of a predictor variable were 5.27 (\(\sqrt{5.27} = 2.3\)), this means that the standard error for the coefficient of that predictor variable is 2.3 times as large (i.e., confidence interval is 2.3 times wider) as it would be if that predictor variable were uncorrelated with the other predictor variables.

VIF = 1: Not correlated
1 < VIF < 5: Moderately correlated
VIF > 5 to 10: Highly correlated (multicollinearity present)

Code
vif(multipleRegressionModel)
    bpi_antisocialT1Sum bpi_anxiousDepressedSum 
               1.291545                1.291545 
19.1.5.1.2 Generalized Variance Inflation Factor (GVIF)

Useful when models have related regressors (multiple polynomial terms or contrasts from same predictor)

19.1.5.1.3 Correlation

correlation among all independent variables the correlation coefficients should be smaller than .08

Code
cor(
  mydata[,c("bpi_antisocialT1Sum","bpi_anxiousDepressedSum")],
  use = "pairwise.complete.obs")
                        bpi_antisocialT1Sum bpi_anxiousDepressedSum
bpi_antisocialT1Sum                1.000000                0.497279
bpi_anxiousDepressedSum            0.497279                1.000000
19.1.5.1.4 Tolerance

The tolerance is an index of the influence of one independent variable on all other independent variables.

\[ \text{tolerance} = 1/\text{VIF} \]

T < 0.2: there might be multicollinearity in the data
T < 0.01: there certainly is multicollinarity in the data

19.1.5.1.5 Condition Index

The condition index is calculated using a factor analysis on the independent variables. Values of 10-30 indicate a mediocre multicollinearity in the regression variables. Values > 30 indicate strong multicollinearity.

For how to interpret, see here: https://stats.stackexchange.com/a/87610/20338 (archived at https://perma.cc/Y4J8-MY7Q)

Code
ols_eigen_cindex(multipleRegressionModel)

19.1.5.2 Ways to Handle

  • Remove highly correlated (i.e., redundant) predictors
  • Average the correlated predictors
  • Principal Component Analysis for data reduction
  • Standardize predictors
  • Center the data (deduct the mean)
  • Singular-value decomposition of the model matrix or the mean-centered model matrix
  • Conduct a factor analysis and rotate the factors to ensure independence of the factors

19.1.6 6. Errors are normally distributed

19.1.6.1 Ways to Test

19.1.6.2 Ways to Handle

  • Apply a transformation to the predictor or outcome variable
  • Exclude outliers
  • Robust regression
    • Best when no outliers: MM-type regression estimator
    • Most resistant to outliers: Least trimmed squares (LTS)—but not good as a standalone estimator, better for identifying outliers
      • ltsReg() function of robustbase package (best)
    • Best when single predictor: Theil-Sen estimator
      • mblm(, repeated = FALSE) function of mblm package
    • Robust correlation
      • Spearman’s rho: cor(, method = "spearman")
      • Percentage bend correlation
      • Minimum vollume ellipsoid
      • Minimum covariance determinant:
      • Winsorized correlation
      • Biweight midcorrelation
      • Xi (\(\xi\)) correlation
    • Not great options:
      • Quantile (L1) regression: rq() function of quantreg package
19.1.6.2.1 Transformations of Outcome Variable
19.1.6.2.1.1 Box-Cox Transformation

Useful if the outcome is strictly positive (or add a constant to outcome to make it strictly positive)

lambda = -1: inverse transformation
lambda = -0.5: 1/sqrt(Y)
lambda = 0: log transformation
lambda = 0.5: square root
lambda = 0.333: cube root
lambda = 1: no transformation
lambda = 2: squared

Raw distribution

Code
plot(density(na.omit(mydata$bpi_antisocialT2Sum)))

Add constant to outcome to make it strictly positive

Code
strictlyPositiveDV <- lm(
  bpi_antisocialT2Sum + 1 ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.exclude)

Identify best power transformation (lambda)

Consider rounding the power to a common value (square root = .5; cube root = .333; squared = 2)

Code
boxCox(strictlyPositiveDV)

Code
powerTransform(strictlyPositiveDV) 
Estimated transformation parameter 
      Y1 
0.234032 
Code
transformedDV <- powerTransform(strictlyPositiveDV)

summary(transformedDV)
bcPower Transformation to Normality 
   Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
Y1     0.234        0.23       0.1825       0.2856

Likelihood ratio test that transformation parameter is equal to 0
 (log transformation)
                           LRT df       pval
LR test, lambda = (0) 78.04052  1 < 2.22e-16

Likelihood ratio test that no transformation is needed
                           LRT df       pval
LR test, lambda = (1) 853.8675  1 < 2.22e-16

Transform the DV

Code
mydata$bpi_antisocialT2SumTransformed <- bcPower(mydata$bpi_antisocialT2Sum + 1, coef(transformedDV))

plot(density(na.omit(mydata$bpi_antisocialT2SumTransformed)))

Compare residuals from model with and without transformation

Model without transformation

Code
summary(modelWithoutTransformation <- lm(
  bpi_antisocialT2Sum + 1 ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.exclude))

Call:
lm(formula = bpi_antisocialT2Sum + 1 ~ bpi_antisocialT1Sum + 
    bpi_anxiousDepressedSum, data = mydata, na.action = na.exclude)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.3755 -1.2337 -0.2212  0.9911 12.8017 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)              2.19830    0.05983  36.743  < 2e-16 ***
bpi_antisocialT1Sum      0.46553    0.01858  25.049  < 2e-16 ***
bpi_anxiousDepressedSum  0.16075    0.02916   5.513 3.83e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.979 on 2871 degrees of freedom
  (8656 observations deleted due to missingness)
Multiple R-squared:  0.262, Adjusted R-squared:  0.2615 
F-statistic: 509.6 on 2 and 2871 DF,  p-value: < 2.2e-16
Code
plot(density(na.omit(studres(modelWithoutTransformation))))

Model with transformation

Code
summary(modelWithTransformation <- lm(
  bpi_antisocialT2SumTransformed ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.exclude))

Call:
lm(formula = bpi_antisocialT2SumTransformed ~ bpi_antisocialT1Sum + 
    bpi_anxiousDepressedSum, data = mydata, na.action = na.exclude)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.3429 -0.4868  0.0096  0.4922  2.9799 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)             0.800373   0.022024  36.342  < 2e-16 ***
bpi_antisocialT1Sum     0.165476   0.006841  24.189  < 2e-16 ***
bpi_anxiousDepressedSum 0.055910   0.010733   5.209 2.03e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.7283 on 2871 degrees of freedom
  (8656 observations deleted due to missingness)
Multiple R-squared:  0.2477,    Adjusted R-squared:  0.2472 
F-statistic: 472.7 on 2 and 2871 DF,  p-value: < 2.2e-16
Code
plot(density(na.omit(studres(modelWithTransformation))))

Constructed Variable Test & Plot

A significant p-value indicates a strong need to transform variable:

Code
multipleRegressionModel_constructedVariable <- update(
  multipleRegressionModel,
  . ~ . + boxCoxVariable(bpi_antisocialT1Sum + 1))
summary(multipleRegressionModel_constructedVariable)$coef["boxCoxVariable(bpi_antisocialT1Sum + 1)", , drop = FALSE]
                                           Estimate Std. Error   t value
boxCoxVariable(bpi_antisocialT1Sum + 1) -0.06064732 0.04683415 -1.294938
                                         Pr(>|t|)
boxCoxVariable(bpi_antisocialT1Sum + 1) 0.1954457

Plot allows us to see whether the need for transformation is spread through data or whether it is just dependent on a small fraction of observations:

Code
avPlots(multipleRegressionModel_constructedVariable, "boxCoxVariable(bpi_antisocialT1Sum + 1)")

Inverse Response Plot

The black line is the best-fitting power transformation:

Code
inverseResponsePlot(strictlyPositiveDV, lambda = c(-1,-0.5,0,1/3,.5,1,2))

19.1.6.2.1.2 Yeo-Johnson Transformations

Useful if the outcome is not strictly positive.

Code
yjPower(DV, lambda)
19.1.6.2.2 Transformations of Predictor Variable
19.1.6.2.2.1 Component-Plus-Residual Plots (Partial Residual Plots)

Linear model:

Code
crPlots(multipleRegressionModelNoMissing, order = 1)

Quadratic model:

Code
crPlots(multipleRegressionModelNoMissing, order = 2)

Code
multipleRegressionModel_quadratic <- lm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + I(bpi_antisocialT1Sum^2) + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.exclude)

summary(multipleRegressionModel_quadratic)

Call:
lm(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + I(bpi_antisocialT1Sum^2) + 
    bpi_anxiousDepressedSum, data = mydata, na.action = na.exclude)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.6238 -1.2610 -0.2484  0.9923 12.9008 

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)               1.099237   0.076509  14.367  < 2e-16 ***
bpi_antisocialT1Sum       0.547680   0.043723  12.526  < 2e-16 ***
I(bpi_antisocialT1Sum^2) -0.010221   0.004925  -2.075    0.038 *  
bpi_anxiousDepressedSum   0.161734   0.029144   5.549 3.13e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.977 on 2870 degrees of freedom
  (8656 observations deleted due to missingness)
Multiple R-squared:  0.2631,    Adjusted R-squared:  0.2623 
F-statistic: 341.5 on 3 and 2870 DF,  p-value: < 2.2e-16
Code
anova(
  multipleRegressionModel_quadratic,
  multipleRegressionModel)
Code
crPlots(multipleRegressionModel_quadratic, order = 1)

19.1.6.2.2.2 CERES Plot (Combining conditional Expectations and RESiduals)

Useful when nonlinear associations among the predictors are very strong (component-plus-residual plots may appear nonlinear even though the true partial regression is linear, a phenonomen called leakage)

Code
ceresPlots(multipleRegressionModel)

Code
ceresPlots(multipleRegressionModel_quadratic)
Error in `plot.window()`:
! need finite 'ylim' values

19.1.6.2.2.3 Box-Tidwell Method for Choosing Predictor Transformations

predictors must be strictly positive (or add a constant to make it strictly positive)

Code
boxTidwell(
  bpi_antisocialT2Sum ~ I(bpi_antisocialT1Sum + 1) + I(bpi_anxiousDepressedSum + 1),
  other.x = NULL, #list variables not to be transformed in other.x
  data = mydata,
  na.action = na.exclude)
                               MLE of lambda Score Statistic (t) Pr(>|t|)
I(bpi_antisocialT1Sum + 1)           0.90820             -1.0591   0.2897
I(bpi_anxiousDepressedSum + 1)       0.36689             -1.0650   0.2870

iterations =  4 

Score test for null hypothesis that all lambdas = 1:
F = 1.4056, df = 2 and 2869, Pr(>F) = 0.2454
19.1.6.2.2.4 Constructed-Variables Plot
Code
multipleRegressionModel_cv <- lm(
  bpi_antisocialT2Sum ~ I(bpi_antisocialT1Sum + 1) + I(bpi_anxiousDepressedSum + 1) + I(bpi_antisocialT1Sum * log(bpi_antisocialT1Sum)) + I(bpi_anxiousDepressedSum * log(bpi_anxiousDepressedSum)),
  data = mydata,
  na.action = na.exclude)

summary(multipleRegressionModel_cv)$coef["I(bpi_antisocialT1Sum * log(bpi_antisocialT1Sum))", , drop = FALSE]
                                                    Estimate Std. Error
I(bpi_antisocialT1Sum * log(bpi_antisocialT1Sum)) -0.1691003 0.06887107
                                                    t value   Pr(>|t|)
I(bpi_antisocialT1Sum * log(bpi_antisocialT1Sum)) -2.455317 0.01418393
Code
summary(multipleRegressionModel_cv)$coef["I(bpi_anxiousDepressedSum * log(bpi_anxiousDepressedSum))", , drop = FALSE]
                                                            Estimate Std. Error
I(bpi_anxiousDepressedSum * log(bpi_anxiousDepressedSum)) 0.00828867  0.1384918
                                                             t value  Pr(>|t|)
I(bpi_anxiousDepressedSum * log(bpi_anxiousDepressedSum)) 0.05984953 0.9522831
Code
avPlots(multipleRegressionModel_cv, "I(bpi_antisocialT1Sum * log(bpi_antisocialT1Sum))")

Code
avPlots(multipleRegressionModel_cv, "I(bpi_anxiousDepressedSum * log(bpi_anxiousDepressedSum))")

19.1.6.2.3 Robust models

Resources

19.1.6.2.3.1 Robust correlation

Spearman’s rho

Spearman’s rho is a non-parametric correlation.

Code
cor.test(
  mydata$bpi_antisocialT1Sum,
  mydata$bpi_antisocialT2Sum) #Pearson r, correlation that is sensitive to outliers

    Pearson's product-moment correlation

data:  mydata$bpi_antisocialT1Sum and mydata$bpi_antisocialT2Sum
t = 31.274, df = 2873, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.4761758 0.5307381
sample estimates:
      cor 
0.5039595 
Code
cor.test(
  mydata$bpi_antisocialT1Sum,
  mydata$bpi_antisocialT2Sum, method = "spearman") #Spearman's rho, a rank correlation that is less sensitive to outliers

    Spearman's rank correlation rho

data:  mydata$bpi_antisocialT1Sum and mydata$bpi_antisocialT2Sum
S = 1997347186, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.4956973 

Minimum vollume ellipsoid

Code
cov.mve(
  na.omit(mydata[,c("bpi_antisocialT1Sum","bpi_antisocialT2Sum")]),
  cor = TRUE)
$center
bpi_antisocialT1Sum bpi_antisocialT2Sum 
           2.056773            1.952882 

$cov
                    bpi_antisocialT1Sum bpi_antisocialT2Sum
bpi_antisocialT1Sum           2.1044887           0.8162201
bpi_antisocialT2Sum           0.8162201           2.1211668

$msg
[1] "147 singular samples of size 3 out of 1500"

$crit
[1] 3.79572

$best
[1]  1  3  8  9 10 12

$cor
                    bpi_antisocialT1Sum bpi_antisocialT2Sum
bpi_antisocialT1Sum           1.0000000           0.3863195
bpi_antisocialT2Sum           0.3863195           1.0000000

$n.obs
[1] 2875

Minimum covariance determinant

Code
cov.mcd(
  na.omit(mydata[,c("bpi_antisocialT1Sum","bpi_antisocialT2Sum")]),
  cor = TRUE)
$center
bpi_antisocialT1Sum bpi_antisocialT2Sum 
           2.181181            2.094954 

$cov
                    bpi_antisocialT1Sum bpi_antisocialT2Sum
bpi_antisocialT1Sum            2.379036            1.096437
bpi_antisocialT2Sum            1.096437            2.482409

$msg
[1] "170 singular samples of size 3 out of 1500"

$crit
[1] -0.3839279

$best
[1]  1  3  8  9 10 12

$cor
                    bpi_antisocialT1Sum bpi_antisocialT2Sum
bpi_antisocialT1Sum           1.0000000           0.4511764
bpi_antisocialT2Sum           0.4511764           1.0000000

$n.obs
[1] 2875

Winsorized correlation

Code
correlation(
  mydata[,c("bpi_antisocialT1Sum","bpi_antisocialT2Sum")],
  winsorize = 0.2,
  p_adjust = "none")

Percentage bend correlation

Code
correlation(
  mydata[,c("bpi_antisocialT1Sum","bpi_antisocialT2Sum")],
  method = "percentage",
  p_adjust = "none")

Biweight midcorrelation

Code
correlation(
  mydata[,c("bpi_antisocialT1Sum","bpi_antisocialT2Sum")],
  method = "biweight",
  p_adjust = "none")

Xi (\(\xi\))

Xi (\(\xi\)) is an index of the degree of dependence between two variables

Chatterjee, S. (2021). A new coefficient of correlation. Journal of the American Statistical Association, 116(536), 2009-2022. https://doi.org/10.1080/01621459.2020.1758115

Code
set.seed(52242) # for reproducibility

XICOR::xicor(
  mydata$bpi_antisocialT1Sum,
  mydata$bpi_antisocialT2Sum,
  method = "permutation",
  pvalue = TRUE
)
$xi
[1] 0.1751217

$sd
[1] 0.012695

$pval
[1] 0
19.1.6.2.3.2 Robust regression with a single predictor

Theil-Sen estimator

The Theil-Sen single median estimator is robust to outliers; have to remove missing values first

Code
mydata_subset <- na.omit(mydata[,c("bpi_antisocialT1Sum","bpi_antisocialT2Sum")])[1:400,]
mblm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum,
  dataframe = mydata_subset,
  repeated = FALSE)

Call:
mblm(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum, dataframe = mydata_subset, 
    repeated = FALSE)

Coefficients:
        (Intercept)  bpi_antisocialT1Sum  
                1.0                  0.6  
19.1.6.2.3.3 Robust multiple regression

Best when no outliers: MM-type regression estimator

Robust linear regression

MM-type regression estimator (best):

Code
lmrob(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.exclude)

Call:
lmrob(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,     data = mydata, na.action = na.exclude)
 \--> method = "MM"
Coefficients:
            (Intercept)      bpi_antisocialT1Sum  bpi_anxiousDepressedSum  
                 0.9440                   0.4914                   0.1577  

Iteratively reweighted least squares (IRLS): http://www.ats.ucla.edu/stat/r/dae/rreg.htm (archived at https://web.archive.org/web/20161119025907/http://www.ats.ucla.edu/stat/r/dae/rreg.htm)

Code
rlm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.exclude)
Call:
rlm(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    data = mydata, na.action = na.exclude)
Converged in 6 iterations

Coefficients:
            (Intercept)     bpi_antisocialT1Sum bpi_anxiousDepressedSum 
              0.9838033               0.4873118               0.1620020 

Degrees of freedom: 2874 total; 2871 residual
  (8656 observations deleted due to missingness)
Scale estimate: 1.62 

Robust generalized regression

Code
glmrob(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  family = "poisson",
  na.action = "na.exclude")

Call:  glmrob(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum +      bpi_anxiousDepressedSum, family = "poisson", data = mydata,      na.action = "na.exclude") 

Coefficients:
            (Intercept)      bpi_antisocialT1Sum  bpi_anxiousDepressedSum  
                0.37281                  0.15589                  0.05026  

Number of observations: 2874 
Fitted by method  'Mqle' 

Most resistant to outliers: Least trimmed squares (LTS)—but not good as a standalone estimator, better for identifying outliers

Code
ltsReg(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.exclude)

Call:
ltsReg.formula(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum +     bpi_anxiousDepressedSum, data = mydata, na.action = na.exclude)

Coefficients:
              Intercept      bpi_antisocialT1Sum  bpi_anxiousDepressedSum  
                 0.7813                   0.5145                   0.1318  

Scale estimate 1.75 

Not great options:

Quantile (L1) regression: rq() function of quantreg package

https://data.library.virginia.edu/getting-started-with-quantile-regression/ (archived at https://perma.cc/FSV4-5DCR)

Code
quantiles <- 1:9/10

quantileRegressionModel <- rq(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  tau = quantiles,
  na.action = na.exclude)

quantileRegressionModel
Call:
rq(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    tau = quantiles, data = mydata, na.action = na.exclude)

Coefficients:
                           tau= 0.1  tau= 0.2   tau= 0.3 tau= 0.4 tau= 0.5
(Intercept)             -0.19607843 0.0000000 0.09090909      0.5     0.88
bpi_antisocialT1Sum      0.19607843 0.3333333 0.45454545      0.5     0.52
bpi_anxiousDepressedSum  0.09803922 0.1111111 0.13636364      0.1     0.12
                         tau= 0.6 tau= 0.7 tau= 0.8  tau= 0.9
(Intercept)             1.0000000   1.3750   2.1250 3.1000000
bpi_antisocialT1Sum     0.5913978   0.6250   0.6250 0.6428571
bpi_anxiousDepressedSum 0.2043011   0.1875   0.1875 0.2285714

Degrees of freedom: 11530 total; 11527 residual
Code
summary(quantileRegressionModel)

Call: rq(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    tau = quantiles, data = mydata, na.action = na.exclude)

tau: [1] 0.1

Coefficients:
                        Value    Std. Error t value  Pr(>|t|)
(Intercept)             -0.19608  0.05341   -3.67101  0.00025
bpi_antisocialT1Sum      0.19608  0.03492    5.61452  0.00000
bpi_anxiousDepressedSum  0.09804  0.03536    2.77276  0.00559

Call: rq(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    tau = quantiles, data = mydata, na.action = na.exclude)

tau: [1] 0.2

Coefficients:
                        Value    Std. Error t value  Pr(>|t|)
(Intercept)              0.00000  0.04196    0.00000  1.00000
bpi_antisocialT1Sum      0.33333  0.02103   15.85299  0.00000
bpi_anxiousDepressedSum  0.11111  0.02669    4.16376  0.00003

Call: rq(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    tau = quantiles, data = mydata, na.action = na.exclude)

tau: [1] 0.3

Coefficients:
                        Value    Std. Error t value  Pr(>|t|)
(Intercept)              0.09091  0.05830    1.55945  0.11900
bpi_antisocialT1Sum      0.45455  0.02584   17.59380  0.00000
bpi_anxiousDepressedSum  0.13636  0.03601    3.78723  0.00016

Call: rq(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    tau = quantiles, data = mydata, na.action = na.exclude)

tau: [1] 0.4

Coefficients:
                        Value    Std. Error t value  Pr(>|t|)
(Intercept)              0.50000  0.06410    7.80035  0.00000
bpi_antisocialT1Sum      0.50000  0.02374   21.06425  0.00000
bpi_anxiousDepressedSum  0.10000  0.03372    2.96584  0.00304

Call: rq(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    tau = quantiles, data = mydata, na.action = na.exclude)

tau: [1] 0.5

Coefficients:
                        Value    Std. Error t value  Pr(>|t|)
(Intercept)              0.88000  0.06584   13.36547  0.00000
bpi_antisocialT1Sum      0.52000  0.02491   20.87561  0.00000
bpi_anxiousDepressedSum  0.12000  0.03771    3.18245  0.00148

Call: rq(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    tau = quantiles, data = mydata, na.action = na.exclude)

tau: [1] 0.6

Coefficients:
                        Value    Std. Error t value  Pr(>|t|)
(Intercept)              1.00000  0.04246   23.55356  0.00000
bpi_antisocialT1Sum      0.59140  0.02146   27.55553  0.00000
bpi_anxiousDepressedSum  0.20430  0.03156    6.47422  0.00000

Call: rq(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    tau = quantiles, data = mydata, na.action = na.exclude)

tau: [1] 0.7

Coefficients:
                        Value    Std. Error t value  Pr(>|t|)
(Intercept)              1.37500  0.07716   17.82011  0.00000
bpi_antisocialT1Sum      0.62500  0.02397   26.07659  0.00000
bpi_anxiousDepressedSum  0.18750  0.03760    4.98623  0.00000

Call: rq(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    tau = quantiles, data = mydata, na.action = na.exclude)

tau: [1] 0.8

Coefficients:
                        Value    Std. Error t value  Pr(>|t|)
(Intercept)              2.12500  0.09498   22.37224  0.00000
bpi_antisocialT1Sum      0.62500  0.03524   17.73509  0.00000
bpi_anxiousDepressedSum  0.18750  0.05040    3.72020  0.00020

Call: rq(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum, 
    tau = quantiles, data = mydata, na.action = na.exclude)

tau: [1] 0.9

Coefficients:
                        Value    Std. Error t value  Pr(>|t|)
(Intercept)              3.10000  0.11730   26.42749  0.00000
bpi_antisocialT1Sum      0.64286  0.04797   13.40136  0.00000
bpi_anxiousDepressedSum  0.22857  0.06439    3.54960  0.00039

Plot to examine the association between the predictor and the outcome at different levels (i.e., quantiles) of the predictor:

Code
ggplot(
  mydata,
  aes(bpi_antisocialT1Sum, bpi_antisocialT2Sum)) + 
  geom_point() + 
  geom_quantile(
    quantiles = quantiles,
    aes(color = factor(after_stat(quantile))))

Below is a plot that examines the difference in quantile coefficients (and associated confidence intervals) at different levels (i.e., quantiles) of the predictor. The x-axis is the quantile of the predictor. The y-axis is the slope coefficient. Each black dot is the slope coefficient for the given quantile of the predictor. The red lines are the least squares estimate of the slope coefficient and its confidence interval.

Code
plot(summary(quantileRegressionModel))

Code
plot(
  summary(quantileRegressionModel),
  parm = "bpi_antisocialT1Sum")

19.2 Examining Model Assumptions

19.2.1 Distribution of Residuals

19.2.1.1 QQ plots

https://www.ucd.ie/ecomodel/Resources/QQplots_WebVersion.html (archived at https://perma.cc/9UC3-3WRX)

Code
qqPlot(multipleRegressionModel, main = "QQ Plot", id = TRUE)

[1] 8955 8956
Code
qqnorm(resid(multipleRegressionModel))

Code
qqnorm(resid(rmsMultipleRegressionModel))
qqnorm(resid(robustLinearRegression))

Code
qqnorm(resid(ltsRegression))

Code
qqnorm(resid(generalizedRegressionModel))

Code
qqnorm(resid(rmsGeneralizedRegressionModel))
Error in `residuals.Glm()`:
! argument "type" is missing, with no default
Code
qqnorm(resid(robustGeneralizedRegression))

Code
qqnorm(resid(multilevelRegressionModel))
Error in `h()`:
! error in evaluating the argument 'object' in selecting a method for function 'resid': object 'multilevelRegressionModel' not found

19.2.1.2 PP plots

Code
ppPlot(multipleRegressionModel)

Code
ppPlot(rmsMultipleRegressionModel)
ppPlot(robustLinearRegression)

Code
ppPlot(ltsRegression)

Code
ppPlot(generalizedRegressionModel)

Code
ppPlot(rmsGeneralizedRegressionModel)
Error in `residuals.Glm()`:
! argument "type" is missing, with no default
Code
ppPlot(robustGeneralizedRegression)

19.2.1.3 Density Plot of Residuals

Code
studentizedResiduals <- na.omit(rstudent(multipleRegressionModel))
plot(
  density(studentizedResiduals),
  col="red")
xfit <- seq(
  min(studentizedResiduals, na.rm = TRUE),
  max(studentizedResiduals, na.rm = TRUE),
  length = 40)
lines(
  xfit,
  dnorm(xfit),
  col="gray") #compare to normal distribution

19.2.2 Residual Plots

Residual plots are plots of the residuals versus observed/fitted values.

Includes plots of a) model residuals versus observed values on predictors and b) model residuals versus model fitted values.

Note: have to remove na.action = na.exclude

Tests include:

  • lack-of-fit test for every numeric predictor, t-test for the regressor, added to the model, indicating no lack-of-fit for this type
  • Tukey’s test for nonadditivity: adding the squares of the fitted values to the model and refitting (evaluates adequacy of model fit)
Code
residualPlots(
  multipleRegressionModelNoMissing,
  id = TRUE)

                        Test stat Pr(>|Test stat|)  
bpi_antisocialT1Sum       -2.0755          0.03803 *
bpi_anxiousDepressedSum   -1.2769          0.20175  
Tukey test                -2.3351          0.01954 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

19.2.3 Marginal Model Plots

Marginal model plots are plots of the outcome versus predictors/fitted values.

Includes plots of a) observed outcome values versus values on predictors (ignoring the other predictors) and b) observed outcome values versus model fitted values.

Code
marginalModelPlots(
  multipleRegressionModel,
  sd = TRUE,
  id = TRUE)

19.2.4 Added-Variable Plots

Added-variable plots are plots of the partial association between the outcome and each predictor, controlling for all other predictors.

Useful for identifying jointly influential observations and studying the impact of observations on the regression coefficients.

y-axis: residuals from model with all predictors excluding the predictor of interest

x-axis: residuals from model regressing predictor of interest on all other predictors

Code
avPlots(
  multipleRegressionModel,
  id = TRUE)

19.2.4.1 Refit model removing jointly influential observations

Code
multipleRegressionModel_removeJointlyInfluentialObs <- lm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.exclude,
  subset = -c(6318,6023,4022,4040))

avPlots(
  multipleRegressionModel_removeJointlyInfluentialObs,
  id = TRUE)

Code
compareCoefs(
  multipleRegressionModel,
  multipleRegressionModel_removeJointlyInfluentialObs)
Calls:
1: lm(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + 
  bpi_anxiousDepressedSum, data = mydata, na.action = na.exclude)
2: lm(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + 
  bpi_anxiousDepressedSum, data = mydata, subset = -c(6318, 6023, 4022, 4040),
   na.action = na.exclude)

                        Model 1 Model 2
(Intercept)              1.1983  1.1949
SE                       0.0598  0.0599
                                       
bpi_antisocialT1Sum      0.4655  0.4649
SE                       0.0186  0.0188
                                       
bpi_anxiousDepressedSum  0.1608  0.1665
SE                       0.0292  0.0295
                                       

19.2.5 Outlier test

Locates the largest Studentized residuals in absolute value and computes the Bonferroni-corrected p-values based on a t-test for linear models.

Test of outlyingness, i.e., how likely one would have a residual of a given magnitude in a normal distribution with the same sample size

Note: it does not test how extreme an observation is relative to its distribution (i.e., leverage)

Code
outlierTest(multipleRegressionModel)
     rstudent unadjusted p-value Bonferroni p
8955 6.519574         8.2999e-11   2.3854e-07
8956 6.519574         8.2999e-11   2.3854e-07
8957 6.182195         7.2185e-10   2.0746e-06
2560 4.914941         9.3800e-07   2.6958e-03
1385 4.573975         4.9884e-06   1.4337e-02

19.2.6 Observations with high leverage

Identifies observations with high leverage (i.e., high hat values)

hat values are an index of leverage (observations that are far from the center of the regressor space and have greater influence on OLS regression coefficients)

Code
hist(hatvalues(multipleRegressionModel))

Code
plot(hatvalues(multipleRegressionModel))

Code
influenceIndexPlot(
  multipleRegressionModel,
  id = TRUE)

Code
influencePlot(
  multipleRegressionModel,
  id = TRUE) # circle size is proportional to Cook's Distance

Code
leveragePlots(
  multipleRegressionModel,
  id = TRUE)

19.2.7 Observations with high influence (on OLS regression coefficients)

https://cran.r-project.org/web/packages/olsrr/vignettes/influence_measures.html (archived at https://perma.cc/T2TS-PZZ4)

Code
head(influence.measures(multipleRegressionModel)$infmat)
        dfb.1_     dfb.b_T1     dfb.b_DS       dffit     cov.r       cook.d
1  0.000000000  0.000000000  0.000000000          NA 1.0000000           NA
2  0.000000000  0.000000000  0.000000000          NA 1.0000000           NA
3 -0.006546394  0.008357248 -0.006308529 -0.01004886 1.0024362 0.0000336708
4  0.042862749 -0.025185789 -0.007782877  0.04286275 0.9998621 0.0006121902
5  0.000000000  0.000000000  0.000000000          NA 1.0000000           NA
6 -0.018327367  0.010769006  0.003327823 -0.01832737 1.0015775 0.0001119888
           hat
1 0.0000000000
2 0.0000000000
3 0.0014593072
4 0.0009143185
5 0.0000000000
6 0.0009143185

19.2.8 DFBETA

Code
head(dfbeta(multipleRegressionModel))
    (Intercept) bpi_antisocialT1Sum bpi_anxiousDepressedSum
1  0.0000000000        0.0000000000            0.000000e+00
2  0.0000000000        0.0000000000            0.000000e+00
3 -0.0003917284        0.0001553400           -1.839704e-04
4  0.0025639901       -0.0004679817           -2.268890e-04
5  0.0000000000        0.0000000000            0.000000e+00
6 -0.0010966309        0.0002001580            9.704151e-05
Code
hist(dfbeta(multipleRegressionModel))

Code
dfbetasPlots(
  multipleRegressionModel,
  id = TRUE)
Error in `dfbetasPlots.lm()`:
! argument 2 matches multiple formal arguments

19.2.9 DFFITS

Code
head(dffits(multipleRegressionModel)[order(dffits(multipleRegressionModel), decreasing = TRUE)])
     8472      4468      1917      8955      8956      1986 
0.2664075 0.2428065 0.2237760 0.1972271 0.1972271 0.1923034 
Code
hist(dffits(multipleRegressionModel))

Code
plot(dffits(multipleRegressionModel))

Code
plot(dffits(multipleRegressionModel)[order(dffits(multipleRegressionModel), decreasing = TRUE)])

19.2.10 Cook’s Distance

Observations that are both outlying (have a high residual from the regression line) and have high leverage (are far from the center of the regressor space) have high influence on the OLS regression coefficients. An observation will have less influence if it lies on the regression line (not an outlier, i.e., has a low residual) or if it has low leverage (i.e., has a value near the center of a predictor’s distribution).

Code
head(cooks.distance(multipleRegressionModel)[order(cooks.distance(multipleRegressionModel), decreasing = TRUE)])
      2765       1371       2767       8472       6170       6023 
0.05488391 0.03624388 0.03624388 0.02358305 0.02323282 0.02315496 
Code
hist(cooks.distance(multipleRegressionModel))

Code
plot(cooks.distance(multipleRegressionModel))

Code
plot(cooks.distance(multipleRegressionModel)[order(cooks.distance(multipleRegressionModel), decreasing = TRUE)])

19.2.10.1 Refit model removing values with high cook’s distance

Code
multipleRegressionModel_2 <- lm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.exclude,
  subset = -c(2765,1371))

multipleRegressionModel_3 <- lm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.exclude,
  subset = -c(2765,1371,2767))

multipleRegressionModel_4 <- lm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.exclude,
  subset = -c(2765,1371,2767,8472))

multipleRegressionModel_5 <- lm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.exclude,
  subset = -c(2765,1371,2767,8472,6170))

multipleRegressionModel_6 <- lm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.exclude,
  subset = -c(2765,1371,2767,8472,6170,6023))

multipleRegressionModel_7 <- lm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = na.exclude,
  subset = -c(2765,1371,2767,8472,6170,6023,2766))
19.2.10.1.1 Examine how much regression coefficients change when excluding influential observations
Code
compareCoefs(
  multipleRegressionModel,
  multipleRegressionModel_2,
  multipleRegressionModel_3,
  multipleRegressionModel_4,
  multipleRegressionModel_5,
  multipleRegressionModel_6,
  multipleRegressionModel_7)
Calls:
1: lm(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + 
  bpi_anxiousDepressedSum, data = mydata, na.action = na.exclude)
2: lm(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + 
  bpi_anxiousDepressedSum, data = mydata, subset = -c(2765, 1371), na.action =
   na.exclude)
3: lm(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + 
  bpi_anxiousDepressedSum, data = mydata, subset = -c(2765, 1371, 2767), 
  na.action = na.exclude)
4: lm(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + 
  bpi_anxiousDepressedSum, data = mydata, subset = -c(2765, 1371, 2767, 8472),
   na.action = na.exclude)
5: lm(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + 
  bpi_anxiousDepressedSum, data = mydata, subset = -c(2765, 1371, 2767, 8472, 
  6170), na.action = na.exclude)
6: lm(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + 
  bpi_anxiousDepressedSum, data = mydata, subset = -c(2765, 1371, 2767, 8472, 
  6170, 6023), na.action = na.exclude)
7: lm(formula = bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + 
  bpi_anxiousDepressedSum, data = mydata, subset = -c(2765, 1371, 2767, 8472, 
  6170, 6023, 2766), na.action = na.exclude)

                        Model 1 Model 2 Model 3 Model 4 Model 5 Model 6 Model 7
(Intercept)              1.1983  1.1704  1.1577  1.1676  1.1620  1.1537  1.1439
SE                       0.0598  0.0597  0.0596  0.0596  0.0596  0.0595  0.0595
                                                                               
bpi_antisocialT1Sum      0.4655  0.4739  0.4779  0.4748  0.4744  0.4793  0.4836
SE                       0.0186  0.0185  0.0185  0.0185  0.0185  0.0185  0.0185
                                                                               
bpi_anxiousDepressedSum  0.1608  0.1691  0.1726  0.1699  0.1770  0.1742  0.1743
SE                       0.0292  0.0290  0.0290  0.0289  0.0290  0.0290  0.0289
                                                                               

19.2.10.2 Examine how much regression coefficients change when using least trimmed squares (LTS) that is resistant to outliers

Code
coef(lm(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action="na.exclude"))
            (Intercept)     bpi_antisocialT1Sum bpi_anxiousDepressedSum 
              1.1983004               0.4655286               0.1607541 
Code
coef(ltsReg(
  bpi_antisocialT2Sum ~ bpi_antisocialT1Sum + bpi_anxiousDepressedSum,
  data = mydata,
  na.action = "na.exclude"))
              Intercept     bpi_antisocialT1Sum bpi_anxiousDepressedSum 
              0.7439567               0.5432644               0.1524929 

19.2.11 Resources

Book: An R Companion to Applied Regression

https://people.duke.edu/~rnau/testing.htm (archived at https://perma.cc/9CXH-GBSN)

https://www.statmethods.net/stats/rdiagnostics.html (archived at https://perma.cc/7JX8-K6BY)

https://socserv.socsci.mcmaster.ca/jfox/Courses/Brazil-2009/index.html (archived at https://perma.cc/4A2P-9S49)

https://en.wikipedia.org/wiki/Ordinary_least_squares#Assumptions (archived at https://perma.cc/Q4NS-X8TJ)

20 Session Info

Code
R version 4.5.2 (2025-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.3 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: UTC
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] yhat_2.0-5         rockchalk_1.8.157  effectsize_1.0.1   cocor_1.1-4       
 [5] XICOR_0.4.1        mice_3.19.0        regtools_1.7.0     gtools_3.9.5      
 [9] FNN_1.1.4.1        lavaan_0.6-21      interactions_1.2.0 correlation_0.8.8 
[13] effects_4.2-4      mblm_0.12.1        quantreg_6.1       SparseM_1.84-2    
[17] olsrr_0.6.1        foreign_0.8-90     AER_1.2-15         survival_3.8-3    
[21] sandwich_3.1-1     lmtest_0.9-40      zoo_1.8-15         mgcv_1.9-3        
[25] nlme_3.1-168       car_3.1-3          carData_3.0-5      cvTools_0.3.3     
[29] lattice_0.22-7     brms_2.23.0        Rcpp_1.1.0         robustbase_0.99-6 
[33] rms_8.1-0          Hmisc_5.2-4        psych_2.5.6        lubridate_1.9.4   
[37] forcats_1.0.1      stringr_1.6.0      dplyr_1.1.4        purrr_1.2.0       
[41] readr_2.1.6        tidyr_1.3.2        tibble_3.3.0       ggplot2_4.0.1     
[45] tidyverse_2.0.0    MASS_7.3-65        petersenlab_1.2.2 

loaded via a namespace (and not attached):
  [1] domir_1.2.0           splines_4.5.2         polspline_1.1.25     
  [4] R.oo_1.27.1           datawizard_1.3.0      rpart_4.1.24         
  [7] lifecycle_1.0.4       Rdpack_2.6.4          StanHeaders_2.32.10  
 [10] processx_3.8.6        globals_0.18.0        insight_1.4.4        
 [13] backports_1.5.0       survey_4.4-8          magrittr_2.0.4       
 [16] openxlsx_4.2.8.1      rmarkdown_2.30        plotrix_3.8-13       
 [19] yaml_2.3.12           otel_0.2.0            text2vec_0.6.6       
 [22] zip_2.3.3             pkgbuild_1.4.8        DBI_1.2.3            
 [25] minqa_1.2.8           RColorBrewer_1.1-3    multcomp_1.4-29      
 [28] abind_1.4-8           quadprog_1.5-8        nnet_7.3-20          
 [31] TH.data_1.1-5         tensorA_0.36.2.1      psychTools_2.5.7.22  
 [34] jtools_2.3.0          float_0.3-3           inline_0.3.21        
 [37] listenv_0.10.0        nortest_1.0-4         performance_0.15.3   
 [40] MatrixModels_0.5-4    goftest_1.2-3         bridgesampling_1.2-1 
 [43] parallelly_1.46.0     codetools_0.2-20      tidyselect_1.2.1     
 [46] shape_1.4.6.1         bayesplot_1.15.0      farver_2.1.2         
 [49] lme4_1.1-38           broom.mixed_0.2.9.6   matrixStats_1.5.0    
 [52] stats4_4.5.2          base64enc_0.1-3       jsonlite_2.0.0       
 [55] rsparse_0.5.3         mitml_0.4-5           Formula_1.2-5        
 [58] iterators_1.0.14      foreach_1.5.2         tools_4.5.2          
 [61] miscTools_0.6-28      yacca_1.4-2           glue_1.8.0           
 [64] mnormt_2.1.1          gridExtra_2.3         pan_1.9              
 [67] xfun_0.55             distributional_0.5.0  rtf_0.4-14.1         
 [70] rje_1.12.1            loo_2.9.0             withr_3.0.2          
 [73] fastmap_1.2.0         mitools_2.4           boot_1.3-32          
 [76] callr_3.7.6           digest_0.6.39         estimability_1.5.1   
 [79] timechange_0.3.0      R6_2.6.1              colorspace_2.1-2     
 [82] mix_1.0-13            R.methodsS3_1.8.2     RhpcBLASctl_0.23-42  
 [85] generics_0.1.4        data.table_1.18.0     htmlwidgets_1.6.4    
 [88] parameters_0.28.3     pkgconfig_2.0.3       gtable_0.3.6         
 [91] S7_0.2.1              furrr_0.3.1           htmltools_0.5.9      
 [94] scales_1.4.0          posterior_1.6.1       reformulas_0.4.3     
 [97] lgr_0.5.0             knitr_1.51            rstudioapi_0.17.1    
[100] tzdb_0.5.0            reshape2_1.4.5        coda_0.19-4.1        
[103] checkmate_2.3.3       nloptr_2.2.1          parallel_4.5.2       
[106] mlapi_0.1.1           pillar_1.11.1         grid_4.5.2           
[109] vctrs_0.6.5           jomo_2.7-6            xtable_1.8-4         
[112] cluster_2.1.8.1       htmlTable_2.4.3       evaluate_1.0.5       
[115] pbivnorm_0.6.0        kutils_1.73           mvtnorm_1.3-3        
[118] cli_3.6.5             compiler_4.5.2        rlang_1.1.6          
[121] crayon_1.5.3          rstantools_2.5.0      labeling_0.4.3       
[124] ps_1.9.1              plyr_1.8.9            rstan_2.32.7         
[127] stringi_1.8.7         pander_0.6.6          QuickJSR_1.8.1       
[130] viridisLite_0.4.2     glmnet_4.1-10         Brobdingnag_1.2-9    
[133] bayestestR_0.17.0     Matrix_1.7-4          hms_1.1.4            
[136] future_1.68.0         rbibutils_2.4         broom_1.0.11         
[139] RcppParallel_5.1.11-1 DEoptimR_1.1-4       

Developmental Psychopathology Lab