Code
#install.packages("remotes")
#remotes::install_github("DevPsyLab/petersenlab")Exploratory data analysis is important for understanding your data, checking for data issues/errors, and checking assumptions for different statistical models.
LOOK AT YOUR DATA—this is one of the most overlooked steps in data analysis!
#install.packages("remotes")
#remotes::install_github("DevPsyLab/petersenlab")set.seed(52242)
n <- 1000
ID <- rep(1:100, each = 10)
predictor <- rbeta(n, 1.5, 5) * 100
outcome <- predictor + rnorm(n, mean = 0, sd = 20) + 50
predictorOverplot <- sample(1:50, n, replace = TRUE)
outcomeOverplot <- predictorOverplot + sample(1:75, n, replace = TRUE)
categorical1 <- sample(1:5, size = n, replace = TRUE)
categorical2 <- sample(1:5, size = n, replace = TRUE)
mydata <- data.frame(
ID = ID,
predictor = predictor,
outcome = outcome,
predictorOverplot = predictorOverplot,
outcomeOverplot = outcomeOverplot,
categorical1 = categorical1,
categorical2 = categorical2)
mydata[sample(1:n, size = 10), "predictor"] <- NA
mydata[sample(1:n, size = 10), "outcome"] <- NA
mydata[sample(1:n, size = 10), "predictorOverplot"] <- NA
mydata[sample(1:n, size = 10), "outcomeOverplot"] <- NA
mydata[sample(1:n, size = 30), "categorical1"] <- NA
mydata[sample(1:n, size = 70), "categorical2"] <- NAround(data.frame(psych::describe(mydata)), 2) ID predictor outcome predictorOverplot
50.50 22.64 73.36 25.54
outcomeOverplot categorical1 categorical2
63.53 2.96 2.95
ID predictor outcome predictorOverplot
50.50 22.64 73.36 25.54
outcomeOverplot categorical1 categorical2
63.53 2.96 2.95
ID predictor outcome predictorOverplot
50.50 19.61 73.17 26.00
outcomeOverplot categorical1 categorical2
63.00 3.00 3.00
ID predictor outcome predictorOverplot
50.50 22.64 73.36 35.00
outcomeOverplot categorical1 categorical2
63.33 2.00 4.00
Compute all of these measures of central tendency:
mydata %>%
summarise(across(everything(),
.fns = list(mean = ~ mean(., na.rm = TRUE),
median = ~ median(., na.rm = TRUE),
mode = ~ Mode(., multipleModes = "mean")),
.names = "{.col}.{.fn}")) %>%
round(., 2) %>%
pivot_longer(cols = everything(),
names_to = c("variable","index"),
names_sep = "\\.") %>%
pivot_wider(names_from = index,
values_from = value)Compute all of these measures of dispersion:
mydata %>%
summarise(across(everything(),
.fns = list(SD = ~ sd(., na.rm = TRUE),
min = ~ min(., na.rm = TRUE),
max = ~ max(., na.rm = TRUE),
skewness = ~ skew(., na.rm = TRUE),
kurtosis = ~ kurtosi(., na.rm = TRUE)),
.names = "{.col}.{.fn}")) %>%
round(., 2) %>%
pivot_longer(cols = everything(),
names_to = c("variable","index"),
names_sep = "\\.") %>%
pivot_wider(names_from = index,
values_from = value)Consider transforming data if skewness > |0.8| or if kurtosis > |3.0|.
Add summary statistics to the bottom of correlation matrices in papers:
cor.table(mydata, type = "manuscript")summaryTable <- mydata %>%
summarise(across(everything(),
.fns = list(n = ~ length(na.omit(.)),
missingness = ~ mean(is.na(.)) * 100,
M = ~ mean(., na.rm = TRUE),
SD = ~ sd(., na.rm = TRUE),
min = ~ min(., na.rm = TRUE),
max = ~ max(., na.rm = TRUE),
skewness = ~ skew(., na.rm = TRUE),
kurtosis = ~ kurtosi(., na.rm = TRUE)),
.names = "{.col}.{.fn}")) %>%
pivot_longer(cols = everything(),
names_to = c("variable","index"),
names_sep = "\\.") %>%
pivot_wider(names_from = index,
values_from = value)
summaryTableTransposed <- summaryTable[-1] %>%
t() %>%
as.data.frame() %>%
setNames(summaryTable$variable) %>%
round(., digits = 2)
summaryTableTransposedSee here for resources for creating figures in R.
hist(mydata$outcome)ggplot2
ggplot(mydata, aes(x = outcome)) +
geom_histogram(color = 1)ggplot2
ggplot(mydata, aes(x = outcome)) +
geom_histogram(aes(y = after_stat(density)), color = 1) +
geom_density() +
geom_rug()ggplot2
ggplot(mydata, aes(x = outcome)) +
geom_density()boxplot(mydata$outcome, horizontal = TRUE)ggplot2
ggplot(mydata, aes(x = outcome)) +
geom_boxplot()ggplot2
ggplot(mydata, aes(x = "", y = outcome)) +
geom_violin()For more advanced scatterplots, see here. For an overview of correlation analysis, see here.
cor(mydata, use = "pairwise.complete.obs") ID predictor outcome predictorOverplot
ID 1.000000000 -0.019094589 -0.02501796 0.02984671
predictor -0.019094589 1.000000000 0.63517610 0.04350431
outcome -0.025017962 0.635176098 1.00000000 0.02976820
predictorOverplot 0.029846711 0.043504314 0.02976820 1.00000000
outcomeOverplot 0.003150872 0.043344473 0.06289224 0.56564445
categorical1 0.040442534 -0.002422711 -0.01532232 0.02498938
categorical2 0.008920415 -0.046927177 -0.01868628 0.03880135
outcomeOverplot categorical1 categorical2
ID 0.003150872 0.040442534 0.008920415
predictor 0.043344473 -0.002422711 -0.046927177
outcome 0.062892238 -0.015322318 -0.018686282
predictorOverplot 0.565644447 0.024989385 0.038801346
outcomeOverplot 1.000000000 0.011871644 0.043682632
categorical1 0.011871644 1.000000000 -0.086310192
categorical2 0.043682632 -0.086310192 1.000000000
cor.test( ~ predictor + outcome, data = mydata)
Pearson's product-moment correlation
data: predictor and outcome
t = 25.718, df = 978, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5962709 0.6711047
sample estimates:
cor
0.6351761
cor.table(mydata)cor(mydata, use = "pairwise.complete.obs", method = "spearman") ID predictor outcome predictorOverplot
ID 1.000000000 -0.01181085 -0.02805450 0.02925155
predictor -0.011810853 1.00000000 0.60003760 0.02672008
outcome -0.028054497 0.60003760 1.00000000 0.02460831
predictorOverplot 0.029251546 0.02672008 0.02460831 1.00000000
outcomeOverplot 0.001416626 0.03278463 0.07073642 0.54841297
categorical1 0.040888369 -0.01162260 -0.01906631 0.02350886
categorical2 0.009137467 -0.03763251 -0.02273699 0.04062487
outcomeOverplot categorical1 categorical2
ID 0.001416626 0.04088837 0.009137467
predictor 0.032784628 -0.01162260 -0.037632515
outcome 0.070736421 -0.01906631 -0.022736992
predictorOverplot 0.548412967 0.02350886 0.040624873
outcomeOverplot 1.000000000 0.01505255 0.044184959
categorical1 0.015052553 1.00000000 -0.086290083
categorical2 0.044184959 -0.08629008 1.000000000
cor.test( ~ predictor + outcome, data = mydata, method = "spearman")
Spearman's rank correlation rho
data: predictor and outcome
S = 62740170, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.6000376
cor.table(mydata, correlation = "spearman")Xi (\(\xi\)) is an index of the degree of dependence between two variables, which is useful as an index of nonlinear correlation.
Chatterjee, S. (2021). A new coefficient of correlation. Journal of the American Statistical Association, 116(536), 2009-2022. https://doi.org/10.1080/01621459.2020.1758115
calculateXI(
mydata$predictor,
mydata$outcome)[1] 0.2319062
ggplot2
ggplot(mydata, aes(x = predictor, y = outcome)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE)scatterplot <-
ggplot(mydata, aes(x = predictor, y = outcome)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE)densityMarginal <- ggMarginal(
scatterplot,
type = "density",
xparams = list(fill = "gray"),
yparams = list(fill = "gray"))print(densityMarginal, newpage = TRUE)ggplot(mydata, aes(x = predictorOverplot, y = outcomeOverplot)) +
geom_point(position = "jitter", alpha = 0.3) +
geom_density2d()smoothScatter(mydata$predictorOverplot, mydata$outcomeOverplot)mydata_nomissing <- na.omit(mydata[,c("predictor","outcome")])
dataEllipse(mydata_nomissing$predictor, mydata_nomissing$outcome, levels = c(0.5, .95))vwReg(outcome ~ predictor, data = mydata)The Shapiro-Wilk test of normality does not accept more than 5000 cases because it will reject the hypothesis that data come from a normal distribution with even slight deviations from normality.
shapiro.test(na.omit(mydata$outcome)) #subset to keep only the first 5000 rows: mydata$outcome[1:5000]
Shapiro-Wilk normality test
data: na.omit(mydata$outcome)
W = 0.9971, p-value = 0.06998
mcar_test(mydata)Examine the associations among variables controlling for a covariate (outcomeOverplot).
partialcor.table(mydata[,c("predictor","outcome","predictorOverplot")], z = mydata[,c("outcomeOverplot")])partialcor.table(mydata[,c("predictor","outcome","predictorOverplot")], z = mydata[,c("outcomeOverplot")], type = "manuscript")partialcor.table(mydata[,c("predictor","outcome","predictorOverplot")], z = mydata[,c("outcomeOverplot")], type = "manuscriptBig")scatterplotMatrix(~ predictor + outcome + predictorOverplot + outcomeOverplot, data = mydata, use = "pairwise.complete.obs")pairs.panels(mydata[,c("predictor","outcome","predictorOverplot","outcomeOverplot")])multipleRegressionModel <- lm(outcome ~ predictor + predictorOverplot,
data = mydata,
na.action = "na.exclude")
allEffects(multipleRegressionModel) model: outcome ~ predictor + predictorOverplot
predictor effect
predictor
0.1 20 40 60 80
49.23335 70.37778 91.62847 112.87915 134.12983
predictorOverplot effect
predictorOverplot
1 10 30 40 50
73.13937 73.15070 73.17586 73.18845 73.20103
plot(allEffects(multipleRegressionModel))multilevelRegressionModel <- lme(outcome ~ predictor + predictorOverplot, random = ~ 1|ID,
method = "ML",
data = mydata,
na.action = "na.exclude")
allEffects(multilevelRegressionModel) model: outcome ~ predictor + predictorOverplot
predictor effect
predictor
0.1 20 40 60 80
49.23335 70.37778 91.62847 112.87915 134.12983
predictorOverplot effect
predictorOverplot
1 10 30 40 50
73.13937 73.15070 73.17586 73.18845 73.20103
plot(allEffects(multilevelRegressionModel))R version 4.5.2 (2025-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.3 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
locale:
[1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
[4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
time zone: UTC
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] XICOR_0.4.1 ggExtra_0.11.0 mvnormtest_0.1-9-3 naniar_1.1.0
[5] lubridate_1.9.4 forcats_1.0.1 stringr_1.6.0 dplyr_1.1.4
[9] purrr_1.2.0 readr_2.1.6 tidyr_1.3.2 tibble_3.3.0
[13] tidyverse_2.0.0 psych_2.5.6 ggplot2_4.0.1 corrplot_0.95
[17] effects_4.2-4 nlme_3.1-168 ellipse_0.5.0 vioplot_0.5.1
[21] zoo_1.8-15 sm_2.2-6.0 car_3.1-3 carData_3.0-5
[25] petersenlab_1.2.2
loaded via a namespace (and not attached):
[1] Rdpack_2.6.4 DBI_1.2.3 mnormt_2.1.1
[4] gridExtra_2.3 rlang_1.1.6 magrittr_2.0.4
[7] otel_0.2.0 compiler_4.5.2 mgcv_1.9-3
[10] vctrs_0.6.5 reshape2_1.4.5 quadprog_1.5-8
[13] pkgconfig_2.0.3 fastmap_1.2.0 backports_1.5.0
[16] labeling_0.4.3 pbivnorm_0.6.0 promises_1.5.0
[19] rmarkdown_2.30 psychTools_2.5.7.22 tzdb_0.5.0
[22] nloptr_2.2.1 visdat_0.6.0 xfun_0.55
[25] jsonlite_2.0.0 later_1.4.4 parallel_4.5.2
[28] lavaan_0.6-21 cluster_2.1.8.1 R6_2.6.1
[31] stringi_1.8.7 RColorBrewer_1.1-3 boot_1.3-32
[34] rpart_4.1.24 estimability_1.5.1 Rcpp_1.1.0
[37] knitr_1.51 base64enc_0.1-3 httpuv_1.6.16
[40] Matrix_1.7-4 splines_4.5.2 nnet_7.3-20
[43] timechange_0.3.0 tidyselect_1.2.1 rstudioapi_0.17.1
[46] abind_1.4-8 yaml_2.3.12 miniUI_0.1.2
[49] lattice_0.22-7 plyr_1.8.9 shiny_1.12.1
[52] withr_3.0.2 S7_0.2.1 evaluate_1.0.5
[55] foreign_0.8-90 survival_3.8-3 isoband_0.3.0
[58] survey_4.4-8 norm_1.0-11.1 pillar_1.11.1
[61] KernSmooth_2.23-26 rtf_0.4-14.1 checkmate_2.3.3
[64] stats4_4.5.2 reformulas_0.4.3 insight_1.4.4
[67] generics_0.1.4 mix_1.0-13 hms_1.1.4
[70] scales_1.4.0 minqa_1.2.8 xtable_1.8-4
[73] glue_1.8.0 Hmisc_5.2-4 tools_4.5.2
[76] data.table_1.17.8 lme4_1.1-38 mvtnorm_1.3-3
[79] grid_4.5.2 mitools_2.4 rbibutils_2.4
[82] colorspace_2.1-2 htmlTable_2.4.3 Formula_1.2-5
[85] cli_3.6.5 viridisLite_0.4.2 gtable_0.3.6
[88] R.methodsS3_1.8.2 digest_0.6.39 htmlwidgets_1.6.4
[91] farver_2.1.2 R.oo_1.27.1 htmltools_0.5.9
[94] lifecycle_1.0.4 mime_0.13 MASS_7.3-65
---
title: "Exploratory Data Analysis"
---
# Rationale
Exploratory data analysis is important for understanding your data, checking for data issues/errors, and checking assumptions for different statistical models.
LOOK AT YOUR DATA—this is one of the most overlooked steps in data analysis!
# Preamble
## Install Libraries
```{r}
#install.packages("remotes")
#remotes::install_github("DevPsyLab/petersenlab")
```
## Load Libraries
```{r}
library("petersenlab")
library("car")
library("vioplot")
library("ellipse")
library("nlme")
library("effects")
library("corrplot")
library("ggplot2")
library("psych")
library("tidyverse")
library("purrr")
library("naniar")
library("mvnormtest")
library("ggExtra")
library("XICOR")
```
# Simulate Data
```{r}
set.seed(52242)
n <- 1000
ID <- rep(1:100, each = 10)
predictor <- rbeta(n, 1.5, 5) * 100
outcome <- predictor + rnorm(n, mean = 0, sd = 20) + 50
predictorOverplot <- sample(1:50, n, replace = TRUE)
outcomeOverplot <- predictorOverplot + sample(1:75, n, replace = TRUE)
categorical1 <- sample(1:5, size = n, replace = TRUE)
categorical2 <- sample(1:5, size = n, replace = TRUE)
mydata <- data.frame(
ID = ID,
predictor = predictor,
outcome = outcome,
predictorOverplot = predictorOverplot,
outcomeOverplot = outcomeOverplot,
categorical1 = categorical1,
categorical2 = categorical2)
mydata[sample(1:n, size = 10), "predictor"] <- NA
mydata[sample(1:n, size = 10), "outcome"] <- NA
mydata[sample(1:n, size = 10), "predictorOverplot"] <- NA
mydata[sample(1:n, size = 10), "outcomeOverplot"] <- NA
mydata[sample(1:n, size = 30), "categorical1"] <- NA
mydata[sample(1:n, size = 70), "categorical2"] <- NA
```
# Descriptive statistics
```{r}
round(data.frame(psych::describe(mydata)), 2)
```
## Sample
- Check the sample size (*N*)
- Is the sample size in the data the expected sample size?
Are there cases (participants) that are missing?
Are there cases that should not be there?
- Here is the sample size:
```{r}
length(unique(mydata$ID))
```
- Check the extent of missingness
- How much data are missing in the model variables—including the predictor, outcome, and covariates?
- Here are the proportion of missing data in each variable:
```{r}
map(mydata, ~mean(is.na(.))) %>% t %>% t
```
## Distribution
- Frequencies
- Examine the frequencies of categorical variables:
```{r}
mydata %>%
select(categorical1, categorical2) %>%
sapply(function(x) table(x, useNA = "always")) %>%
t()
```
## Central Tendency
- Mean
```{r}
round(colMeans(mydata, na.rm = TRUE), 2)
round(apply(mydata, 2, function(x) mean(x, na.rm = TRUE)), 2)
mydata %>%
summarise(across(everything(),
.fns = list(mean = ~ mean(., na.rm = TRUE)))) %>%
round(., 2)
```
- Median
```{r}
round(apply(mydata, 2, function(x) median(x, na.rm = TRUE)), 2)
mydata %>%
summarise(across(everything(),
.fns = list(median = ~ median(., na.rm = TRUE)))) %>%
round(., 2)
```
- Mode
```{r}
round(apply(mydata, 2, function(x) Mode(x, multipleModes = "mean")), 2)
mydata %>%
summarise(across(everything(),
.fns = list(mode = ~ Mode(., multipleModes = "mean")))) %>%
round(., 2)
```
Compute all of these measures of central tendency:
```{r}
mydata %>%
summarise(across(everything(),
.fns = list(mean = ~ mean(., na.rm = TRUE),
median = ~ median(., na.rm = TRUE),
mode = ~ Mode(., multipleModes = "mean")),
.names = "{.col}.{.fn}")) %>%
round(., 2) %>%
pivot_longer(cols = everything(),
names_to = c("variable","index"),
names_sep = "\\.") %>%
pivot_wider(names_from = index,
values_from = value)
```
## Dispersion
- Standard deviation
- Observed minimum and maximum (vis-à-vis possible minimum and maximum)
- Skewness
- Kurtosis
Compute all of these measures of dispersion:
```{r}
mydata %>%
summarise(across(everything(),
.fns = list(SD = ~ sd(., na.rm = TRUE),
min = ~ min(., na.rm = TRUE),
max = ~ max(., na.rm = TRUE),
skewness = ~ skew(., na.rm = TRUE),
kurtosis = ~ kurtosi(., na.rm = TRUE)),
.names = "{.col}.{.fn}")) %>%
round(., 2) %>%
pivot_longer(cols = everything(),
names_to = c("variable","index"),
names_sep = "\\.") %>%
pivot_wider(names_from = index,
values_from = value)
```
Consider transforming data if skewness > |0.8| or if kurtosis > |3.0|.
## Summary Statistics {#summaryStats}
Add summary statistics to the bottom of correlation matrices in papers:
```{r}
cor.table(mydata, type = "manuscript")
summaryTable <- mydata %>%
summarise(across(everything(),
.fns = list(n = ~ length(na.omit(.)),
missingness = ~ mean(is.na(.)) * 100,
M = ~ mean(., na.rm = TRUE),
SD = ~ sd(., na.rm = TRUE),
min = ~ min(., na.rm = TRUE),
max = ~ max(., na.rm = TRUE),
skewness = ~ skew(., na.rm = TRUE),
kurtosis = ~ kurtosi(., na.rm = TRUE)),
.names = "{.col}.{.fn}")) %>%
pivot_longer(cols = everything(),
names_to = c("variable","index"),
names_sep = "\\.") %>%
pivot_wider(names_from = index,
values_from = value)
summaryTableTransposed <- summaryTable[-1] %>%
t() %>%
as.data.frame() %>%
setNames(summaryTable$variable) %>%
round(., digits = 2)
summaryTableTransposed
```
## Distribution Plots
See [here](figures.qmd) for resources for creating figures in R.
### Histogram
#### Base R
```{r}
hist(mydata$outcome)
```
#### `ggplot2`
```{r}
ggplot(mydata, aes(x = outcome)) +
geom_histogram(color = 1)
```
### Histogram overlaid with density plot and rug plot
#### Base R
```{r}
hist(mydata$outcome, prob = TRUE)
lines(density(mydata$outcome, na.rm = TRUE))
rug(mydata$outcome)
```
#### `ggplot2`
```{r}
ggplot(mydata, aes(x = outcome)) +
geom_histogram(aes(y = after_stat(density)), color = 1) +
geom_density() +
geom_rug()
```
### Density Plot
#### Base R
```{r}
plot(density(mydata$outcome, na.rm = TRUE))
```
#### `ggplot2`
```{r}
ggplot(mydata, aes(x = outcome)) +
geom_density()
```
### Box and whisker plot (boxplot)
#### Base R
```{r}
boxplot(mydata$outcome, horizontal = TRUE)
```
#### `ggplot2`
```{r}
ggplot(mydata, aes(x = outcome)) +
geom_boxplot()
```
### Violin plot
#### Base R
```{r}
vioplot(na.omit(mydata$outcome), horizontal = TRUE)
```
#### `ggplot2`
```{r}
ggplot(mydata, aes(x = "", y = outcome)) +
geom_violin()
```
# Bivariate Associations
For more advanced scatterplots, see [here](figures.qmd#marginalDistributions).
For an overview of correlation analysis, see [here](https://isaactpetersen.github.io/Fantasy-Football-Analytics-Textbook/correlation.html).
## Correlation Coefficients
### Pearson Correlation
```{r}
cor(mydata, use = "pairwise.complete.obs")
cor.test( ~ predictor + outcome, data = mydata)
cor.table(mydata)
```
### Spearman Correlation
```{r}
cor(mydata, use = "pairwise.complete.obs", method = "spearman")
cor.test( ~ predictor + outcome, data = mydata, method = "spearman")
cor.table(mydata, correlation = "spearman")
```
### Xi ($\xi$)
Xi ($\xi$) is an index of the degree of dependence between two variables, which is useful as an index of nonlinear correlation.
Chatterjee, S. (2021). A new coefficient of correlation. *Journal of the American Statistical Association, 116*(536), 2009-2022. <https://doi.org/10.1080/01621459.2020.1758115>
```{r}
calculateXI(
mydata$predictor,
mydata$outcome)
```
## Scatterplot
### Base R
```{r}
plot(
mydata$predictor,
mydata$outcome)
abline(lm(
outcomeOverplot ~ predictorOverplot,
data = mydata,
na.action = "na.exclude"))
```
### `ggplot2`
```{r}
ggplot(mydata, aes(x = predictor, y = outcome)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE)
```
## Scatterplot with Marginal Density Plot
```{r}
scatterplot <-
ggplot(mydata, aes(x = predictor, y = outcome)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE)
```
```{r}
densityMarginal <- ggMarginal(
scatterplot,
type = "density",
xparams = list(fill = "gray"),
yparams = list(fill = "gray"))
```
```{r}
print(densityMarginal, newpage = TRUE)
```
## High Density Scatterplot
```{r}
ggplot(mydata, aes(x = predictorOverplot, y = outcomeOverplot)) +
geom_point(position = "jitter", alpha = 0.3) +
geom_density2d()
smoothScatter(mydata$predictorOverplot, mydata$outcomeOverplot)
```
## Data Ellipse
```{r}
mydata_nomissing <- na.omit(mydata[,c("predictor","outcome")])
dataEllipse(mydata_nomissing$predictor, mydata_nomissing$outcome, levels = c(0.5, .95))
```
## Visually Weighted Regression
```{r}
#| results: false
vwReg(outcome ~ predictor, data = mydata)
```
# Basic inferential statistics
## Tests of Normality
### Shapiro-Wilk test of normality
The Shapiro-Wilk test of normality does not accept more than 5000 cases because it will reject the hypothesis that data come from a normal distribution with even slight deviations from normality.
```{r}
shapiro.test(na.omit(mydata$outcome)) #subset to keep only the first 5000 rows: mydata$outcome[1:5000]
```
### Test of multivariate normality
```{r}
mydata %>%
na.omit %>%
t %>%
mshapiro.test
```
## Tests of systematic missingness (i.e., whether missingness on a variable depends on other variables)
- Generally test:
- Whether data are consistent with a missing completely at random (MCAR) pattern—Little's MCAR Test
- Whether outcome variable(s) differ as a function of any model variables (predictors and covariates) and as a function of any key demographic characteristics (e.g., sex, ethnicity, socioeconomic status)
- Whether focal predictor variable(s) differ as a function of any model variables (including outcome variable) and as a function of any key demographic characteristics
- For instance:
- Whether males are more likely than girls to be missing scores on the dependent variable
- Whether longitudinal attrition is greater in lower socioeconomic status families
- If missingness differs systematically as a function of other variables, you can include that variable as a control variable in models, and/or can include that variable in multiple imputation to inform imputed scores for missing values
### Little's MCAR Test
```{r}
mcar_test(mydata)
```
## Multivariate Associations
### Correlation Matrix
#### Pearson Correlations
```{r}
cor.table(mydata[,c("predictor","outcome","predictorOverplot","outcomeOverplot")])
cor.table(mydata[,c("predictor","outcome","predictorOverplot","outcomeOverplot")], type = "manuscript")
cor.table(mydata[,c("predictor","outcome","predictorOverplot","outcomeOverplot")], type = "manuscriptBig")
```
#### Spearman Correlations
```{r}
cor.table(mydata[,c("predictor","outcome","predictorOverplot","outcomeOverplot")], correlation = "spearman")
cor.table(mydata[,c("predictor","outcome","predictorOverplot","outcomeOverplot")], type = "manuscript", correlation = "spearman")
cor.table(mydata[,c("predictor","outcome","predictorOverplot","outcomeOverplot")], type = "manuscriptBig", correlation = "spearman")
```
#### Partial Correlations
Examine the associations among variables controlling for a covariate (`outcomeOverplot`).
```{r}
partialcor.table(mydata[,c("predictor","outcome","predictorOverplot")], z = mydata[,c("outcomeOverplot")])
partialcor.table(mydata[,c("predictor","outcome","predictorOverplot")], z = mydata[,c("outcomeOverplot")], type = "manuscript")
partialcor.table(mydata[,c("predictor","outcome","predictorOverplot")], z = mydata[,c("outcomeOverplot")], type = "manuscriptBig")
```
### Correlogram
```{r}
corrplot(cor(mydata[,c("predictor","outcome","predictorOverplot","outcomeOverplot")], use = "pairwise.complete.obs"))
```
### Scatterplot matrix
```{r}
scatterplotMatrix(~ predictor + outcome + predictorOverplot + outcomeOverplot, data = mydata, use = "pairwise.complete.obs")
```
### Pairs panels
```{r}
pairs.panels(mydata[,c("predictor","outcome","predictorOverplot","outcomeOverplot")])
```
## Effect Plots
### Multiple Regression Model
```{r}
multipleRegressionModel <- lm(outcome ~ predictor + predictorOverplot,
data = mydata,
na.action = "na.exclude")
allEffects(multipleRegressionModel)
plot(allEffects(multipleRegressionModel))
```
### Multilevel Regression Model
```{r}
multilevelRegressionModel <- lme(outcome ~ predictor + predictorOverplot, random = ~ 1|ID,
method = "ML",
data = mydata,
na.action = "na.exclude")
allEffects(multilevelRegressionModel)
plot(allEffects(multilevelRegressionModel))
```
# Session Info
```{r}
#| code-fold: true
sessionInfo()
```