Exploratory data analysis is important for understanding your data, checking for data issues/errors, and checking assumptions for different statistical models.
LOOK AT YOUR DATA—this is one of the most overlooked steps in data analysis!
#install.packages("remotes")
#remotes::install_github("DevPsyLab/petersenlab")
library("petersenlab")
library("car")
library("vioplot")
library("ellipse")
library("nlme")
library("effects")
library("corrplot")
library("ggplot2")
library("psych")
library("tidyverse")
library("purrr")
library("naniar")
library("mvnormtest")
library("ggExtra")
library("XICOR")
set.seed(52242)
n <- 1000
ID <- rep(1:100, each = 10)
predictor <- rbeta(n, 1.5, 5) * 100
outcome <- predictor + rnorm(n, mean = 0, sd = 20) + 50
predictorOverplot <- sample(1:50, n, replace = TRUE)
outcomeOverplot <- predictorOverplot + sample(1:75, n, replace = TRUE)
categorical1 <- sample(1:5, size = n, replace = TRUE)
categorical2 <- sample(1:5, size = n, replace = TRUE)
mydata <- data.frame(
ID = ID,
predictor = predictor,
outcome = outcome,
predictorOverplot = predictorOverplot,
outcomeOverplot = outcomeOverplot,
categorical1 = categorical1,
categorical2 = categorical2)
mydata[sample(1:n, size = 10), "predictor"] <- NA
mydata[sample(1:n, size = 10), "outcome"] <- NA
mydata[sample(1:n, size = 10), "predictorOverplot"] <- NA
mydata[sample(1:n, size = 10), "outcomeOverplot"] <- NA
mydata[sample(1:n, size = 30), "categorical1"] <- NA
mydata[sample(1:n, size = 70), "categorical2"] <- NA
round(data.frame(psych::describe(mydata)), 2)
length(unique(mydata$ID))
[1] 100
map(mydata, ~mean(is.na(.))) %>% t %>% t
[,1]
ID 0
predictor 0.01
outcome 0.01
predictorOverplot 0.01
outcomeOverplot 0.01
categorical1 0.03
categorical2 0.07
mydata %>%
select(categorical1, categorical2) %>%
sapply(function(x) table(x, useNA = "always")) %>%
t()
1 2 3 4 5 <NA>
categorical1 196 204 191 199 180 30
categorical2 195 179 193 202 161 70
round(colMeans(mydata, na.rm = TRUE), 2)
ID predictor outcome predictorOverplot
50.50 22.64 73.36 25.54
outcomeOverplot categorical1 categorical2
63.53 2.96 2.95
round(apply(mydata, 2, function(x) mean(x, na.rm = TRUE)), 2)
ID predictor outcome predictorOverplot
50.50 22.64 73.36 25.54
outcomeOverplot categorical1 categorical2
63.53 2.96 2.95
mydata %>%
summarise(across(everything(),
.fns = list(mean = ~ mean(., na.rm = TRUE)))) %>%
round(., 2)
round(apply(mydata, 2, function(x) median(x, na.rm = TRUE)), 2)
ID predictor outcome predictorOverplot
50.50 19.61 73.17 26.00
outcomeOverplot categorical1 categorical2
63.00 3.00 3.00
mydata %>%
summarise(across(everything(),
.fns = list(median = ~ median(., na.rm = TRUE)))) %>%
round(., 2)
round(apply(mydata, 2, function(x) Mode(x, multipleModes = "mean")), 2)
ID predictor outcome predictorOverplot
50.50 22.64 73.36 35.00
outcomeOverplot categorical1 categorical2
63.33 2.00 4.00
mydata %>%
summarise(across(everything(),
.fns = list(mode = ~ Mode(., multipleModes = "mean")))) %>%
round(., 2)
Compute all of these measures of central tendency:
mydata %>%
summarise(across(everything(),
.fns = list(mean = ~ mean(., na.rm = TRUE),
median = ~ median(., na.rm = TRUE),
mode = ~ Mode(., multipleModes = "mean")),
.names = "{.col}.{.fn}")) %>%
round(., 2) %>%
pivot_longer(cols = everything(),
names_to = c("variable","index"),
names_sep = "\\.") %>%
pivot_wider(names_from = index,
values_from = value)
Compute all of these measures of dispersion:
mydata %>%
summarise(across(everything(),
.fns = list(SD = ~ sd(., na.rm = TRUE),
min = ~ min(., na.rm = TRUE),
max = ~ max(., na.rm = TRUE),
skewness = ~ skew(., na.rm = TRUE),
kurtosis = ~ kurtosi(., na.rm = TRUE)),
.names = "{.col}.{.fn}")) %>%
round(., 2) %>%
pivot_longer(cols = everything(),
names_to = c("variable","index"),
names_sep = "\\.") %>%
pivot_wider(names_from = index,
values_from = value)
Consider transforming data if skewness > |0.8| or if kurtosis > |3.0|.
Add summary statistics to the bottom of correlation matrices in papers:
cor.table(mydata, type = "manuscript")
summaryTable <- mydata %>%
summarise(across(everything(),
.fns = list(n = ~ length(na.omit(.)),
missingness = ~ mean(is.na(.)) * 100,
M = ~ mean(., na.rm = TRUE),
SD = ~ sd(., na.rm = TRUE),
min = ~ min(., na.rm = TRUE),
max = ~ max(., na.rm = TRUE),
skewness = ~ skew(., na.rm = TRUE),
kurtosis = ~ kurtosi(., na.rm = TRUE)),
.names = "{.col}.{.fn}")) %>%
pivot_longer(cols = everything(),
names_to = c("variable","index"),
names_sep = "\\.") %>%
pivot_wider(names_from = index,
values_from = value)
summaryTableTransposed <- summaryTable[-1] %>%
t() %>%
as.data.frame() %>%
setNames(summaryTable$variable) %>%
round(., digits = 2)
summaryTableTransposed
See here for resources for creating figures in R.
hist(mydata$outcome)
ggplot2
ggplot(mydata, aes(x = outcome)) +
geom_histogram(color = 1)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 10 rows containing non-finite outside the scale range
(`stat_bin()`).
hist(mydata$outcome, prob = TRUE)
lines(density(mydata$outcome, na.rm = TRUE))
rug(mydata$outcome)
ggplot2
ggplot(mydata, aes(x = outcome)) +
geom_histogram(aes(y = after_stat(density)), color = 1) +
geom_density() +
geom_rug()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 10 rows containing non-finite outside the scale range
(`stat_bin()`).
Warning: Removed 10 rows containing non-finite outside the scale range
(`stat_density()`).
plot(density(mydata$outcome, na.rm = TRUE))
ggplot2
ggplot(mydata, aes(x = outcome)) +
geom_density()
Warning: Removed 10 rows containing non-finite outside the scale range
(`stat_density()`).
boxplot(mydata$outcome, horizontal = TRUE)
ggplot2
ggplot(mydata, aes(x = outcome)) +
geom_boxplot()
Warning: Removed 10 rows containing non-finite outside the scale range
(`stat_boxplot()`).
vioplot(na.omit(mydata$outcome), horizontal = TRUE)
ggplot2
ggplot(mydata, aes(x = "", y = outcome)) +
geom_violin()
Warning: Removed 10 rows containing non-finite outside the scale range
(`stat_ydensity()`).
For more advanced scatterplots, see here.
cor(mydata, use = "pairwise.complete.obs")
ID predictor outcome predictorOverplot
ID 1.000000000 -0.019094589 -0.02501796 0.02984671
predictor -0.019094589 1.000000000 0.63517610 0.04350431
outcome -0.025017962 0.635176098 1.00000000 0.02976820
predictorOverplot 0.029846711 0.043504314 0.02976820 1.00000000
outcomeOverplot 0.003150872 0.043344473 0.06289224 0.56564445
categorical1 0.040442534 -0.002422711 -0.01532232 0.02498938
categorical2 0.008920415 -0.046927177 -0.01868628 0.03880135
outcomeOverplot categorical1 categorical2
ID 0.003150872 0.040442534 0.008920415
predictor 0.043344473 -0.002422711 -0.046927177
outcome 0.062892238 -0.015322318 -0.018686282
predictorOverplot 0.565644447 0.024989385 0.038801346
outcomeOverplot 1.000000000 0.011871644 0.043682632
categorical1 0.011871644 1.000000000 -0.086310192
categorical2 0.043682632 -0.086310192 1.000000000
cor.test( ~ predictor + outcome, data = mydata)
Pearson's product-moment correlation
data: predictor and outcome
t = 25.718, df = 978, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5962709 0.6711047
sample estimates:
cor
0.6351761
cor.table(mydata)
cor(mydata, use = "pairwise.complete.obs", method = "spearman")
ID predictor outcome predictorOverplot
ID 1.000000000 -0.01181085 -0.02805450 0.02925155
predictor -0.011810853 1.00000000 0.60003760 0.02672008
outcome -0.028054497 0.60003760 1.00000000 0.02460831
predictorOverplot 0.029251546 0.02672008 0.02460831 1.00000000
outcomeOverplot 0.001416626 0.03278463 0.07073642 0.54841297
categorical1 0.040888369 -0.01162260 -0.01906631 0.02350886
categorical2 0.009137467 -0.03763251 -0.02273699 0.04062487
outcomeOverplot categorical1 categorical2
ID 0.001416626 0.04088837 0.009137467
predictor 0.032784628 -0.01162260 -0.037632515
outcome 0.070736421 -0.01906631 -0.022736992
predictorOverplot 0.548412967 0.02350886 0.040624873
outcomeOverplot 1.000000000 0.01505255 0.044184959
categorical1 0.015052553 1.00000000 -0.086290083
categorical2 0.044184959 -0.08629008 1.000000000
cor.test( ~ predictor + outcome, data = mydata, method = "spearman")
Spearman's rank correlation rho
data: predictor and outcome
S = 62740170, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.6000376
cor.table(mydata, correlation = "spearman")
Xi (\(\xi\)) is an index of the degree of dependence between two variables, which is useful as an index of nonlinear correlation.
Chatterjee, S. (2021). A new coefficient of correlation. Journal of the American Statistical Association, 116(536), 2009-2022. https://doi.org/10.1080/01621459.2020.1758115
calculateXI(
mydata$predictor,
mydata$outcome)
[1] 0.2319062
plot(
mydata$predictor,
mydata$outcome)
abline(lm(
outcomeOverplot ~ predictorOverplot,
data = mydata,
na.action = "na.exclude"))
ggplot2
ggplot(mydata, aes(x = predictor, y = outcome)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE)
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 20 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 20 rows containing missing values or values outside the scale range
(`geom_point()`).
scatterplot <-
ggplot(mydata, aes(x = predictor, y = outcome)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE)
densityMarginal <- ggMarginal(
scatterplot,
type = "density",
xparams = list(fill = "gray"),
yparams = list(fill = "gray"))
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 20 rows containing non-finite outside the scale range
(`stat_smooth()`).
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 20 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 20 rows containing missing values or values outside the scale range
(`geom_point()`).
print(densityMarginal, newpage = TRUE)
ggplot(mydata, aes(x = predictorOverplot, y = outcomeOverplot)) +
geom_point(position = "jitter", alpha = 0.3) +
geom_density2d()
Warning: Removed 20 rows containing non-finite outside the scale range
(`stat_density2d()`).
Warning: Removed 20 rows containing missing values or values outside the scale range
(`geom_point()`).
smoothScatter(mydata$predictorOverplot, mydata$outcomeOverplot)
mydata_nomissing <- na.omit(mydata[,c("predictor","outcome")])
dataEllipse(mydata_nomissing$predictor, mydata_nomissing$outcome, levels = c(0.5, .95))
vwReg(outcome ~ predictor, data = mydata)
The Shapiro-Wilk test of normality does not accept more than 5000 cases because it will reject the hypothesis that data come from a normal distribution with even slight deviations from normality.
shapiro.test(na.omit(mydata$outcome)) #subset to keep only the first 5000 rows: mydata$outcome[1:5000]
Shapiro-Wilk normality test
data: na.omit(mydata$outcome)
W = 0.9971, p-value = 0.06998
mydata %>%
na.omit %>%
t %>%
mshapiro.test
Shapiro-Wilk normality test
data: Z
W = 0.98542, p-value = 1.339e-07
https://upload.wikimedia.org/wikipedia/commons/7/74/InferentialStatisticalDecisionMakingTrees.pdf (archived at https://perma.cc/L2QR-ALFA)
mcar_test(mydata)
cor.table(mydata[,c("predictor","outcome","predictorOverplot","outcomeOverplot")])
cor.table(mydata[,c("predictor","outcome","predictorOverplot","outcomeOverplot")], type = "manuscript")
cor.table(mydata[,c("predictor","outcome","predictorOverplot","outcomeOverplot")], type = "manuscriptBig")
cor.table(mydata[,c("predictor","outcome","predictorOverplot","outcomeOverplot")], correlation = "spearman")
cor.table(mydata[,c("predictor","outcome","predictorOverplot","outcomeOverplot")], type = "manuscript", correlation = "spearman")
cor.table(mydata[,c("predictor","outcome","predictorOverplot","outcomeOverplot")], type = "manuscriptBig", correlation = "spearman")
Examine the associations among variables controlling for a covariate
(outcomeOverplot
).
partialcor.table(mydata[,c("predictor","outcome","predictorOverplot")], z = mydata[,c("outcomeOverplot")])
predictor outcome predictorOverplot
1. predictor.r 1.00 .63*** .02
2. sig NA .00 .47
3. n 980 970 971
4. outcome.r .63*** 1.00 -.01
5. sig .00 NA .83
6. n 970 980 970
7. predictorOverplot.r .02 -.01 1.00
8. sig .47 .83 NA
9. n 971 970 980
partialcor.table(mydata[,c("predictor","outcome","predictorOverplot")], z = mydata[,c("outcomeOverplot")], type = "manuscript")
partialcor.table(mydata[,c("predictor","outcome","predictorOverplot")], z = mydata[,c("outcomeOverplot")], type = "manuscriptBig")
corrplot(cor(mydata[,c("predictor","outcome","predictorOverplot","outcomeOverplot")], use = "pairwise.complete.obs"))
scatterplotMatrix(~ predictor + outcome + predictorOverplot + outcomeOverplot, data = mydata, use = "pairwise.complete.obs")
pairs.panels(mydata[,c("predictor","outcome","predictorOverplot","outcomeOverplot")])
multipleRegressionModel <- lm(outcome ~ predictor + predictorOverplot,
data = mydata,
na.action = "na.exclude")
allEffects(multipleRegressionModel)
model: outcome ~ predictor + predictorOverplot
predictor effect
predictor
0.1 20 40 60 80
49.23335 70.37778 91.62847 112.87915 134.12983
predictorOverplot effect
predictorOverplot
1 10 30 40 50
73.13937 73.15070 73.17586 73.18845 73.20103
plot(allEffects(multipleRegressionModel))
multilevelRegressionModel <- lme(outcome ~ predictor + predictorOverplot, random = ~ 1|ID,
method = "ML",
data = mydata,
na.action = "na.exclude")
allEffects(multilevelRegressionModel)
model: outcome ~ predictor + predictorOverplot
predictor effect
predictor
0.1 20 40 60 80
49.23335 70.37778 91.62847 112.87915 134.12983
predictorOverplot effect
predictorOverplot
1 10 30 40 50
73.13937 73.15070 73.17586 73.18845 73.20103
plot(allEffects(multilevelRegressionModel))
sessionInfo()
R version 4.4.2 (2024-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.1 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
locale:
[1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
[4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
time zone: UTC
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] XICOR_0.4.1 ggExtra_0.10.1 mvnormtest_0.1-9-3 naniar_1.1.0
[5] lubridate_1.9.4 forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4
[9] purrr_1.0.2 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
[13] tidyverse_2.0.0 psych_2.4.12 ggplot2_3.5.1 corrplot_0.95
[17] effects_4.2-2 nlme_3.1-166 ellipse_0.5.0 vioplot_0.5.0
[21] zoo_1.8-12 sm_2.2-6.0 car_3.1-3 carData_3.0-5
[25] petersenlab_1.1.0
loaded via a namespace (and not attached):
[1] Rdpack_2.6.2 DBI_1.2.3 mnormt_2.1.1 gridExtra_2.3
[5] rlang_1.1.4 magrittr_2.0.3 compiler_4.4.2 mgcv_1.9-1
[9] vctrs_0.6.5 reshape2_1.4.4 quadprog_1.5-8 pkgconfig_2.0.3
[13] fastmap_1.2.0 backports_1.5.0 labeling_0.4.3 pbivnorm_0.6.0
[17] promises_1.3.2 rmarkdown_2.29 psychTools_2.4.3 tzdb_0.4.0
[21] nloptr_2.1.1 visdat_0.6.0 xfun_0.50 cachem_1.1.0
[25] jsonlite_1.8.9 later_1.4.1 parallel_4.4.2 lavaan_0.6-19
[29] cluster_2.1.6 R6_2.5.1 bslib_0.8.0 stringi_1.8.4
[33] RColorBrewer_1.1-3 boot_1.3-31 rpart_4.1.23 estimability_1.5.1
[37] jquerylib_0.1.4 Rcpp_1.0.13-1 knitr_1.49 base64enc_0.1-3
[41] httpuv_1.6.15 Matrix_1.7-1 splines_4.4.2 nnet_7.3-19
[45] timechange_0.3.0 tidyselect_1.2.1 rstudioapi_0.17.1 abind_1.4-8
[49] yaml_2.3.10 miniUI_0.1.1.1 lattice_0.22-6 plyr_1.8.9
[53] shiny_1.10.0 withr_3.0.2 evaluate_1.0.3 foreign_0.8-87
[57] survival_3.7-0 isoband_0.2.7 survey_4.4-2 norm_1.0-11.1
[61] pillar_1.10.1 KernSmooth_2.23-24 rtf_0.4-14.1 checkmate_2.3.2
[65] stats4_4.4.2 reformulas_0.4.0 insight_1.0.1 generics_0.1.3
[69] mix_1.0-13 hms_1.1.3 munsell_0.5.1 scales_1.3.0
[73] minqa_1.2.8 xtable_1.8-4 glue_1.8.0 Hmisc_5.2-2
[77] tools_4.4.2 data.table_1.16.4 lme4_1.1-36 mvtnorm_1.3-3
[81] grid_4.4.2 mitools_2.4 rbibutils_2.3 colorspace_2.1-1
[85] htmlTable_2.4.3 Formula_1.2-5 cli_3.6.3 viridisLite_0.4.2
[89] gtable_0.3.6 R.methodsS3_1.8.2 sass_0.4.9 digest_0.6.37
[93] farver_2.1.2 htmlwidgets_1.6.4 R.oo_1.27.0 htmltools_0.5.8.1
[97] lifecycle_1.0.4 mime_0.12 MASS_7.3-61