--- title: Working with Confounds and Variables output: rmarkdown::html_vignette: toc: yes toc_depth: 2.0 css: albers.css header-includes: - '' params: family: red preset: homage resource_files: - albers.css - albers.js vignette: | %\VignetteIndexEntry{Working with Confounds and Variables} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} if (requireNamespace("ggplot2", quietly = TRUE) && requireNamespace("albersdown", quietly = TRUE)) ggplot2::theme_set(albersdown::theme_albers(family = params$family, preset = params$preset)) knitr::opts_chunk$set( collapse = TRUE, comment = "#>", message = FALSE, warning = FALSE, fig.width = 7, fig.height = 4 ) ``` ```{r albers-classes, echo=FALSE, results='asis'} cat(sprintf( paste0( '' ), params$family, params$preset )) ``` ```{r load-packages} library(bidser) library(dplyr) library(tidyr) library(tibble) ``` After loading a BIDS dataset and finding your scans, the next step in most fMRI workflows is extracting **confound regressors** and **event tables** for modeling. bidser gives you tidy tibbles at every level -- per-run, per-subject, or dataset-wide -- so you can go straight from BIDS into `lm()`, `lme4`, or whatever modeling framework you prefer. ## Building a test dataset We'll create a small mock dataset with two subjects, two tasks, and realistic fMRIPrep-style confound files so that every code chunk in this vignette runs without network access. ```{r build-mock, include = FALSE} temp_dir <- tempfile("bids_confounds_vignette_") dir.create(temp_dir) # participants.tsv readr::write_tsv( tibble(participant_id = c("sub-01", "sub-02"), age = c(25L, 30L), sex = c("M", "F")), file.path(temp_dir, "participants.tsv") ) # dataset_description.json jsonlite::write_json( list(Name = "ConfoundDemo", BIDSVersion = "1.7.0"), file.path(temp_dir, "dataset_description.json"), auto_unbox = TRUE ) # Create raw + derivative structure set.seed(42) n_vols <- 100 for (sub in c("01", "02")) { for (task in c("rest", "nback")) { # --- raw --- func_dir <- file.path(temp_dir, paste0("sub-", sub), "func") dir.create(func_dir, recursive = TRUE, showWarnings = FALSE) bold_name <- sprintf("sub-%s_task-%s_run-01_bold.nii.gz", sub, task) file.create(file.path(func_dir, bold_name)) events_name <- sprintf("sub-%s_task-%s_run-01_events.tsv", sub, task) n_trials <- if (task == "nback") 40 else 0 if (n_trials > 0) { ev <- tibble( onset = sort(runif(n_trials, 0, 290)), duration = rep(2, n_trials), trial_type = sample(c("0back", "2back"), n_trials, replace = TRUE), response_time = round(rnorm(n_trials, 0.8, 0.15), 3) ) readr::write_tsv(ev, file.path(func_dir, events_name)) } else { readr::write_tsv( tibble(onset = numeric(0), duration = numeric(0), trial_type = character(0)), file.path(func_dir, events_name) ) } # --- derivatives/fmriprep --- deriv_func <- file.path(temp_dir, "derivatives", "fmriprep", paste0("sub-", sub), "func") dir.create(deriv_func, recursive = TRUE, showWarnings = FALSE) preproc_name <- sprintf( "sub-%s_task-%s_run-01_space-MNI152NLin2009cAsym_desc-preproc_bold.nii.gz", sub, task ) file.create(file.path(deriv_func, preproc_name)) conf_name <- sprintf( "sub-%s_task-%s_run-01_desc-confounds_timeseries.tsv", sub, task ) conf <- tibble( csf = rnorm(n_vols), white_matter = rnorm(n_vols), global_signal = rnorm(n_vols), framewise_displacement = abs(rnorm(n_vols, 0.2, 0.1)), trans_x = cumsum(rnorm(n_vols, 0, 0.02)), trans_y = cumsum(rnorm(n_vols, 0, 0.02)), trans_z = cumsum(rnorm(n_vols, 0, 0.02)), rot_x = cumsum(rnorm(n_vols, 0, 0.001)), rot_y = cumsum(rnorm(n_vols, 0, 0.001)), rot_z = cumsum(rnorm(n_vols, 0, 0.001)), trans_x_derivative1 = c(NA, diff(cumsum(rnorm(n_vols, 0, 0.02)))), trans_y_derivative1 = c(NA, diff(cumsum(rnorm(n_vols, 0, 0.02)))), trans_z_derivative1 = c(NA, diff(cumsum(rnorm(n_vols, 0, 0.02)))), rot_x_derivative1 = c(NA, diff(cumsum(rnorm(n_vols, 0, 0.001)))), rot_y_derivative1 = c(NA, diff(cumsum(rnorm(n_vols, 0, 0.001)))), rot_z_derivative1 = c(NA, diff(cumsum(rnorm(n_vols, 0, 0.001)))), trans_x_power2 = trans_x^2, trans_y_power2 = trans_y^2, trans_z_power2 = trans_z^2, rot_x_power2 = rot_x^2, rot_y_power2 = rot_y^2, rot_z_power2 = rot_z^2, trans_x_derivative1_power2 = trans_x_derivative1^2, trans_y_derivative1_power2 = trans_y_derivative1^2, trans_z_derivative1_power2 = trans_z_derivative1^2, rot_x_derivative1_power2 = rot_x_derivative1^2, rot_y_derivative1_power2 = rot_y_derivative1^2, rot_z_derivative1_power2 = rot_z_derivative1^2, a_comp_cor_00 = rnorm(n_vols), a_comp_cor_01 = rnorm(n_vols), a_comp_cor_02 = rnorm(n_vols), a_comp_cor_03 = rnorm(n_vols), a_comp_cor_04 = rnorm(n_vols), a_comp_cor_05 = rnorm(n_vols), t_comp_cor_00 = rnorm(n_vols), t_comp_cor_01 = rnorm(n_vols), t_comp_cor_02 = rnorm(n_vols), cosine_00 = cos(seq(0, pi, length.out = n_vols)), cosine_01 = cos(seq(0, 2 * pi, length.out = n_vols)), cosine_02 = cos(seq(0, 3 * pi, length.out = n_vols)) ) readr::write_tsv(conf, file.path(deriv_func, conf_name)) } } # Also write dataset_description.json for fmriprep derivative deriv_root <- file.path(temp_dir, "derivatives", "fmriprep") jsonlite::write_json( list(Name = "fMRIPrep", BIDSVersion = "1.7.0", GeneratedBy = list(list(Name = "fMRIPrep", Version = "21.0.0"))), file.path(deriv_root, "dataset_description.json"), auto_unbox = TRUE ) ``` ```{r load-project} proj <- bids_project(temp_dir, fmriprep = TRUE) proj ``` Our demo has `r length(participants(proj))` subjects, each with a resting-state and an n-back task run, plus fMRIPrep derivatives containing confound timeseries. ## Confound sets: predefined recipes Before reading any files, it helps to know what confound sets bidser ships with. These are curated variable lists that match common denoising strategies from the fMRI literature. ```{r list-sets} list_confound_sets() ``` Each set is a named collection of confound variable names. You can inspect what a set contains: ```{r show-motion-sets} # 6 rigid-body motion parameters confound_set("motion6") # Friston 24-parameter expansion (motion + derivatives + squares) length(confound_set("motion24")) ``` The sets compose naturally -- `"36p"` is `motion24` plus tissue signals and their expansions, while `"9p"` is `motion6` plus the three global signals. ## Reading confounds `read_confounds()` reads fMRIPrep confound TSV files and returns tidy tibbles. Pass a confound set name to select variables: ```{r read-motion6} conf_nested <- read_confounds(proj, cvars = confound_set("motion6")) conf_nested ``` The result is a nested tibble -- one row per run, with the actual timeseries tucked inside the `data` list column. This structure makes it easy to iterate over runs in a modeling pipeline: ```{r unnest-motion} conf_nested |> unnest(data) |> select(participant_id, task, run, trans_x, rot_x) |> head() ``` ### Selecting variables by name or wildcard You can also pass a character vector of specific variable names, or use wildcards to match patterns: ```{r wildcard-confounds} # All CompCor components compcor_conf <- read_confounds( proj, subid = "01", task = "rest", cvars = c("a_comp_cor_*", "t_comp_cor_*") ) names(compcor_conf$data[[1]]) ``` ### Flat output for quick modeling When you just want a single table for modeling, use `nest = FALSE`: ```{r flat-confounds} conf_flat <- read_confounds( proj, cvars = confound_set("motion6"), nest = FALSE ) dim(conf_flat) head(conf_flat) ``` Each row is one volume, with subject/task/run identifiers alongside the confound values -- ready to join with your design matrix. fMRIPrep writes leading `NA` values for derivative-type confounds such as `framewise_displacement` and `*_derivative1`. The default preserves those values for backward compatibility. For raw nuisance regressors that need to go directly into a GLM design matrix, set `na_action = "zero"` to use the common fMRIPrep convention of replacing those leading values with zero. ## Confound strategies: PCA reduction For high-dimensional confound sets, you often want to reduce the dimensionality rather than include all regressors directly. A `confound_strategy()` splits variables into two groups: those reduced via PCA and those kept as-is. ```{r show-strategies} list_confound_strategies() ``` The built-in `"pcabasic80"` strategy applies PCA to motion and CompCor variables (retaining 80% of variance) while keeping cosine regressors unchanged: ```{r pca-strategy} strat <- confound_strategy("pcabasic80") conf_pca <- read_confounds(proj, subid = "01", task = "rest", cvars = strat) names(conf_pca$data[[1]]) ``` The motion and CompCor variables have been replaced by a handful of principal components, while the cosine regressors pass through untouched. ### Custom strategies You can build your own strategy by specifying which variables get PCA-reduced and which are kept raw: ```{r custom-strategy} my_strat <- confound_strategy( pca_vars = c(confound_set("motion24"), confound_set("acompcor")), raw_vars = c("framewise_displacement", confound_set("cosine")), npcs = 5 ) conf_custom <- read_confounds(proj, subid = "01", task = "nback", cvars = my_strat) names(conf_custom$data[[1]]) ``` Five PCs from the motion + aCompCor space, plus FD and cosine regressors kept in their original form. ## Reading event files Event files describe the experimental design -- trial onsets, durations, and conditions. `read_events()` returns them as nested tibbles: ```{r read-events} events <- read_events(proj, task = "nback") events ``` Unnest to get a flat trial table: ```{r unnest-events} trials <- events |> unnest(data) |> select(.subid, .task, .run, onset, duration, trial_type, response_time) head(trials) ``` ```{r trial-summary} trials |> group_by(.subid, trial_type) |> summarise( n_trials = n(), mean_rt = mean(response_time, na.rm = TRUE), .groups = "drop" ) ``` If you want everything in one flat table directly, use `load_all_events()`: ```{r load-all-events} all_events <- load_all_events(proj, task = "nback") nrow(all_events) ``` ## The variables table: one tibble per run `variables_table()` pulls together scans, events, and confounds into a single run-level tibble. This is the tidy bridge between BIDS and your analysis code: ```{r variables-table} vars <- variables_table(proj) vars |> select(.subid, .task, .run, any_of(c("n_scans", "n_events", "n_confound_rows"))) ``` Each row is one run. The `scans`, `events`, and `confounds` list columns hold the nested data. You can selectively include only events or only confounds: ```{r variables-events-only} vars_events <- variables_table(proj, task = "nback", include = "events") vars_events |> select(.subid, .task, .run, n_events) ``` This structure makes it straightforward to write a per-run analysis loop: ```{r per-run-loop} # Use the nback-only table which has both events and confounds columns vars_nback <- variables_table(proj, task = "nback") vars_nback |> filter(n_events > 0, n_confound_rows > 0) |> rowwise() |> mutate( n_conditions = length(unique(events$trial_type)), mean_fd = mean(confounds$framewise_displacement, na.rm = TRUE) ) |> ungroup() |> select(.subid, .task, n_conditions, mean_fd) ``` ## Dataset QA with bids_report() `bids_report()` assembles a lightweight QA summary covering project metadata, compliance checks, pipeline discovery, and run-level coverage: ```{r bids-report} report <- bids_report(proj) report ``` The underlying data is fully accessible for custom reporting: ```{r report-data} rdata <- bids_report_data(proj) rdata$summary rdata$run_coverage ``` You can use the coverage table to spot missing data at a glance: ```{r coverage-check} rdata$run_coverage |> mutate( has_scans = n_scans > 0, has_events = if ("n_events" %in% names(rdata$run_coverage)) n_events > 0 else FALSE, has_confounds = n_confound_rows > 0 ) ``` ## Putting it together A typical analysis script combines these pieces into a pipeline. Here is a sketch that extracts design-ready data for each run: ```{r full-pipeline} analysis_data <- variables_table(proj, task = "nback") |> filter(n_events > 0, n_confound_rows > 0) |> rowwise() |> mutate( n_trials = nrow(events), n_vols = nrow(confounds) ) |> ungroup() |> select(.subid, .task, .run, n_trials, n_vols) analysis_data ``` From here you would unnest the events and confounds columns, build your design matrix with `model.matrix()`, and fit your model -- all in native R tibbles with no intermediate file-path bookkeeping. ```{r cleanup, include = FALSE} unlink(temp_dir, recursive = TRUE) ```