Skip to contents

Download a copy of the vignette to follow along here: alluvial_plots.Rmd

Alluvial plots can be generated to visualize how changing the number of clusters influences the distribution of patients according to one (or a few) variables of interest.

First, some data setup just as was done in the previous vignettes.

library(metasnf)

# Generate data_list
data_list <- generate_data_list(
    list(
        data = expression_df,
        name = "genes_1_and_2_exp",
        domain = "gene_expression",
        type = "continuous"
    ),
    list(
        data = methylation_df,
        name = "genes_1_and_2_meth",
        domain = "gene_methylation",
        type = "continuous"
    ),
    list(
        data = gender_df,
        name = "gender",
        domain = "demographics",
        type = "categorical"
    ),
    list(
        data = diagnosis_df,
        name = "diagnosis",
        domain = "clinical",
        type = "categorical"
    ),
    uid = "patient_id"
)

settings_matrix <- generate_settings_matrix(
    data_list,
    nrow = 1,
    max_k = 40,
    seed = 42
)
## [1] "The global seed has been changed!"
batch_snf_results <- batch_snf(
    data_list,
    settings_matrix,
    return_similarity_matrices = TRUE
)
## [1] "Row: 1/1 | Time remaining: 0 seconds"
## [1] "Total time taken: 1 seconds."
solutions_matrix <- batch_snf_results$"solutions_matrix"
similarity_matrices <- batch_snf_results$"similarity_matrices"

similarity_matrix <- similarity_matrices[[1]]

cluster_solution <- get_cluster_solutions(solutions_matrix)$"1"

Next, assemble a list clustering algorithm functions that cover the range of the number of clusters you’d like to visualize. The example below uses spectral_two to spectral_six, which are spectral clustering functions covering 2 clusters to 6 clusters respectively.

# Spectral clustering functions ranging from 2 to 6 clusters
cluster_sequence <- list(
    spectral_two,
    spectral_three,
    spectral_four
)

Then, we can either generate an alluvial plot covering our similarity matrix over these clustering algorithms for data in a data_list:

alluvial_cluster_plot(
    cluster_sequence = cluster_sequence,
    similarity_matrix = similarity_matrix,
    data_list = data_list,
    key_outcome = "gender", # the name of the variable of interest
    key_label = "Gender", # how the variable of interest should be displayed
    extra_outcomes = "diagnosis", # more variables to plot but not colour by
    title = "Gender Across Cluster Counts"
)

Or in an external dataframe:

extra_data <- dplyr::inner_join(
    gender_df,
    diagnosis_df,
    by = "patient_id"
) |>
    dplyr::mutate(subjectkey = paste0("subject_", patient_id))

head(extra_data)
##   patient_id gender       diagnosis  subjectkey
## 1        660 female definite asthma subject_660
## 2        420 female possible asthma subject_420
## 3        252 female definite asthma subject_252
## 4        173 female       no asthma subject_173
## 5        327 female definite asthma subject_327
## 6        245 female definite asthma subject_245
alluvial_cluster_plot(
    cluster_sequence = cluster_sequence,
    similarity_matrix = similarity_matrix,
    data = extra_data,
    key_outcome = "gender",
    key_label = "Gender",
    extra_outcomes = "diagnosis",
    title = "Gender Across Cluster Counts"
)