Skip to contents

Download a copy of the vignette to follow along here: label_propagation.Rmd

In this vignette, we will walk through label propagation in the metaSNF package. Code from this vignette is largely taken from the end of the less simple example vignette.

The label propagation procedure can be used to predict cluster membership for new, unlabeled observations based on their similarity to previously labeled observations. These unlabeled observations could be a held out test set from your original sample or a new sample entirely.

The process involves the following steps:

  1. Assign clusters to some group of patients
  2. Calculate all the pairwise similarities amongst all the already clustered and to-be-labeled patients
  3. Run the label propagation algorithm to predict cluster membership in the to-be-labeled patients

There is a lot of room for flexibility in how steps 1 and 2 are conducted. SNF is not necessary at any part of the process. For example, step one could be done by assigning clusters in your training set manually or by a simple clustering method like k-means. Step two could be done just by calculating the euclidean distances across all the training and testing subjects for a small subset of variables. The variables used to calculate the similarities in step 2 don’t necessarily need to be the same ones used to derive the cluster solution in the training set either.

All that aside, we show here a simple approach that involves assigning the clusters by a call to batch_snf, assembling a data list that has the training and testing set subjects, and feeding the results into a simple label propagating function, lp_solutions_matrix.

library(metasnf)

# Start by making a data list containing all our dataframes to more easily
# identify subjects without missing data
all_patient <- generate_data_list(
    list(abcd_cort_t, "cort_t", "neuroimaging", "continuous"),
    list(abcd_cort_sa, "cort_sa", "neuroimaging", "continuous"),
    list(abcd_subc_v, "subc_v", "neuroimaging", "continuous"),
    list(abcd_h_income, "household_income", "demographics", "continuous"),
    list(abcd_pubertal, "pubertal_status", "demographics", "continuous"),
    list(abcd_anxiety, "anxiety", "behaviour", "ordinal"),
    list(abcd_depress, "depressed", "behaviour", "ordinal"),
    uid = "patient"
)

# Get a vector of all the subjects
all_subjects <- get_dl_subjects(all_patient)

# Dataframe assigning 80% of subjects to train and 20% to test
train_test_split <- train_test_assign(
    train_frac = 0.8,
    subjects = all_subjects
)

# Pulling the training and testing subjects specifically
train_subs <- train_test_split$"train"
test_subs <- train_test_split$"test"

# Partition a training set
train_abcd_cort_t <- abcd_cort_t[abcd_cort_t$"patient" %in% train_subs, ]
train_abcd_cort_sa <- abcd_cort_sa[abcd_cort_sa$"patient" %in% train_subs, ]
train_abcd_subc_v <- abcd_subc_v[abcd_subc_v$"patient" %in% train_subs, ]
train_abcd_h_income <- abcd_h_income[abcd_h_income$"patient" %in% train_subs, ]
train_abcd_pubertal <- abcd_pubertal[abcd_pubertal$"patient" %in% train_subs, ]
train_abcd_anxiety <- abcd_anxiety[abcd_anxiety$"patient" %in% train_subs, ]
train_abcd_depress <- abcd_depress[abcd_depress$"patient" %in% train_subs, ]

# Partition a test set
test_abcd_cort_t <- abcd_cort_t[abcd_cort_t$"patient" %in% test_subs, ]
test_abcd_cort_sa <- abcd_cort_sa[abcd_cort_sa$"patient" %in% test_subs, ]
test_abcd_subc_v <- abcd_subc_v[abcd_subc_v$"patient" %in% test_subs, ]
test_abcd_h_income <- abcd_h_income[abcd_h_income$"patient" %in% test_subs, ]
test_abcd_pubertal <- abcd_pubertal[abcd_pubertal$"patient" %in% test_subs, ]
test_abcd_anxiety <- abcd_anxiety[abcd_anxiety$"patient" %in% test_subs, ]
test_abcd_depress <- abcd_depress[abcd_depress$"patient" %in% test_subs, ]

# Find cluster solutions in the training set
train_data_list <- generate_data_list(
    list(train_abcd_cort_t, "cort_t", "neuroimaging", "continuous"),
    list(train_abcd_cort_sa, "cortical_sa", "neuroimaging", "continuous"),
    list(train_abcd_subc_v, "subc_v", "neuroimaging", "continuous"),
    list(train_abcd_h_income, "household_income", "demographics", "continuous"),
    list(train_abcd_pubertal, "pubertal_status", "demographics", "continuous"),
    uid = "patient"
)

# We'll pick a solution that has good separation over our target variables
train_target_list <- generate_data_list(
    list(train_abcd_anxiety, "anxiety", "behaviour", "ordinal"),
    list(train_abcd_depress, "depressed", "behaviour", "ordinal"),
    uid = "patient"
)

settings_matrix <- generate_settings_matrix(
    train_data_list,
    nrow = 5,
    seed = 42,
    min_k = 10,
    max_k = 30
)
#> [1] "The global seed has been changed!"

train_solutions_matrix <- batch_snf(
    train_data_list,
    settings_matrix
)
#> [1] "Row: 1/5 | Time remaining: 1 seconds"
#> [1] "Row: 2/5 | Time remaining: 1 seconds"
#> [1] "Row: 3/5 | Time remaining: 0 seconds"
#> [1] "Row: 4/5 | Time remaining: 0 seconds"
#> [1] "Row: 5/5 | Time remaining: 0 seconds"
#> [1] "Total time taken: 1 seconds."

extended_solutions_matrix <- extend_solutions(
    train_solutions_matrix,
    train_target_list
)
#> [1] "Processing row 1 of 5"
#> [1] "Processing row 2 of 5"
#> [1] "Processing row 3 of 5"
#> [1] "Processing row 4 of 5"
#> [1] "Processing row 5 of 5"

# Determining solution with the lowest minimum p-value
lowest_min_pval <- min(extended_solutions_matrix$"min_pval")
which(extended_solutions_matrix$"min_pval" == lowest_min_pval)
#> [1] 1
top_row <- extended_solutions_matrix[4, ]

# Propagate that solution to the subjects in the test set
# data list below has both training and testing subjects
full_data_list <- generate_data_list(
    list(abcd_cort_t, "cort_t", "neuroimaging", "continuous"),
    list(abcd_cort_sa, "cort_sa", "neuroimaging", "continuous"),
    list(abcd_subc_v, "subc_v", "neuroimaging", "continuous"),
    list(abcd_h_income, "household_income", "demographics", "continuous"),
    list(abcd_pubertal, "pubertal_status", "demographics", "continuous"),
    uid = "patient"
)

# Use the solutions matrix from the training subjects and the data list from
# the training and testing subjects to propagate labels to the test subjects
propagated_labels <- lp_solutions_matrix(top_row, full_data_list)
#> [1] "Processing row 1 of 1..."

head(propagated_labels)
#>                 subjectkey group 4
#> 1 subject_NDAR_INV0567T2Y9 train 1
#> 2 subject_NDAR_INV0J4PYA5F train 1
#> 3 subject_NDAR_INV10OMKVLE train 2
#> 4 subject_NDAR_INV15FPCW4O train 1
#> 5 subject_NDAR_INV19NB4RJK train 2
#> 6 subject_NDAR_INV1HLGR738 train 1
tail(propagated_labels)
#>                   subjectkey group 4
#> 95  subject_NDAR_INVJEV61XIU  test 2
#> 96  subject_NDAR_INVJR3S271G  test 2
#> 97  subject_NDAR_INVK9ULDQA2  test 1
#> 98  subject_NDAR_INVKYH529RD  test 1
#> 99  subject_NDAR_INVL045Z1TY  test 2
#> 100 subject_NDAR_INVLDQH8ATK  test 1

You could, if you wanted, see how all of your clustering solutions propagate to the test set, but that would mean reusing your test set and removing much of the protection against overfitting provided by this procedure.

propagated_labels_all <- lp_solutions_matrix(
    extended_solutions_matrix,
    full_data_list
)
#> [1] "Processing row 1 of 5..."
#> [1] "Processing row 2 of 5..."
#> [1] "Processing row 3 of 5..."
#> [1] "Processing row 4 of 5..."
#> [1] "Processing row 5 of 5..."

head(propagated_labels_all)
#>                 subjectkey group 1 2 3 4  5
#> 1 subject_NDAR_INV0567T2Y9 train 1 1 5 1 10
#> 2 subject_NDAR_INV0J4PYA5F train 2 1 5 1  3
#> 3 subject_NDAR_INV10OMKVLE train 1 1 3 2  5
#> 4 subject_NDAR_INV15FPCW4O train 1 1 4 1  4
#> 5 subject_NDAR_INV19NB4RJK train 1 1 8 2  9
#> 6 subject_NDAR_INV1HLGR738 train 1 2 8 1  9
tail(propagated_labels_all)
#>                   subjectkey group 1 2 3 4 5
#> 95  subject_NDAR_INVJEV61XIU  test 2 2 2 2 2
#> 96  subject_NDAR_INVJR3S271G  test 1 1 4 2 4
#> 97  subject_NDAR_INVK9ULDQA2  test 1 1 1 1 1
#> 98  subject_NDAR_INVKYH529RD  test 1 1 7 1 7
#> 99  subject_NDAR_INVL045Z1TY  test 1 2 6 2 8
#> 100 subject_NDAR_INVLDQH8ATK  test 1 1 6 1 8