Skip to contents

Given a solutions data frame containing clustered observations and a data list containing those clustered observations as well as additional to-be-clustered observations, this function will re-run SNF to generate a similarity matrix of all observations and use the label propagation algorithm to assigned predicted clusters to the unclustered observations.

Usage

label_propagate(partial_sol_df, full_dl, verbose = FALSE)

Arguments

partial_sol_df

A solutions data frame derived from the training set.

full_dl

A data list containing observations from both the training and testing sets.

verbose

If TRUE, output progress to console.

Value

A data frame with one row per observation containing a column for UIDs, a column for whether the observation was in the train (original) or test (held out) set, and one column per row of the solutions data frame indicating the original and propagated clusters.

Examples

## Function to identify obervations with complete data
#uids_with_complete_obs <- get_complete_uids(
#    list(subc_v, income, pubertal, anxiety, depress),
#    uid = "unique_id"
#)
#
## Dataframe assigning 80% of observations to train and 20% to test
#train_test_split <- train_test_assign(
#    train_frac = 0.8,
#    uids = uids_with_complete_obs
#)
#
## Pulling the training and testing observations specifically
#train_obs <- train_test_split$"train"
#test_obs <- train_test_split$"test"
#
## Partition a training set
#train_subc_v <- subc_v[subc_v$"unique_id" %in% train_obs, ]
#train_income <- income[income$"unique_id" %in% train_obs, ]
#train_pubertal <- pubertal[pubertal$"unique_id" %in% train_obs, ]
#train_anxiety <- anxiety[anxiety$"unique_id" %in% train_obs, ]
#train_depress <- depress[depress$"unique_id" %in% train_obs, ]
#
## Partition a test set
#test_subc_v <- subc_v[subc_v$"unique_id" %in% test_obs, ]
#test_income <- income[income$"unique_id" %in% test_obs, ]
#test_pubertal <- pubertal[pubertal$"unique_id" %in% test_obs, ]
#test_anxiety <- anxiety[anxiety$"unique_id" %in% test_obs, ]
#test_depress <- depress[depress$"unique_id" %in% test_obs, ]
#
## Find cluster solutions in the training set
#train_dl <- data_list(
#    list(train_subc_v, "subc_v", "neuroimaging", "continuous"),
#    list(train_income, "household_income", "demographics", "continuous"),
#    list(train_pubertal, "pubertal_status", "demographics", "continuous"),
#    uid = "unique_id"
#)
#
## We'll pick a solution that has good separation over our target features
#train_target_dl <- data_list(
#    list(train_anxiety, "anxiety", "behaviour", "ordinal"),
#    list(train_depress, "depressed", "behaviour", "ordinal"),
#    uid = "unique_id"
#)
#
#sc <- snf_config(
#    train_dl,
#    n_solutions = 5,
#    min_k = 10,
#    max_k = 30
#)
#
#train_sol_df <- batch_snf(
#    train_dl,
#    sc,
#    return_sim_mats = TRUE
#)
#
#ext_sol_df <- extend_solutions(
#    train_sol_df,
#    train_target_dl
#)
#
## Determining solution with the lowest minimum p-value
#lowest_min_pval <- min(ext_sol_df$"min_pval")
#which(ext_sol_df$"min_pval" == lowest_min_pval)
#top_row <- ext_sol_df[1, ]
#
## Propagate that solution to the observations in the test set
## data list below has both training and testing observations
#full_dl <- data_list(
#    list(subc_v, "subc_v", "neuroimaging", "continuous"),
#    list(income, "household_income", "demographics", "continuous"),
#    list(pubertal, "pubertal_status", "demographics", "continuous"),
#    uid = "unique_id"
#)
#
## Use the solutions data frame from the training observations and the data list
## from the training and testing observations to propagate labels to the test observations
#propagated_labels <- label_propagate(top_row, full_dl)
#
#propagated_labels_all <- label_propagate(ext_sol_df, full_dl)
#
#head(propagated_labels_all)
#tail(propagated_labels_all)