Skip to contents

This function generates the major data object that will be processed when iterating through the each SNF pipeline defined in the settings_matrix. The data_list is a named and nested list containing input dataframes (data), the name of that input dataframe (for the user's reference), the 'domain' of that dataframe (the broader source of information that the input dataframe is capturing, determined by user's domain knowledge), and the type of variable stored in the dataframe (continuous, discrete, ordinal, categorical, or mixed).

Usage

generate_data_list(
  ...,
  uid = NULL,
  test_subjects = NULL,
  train_subjects = NULL,
  sort_subjects = TRUE,
  return_missing = FALSE
)

Arguments

...

Any number of list formatted as (df, "df_name", "df_domain", "df_type") OR any number of lists of lists formatted as (df, "df_name", "df_domain", "df_type")

uid

(string) the name of the uid column currently used data

test_subjects

character vector of test subjects (useful if building a full data list for label propagation)

train_subjects

character vector of train subjects (useful if building a full data list for label propagation)

sort_subjects

If TRUE, the subjects in the data_list will be sorted

return_missing

If TRUE, function returns a list where the first element is the data_list and the second element is a vector of unique IDs of patients who were removed during the complete data filtration step.

Examples

heart_rate_df <- data.frame(
    patient_id = c("1", "2", "3"),
    var1 = c(0.04, 0.1, 0.3),
    var2 = c(30, 2, 0.3)
)

personality_test_df <- data.frame(
    patient_id = c("1", "2", "3"),
    var3 = c(900, 1990, 373),
    var4 = c(509, 2209, 83)
)

survey_response_df <- data.frame(
    patient_id = c("1", "2", "3"),
    var5 = c(1, 3, 3),
    var6 = c(2, 3, 3)
)

city_df <- data.frame(
    patient_id = c("1", "2", "3"),
    var7 = c("toronto", "montreal", "vancouver")
)

# Explicitly (Name each nested list element):
data_list <- generate_data_list(
    list(
        data = heart_rate_df,
        name = "heart_rate",
        domain = "clinical",
        type = "continuous"
    ),
    list(
        data = personality_test_df,
        name = "personality_test",
        domain = "surveys",
        type = "continuous"
    ),
    list(
        data = survey_response_df,
        name = "survey_response",
        domain = "surveys",
        type = "ordinal"
    ),
    list(
        data = city_df,
        name = "city",
        domain = "location",
        type = "categorical"
    ),
    uid = "patient_id"
)

# Compact loading
data_list <- generate_data_list(
    list(heart_rate_df, "heart_rate", "clinical", "continuous"),
    list(personality_test_df, "personality_test", "surveys", "continuous"),
    list(survey_response_df, "survey_response", "surveys", "ordinal"),
    list(city_df, "city", "location", "categorical"),
    uid = "patient_id"
)

# Printing data_list summaries
summarize_dl(data_list)
#>                              name        type   domain length width
#> heart_rate             heart_rate  continuous clinical      3     3
#> personality_test personality_test  continuous  surveys      3     3
#> survey_response   survey_response     ordinal  surveys      3     3
#> city                         city categorical location      3     2

# Alternative loading: providing a single list of lists
list_of_lists <- list(
    list(heart_rate_df, "data1", "domain1", "continuous"),
    list(personality_test_df, "data2", "domain2", "continuous")
)

dl <- generate_data_list(
    list_of_lists,
    uid = "patient_id"
)