## Create clean data dictionary # Activate packages ------------------------------------------------------- pacman::p_load( "here", "rio", "tidyverse", "janitor" ) remotes::install_github("epicentre-msf/datadict") # # Load/install packages from GitHub pacman::p_load_gh( # data dictionary "epicentre-msf/datadict" # create/validate data dictionary ) ## Import the clean dataset # Import data ------------------------------------------------------------- dat <- rio::import(here("4_Data", "clean", "study_data_cleaned_under5_2024-03-22.xlsx"), setclass = "tbl") # Create data dictionary -------------------------------------------------- kule_vcs_clean_dict <- dict_from_data(dat) # Data cleaning for data dictionary ----------------------------------------------------------- # clean up numeric and coded list variables kule_vcs_clean_dict <- kule_vcs_clean_dict %>% mutate(type = case_when( variable_name %in% c("zone_other", "no_consent_reason") ~ "Free text", variable_name %in% c("consent", "vaccination_status_simple_msf", "vaccination_status_simple_rout") ~ "Coded list", .default = type )) ## Change origin of variable for the derived variables derived_vars <- c( "consent", "age_group", "age_group_mon", "age_category", "vaccination_status", "vaccination_status_simple", "vaccination_status_msf", "vaccination_status_simple_msf", "vaccination_status_rout", "vaccination_status_simple_rout", "weight" ) ## Create a loop that replaces the value of original to derived based on being a derived variable for (i in 1:nrow(kule_vcs_clean_dict)){ if(kule_vcs_clean_dict$variable_name[i] %in% derived_vars) { kule_vcs_clean_dict$origin[i] <- "derived" } } ## Add values for block name and additional variables kule_vcs_clean_dict <- kule_vcs_clean_dict %>% mutate(choices = case_when( variable_name == "consent" ~ c("0, FALSE | 1, TRUE"), variable_name %in% c("vaccination_status_simple_msf", "vaccination_status_simple_rout") ~ c("0, TRUE | 1, FALSE"), variable_name == "vaccination_status" ~ c("0, Card | 1, Unvaccinated | 2, Verbal"), variable_name == "age_category" ~ c("0, 0-5 months | 1, 6-8 months | 2, 9-12 months | 3, 1 years | 4, 2 years | 5, 3 years | 6, 4 years | 7, 5 years"), .default = choices )) ## Export the clean data dictionary export(kule_vcs_clean_dict, here("4_Data", "dictionary", "clean", "kule_measles_dict_vcs_2023_clean.xlsx")) # Pseudonymisation process ------------------------------------------------ vars_withhold <- c( "age_months", "age_years" ) ## Create a copy of the dictionary dict <- kule_vcs_clean_dict dict$status[dict$variable_name %in% vars_withhold] <- "withheld" ## Assess re-identification risk criterion using all indirect identifiers vars_indirect <- c( "sex", "age_group", "age_group_mon", "age_category", "zone_name" ) test <- datadict::k_anonymity_counts(dat, vars_indirect, threshold = 5) ## Check other combinations of variables to see if more need to be withheld datadict::k_anonymity_counts(dat, c("age_group"), threshold = 5) datadict::k_anonymity_counts(dat, c("age_group_mon" ), threshold = 5) datadict::k_anonymity_counts(dat, c("age_category" ), threshold = 5) datadict::k_anonymity_counts(dat, c("zone_name", "age_group"), threshold = 5) ## Zone name and sex variables as well as other age variables make individuals more directly identifiable and need to be withheld vars_withhold <- c( "zone_name", "age_category", "age_group_mon", "sex" ) dict$status[dict$variable_name %in% vars_withhold] <- "withheld" # Remove withheld variables from dataset ---------------------------------- dat$zone_name <- NA dat$age_category <- NA dat$age_months <- NA dat$age_years <- NA dat$age_group_mon <- NA dat$sex <- NA datadict::valid_dict(dict) datadict::valid_data(dat, dict) ### Export cleaned dictionary and dataset export(dat, here("4_Data", "clean", "kule_measles_data_vcs_2023_clean_SHARE.xlsx")) export(dict, here("4_Data", "dictionary", "clean", "kule_measles_dict_vcs_2023_clean_SHARE.xlsx"))