---
title: "Vaccination coverage survey, Goma IDP camps, June 2024"
output: 
  word_document:
    keep_md: true
---




```{r setup, include = FALSE, results='hide', message=FALSE, warning=FALSE}
## hide all code chunks in the output, but show errors
knitr::opts_chunk$set(echo = FALSE, error = TRUE, message= FALSE, warning = FALSE, fig.width = 6*1.25, fig.height = 6)


## set default NA to - in output, define figure width/height
options(knitr.kable.NA = "-")



## Installing required packages for this template
required_packages <- c("knitr",       # create output docs
                       "here",        # find your files
                       "rio",         # for importing data
                       "epitrix",     # clean/shape data
                       "dplyr",       # clean/shape data
                       "tidyr",       # clean/shape data
                       "forcats",     # manipulate and rearrange factors
                       "stringr",     # manipulate texts
                       "ggplot2",     # create plots and charts
                       "sitrep",      # MSF field epi functions
                       "survey",      # for survey functions
                       "srvyr" ,       # dplyr wrapper for survey package
                       "janitor",
                       "apyramid",
                       "lubridate",
                       "kableExtra", # for tables
                       "gt",
                       "gtsummary",
                       "arsenal"
                       )

for (pkg in required_packages) {
  ## install packages if not already present
  if (!pkg %in% rownames(installed.packages())) {
    install.packages(pkg)
  }
  
  ## load packages to this current session 
  library(pkg, character.only = TRUE)
}


## set default text size to 18 for plots
## give classic black/white axes for plots
ggplot2::theme_set(theme_classic(base_size = 18))
```
```{r run_pop_estimate, message=FALSE, warning = FALSE}
# Source the R script
## Not shareable
source("Pop_estimation_Goma_2024.R")
```

```{r read_nonDHIS_data, warning = FALSE, message = FALSE}

##Individual
study_data_indiv<- rio::import(here::here("data", "Bulengo_Lush_baseline_vcs_V3_FINAL_18_06.xlsx"), which = "g6")
#Household
study_data_hh <- rio::import(here::here("data", "Bulengo_Lush_baseline_vcs_V3_FINAL_18_06.xlsx"), which = "Bulengo_Lush_baseline_vcs_V3")


```



<!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/// merge_data_levels \\\
--------------------------------------------------------------------------------
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

```{r merge_data_levels}


## join the individual and household data to form a complete data set
study_data_raw <- left_join(study_data_hh, study_data_indiv, by = c("_index" = "_parent_index"))

##This is the equivalent of the pseudonimysed data set available in the MSF DSP
##pseudonimysation process was as below

## merge age variables ---------------------------------------------------------
# age_group_levels <- c("0-11 months", "12-23 months", "2-4 years", "5-14 years", "15-29 years", "30-44 years", "45+ years")
# 
# dat <- dat %>% 
#   mutate(
#     age_group = case_when(
#       age_months <12 ~ "0-11 months",
#       age_months>=12 & age_months <24 ~ "12-23 months",
#       age_years >= 2 & age_years <=4 ~ "2-4 years",
#       age_years >= 5 & age_years <=14 ~ "5-14 years",
#       age_years >= 15 & age_years <= 29 ~ "15-29 years",
#       age_years >= 30 & age_years <= 44 ~ "30-44 years",
#       age_years >= 45 ~ "45+ years",
#       ),
#     .after = age_years,
#     age_group = factor(age_group, levels = age_group_levels)
#   )
# 
# 
# # withhold old age variables
# dat$age_months <- NA_real_
# dat$age_years <- NA_real_
# 
# #withhold locations (not relevant for analysis)
# dat$ads <- NA_character_
# dat<-camp <- NA_character_

## Further adjustments to the below code will be required to run it with pseudonimysed data.
## E.g results by camp or aire de sante (ads) will not be available nor population estimates

```



<!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/// prep_data \\\
--------------------------------------------------------------------------------
This section is to import your data dictionary and standardise variable names. 

The Dharma data dictionary has variable names
in the column_name column. Possible values for each variable are
specified in choice_code and choice_name columns. 
Code has the shortened value and Name has the full-text value.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

```{r prep_data}
## generates MSF standard dictionary for Dharma 
study_data_dict <- msf_dict_survey("Vaccination_short")


## make a copy of your orginal dataset and name it study_data_cleaned
study_data_cleaned <- study_data_raw

## define clean variable names using clean_labels from the epitrix package. 
cleaned_colnames <- epitrix::clean_labels(colnames(study_data_cleaned))

## overwrite variable names with defined clean names
colnames(study_data_cleaned) <- cleaned_colnames 


## In this example, we rename variables several variables
## The formula for this is rename(data, NEW_NAME = OLD_NAME). 

study_data_cleaned <- rename(study_data_cleaned,
                             health_district = "lieu_de_residence")

study_data_cleaned <- study_data_cleaned %>% 
  mutate(health_district = recode (health_district, "communaut" = "resident"))
                          
```


```{r create_age_group}

## Age group variables ----------------------------------------------------------
## This step shows you how to create categorical variables from numeric variables.
## We have some intermediate steps on the way.

## make sure age is an integer 
study_data_cleaned <- study_data_cleaned %>% 
  mutate(age_years = as.integer(age_years), 
         age_months = as.integer(age_months))




#The below shows another option for age grouping
# create an age group variable (combining those over 5 years)
study_data_cleaned <- study_data_cleaned %>%
  mutate(age_group = factor(
    if_else(age_years >= 5,
      "5+",
      as.character(age_years)
    )
  ))

##Create age group 2

# Create a different age_group column using cut function
# Define first break points and labels for age groups
breaks <- c(0, 5, 15, 30, 45, Inf)
labels <- c("0-4", "5-14", "15-29", "30-44", "45+")

study_data_cleaned <- study_data_cleaned %>%
  mutate(age_group2 = cut(age_years, breaks = breaks, labels = labels, include.lowest = TRUE, right = FALSE))

## Create age group by 10 years for pyramind 
study_data_cleaned$age_group_10 <- age_categories(study_data_cleaned$age_years,
                                             lower = 0, 
                                             upper = 60, 
                                             by = 10)


```




<!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/// factor_vars \\\
--------------------------------------------------------------------------------

This chunk will help you clean factor variables.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

```{r factor_vars}


##DTP vaccine ever differencing card and verbal
study_data_cleaned <- study_data_cleaned %>% 
  mutate(dtp_vaccination_status = case_when(
    is.na(ever_vac)            ~ NA_character_,
    dtp_ever    == "yes" & vac_card_pic== "yes" ~ "Vaccinated - card", 
    dtp_ever    == "yes" & vac_card_pic== "no" ~ "Vaccinated - verbal",
    ever_vac == "no" ~ "Unvaccinated",
    dtp_ever == "no" ~ "Unvaccinated",
    ever_vac == "dont_know" ~ "Unknown",
    dtp_ever == "dont_know" ~ "Unknown")
  )


##DTP vaccine ever, card abd verbal same 
##The below code consider the do not know answer as "Unvaccinated"
study_data_cleaned <- study_data_cleaned %>% 
  mutate(dtp_vaccination_status_simple = case_when(
    is.na(ever_vac)            ~ NA_character_,
    dtp_ever    == "yes" & vac_card_pic== "yes" ~ "Vaccinated", 
    dtp_ever    == "yes" & vac_card_pic== "no" ~ "Vaccinated",
    ever_vac == "no" ~ "Unvaccinated",
    dtp_ever == "no" ~ "Unvaccinated",
    ever_vac == "dont_know" ~ "Unvaccinated",
    dtp_ever == "dont_know" ~ "Unvaccinated")
  )

##DTP vaccine number of doses 
##The below code consider the do not know answer as "Unvaccinated"
study_data_cleaned <- study_data_cleaned %>% 
  mutate(dtp_doses = case_when(
    is.na(ever_vac)            ~ NA_character_,
    dtp_ever    == "yes" & dtp_n== "1" ~ "1 dose", 
    dtp_ever    == "yes" & dtp_n== "2" ~ "2 dose",
    dtp_ever    == "yes" & dtp_n== "3" ~ "3 dose",
    dtp_ever    == "yes" & is.na(dtp_n) ~ NA_character_,
    ever_vac == "no" ~ "Unvaccinated",
    dtp_ever == "no" ~ "Unvaccinated",
    ever_vac == "dont_know" ~ "Unvaccinated",
    dtp_ever == "dont_know" ~ "Unvaccinated")
  )



# create month of arrival

study_data_cleaned <- study_data_cleaned %>%
  mutate(month_hh_arriv = paste(month(hh_arriv_date, label = TRUE, abbr = FALSE), year(hh_arriv_date), sep = "-"))

# Create and debug month_year_string column
study_data_cleaned <- study_data_cleaned %>%
  mutate(
    month_year_string = paste(year(hh_arriv_date), sprintf("%02d", month(hh_arriv_date)), "01", sep = "-")
  )

# Convert month_year_string to Date
study_data_cleaned <- study_data_cleaned %>%
  mutate(
    month_year_date = as.Date(month_year_string, format = "%Y-%m-%d")
  )

# Arrange data and factorize month_hh_arriv
study_data_cleaned <- study_data_cleaned %>%
  arrange(month_year_date) %>%
  mutate(
    month_hh_arriv = factor(month_hh_arriv, levels = unique(month_hh_arriv))
  )  %>%
  filter(!is.na(month_hh_arriv))  # Remove rows with NA in month_hh_arriv



```

```{r remove_unused_data_duplicates}


## store the cases that you drop so you can describe them (e.g. non-consenting)
dropped <- study_data_cleaned %>% 
  filter(consent=="no")
## drop the unused rows from the survey data set  
study_data_cleaned <- study_data_cleaned %>%
  filter(consent!= "no")

# ## drop the unused rows from the survey data set  
# study_data_cleaned <- study_data_cleaned %>%
#   filter(consent)


study_data_cleaned <- study_data_cleaned %>% 
  filter(!is.na(age_group))


```


<!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/// read_population_data \\\
--------------------------------------------------------------------------------
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

```{r read_population_data, warning = FALSE, message = FALSE}


## generate population data by age groups in years for district A
population_data_age_district_a<- gen_population(total_pop = 330050, # set total population 
  groups      = c("0", "1", "2", "3", "4", "5+"), # set groups
  proportions = c(0.04, 0.04, 0.04, 0.04,
                  0.04, 0.8), # set proportions for each group
  strata      = c("male", "female")) %>%           # stratify by gender
  rename(age_group  = groups,                      # rename columns (NEW NAME = OLD NAME)
         sex        = strata,
         population = n) %>% 
  mutate(health_district = "camp")  


## generate population data by age groups in years for district A
population_data_age_district_b<- gen_population(total_pop = 346195, # set total population 
  groups      = c("0", "1", "2", "3", "4", "5+"), # set groups
  proportions = c(0.04, 0.04, 0.04, 0.04,
                  0.04, 0.8), # set proportions for each group
  strata      = c("male", "female")) %>%           # stratify by gender
  rename(age_group  = groups,                      # rename columns (NEW NAME = OLD NAME)
         sex        = strata,
         population = n) %>% 
  mutate(health_district = "resident")  

## bind region population data together to get overall population 
population_data_age <- bind_rows(population_data_age_district_a, 
                                 population_data_age_district_b)



##and create age group by months: create age group variable for under 2 years based on months
#Create months for all



study_data_cleaned <- study_data_cleaned %>%
  mutate(age_months = ifelse(is.na(age_months), age_years*12, age_months))  

# Define first break points and labels for age groups


breaks <- c(0, 4, 12, 24, 59, Inf)
labels <- c("0<4m", "4<11m", "12-23m", "24-59m", "5y+")

study_data_cleaned <- study_data_cleaned %>%
  mutate(age_group_mon = cut(age_months, breaks = breaks, labels = labels, include.lowest = TRUE, right = FALSE))




##Create other age group for exploring differences between resident and camps areas



# Define first break points and labels for age groups
breaks <- c(0, 5, 15, 30, 45, Inf)
labels <- c("0-4", "5-14", "15-29", "30-44", "45+")

study_data_cleaned <- study_data_cleaned %>%
  mutate(age_group2 = cut(age_years, breaks = breaks, labels = labels, include.lowest = TRUE, right = FALSE))

##Create sub set for children under 2--> this what will be used for weights, survey design  and vaccination coverage tables
study_data_target<- study_data_cleaned %>% 
  filter (age_years<2)

# I repeat the above age_group_mon, as otherwise there is a problem with one table

breaks <- c(0, 4, 12, 24)
labels <- c("0<4m", "4<11m", "12-23m")

study_data_target <- study_data_target %>%
  mutate(age_group_mon2 = cut(age_months, breaks = breaks, labels = labels, include.lowest = TRUE, right = FALSE))



```

```{r demo_details}

#calculate hh average size
mean_hh <- round(mean(study_data_cleaned$member_number, na.rm = TRUE), 2)


# 95% CI mean
result <- study_data_cleaned %>%
  summarize(
    mean_hh = mean(member_number, na.rm = TRUE),
    sem_hh = sd(member_number, na.rm = TRUE) / sqrt(n()),
    t_value = qt(0.975, df = n() - 1),
    margin_of_error = t_value * sem_hh,
    ci_lower = mean_hh - margin_of_error,
    ci_upper = mean_hh + margin_of_error
  )

# Format the 95% CI as a single string
mean_hhCI <- paste0(round(result$ci_lower, 2), "-", round(result$ci_upper, 2))

#median hh

median_hh <- median(study_data_cleaned$member_number, na.rm = TRUE)

iqr_hh<-IQR(study_data_cleaned$member_number)


###CALCULATE BY idpS AND RESIDENTS
#create subsets
idps<-study_data_cleaned %>% 
  filter(health_district == "camp")

resid<-study_data_cleaned %>% 
  filter(health_district == "resident")


##IDPs
#calculate hh average size
mean_hh_idp<-round(mean(idps$member_number,  na.rm = TRUE),2)


mean_hh <- round(mean(study_data_cleaned$member_number, na.rm = TRUE), 2)
# 95% CI mean
result <- idps %>%
  summarize(
    mean_hh = mean(member_number, na.rm = TRUE),
    sem_hh = sd(member_number, na.rm = TRUE) / sqrt(n()),
    t_value = qt(0.975, df = n() - 1),
    margin_of_error = t_value * sem_hh,
    ci_lower = mean_hh - margin_of_error,
    ci_upper = mean_hh + margin_of_error
  )

# Format the 95% CI as a single string
mean_hhCI_idp <- paste0(round(result$ci_lower, 2), "-", round(result$ci_upper, 2))


#median and IQR hh

median_hh_idp <- median(idps$member_number, na.rm = TRUE)

iqr_hh_idp<-IQR(idps$member_number)

##RESIDENTS
#calculate hh average size
mean_hh_res<-round(mean(resid$member_number,  na.rm = TRUE),2)

# 95% CI mean
result <- resid %>%
  summarize(
    mean_hh = mean(member_number, na.rm = TRUE),
    sem_hh = sd(member_number, na.rm = TRUE) / sqrt(n()),
    t_value = qt(0.975, df = n() - 1),
    margin_of_error = t_value * sem_hh,
    ci_lower = mean_hh - margin_of_error,
    ci_upper = mean_hh + margin_of_error
  )

# Format the 95% CI as a single string
mean_hhCI_res <- paste0(round(result$ci_lower, 2), "-", round(result$ci_upper, 2))


#median hh

median_hh_res <- median(resid$member_number, na.rm = TRUE)

iqr_hh_resid<-IQR(resid$member_number)


####CALCULATE % OF IDPS IN RESIDENT AREA
idp_per<-round((sum(resid$idp)/sum(resid$member_number))*100,2)



```

```{r survey_weights}
##Create survey weights  for a stratified design------------------------------------------------------------------

## create a variable called "surv_weight_strata" 
## contains weights for each individual - by age group, sex and health district
study_data_target <- add_weights_strata(x = study_data_target, 
                                         p = population_data_age, 
                                         surv_weight = "surv_weight_strata",
                                         surv_weight_ID = "surv_weight_ID_strata",
                                         age_group, sex, health_district)


```


```{r survey_design}

## simple random ---------------------------------------------------------------

survey_design_simple <- study_data_target %>% 
  as_survey_design(ids = 1, # 1 for no cluster ids 
                   weights = NULL, # No weight added
                   strata = NULL # sampling was simple (no strata)
                  )

## stratified ------------------------------------------------------------------

survey_design <- study_data_target %>% 
  as_survey_design(ids = 1, # 1 for no cluster ids 
                   weights = surv_weight_strata, # weight variable created above 
                   strata = health_district # sampling was stratified by district
  )
```


```{r save_cleaned_data}
# rio::export(study_data_cleaned, str_glue("study_data_cleaned_{Sys.Date()}.xlsx"))
```

# Results

## Population estimates

We included a total of `r nrow(study_data_hh)` points in the survey to estimate population. Details on points visited can be seen in annexes.

The calculation of the average shelter size was done using `r nrow(data_pop)`. The resulting total population estimate was `r total_population` (95%CI: `r total_ci`). Details of hut average size and household average size can be seen in annexes.

Among the study population  in the residents areas `r idp_per`% were IDPs.

Population estimates by camp can be seen in the table below


### Population estimates by camp

```{r population_estimates_table, warning=FALSE, message=FALSE, echo=FALSE}
# Print the results_df as a nice table using gt
results_df %>%
  gt() %>%
    fmt_number(
    columns = vars(Population),
    decimals = 0
  ) %>%
  cols_label(
    `95% CI` = "95% Confidence Interval"
  )



```


## Survey inclusion 



```{r inclusion_counts}

## get counts of number households 
num_hh <- study_data_cleaned %>% 
  ## get unique houses by cluster
  distinct(shelt_id, team, date) %>% 
  ## get number of rounds (count how many unique)
  nrow()

```


Among the `r nrow(study_data_hh)` points included there were `r num_hh` households that were interviewed (details on non visited points can be seen in annex). 


Among the `r nrow(dropped)` individuals without consent to participate in the survey, 
the reasons for refusal are described below. 
----- PLEASE ADD HERE ---


## Demographic information


In total we included `r nrow(study_data_cleaned)` indiviudals in the survey analysis. Among them `r nrow(study_data_target)` were 0 to 23 months.
The age break down and a statistical comparison with the source population is shown as a annex. 


```{r median_age_sex_ratios}
## compute the median age 
medage <- median(study_data_cleaned$age_years,  na.rm = TRUE)
## paste the lower and uper quartile together
iqr <- str_c(  # basically copy paste togehter the following
  ## calculate the 25% and 75% of distribution, with missings removed
  quantile(     
    study_data_cleaned$age_years, 
    c(0.25, 0.75), 
    na.rm = TRUE), 
  ## between lower and upper place an en-dash
  collapse = "--")


## compute overall sex ratio 
sex_ratio <- study_data_cleaned %>% 
  count(sex) %>% 
  pivot_wider(names_from = sex, values_from = n) %>% 
  mutate(ratio = round(male/female, digits = 3)) %>%
  pull(ratio)

## compute sex ratios by age group 
sex_ratio_age <- study_data_cleaned %>% 
  count(age_group2, sex) %>% 
  pivot_wider(names_from = sex, values_from = n) %>% 
  mutate(ratio = round(male/female, digits = 3)) %>%
  select(age_group2, ratio)

## sort table by ascending ratio then select the lowest (first)
min_sex_ratio_age <- arrange(sex_ratio_age, ratio) %>% slice(1)



```

Among the `r nrow(study_data_cleaned)` surveyed individuals, there were 
`r fmt_count(study_data_cleaned, sex == "female")` females and 
`r fmt_count(study_data_cleaned, sex == "male")` males (unweighted). The male to
female ratio was `r sex_ratio` in the surveyed population. The lowest male to
female ratio was `r min_sex_ratio_age$ratio`
in the `r min_sex_ratio_age$age_group` year age group (males more affected by conflict?)
The median age of surveyed individuals was `r medage` years (Q1-Q3 of `r iqr`
years). Children under five years of age made up 
`r fmt_count(study_data_cleaned, age_years < 5)`of the surveyed individuals.
The highest number of surveyed indivduals (unweighted) were in the 
`r table(study_data_cleaned$age_group) %>% which.max() %>% names()`
year age group.





### Unweighted age distribution of population by year age group and gender.

```{r describe_by_age_group_and_sex, warning = FALSE, message = FALSE }

tab_linelist(study_data_cleaned, age_group2, 
             strata     = sex, 
             row_total  = TRUE,
             col_total  = TRUE,
             prop_total = TRUE) %>%
  ## select and rename column names appropriately
  select("Age"              = "value",
         "Female cases (n)" = "female n",
         "% F"                = "female proportion",
         "Male cases (n)"   = "male n",
         "% M"                = "male proportion",
         "Total") %>%
  kable(digits = 1)
```


There were `r fmt_count(study_data_cleaned, is.na(sex))` cases missing information on sex and 
`r fmt_count(study_data_cleaned, is.na(age_group))` missing age group.

### Unweighted age and gender distribution of household population covered by the survey (population pyramid).


```{r age_pyramid, warning=FALSE}


study_data_cleaned %>% 
  ## make sure the variables are factors
  mutate(age_group_10 = factor(age_group_10), 
         sex = factor(sex)) %>%
  age_pyramid(
    age_group = "age_group_10",
    split_by = "sex", 
    proportion = TRUE)+
  labs(y = "Proportion", x = "Age group (years)") +                 # change axis labels
  theme(legend.position = "bottom",     # move legend to bottom
        legend.title = element_blank(), # remove title
        text = element_text(size = 18)  # change text size
       )


```



### Unweighted age distribution of study target population by month and gender.


```{r describe_by_age_category_and_sex}

tab_linelist(study_data_target, age_group_mon, 
             strata     = sex, 
             row_total  = TRUE,
             col_total  = TRUE,
             prop_total = TRUE) %>%
  ## select and rename column names appropriately
  select("Age"              = "value",
         "Female cases (n)" = "female n",
         "%."                = "female proportion",
         "Male cases (n)"   = "male n",
         "%"                = "male proportion",
         "Total") %>%
  kable(digits = 1)
```



### Unweighted age distribution of study target population by month and area.


```{r describe_by_age_category_and_area}

tab_linelist(study_data_target, age_group_mon, 
             strata     = health_district, 
             row_total  = TRUE,
             col_total  = TRUE,
             prop_total = TRUE) %>%
  ## select and rename column names appropriately
  select("Age"              = "value",
         "Camp (n)" = "camp n",
         "%."                = "camp proportion",
         "Resident (n)"   = "resident n",
         "%"                = "resident proportion",
         "Total") %>%
  kable(digits = 1)
```

## Vaccination coverage
Children younger than two months were excluded from the analysis as first DTP dose is at 6 weeks. 
In all tables for DTP (ever or at least one dose) vaccination coverage children less than two months are excluded.
In all tables for coverage of DTP by number of doses children who are younger than 4 months were are excluded (as the last dose of DPT should be given at 14 weeks)
When no specified otherwise the vaccination coverage tables present results accepting equal validity from self-reported and card-reported vaccination status.
Tables presenting results for vaccination coverage for DTP (ever vaccinated with DTP)refer to the percentage of children who have ever received at least one dose of DTP. 

### Percentage of children that has never had any vaccination

```{r ever_vac}

survey_design %>%  # use the survey object (weighted)
  ## calculate weighted counts and weighted proportions
    tab_survey(ever_vac, deff = FALSE) %>% 
  ## select and rename appropriate columns 
  select("Ever vaccinated-any antigen" = value, 
         "Children (n)" = n, 
         "% (95% CI)" = ci )%>% 
         #"Design effect" = deff) %>% --> I will remove DEFF everywhere as it does not apply
  kable(digits = 1)
```


### Unweighted vaccination coverage for DTP (ever vaccinated with DTP)
We present unweigthed results because the main indicators (DTP vaccine coverage were the same in both locations) 


```{r vaccination_coverage_overall}

survey_design_simple %>%  # use the survey object (weighted)
  ## calculate weighted counts and weighted proportions
  filter(age_months>1) %>% 
  tab_survey(dtp_vaccination_status_simple, deff = FALSE) %>% 
  ## select and rename appropriate columns 
  select("Vaccination status" = value, 
         "Children (n)" = n, 
         "% (95% CI)" = ci )%>% 
         #"Design effect" = deff) %>% --> I will remove DEFF everywhere as it does not apply
  kable(digits = 1)
```



### Weighted vaccination coverage by number of doses

```{r vaccination_coverage_overall_dtp_n}

survey_design_simple %>%  # use the survey object (weighted)
  ## calculate weighted counts and weighted proportions
 filter(age_months>3) %>% 
  tab_survey(dtp_doses, deff = FALSE) %>% 
  ## select and rename appropriate columns 
  select("Vaccination status" = value, 
         "Children (n)" = n, 
         "% (95% CI)" = ci )%>% 
         #"Design effect" = deff) %>% --> I will remove DEFF everywhere as it does not apply
  kable(digits = 1)
```

### Unweighted DTP (ever DTP) vaccination coverage by area (IDP or resident)
Results by area will be presented unweighted as the different areas samples are not relevant and the age group strata are almost identical

```{r vaccination_coverage_strata}

survey_design_simple %>%  # use the survey object (weighted)
   filter(age_months>1) %>% 
  tab_survey(dtp_vaccination_status_simple, strata = health_district) %>% 
  select(-variable) %>%
  rename("Vaccination status" = value) %>% 
  augment_redundant(" (n)"         = " n") %>%  # wrap all "n" in braces (note space before n)
  rename_redundant("% (95% CI)"    = " ci") %>% # relabel all columns containing "ci"
  kable(digits = 1)
```


### Unweighted DTP vaccination coverage by number of doses and area
Including only children 4 to 23 months


```{r dtp_doses_strata}

survey_design_simple %>%  # use the survey object (weighted)
  filter (age_months>3) %>% 
  tab_survey(dtp_doses, strata = health_district) %>% 
  select(-variable) %>%
  rename("Vaccination status" = value) %>% 
  augment_redundant(" (n)"         = " n") %>%  # wrap all "n" in braces (note space before n)
  rename_redundant("% (95% CI)"    = " ci") %>% # relabel all columns containing "ci"
  kable(digits = 1)
```


### Unweighted DTP vaccination coverage by number of doses and age group



```{r dtp_doses_strata_age}

survey_design_simple %>%  # use the survey object (weighted)
  #filter (age_months>3) %>% 
  tab_survey(dtp_doses, strata = age_group_mon2) %>% 
  select(-variable) %>%
  rename("Vaccination status" = value) %>% 
  augment_redundant(" (n)"         = " n") %>%  # wrap all "n" in braces (note space before n)
  rename_redundant("% (95% CI)"    = " ci") %>% # relabel all columns containing "ci"
  kable(digits = 1)
```

### Unweighted DTP (ever) vaccination coverage by sex



```{r vaccination_coverage_sex}

survey_design_simple %>% # use the survey object (weighted)
  ## calculate weighted counts and weighted proportions - stratified by sex
   filter(age_months>1) %>% 
  tab_survey(dtp_vaccination_status_simple, strata = sex, deff = FALSE) %>%
  select(-variable) %>%
  rename("Vaccination status" = value) %>% 
  augment_redundant(" (n)"         = " n") %>%  # wrap all "n" in braces (note space before n)
  rename_redundant("% (95% CI)"    = " ci") %>%       # relabel all columns containing "ci"
                 
  kable(digits = 1)
```


### Unweighted DTP (ever) vaccination coverage by age group 
In the below table we did include children who are less than two months old, because results are anyway stratified by age group and this can help understanding better the data.
We present unweigthed results because the main indicators (DTP vaccine coverage) were the same in both locations.

```{r vaccination_coverage_age_group}
 
survey_design_simple %>% 
    tab_survey(dtp_vaccination_status_simple, strata = age_group_mon2, deff = FALSE, 
             digits = 1, drop = "Missing", transpose = "value") %>%
  rename("Age group (months)" = age_group_mon2) %>% 
  augment_redundant(" (n)"         = " n") %>%  # wrap all "n" in braces (note space before n)
  rename_redundant("% (95% CI)"    = " ci") %>%        # relabel all columns containing "ci"
                 
  kable(digits = 1)
```



## INTERPREATION OF RESULTS
```{r interpretations}


# Target population for resident areas
# AIRES DE SANTE	 
# Aire de Sante	POP 2024
# Buhimba	20040
# Mugunga	49199
  

total_population_res<-(20040+49199)*0.3+(20040+49199)

#create tab vaccine coverage ever DTp (for less than 4 m)



#Create table age groups
poprop<-tab_linelist(study_data_cleaned, age_group_mon, 
             strata     = health_district, 
             row_total  = FALSE,
             col_total  = TRUE,
             prop_total = FALSE)

#extract values from table
less_4m_idp <- poprop %>% filter(value == "0<4m") %>% pull(`camp proportion`)
four_11_idp<- poprop %>% filter(value == "4<11m") %>% pull(`camp proportion`)
twelve_23_idp<- poprop %>% filter(value == "12-23m") %>% pull(`camp proportion`)
twentythree_59_idp<- poprop %>% filter(value == "24-59m") %>% pull(`camp proportion`)


less_4m_res <- poprop %>% filter(value == "0<4m") %>% pull(`resident proportion`)
four_11_res<- poprop %>% filter(value == "4<11m") %>% pull(`resident proportion`)
twelve_23_res<- poprop %>% filter(value == "12-23m") %>% pull(`resident proportion`)
twentythree_59_res<- poprop %>% filter(value == "24-59m") %>% pull(`resident proportion`)

#Calculate populations points estimates
less_4m_idp_p<-less_4m_idp/100*total_population
m4to11_idp_p<-(four_11_idp/100)*total_population
m12to23_idp_p<-twelve_23_idp/100*total_population
m23to59_idp_p<-(twentythree_59_idp/100)*total_population

less_4m_res_p<-less_4m_res/100*total_population_res
m4to11_res_p<-four_11_res/100*total_population_res
m12to23_res_p<-twelve_23_res/100*total_population_res
m23to59_res_p<-twentythree_59_res/100*total_population_res

#Calculate populations points estimates (with 95%CI, for IDPs)
less_4m_idp_h<-less_4m_idp/100*CIb
m4to11_idp_h<-(four_11_idp/100)*CIb
m12to23_idp_h<-twelve_23_idp/100*CIb
m23to59_idp_h<-(twentythree_59_idp/100)*CIb



#Vaccination coverage estimates  (num,bers to be updated or extratced from tables). Using same estimates for IDPs and residents
#Never vaccinated
VCever0to3<-0.618
VCever4to11<-0.234
VCever12to23<-0.199

#Partially vaccinated adding those with 1 dose and those with 2 doses)
VCpart0to3<-0.263+0.105
VCpart4to11<-0.271+0.286
VCpart12to23<-0.078+0.265


  
#Chuldren needing all doses in IPDs
  #I consider higher 95%CIs for pop estimates
all_d0_3_i<-VCever0to3*less_4m_idp_h
all_d4_11_i<-VCever4to11*m4to11_idp_h
all_d12_23_i<-VCever12to23*m12to23_idp_h
all_d23_59_i<-VCever12to23*m23to59_idp_h #Consider same VC as 12 to 23 m

#Children needing some doses in IPD
 #I also consider higher 95% CI and 1 and 2 doses of VC
part_d0_3_i<-VCpart0to3*less_4m_idp_h
part_d4_11_i<-VCpart4to11*m4to11_idp_h
part_d12_23_i<-VCpart12to23*m12to23_idp_h
part_d23_59_i<-VCpart12to23*m23to59_idp_h


  #I consider higher 95%CIs for pop estimates IDPs
tot_all_idp<-round(all_d0_3_i+all_d4_11_i+all_d12_23_i+all_d23_59_i,0)

 #I also consider higher 95% CI and 1 and 2 doses of VC IDPs
tot_part_idp<-round(part_d0_3_i+part_d4_11_i+part_d12_23_i+part_d23_59_i,0)


#Chuldren needing all doses Residents
  #I consider higher 95%CIs for pop estimates
all_d0_3_r<-VCever0to3*less_4m_res_p
all_d4_11_r<-VCever4to11*m4to11_res_p
all_d12_23_r<-VCever12to23*m12to23_res_p
all_d23_59_r<-VCever12to23*m23to59_res_p #Consider same VC as 12 to 23 m

#Children needing some doses residents
 #I also consider higher 95% CI and 1 and 2 doses of VC
part_d0_3_r<-VCpart0to3*less_4m_res_p
part_d4_11_r<-VCpart4to11*m4to11_res_p
part_d12_23_r<-VCpart12to23*m12to23_res_p
part_d23_59_r<-VCpart12to23*m23to59_res_p


  #I consider higher 95%CIs for pop estimates
tot_all_r<-round(all_d0_3_r+all_d4_11_r+all_d12_23_r+all_d23_59_r,0 )

 #I also consider higher 95% CI and 1 and 2 doses of VC
tot_part_r<-round(part_d0_3_r+part_d4_11_r+part_d12_23_r+part_d23_59_r,0)


## totals

tot_all<-tot_all_idp + tot_all_r
tot_part<- round(tot_part_idp+tot_part_r,0)

##Formatting for report output

tot_all_idp <- format(tot_all_idp, scientific = FALSE, big.mark = ",", nsmall = 0)
tot_part_idp <- format(tot_part_idp, scientific = FALSE, big.mark = ",", nsmall = 0)
tot_part_r <- format(tot_part_r, scientific = FALSE, big.mark = ",", nsmall = 0)
tot_all_r<- format(tot_all_r, scientific = FALSE, big.mark = ",", nsmall = 0)
tot_all<-format(tot_all, scientific = FALSE, big.mark = ",", nsmall = 0)
tot_part<- format(tot_part, scientific = FALSE, big.mark = ",", nsmall = 0)

```

For the calculation in needs of vaccines we consider two sets of indicators:  
- Vaccination coverage estimates for the different age groups in the different areas (using the results presented in this report, and using overall coverage for both IDPs and residents, as results were similar)  
- Percentage of population (for IDPs we calculated this using satellite imagery and the survey and for resident areas we use MoH population estimates)  

#### Distribution of target population by age group (all children, inlcuding those vaccinated)
(In IDPs we consider the higher 95%CI of the estimates for population estimates) 
``` {r pop_inter, warning = FALSE, message= FALSE}
## Create table for target populations
# Create a data frame with the given structure
age_groups <- c("0-3 m", "4-11m", "12-23m", "24-59m", "Total")

# Combine the vectors into a data frame
table_data <- data.frame(
  `Age group` = age_groups,
  IDP = c(less_4m_idp_h, m4to11_idp_h, m12to23_idp_h, m23to59_idp_h, sum(less_4m_idp_h, m4to11_idp_h, m12to23_idp_h, m23to59_idp_h)),
  Residents = c(less_4m_res_p, m4to11_res_p, m12to23_res_p, m23to59_res_p, sum(less_4m_res_p, m4to11_res_p, m12to23_res_p, m23to59_res_p)),
  Total = c(
    sum(less_4m_idp_h, less_4m_res_p),
    sum(m4to11_idp_h, m4to11_res_p),
    sum(m12to23_idp_h, m12to23_res_p),
    sum(m23to59_idp_h, m23to59_res_p),
    sum(less_4m_idp_h, m4to11_idp_h, m12to23_idp_h, m23to59_idp_h, less_4m_res_p, m4to11_res_p, m12to23_res_p, m23to59_res_p)
  )
)



kable(table_data)



```

### Calculations of target population considering vaccination status

#### Children needing all doses
The estimated number of children needing ALL doses in IDP camps is: `r tot_all_idp`    
The estimated number of children needing ALL doses in reesident areas is: `r tot_all_r`  
**The total of children needing ALL doses would be  `r tot_all`**

#### Children needing partial doses
The estimated number of children needing SOME doses in IDP camps is: `r tot_part_idp`  
The estimated number of children needing SOME doses in resident areas is: `r tot_part_r`  
**The total of children needing SOME doses would be `r tot_part`** 

#### Assumptions
- We consider that >24



# Annexes



### Unweighted age and gender distribution, stratified by health district, of household population covered by the survey (population pyramid).


```{r age_pyramid_strata, warning=FALSE}



study_data_cleaned %>% 
  ## make sure the variables are factors
  mutate(age_group_10 = factor(age_group_10), 
         sex = factor(sex)) %>%
  age_pyramid(
    age_group = "age_group_10",
    split_by = "sex", 
     stack_by = "health_district",
    proportion = TRUE)+
  labs(y = "Proportion", x = "Age group (years)") +                 # change axis labels
  theme(legend.position = "bottom",     # move legend to bottom
        legend.title = element_blank(), # remove title
        text = element_text(size = 18)  # change text size
       )


```


### Unweighted distribution of participants by age groups and area




```{r describe_by_age_category_and_area2}

tab_linelist(study_data_cleaned, age_group_mon, 
             strata     = health_district, 
             row_total  = FALSE,
             col_total  = TRUE,
             prop_total = FALSE) %>%
  ## select and rename column names appropriately
  select("Age"              = "value",
         "Camp (n)" = "camp n",
         "%."                = "camp proportion",
         "Resident (n)"   = "resident n",
         "%"                = "resident proportion") %>% 
        
  kable(digits = 1)
```

```{r describe_by_age_group_and_area, warning = FALSE, message = FALSE }




tab_linelist(study_data_cleaned, age_group2,
             strata     = health_district,
             row_total  = FALSE,
             col_total  = TRUE,
             prop_total = FALSE) %>%
  ## select and rename column names appropriately
  select("Age group"              = "value",
         "Camp (n)" = "camp n",
         "% camp"                = "camp proportion",
         "Resident (n)"   = "resident n",
         "% Resident"                = "resident proportion") %>% 
  kable(digits = 1)
```





### Weighted DTP ever vaccination coverage; distinguishing between vaccination cards and verbal confirmation 

```{r vaccination_coverage_overall_detail}

survey_design %>% # use the survey object (weighted)
  ## calculate weighted counts and weighted proportions
   filter(age_months>1) %>% 
  tab_survey(dtp_vaccination_status) %>% 
  select("Vaccination status" = value, 
         "Children (n)" = n,  
         "% (95% CI)" = ci) %>% 
  kable(digits = 1)
```



### Details of shelter average size, household average size and propoportion of IDPs living in resident areas

The mean shelter size in the IDP camps was: `r hut_mean`, 95%CI:`r hut_meanCI`

Points that were considered as zero for the calculation of the average shelter size for population estimates:  
- In `r nrow(not_sleep)` of the visited points the structure identified by satellite images and the mapathon was not a shelter used for sleeping or was not a shelter anymore.  
- In `r nrow(empty_shelters)` the structures were or had been used for sleeping but were empty (nobody had slept there the night before). 

Points that were discarded (considered NA) for the average shelter size calculations:  
- There were `r nrow(dropped)` points that were inhabited but the owner refused participation.  
- There were `r nrow(not_know_empty)` points for which the onwers were not found and the team could not establish if somebody had slept in the shelter. 


#### Median and mean household sizes by location


```{r median_mean_hh}
# Create a data frame with the desired structure
table_data <- data.frame(
  Location = c("Residents", "IDPs", "All"),
  Median = c(median_hh_res, median_hh_idp, median_hh),
  Median_IQR = c(iqr_hh_resid, iqr_hh_idp, iqr_hh),
  Mean = c(mean_hh_res, mean_hh_idp, mean_hh),
  Mean_95_CI = c(mean_hhCI_res, mean_hhCI_idp, mean_hhCI)
)

# Use gt to create a styled table
mean_med_hh <- table_data %>%
  gt() %>%
    cols_label(
    Median_IQR = "Median-IQR",
    Mean_95_CI = "Mean-95%CI"
  ) %>%
  tab_style(
    style = list(
      cell_text(weight = "bold")
    ),
    locations = cells_column_labels(everything())
  )

# Print the gt table
mean_med_hh

```


<!-- The chunks like this will not be run
### Unweighted age distribution of population by year age group and gender, IDP camps. 

```{r describe_by_age_group_and_sex_idp, warning = FALSE, message = FALSE }
idp_pop<-study_data_cleaned %>%
  filter(health_district== "camp")

tab_linelist(idp_pop, age_group2,
             strata     = sex,
             row_total  = TRUE,
             col_total  = TRUE,
             prop_total = TRUE) %>%
  ## select and rename column names appropriately
  select("Age"              = "value",
         "Female cases (n)" = "female n",
         "% F"                = "female proportion",
         "Male cases (n)"   = "male n",
         "% M"                = "male proportion",
         "Total") %>%
  kable(digits = 1)
```

### Unweighted age distribution of population by year age group and gender, resident areas.

```{r describe_by_age_group_and_sex_res, warning = FALSE, message = FALSE }
idp_pop<-study_data_cleaned %>% 
  filter(health_district== "resident")

tab_linelist(idp_pop, age_group2, 
             strata     = sex, 
             row_total  = TRUE,
             col_total  = TRUE,
             prop_total = TRUE) %>%
  ## select and rename column names appropriately
  select("Age"              = "value",
         "Female cases (n)" = "female n",
         "% F"                = "female proportion",
         "Male cases (n)"   = "male n",
         "% M"                = "male proportion",
         "Total") %>%
  kable(digits = 1)
```

-->


### Distribution of households by month of arrival

```{r hh_month_arrival}
# Create the histogram
ggplot(study_data_cleaned, aes(x = month_hh_arriv)) +
  geom_bar() +
  xlab("Month-Year") +
  ylab("Count") +
  ggtitle("Distribution of Households by month of arrival") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```


### Analysis of potential sampling bias

The p-value for certain groups is less than 0.05, but the percentage of children sampled by age group it´s less than 1% different to what we expected (in all age groups). We consider this to be a very succesful sample despite the low sample size we aimed for.

```{r descriptive_sampling_bias, warning = FALSE}
## counts and props of the study population
ag <- tab_linelist(study_data_cleaned, "age_group") %>%
  mutate(n_total = sum(n)) %>%
  select(-variable) %>%
  rename(age_group = value) %>%
  mutate(age_group = fct_inorder(age_group))

## counts and props of the source population
propcount <- group_by(population_data_age, age_group) %>%
    tally(population) %>%
    mutate(proportion = n / sum(n))

## bind together the columns of two tables, group by age, and perform a 
## binomial test to see if n/total is significantly different from population
## proportion.
  ## suffix here adds to text to the end of columns in each of the two datasets
left_join(ag, propcount, by = "age_group", suffix = c("", "_pop")) %>%
  group_by(age_group) %>%

  ## broom::tidy(binom.test()) makes a data frame out of the binomial test and
  ## will add the variables p.value, parameter, conf.low, conf.high, method, and
  ## alternative. We will only use p.value here. You can include other
  ## columns if you want to report confidence intervals
  mutate(binom = list(broom::tidy(binom.test(n, n_total, proportion_pop)))) %>%
  unnest(cols = c(binom)) %>% # important for expanding the binom.test data frame
  mutate(proportion_pop = proportion_pop * 100) %>%

  ## Adjusting the p-values to correct for false positives 
  ## (because testing multiple age groups). This will only make 
  ## a difference if you have many age categories
 mutate(p.value = p.adjust(p.value, method = "holm")) %>%
  select(age_group, n, proportion, n_pop, proportion_pop, p.value) %>%
                      
  ## Only show p-values over 0.001 (those under report as <0.001)
 mutate(p.value = ifelse(p.value < 0.001, "<0.001", as.character(round(p.value, 3)))) %>%

  ## rename the columns appropriatley
  
  rename(
    "Age group" = age_group,
    "Study population (n)" = n,
    "% study pop" = proportion,
    "Source population (n)" = n_pop,
    "% source pop" = proportion_pop,
    "P-value" = p.value
  ) %>%
  kable(digits = 1)

```