Climate Diarrhea Analysis • DHSHarmonization

library(DHSHarmonization)

climate_diarrhea_analysis

This endpoint function develops the dataset for analyzing the relationship between climate variables and diarrhea incidence, based on Issue #2.

library(targets)
library(here)
#> here() starts at /n/holylabs/cgolden_lab/Lab/projects/DHSHarmonization
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(purrr)
library(rdhs)
#> Thank you for using rdhs. If you are using rdhs regularly
#> or for automated tasks, please register for your own API key by
#> emailing api@dhsprogram.com. 
#> 
#> More info at <https://api.dhsprogram.com/#/introdevelop.html>
library(janitor)
#> 
#> Attaching package: 'janitor'
#> The following objects are masked from 'package:stats':
#> 
#>     chisq.test, fisher.test
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(glue)
tar_load(dhs_data_HR, store = here("_targets/"))

We first load the DHS data from the household recode and filter the relevant variables from the request:

dhs_data_HR %>% length()
#> [1] 7

There’s 7 datasets; will they merge automatically?

tryCatch({

  dhs_data_HR %>% rdhs::rbind_labelled()
}, error = function(e) {
  message("Error in merging datasets: ", e$message)
})
#> Error in merging datasets: undefined columns selected

Nope, so we have to filter it manually:

var_req_list <- c(
  "hv001",
  "hv008",
  "hv209",
  "hv013",
  "hv009",
  "hv201",
  "hv025",
  "hv205",
  "hv225",
  "hv237",
  "hv235",
  "hv238",
  "hv237b",
  "hv237a",
  "hv237z",
  "hv237f",
  "hv237x",
  "hv237e",
  "hv237c",
  "hv246f",
  "hv237d",
  "hv246b",
  "hv246d",
  "hv246c",
  "hv246",
  "hv246e",
  "hv232",
  "hv238a",
  "hv246f",
  "hv246g",
  "hv230a",
  "hv230b",
  "sh18a",
  "hv246a",
  "hv246f",
  "hv246b",
  "hv246g",
  "hv246h",
  "hv246d",
  "hv246c",
  "sh139c",
  "sh139b",
  "sh139a",
  "hv233",
  "hv232",
  "hv231",
  "hv246",
  "hv246h",
  "hv246i",
  "hv246j",
  "hv246k",
  "hv246a",
  "hv246f",
  "hv246a",
  "hv246i",
  "hv246h",
  "hv230",
  "hv246e",
  "sh22a",
  "sh138"
)
dhs_data_HR %>% 
  summarize_dhs_flat_dictionary() %>%
  mutate(description = janitor::make_clean_names(description)) -> dhs_dict_HR

dhs_dict_HR %>%
  dplyr::filter(variable %in% var_req_list)
#> # A tibble: 71 × 3
#> # Groups:   description [71]
#>    description                                  variable     n
#>    <chr>                                        <chr>    <int>
#>  1 cluster_number                               hv001        7
#>  2 date_of_interview_cmc                        hv008        7
#>  3 has_refrigerator                             hv209        7
#>  4 number_of_de_facto_members                   hv013        7
#>  5 number_of_household_members                  hv009        7
#>  6 source_of_drinking_water                     hv201        7
#>  7 type_of_place_of_residence                   hv025        7
#>  8 type_of_toilet_facility                      hv205        7
#>  9 share_toilet_with_other_households           hv225        5
#> 10 anything_done_to_water_to_make_safe_to_drink hv237        4
#> # ℹ 61 more rows

Now we need to filter each dataset for these variables and then merge them:

dhs_data_HR %>%
  map(., function(df) {
    df %>%
      dplyr::select(hv000, hv006, hv007, hv008,dplyr::any_of(var_req_list))

  }) %>%
  map(., function(df) {
    data_and_labels(df) %>%
      pluck("dataset") %>%
      haven::as_factor()
  }) %>%
  bind_rows() %>%
  tibble() -> dhs_data_filtered_list

To rename the columns:

rename_map <- setNames(dhs_dict_HR$variable, dhs_dict_HR$description)
dhs_data_filtered_list %>%
  rename(any_of(rename_map)) -> dhs_data_renamed

We should also manipulate the date of the interview:

dhs_data_renamed %>%
  mutate(interview_date = my(glue("{month_of_interview} {year_of_interview}"))) %>%
  select(-c(month_of_interview, year_of_interview, date_of_interview_cmc)) -> dhs_data_final

skimr::skim(dhs_data_final)

Data summary
Name	dhs_data_final
Number of rows	79280
Number of columns	46
_______________________
Column type frequency:
character	1
Date	1
factor	41
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
country_code_and_phase	0	1	3	3	0	6	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
interview_date	0	1	1992-02-01	2021-07-01	2011-03-01	42

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
has_refrigerator	0	1.00	FALSE	4	no: 75696, yes: 3561, mis: 14, 9: 9
source_of_drinking_water	0	1.00	FALSE	41	pub: 11443, unp: 10483, riv: 8538, unp: 8140
type_of_place_of_residence	0	1.00	FALSE	2	rur: 56455, urb: 22825, mis: 0
type_of_toilet_facility	0	1.00	FALSE	33	no : 24699, pit: 19399, pit: 5930, pit: 4536
bathroom_is_used_by_household_only	76318	0.04	FALSE	3	no: 1907, yes: 1018, 9: 37
share_toilet_with_other_households	40543	0.49	FALSE	3	yes: 25534, no: 13178, mis: 25
items_present_soap_or_detergent	55220	0.30	FALSE	3	no: 14078, yes: 9975, mis: 7
items_present_basin	73120	0.08	FALSE	3	no: 3333, yes: 2807, mis: 20
items_present_water_tap	73120	0.08	FALSE	3	no: 3771, yes: 2376, mis: 13
place_for_hand_washing	70860	0.11	FALSE	4	in : 6160, som: 1727, now: 531, mis: 2
source_of_drinking_water_2	70860	0.11	FALSE	15	riv: 3376, pub: 2023, pip: 557, pip: 522
anything_done_to_water_to_make_safe_to_drink	29629	0.63	FALSE	4	no: 29054, yes: 20480, don: 114, mis: 3
location_of_source_for_water	33327	0.58	FALSE	4	els: 39581, in : 6134, in : 223, mis: 15
number_of_households_sharing_toilet	60236	0.24	FALSE	11	2: 6121, 3: 4914, 4: 2774, 5: 1711
water_usually_treated_by_add_bleach_chlorine	29629	0.63	FALSE	4	no: 47771, yes: 1760, don: 114, mis: 6
water_usually_treated_by_boil	29629	0.63	FALSE	4	no: 32172, yes: 17359, don: 114, mis: 6
water_usually_treated_by_dont_know	29629	0.63	FALSE	6	no: 49526, don: 103, don: 11, mis: 6
water_usually_treated_by_let_it_stand_and_settle	29629	0.63	FALSE	4	no: 48563, yes: 968, don: 114, mis: 6
water_usually_treated_by_other	29629	0.63	FALSE	4	no: 49476, don: 114, yes: 55, mis: 6
water_usually_treated_by_solar_disinfection	29629	0.63	FALSE	4	no: 49501, don: 114, yes: 30, mis: 6
water_usually_treated_by_strain_through_a_cloth	29629	0.63	FALSE	4	no: 48999, yes: 532, don: 114, mis: 6
owns_chickens_poultry	29629	0.63	FALSE	78	non: 23307, 10: 2732, 2: 2635, 4: 2304
water_usually_treated_by_use_water_filter	29629	0.63	FALSE	4	no: 49213, yes: 318, don: 114, mis: 6
owns_cows_bulls	29629	0.63	FALSE	51	non: 45830, 2: 1080, 1: 777, 3: 481
owns_goats	29629	0.63	FALSE	44	non: 48035, 2: 282, 3: 172, 4: 171
owns_horses_donkeys_mules	29629	0.63	FALSE	17	non: 49465, 2: 122, 4: 12, 3: 11
owns_livestock_herds_or_farm_animals	29629	0.63	FALSE	3	yes: 31954, no: 17692, mis: 5
owns_sheep	29629	0.63	FALSE	40	non: 48341, 2: 273, 3: 142, 4: 140
owns_cs	29629	0.63	FALSE	62	non: 39286, 2: 2921, 1: 1904, 3: 1167
cattle_own	50139	0.37	FALSE	67	non: 22154, 2: 2031, 1: 882, 4: 821
ducks_geese_turkeys	40913	0.48	FALSE	50	non: 32225, 1: 1471, 2: 1357, 3: 673
household_has_basin	65644	0.17	FALSE	3	no: 8971, yes: 4622, mis: 43
household_has_soap_ash_or_other_cleansing_agent	65644	0.17	FALSE	3	yes: 7152, no: 6459, mis: 25
household_has_water_tap	65644	0.17	FALSE	3	yes: 11278, no: 2346, mis: 12
na_cs_own	58770	0.26	FALSE	27	non: 17155, 1: 1481, 2: 1039, 3: 336
na_owns_cs	79280	0.00	FALSE	0	non: 0, mor: 0, unk: 0, mis: 0
na_owns_cs_2	79280	0.00	FALSE	0	non: 0, mor: 0, unk: 0, mis: 0
usual_place_for_handwashing	61423	0.23	FALSE	4	in : 12329, now: 4213, som: 1307, mis: 8
location_of_toilet_facility	59367	0.25	FALSE	3	in : 11719, els: 7399, in : 795, mis: 0
place_where_household_members_wash_their_hands	58770	0.26	FALSE	5	obs: 12091, obs: 5809, not: 2244, not: 228
presence_of_water_at_hand_washing_place	61380	0.23	FALSE	2	wat: 11789, wat: 6111, mis: 0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
cluster_number	1	228.01	165.41	1	96	193	325	657	▇▇▃▂▂
number_of_de_facto_members	1	4.47	2.47	0	3	4	6	40	▇▁▁▁▁
number_of_household_members	1	4.68	2.48	1	3	4	6	41	▇▁▁▁▁

Looks great! Let’s put this in the function:

#climate_diarrhea_analysis()