/*
Title: Dietary Index Distribution
Author: NCI/Information Management Services
Date: 3/25/2025
*/

/*
This example demonstrates creating a distribution of Healthy Eating Index 2020 (HEI-2020) scores using the NCI method.

NOTE: THIS EXAMPLE MAY HAVE LONG COMPUTATION TIMES; IT IS RECOMMENDED TO RUN THIS PRIOR TO THE BREAKOUT SESSION.
*/

libname indata "./ncimultivar/data";

%include "./ncimultivar/macros/ncimultivar.sas";
%include "./hei2020/hei2020.score.macro.sas";

/*
Since the NCI method can generate distributions of multiple foods and nutrients, it is possible to obtain distributions of dietary index scores.

In this example, a distribution of HEI-2020 scores will be generated from 2005-2010 NHANES data.
A subset of six strata (SDMVSTRA) will be used to reduce computation time and allow this example to run in real time.

The covariates being examined are smoking status (SMK_REC), age (RIDAGEYR), and sex (RIAGENDR). 
Two nuisance covariates will be factored in as well: whether the recall was on a weekend (Weekend) and and whether the recall is on day 2 (Day2).

The WTDRD1 variable is the weighting for each observation.

Subjects with missing values are removed, and categorical variables are transformed into binary indicators.
*/

**subset data;
data input_dataset;
	set indata.nhcvd;
	if SDMVSTRA in (48 54 60 66 72 78);
	
	**Define indicator for Day 2;
	Day2 = (DAY = 2);
run;

data input_dataset;
	set input_dataset;
	
	**remove subjects that are missing any covariates;
	if not missing(SMK_REC)  and 
		 not missing(RIDAGEYR) and 
		 not missing(RIAGENDR) and
		 not missing(Weekend)  and
		 not missing(Day2);

	**break down smoking status into binary indicators;
	Current_Smoker = (SMK_REC = 1);
	Former_Smoker = (SMK_REC = 2);
	Never_Smoker = (SMK_REC = 3);
run;

/*
When creating distributions of a dietary index, a measurement error-corrected distribution of the components must be generated first. 
Then, the dietary index scores should be calculated from the distribution of the components. 
This is analogous to the process for calculating ratios of usual intakes.
The dietary index should not be applied directly to the raw data.

The variables being modeled should be properly disaggregated so that there is no overlap between them. 
For example, total vegetables (V_TOTAL) includes dark green vegetables (V_DRKGR). 
A new variable for non-dark green vegatables (V_NONDRKGR, see below) must be created to model alongside dark green vegetables. 
Failing to disaggregate variables will substantially impact the convergence of the model. 
*/

**create variables to model;
data input_dataset;
	set input_dataset;
	
	V_NONDRKGR = V_TOTAL - V_DRKGR;
	
	F_WHOLEFRT = F_CITMLB + F_OTHER;
	
	PF_SEAPLANT = PF_SOY + PF_SEAFD_HI + PF_SEAFD_LOW;
	PF_NONSEAPLANT = max(0, PF_TOTAL - PF_SEAPLANT);
	
	MONOPOLY = TMFAT + TPFAT;
	SATFAT = TSFAT;
	
	SODIUM = TSODI;
	
	KCAL = TKCAL;
run;

/*
The variables can now be transformed and standardized for use in the MCMC algorithm.

After disaggregation, the HEI-2020 contains six episodic variables and nine daily variables.
The Winsorization and Box-Cox survey code would have to be repeated for all of the variables separately.
To reduce this repetitive work, a macro will be used to automate Winsorization and finding Box-Cox lambdas.
This tutorial assumes some familiarity with writing SAS macros.
*/

%let episodic_variables = PF_LEGUMES F_WHOLEFRT G_WHOLE F_JUICE PF_SEAPLANT V_DRKGR;
%let daily_variables = ADD_SUGARS D_TOTAL G_REFINED MONOPOLY SATFAT SODIUM PF_NONSEAPLANT V_NONDRKGR KCAL;

**Macro to Winsorize and find best Box-Cox lambda for every variable;
%macro boxcox_all(episodic_variables=,
									daily_variables=);
									
	%let num_episodic = %sysfunc(countw(&episodic_variables., %str( )));
	%let num_daily = %sysfunc(countw(&daily_variables., %str( )));
	
	**Winsorize variables;
	%let all_variables = &episodic_variables. &daily_variables.;
	
	%do i = 1 %to %eval(&num_episodic. + &num_daily.);
	
		%let variable = %sysfunc(scan(&all_variables., &i., %str( )));
		
		%if &i. <= &num_episodic. %then %do;
		
			%let is_episodic = Y;
		%end;
		%else %do;
		
			%let is_episodic = N;
		%end;
		
		%boxcox_survey(input_data=input_dataset,
									 row_subset=%quote(Day2 = 0),
									 variable=&variable.,
									 is_episodic=&is_episodic.,
									 weight=WTDRD1,
									 do_winsorization=Y,
									 id=SEQN,
									 repeat_obs=DAY);
									 
		**merge in Winsorization report dataset and automatically Winsorize variable;
		proc sort data=input_dataset; by SEQN DAY &variable.; run;
		proc sort data=win_&variable.; by SEQN DAY &variable.; run;
		data input_dataset;
			merge input_dataset (in=in_input)
						win_&variable.;
			by SEQN DAY &variable.;
			if in_input = 1;
			
			if not missing(&variable._win) then &variable. = &variable._win;
		run;	
	%end;
	
	**Find best Box-Cox lambdas for each variable;	
	%do i = 1 %to %eval(&num_episodic. + &num_daily.);
	
		%let variable = %sysfunc(scan(&all_variables., &i., %str( )));
		
		%boxcox_survey(input_data=input_dataset,
									 row_subset=%quote(Day2 = 0),
									 variable=&variable.,
									 covariates=Current_Smoker Former_Smoker RIDAGEYR RIAGENDR Weekend,
									 weight=WTDRD1);
	%end;
	
	**combine Box-Cox lambda datasets;
	data boxcox_lambda_data;
		set
		%do i = 1 %to %eval(&num_episodic. + &num_daily.);
			bc_%sysfunc(scan(&all_variables., &i., %str( )))
		%end;
		;
	run;
%mend boxcox_all;

**Run Box-Cox macro;
%boxcox_all(episodic_variables=&episodic_variables.,
						daily_variables=&daily_variables.);
						
**calculate minimum amount for each variable;
%calculate_minimum_amount(input_data=input_dataset,
													row_subset=%quote(Day2 = 0),
													episodic_variables=&episodic_variables.,
													daily_variables=&daily_variables.);
													
**Run MCMC pre-preprocessor;
%nci_multivar_preprocessor(input_data=input_dataset,
													 episodic_variables=&episodic_variables.,
													 daily_variables=&daily_variables.,
													 continuous_covariates=RIDAGEYR,
													 boxcox_lambda_data=boxcox_lambda_data,
													 minimum_amount_data=minimum_amount_data,
													 outname=model);
													 
/*
The MCMC model can now be fit using all of the variables.

Since this model contains six episodic variables and nine daily variables, convergence is expected to be slower than univariate or bivariate models.
As a result, the number of iterations and burn-in have been increased substantially.
To account for slower convergence, the thinning number has been increased to ensure that the iterations used for posterior mean calculation are independent.

This MCMC model takes a significant amount of time to run due to large number of variables and iterations.
*/

%nci_multivar_mcmc(pre_mcmc_data=model,
									 id=SEQN,
									 repeat_obs=DAY,
									 weight=WTDRD1,
									 episodic_variables=&episodic_variables.,
									 daily_variables=&daily_variables.,
									 default_covariates=Current_Smoker Former_Smoker std_RIDAGEYR RIAGENDR Day2 Weekend,
									 num_mcmc_iterations=9000,
									 num_burn=5000,
									 num_thin=4,
									 mcmc_seed=9999,
									 outname=model);
									 
/*
A simulated dataset representing the distribution of usual intakes for each variable can now be generated. 
This is not a prediction of actual usual intake for each subject.
*/

proc sort data=model_mcmc_in; by SEQN; run;
data distrib_pop;
	set model_mcmc_in;
	by SEQN;
	
	**get first instance of each subject;
	if first.SEQN then do;
	
		**Set Day 2 to zero to factor out the effect of Day 2 recalls;
		Day2 = 0;
	
		**create repeats of each subject for weekday and weekend consumption;
		Weekend = 0;
		Weekend_Weight = 4;
		output;
	
		Weekend = 1;
		Weekend_Weight = 3;
		output;
	end;
run;

**create dataset with 200 simulated usual intakes for each subject;
%nci_multivar_distrib(multivar_mcmc_model=model,
											distrib_population=distrib_pop,
											id=SEQN,
											weight=WTDRD1,
											nuisance_weight=Weekend_Weight,
											num_simulated_u=200,
											distrib_seed=99999,
											outname=model_distrib_out);
											
/*
The simulated distribution of usual intakes can now be used to calculate a distribution of HEI-2020 scores.

The HEI-2020 scoring for toddlers (under age 2) is different from the standard HEI-2020 scoring.
This dataset only has subjects age 20-85, so the standard HEI-2020 score will be calculated for all subjects.
*/

**Create HEI component variables;
data model_distrib_out;
	set model_distrib_out;
	
	usl_V_DRKGRLEG = usl_V_DRKGR + usl_PF_LEGUMES/4;
	usl_V_TOTALLEG = usl_V_DRKGRLEG + usl_V_NONDRKGR;
	
	usl_F_TOTAL = usl_F_WHOLEFRT + usl_F_JUICE;
	
	usl_PF_SEAPLANTLEG = usl_PF_SEAPLANT + usl_PF_LEGUMES;
	usl_PF_ALLPROTLEG = usl_PF_SEAPLANTLEG + usl_PF_NONSEAPLANT;
run;

**calculate HEI-2020 scores;
%HEI2020(indat=model_distrib_out,
				 kcal=usl_KCAL,
				 f_total=usl_F_TOTAL,
				 fwholefrt=usl_F_WHOLEFRT,
				 vtotalleg=usl_V_TOTALLEG,
				 vdrkgrleg=usl_V_DRKGRLEG,
				 g_whole=usl_G_WHOLE,
				 d_total=usl_D_TOTAL,
				 pfallprotleg=usl_PF_ALLPROTLEG,
				 pfseaplantleg=usl_PF_SEAPLANTLEG,
				 monopoly=usl_MONOPOLY,
				 g_refined=usl_G_REFINED,
				 sodium=usl_SODIUM,
				 add_sugars=usl_ADD_SUGARS,
				 satfat=usl_SATFAT,
				 outdat=hei_scores);
				 
**mean and quantiles of total HEI-2020 score;
%nci_multivar_summary(input_data=hei_scores,
											variables=HEI2020_TOTAL_SCORE,
											weight=WTDRD1,
											do_means=Y,
											do_quantiles=Y,
											quantiles=5 25 50 75 95,
											outname=hei_summary);
											
proc print data=hei_summary; 

	title "HEI-2020 Total Scores";
run;

