/*
Clean and prepare data for the MCMC model

Description:

	Removes missing and negative values from variables, makes indicators for episodic variables, 
	and standardizes variables and covariates
	
Parameters:

	- input_data: A SAS data set.
	- episodic_variables: Space-delimited list of episodic variables (max 28 characters each).
	- episodic_biomarkers: Space-delimited list episodic biomarkers (max 28 characters each).
	- daily_variables: Space-delimited list of daily variables (max 28 characters each).
	- daily_biomarkers: Space-delimited list of daily biomarkers (max 28 characters each).
	- continuous_covariates: Space-delimited list of continuous covariates to be standardized (max 28 characters each)
	- boxcox_lambda_data: A SAS data set with Box-Cox lambda parameters for each variable. Must only contain the following columns:
													- variable: Name of the variable.
													- tran_lambda: Box-Cox transformation parameter to use for the variable.
	- minimum_amount_data: A SAS data set with minimum consumption amounts for each variable. Must contain only the following columns:
												 	 - variable: Name of the variable.
												 	 - minamount: Minimum amount consumed for the variable.
	- outlib: The SAS library to store output datasets. (default = WORK).
	- outname: The prefix used to name output datasets (max 23 characters).
	
Output:

	The following SAS datasets are created in outlib and prefixed with outname:
	
		- _mcmc_in: A SAS data set with all of the columns of input_data plus any created indicator variables and standardized variables and covariates.
		- _backtran: A SAS data set with the following columns:
								 	 - variable: The name of the variable.
								 	 - tran_lambda: The Box-Cox lambda used to transform the variable.
								 	 - minamount: The minimum allowed usual intake, defined as half of the smallest non-zero intake in the observed data.
								 	 - tran_center: The mean of the Box-Cox transformed variable before standardization.
								 	 - tran_scale: The standard deviation of the Box-Cox transformed variable before standardization divided by sqrt(2).
								 	 - biomarker: Binary indicator of whether the variable is a biomarker assumed to be unbiased on the transformed scale.
								 	 							If 0, a bias correction factor will be added and a 9-point approximation will be used for backtransformation.
						 										If 1, an exact backtransformation will be used with no correction.
	
Details:

	For episodic variables, the MCMC requires a separate indicator and
	amount to model the probability of consumption and the amount consumed. The
	variable values for both indicators and amounts are Box-Cox transformed,
	then standardized to a mean of 0 and variance of 2. Continuous covariates
	are standardized to a mean of 0 and a variance of 1. This standardization
	is done to meet the assumptions of the MCMC laid out in Zhang, et al.
	(2011). Indicator variables have the prefix ind_, standardized amounts
	have the prefix amt_, and standardized covariates have the prefix std_.
*/
%macro nci_multivar_preprocessor(input_data=,
																 episodic_variables=,
																 episodic_biomarkers=,
																 daily_variables=,
																 daily_biomarkers=,
																 continuous_covariates=,
																 boxcox_lambda_data=,
																 minimum_amount_data=,
																 outlib=WORK,
																 outname=);
																 
	%local num_episodic_variables num_daily_variables num_variables i j variable
				 num_episodic_biomarkers num_daily_biomarkers biomarker
				 num_continuous_covariates covariate
				 sqroot2;
				 
	%let sqroot2 = 1.414213562373095;
	
															
	**Add biomarkers into their corresponding variable lists;
	%let num_episodic_biomarkers = %sysfunc(countw(&episodic_biomarkers., %str( )));
	%do i = 1 %to &num_episodic_biomarkers.;
	
		%let biomarker = %sysfunc(scan(&episodic_biomarkers, &i., %str( )));
		%if %index(%upcase(&episodic_variables.), %upcase(&biomarker.)) = 0 %then %let episodic_variables = &episodic_variables. &biomarker.;
	%end;
	
	%let num_daily_biomarkers = %sysfunc(countw(&daily_biomarkers., %str( )));
	%do i = 1 %to &num_daily_biomarkers.;
	
		%let biomarker = %sysfunc(scan(&daily_biomarkers, &i., %str( )));
		%if %index(%upcase(&daily_variables.), %upcase(&biomarker.)) = 0 %then %let daily_variables = &daily_variables. &biomarker.;
	%end;
	
	**Variable and covariate list lengths;
	%let num_episodic_variables = %sysfunc(countw(&episodic_variables., %str( )));
	%let num_daily_variables = %sysfunc(countw(&daily_variables., %str( )));
	%let num_variables = %eval(&num_episodic_variables. + &num_daily_variables.);
	
	%let num_continuous_covariates = %sysfunc(countw(&continuous_covariates., %str( )));
	
	**Remove missing and negative data;
	data _cleaned
			 _excluded;
		set &input_data.;
		
		if nmiss(of &episodic_variables. &daily_variables. &continuous_covariates.) ^= 0 then output _excluded;
		%do i = 1 %to &num_variables.;
		
			%let variable = %sysfunc(scan(&episodic_variables. &daily_variables., &i., %str( )));
			else if &variable. < 0 then output _excluded;
		%end;
		else output _cleaned;
	run;
	
	**Tranpose Box-Cox lambda and minimum amount data to 1 record;
	proc sort data=&boxcox_lambda_data.; by variable; run;
	proc sort data=&minimum_amount_data.; by variable; run;
	data _lambda_minamount;
		merge &boxcox_lambda_data.
					&minimum_amount_data.;
		by variable;
	run;
	
	data _lambda_minamount_1record (keep = _lambda1-_lambda&num_variables. _minamount1-_minamount&num_variables.);
		set _lambda_minamount end=last_obs;
		
		retain _lambda1-_lambda&num_variables.
					 _minamount1-_minamount&num_variables.;
		
		%do i = 1 %to &num_variables.;
		
			%let variable = %sysfunc(scan(&episodic_variables. &daily_variables., &i., %str( )));
			if compare(variable, "&variable.", "i") = 0 then do;
			
				_lambda&i. = tran_lambda;
				_minamount&i. = minamount;
			end;
		%end;
		
		if last_obs = 1 then output;
	run;
	
	**Create indicator and amount variables;
	data _std_data;
		if _N_ = 1 then set _lambda_minamount_1record;
		set	_cleaned;
				
		%let i = 0;
				
		%do j = 1 %to &num_episodic_variables.;
		
			%let i = %eval(&i. + 1);
		
			%let variable = %sysfunc(scan(&episodic_variables., &j., %str( )));
			if &variable. = 0 then do;
			
				ind_&variable. = 0;
				amt_&variable. = .;
			end;
			else do;
			
				ind_&variable. = 1;
				if _lambda&i. = 0 then amt_&variable. = log(&variable.);
				else amt_&variable. = (&variable.**_lambda&i. - 1)/_lambda&i.;
			end;
		%end;
		
		%do j = 1 %to &num_daily_variables.;
		
			%let i = %eval(&i. + 1);
			
			%let variable = %sysfunc(scan(&daily_variables., &j. %str( )));
			
			**Set zero values for daily variables to minimum amount;
			if &variable. = 0 then &variable. = _minamount&i.; 
			
			if _lambda&i. = 0 then amt_&variable. = log(&variable.);
			else amt_&variable. = (&variable.**_lambda&i. - 1)/_lambda&i.;
		%end;
	run;
	
	**Standardize variables to mean of zero and a variance of 2;
	%do i = 1 %to &num_variables.;
	
		%let variable = %sysfunc(scan(&episodic_variables. &daily_variables., &i., %str( )));
		proc stdize data=_std_data nomiss method=std mult=&sqroot2. outstat=_std&i. out=_std_data;
			var amt_&variable.;
		run;
		
		data _centerscale&i. (keep = variable tran_center tran_scale);
			set _std&i. end=last_obs;
			
			length variable $32.;
			
			retain location scale mult;
			
			if _TYPE_ = "LOCATION" then location = amt_&variable.;
			else if _TYPE_ = "SCALE" then scale = amt_&variable.;
			else if _TYPE_ = "MULT" then mult = amt_&variable.;
			
			if last_obs = 1 then do;
			
				variable = "&variable.";
				tran_center = location;
				tran_scale = scale/mult;
				output;
			end;
		run;
	%end;
	
	data _centerscale_data;
		set _centerscale1 - _centerscale&num_variables.;
	run;
	
	**Standardize covariates to mean of zero and a variance of 1;
	data _std_data;
		set _std_data;
		
		%do i = 1 %to &num_continuous_covariates.;
		
			%let covariate = %sysfunc(scan(&continuous_covariates., &i., %str( )));
			std_&covariate. = &covariate.;
		%end;
	run;
	
	%do i = 1 %to &num_continuous_covariates.;
	
		%let covariate = %sysfunc(scan(&continuous_covariates., &i., %str( )));
		proc stdize data=_std_data nomiss method=std out=_std_data;
			var std_&covariate.;
		run;
	%end;
	
	**Create backtransformation data;
	proc sort data=_centerscale_data; by variable; run;
	data &outlib..&outname._backtran;
		merge &boxcox_lambda_data.
					&minimum_amount_data.
					_centerscale_data;
		by variable;
		
		**Check for biomarkers;
		if find("&episodic_biomarkers. &daily_biomarkers.", variable, "i") ^= 0 then biomarker = 1;
		else biomarker = 0;
	run;
	
	**Output data with indicators, standardized amounts, and standardized covariates;
	data &outlib..&outname._mcmc_in;
		set _std_data;
		
		**Create flag so that the MCMC macro knows that the preprocessor was used;
		_nci_multivar = 1;
	run;
	
	**Print excluded data;
	proc print data=_excluded;
		title "Observations Excluded due to Missing or Negative Data";
	run;
	title;
%mend nci_multivar_preprocessor;

/*
Find minimum consumption amounts

Description:

	Calculate minimum non-zero amount of each food and nutrient on
	consumption days
	
Parameters:

	- input_data: A SAS data set.
	- row_subset: A conditional expression wrapped in %quote() indicating which rows of input_data to use for calculating the minimum amounts.
	- episodic_variables: Space-delimited list of episodic variables.
	- daily_variables: Space-delimited list of daily variables.
	- outlib: The SAS library to store output datasets. (default = WORK).
	
Output:

The following SAS data set is created in outlib:

	- minimum_amount_data: A SAS data set with the following columns:
												   - variable: The name of the food or nutrient variable.
												   - Half of the minimum non-zero amount consumed for the food or nutrient.
	
Details:

	For each food and nutrient a minimum amount of consumption is set at
	half of the smallest non-zero amount consumed. For nutrients, this amount
	replaces zero values so that the Box-Cox transformation is valid. For both
	foods and nutrients, the minimum amount is also used in
	nci_multivar_distrib as a lower bound on the backtransformed amount 
	consumed.
*/
%macro calculate_minimum_amount(input_data=,
																row_subset=,
																episodic_variables=,
																daily_variables=,
																outlib=WORK);
																
	%local num_episodic num_daily variable var_num;

	data &outlib..minimum_amount_data;
		set _NULL_;
		
		length variable $32.;
	run;
	
	%let num_episodic = %sysfunc(countw(&episodic_variables., %str( )));
	%do var_num = 1 %to &num_episodic.;
	
		%let variable = %sysfunc(scan(&episodic_variables., &var_num., %str( )));
		proc univariate data=&input_data. noprint;
			where &variable. > 0
			%if &row_subset. ^= %str() %then %do;
				and &row_subset.
			%end;
			;
		
			var &variable.;
			
			output out=_minamount min=minamount;
		run;
		
		data &outlib..minimum_amount_data (keep = variable minamount);
			set &outlib..minimum_amount_data
					_minamount (in = new);
					
			if new = 1 then do;
			
				variable = "&variable.";
				minamount = minamount/2;
			end;
		run;
	%end;
	
	%let num_daily = %sysfunc(countw(&daily_variables., %str( )));
	%do var_num = 1 %to &num_daily.;
		
		%let variable = %sysfunc(scan(&daily_variables., &var_num., %str( )));
		proc univariate data=&input_data. noprint;
			where &variable. > 0
			%if &row_subset. ^= %str() %then %do;
				and &row_subset.
			%end;
			;
		
			var &variable.;
			
			output out=_minamount min=minamount;
		run;
		
		data &outlib..minimum_amount_data (keep = variable minamount);
			set &outlib..minimum_amount_data
					_minamount (in = new);
			
			if new = 1 then do;
			
				variable = "&variable.";
				minamount = minamount/2;
			end;
		run;
	%end;
%mend calculate_minimum_amount;