/*
Title: Univariate Regression Calibration
Author: NCI/Information Management Services
Date: 3/24/2025
*/

/*
This example demonstrates univariate regression calibration using the NCI method.
*/

libname indata "./ncimultivar/data";

%include "./ncimultivar/macros/ncimultivar.sas";

/*
The distributions created by the NCI method can be used for further analysis such as regressions.

The effect of potassium (TPOTA) intake on systolic blood pressure (BPSY_AVG) will be measured from 2005-2010 NHANES data.
A subset of six strata (SDMVSTRA) will be used to reduce computation time and allow this example to run in real time.

The covariates being examined are smoking status (SMK_REC), age (RIDAGEYR), and sex (RIAGENDR). 
The two nuisance covariates are whether the recall was on a weekend (Weekend) and and whether the recall is on day 2 (Day2).

Subjects with missing values are removed, and categorical variables are transformed into binary indicators.
*/

**subset data;
data input_dataset;
	set indata.nhcvd;
	if SDMVSTRA in (48 54 60 66 72 78);
	
	**Define indicator for Day 2;
	Day2 = (DAY = 2);
run;

data input_dataset;
	set input_dataset;
	
	**remove subjects that are missing any covariates, variables, or outcomes;
	if not missing(SMK_REC)  and 
		 not missing(RIDAGEYR) and 
		 not missing(RIAGENDR) and
		 not missing(Weekend)  and
		 not missing(Day2)		 and
		 not missing(TPOTA)		 and
		 not missing(BPSY_AVG);
		 
	**break down smoking status into binary indicators;
	Current_Smoker = (SMK_REC = 1);
	Former_Smoker = (SMK_REC = 2);
	Never_Smoker = (SMK_REC = 3);
	
	**rename potassium variable for readability;
	Potassium = TPOTA;
run;

/*
The potassium variable can now be transformed and standardized for use in the MCMC algorithm.
*/

**Winsorize extreme values of potassium intake;
%boxcox_survey(input_data=input_dataset,
							 row_subset=%quote(Day2 = 0),
							 variable=Potassium,
							 weight=WTDRD1,
							 do_winsorization=Y,
							 id=SEQN,
							 repeat_obs=DAY);
							 
data input_dataset;
	set input_dataset;
	
	Potassium = max(Potassium, 42.45263);
run;

**run Box-Cox survey and create Box-Cox lambda data using the first recall;
%boxcox_survey(input_data=input_dataset,
							 row_subset=%quote(Day2 = 0),
							 variable=Potassium,
							 covariates=Current_Smoker Former_Smoker RIDAGEYR RIAGENDR Weekend,
							 weight=WTDRD1);
							 
**Calculate minimum amount of potassium intake in the first recall;
%calculate_minimum_amount(input_data=input_dataset,
													row_subset=%quote(Day2 = 0),
													daily_variables=Potassium);
													
**Run MCMC pre-preprocessor;
%nci_multivar_preprocessor(input_data=input_dataset,
													 daily_variables=Potassium,
													 continuous_covariates=RIDAGEYR,
													 boxcox_lambda_data=bc_Potassium,
													 minimum_amount_data=minimum_amount_data,
													 outname=model);
													 
/*
The MCMC measurement error model can now be fit.

To perform regression calibration, draws of the U matrix conditional on the mean MCMC parameters must be taken. 
This is accomplished using the num_post parameter in the %nci_multivar_mcmc() macro. 
Each conditional U matrix draw will be used in place of a simulated U matrix in %nci_multivar_distrib() to simulate a usual intake for each subject. 
To ensure that enough data is generated to produce good estimates of the model parameters, 500 conditional U matrices will be drawn in order to simulate 500 usual intakes for each subject.
*/

%nci_multivar_mcmc(pre_mcmc_data=model,
									 id=SEQN,
									 repeat_obs=DAY,
									 weight=WTDRD1,
									 daily_variables=Potassium,
									 default_covariates=Current_Smoker Former_Smoker std_RIDAGEYR RIAGENDR Day2 Weekend,
									 num_mcmc_iterations=3000,
									 num_burn=1000,
									 num_thin=2,
									 num_post=500,
									 mcmc_seed=9999,
									 outname=model);
									 
/*
A dataset with simulated usual intakes for each subject can now be created using %nci_multivar_distrib(). 
This dataset represents the conditional expectation of usual intake given the observed data for each subject to be used in the regression calibration procedure. 
It is not a prediction or calculation of the true usual intake for individuals.

The population-based dataset is created the same way as for distributions.

Each subject will have 500 simulated usual intakes from the 500 conditional U matrix draws taken from the output of %nci_multivar_mcmc(). 
In order to tell %nci_multivar_distrib() to use the conditional U matrix draws instead of simulating U matrices, the use_mcmc_u_matrices parameter must be set to Y.

The additional_output parameter is used to include the outcome variable and covariates in the distribution dataset to use in the regression.
The variables supplied in additional_output will be passed through from the population base dataset.
*/

proc sort data=model_mcmc_in; by SEQN; run;
data distrib_pop;
	set model_mcmc_in;
	by SEQN;
	
	**get first instance of each subject;
	if first.SEQN then do;
	
		**Set Day 2 to zero to factor out the effect of Day 2 recalls;
		Day2 = 0;
	
		**create repeats of each subject for weekday and weekend consumption;
		Weekend = 0;
		Weekend_Weight = 4;
		output;
	
		Weekend = 1;
		Weekend_Weight = 3;
		output;
	end;
run;

%nci_multivar_distrib(multivar_mcmc_model=model,
											distrib_population=distrib_pop,
											id=SEQN,
											weight=WTDRD1,
											nuisance_weight=Weekend_Weight,
											use_mcmc_u_matrices=Y,
											additional_output=Current_Smoker Former_Smoker RIDAGEYR RIAGENDR BPSY_AVG,
											distrib_seed=99999,
											outname=model_distrib_out);
											
/*
To perform regression calibration, the 500 simulated intakes for each subject need to be averaged.
*/

proc sort data=model_distrib_out; by SEQN WTDRD1 Current_Smoker Former_Smoker RIDAGEYR RIAGENDR BPSY_AVG; run;

proc univariate data=model_distrib_out noprint;
	by SEQN WTDRD1 Current_Smoker Former_Smoker RIDAGEYR RIAGENDR BPSY_AVG;
	
	var usl_Potassium;
	
	output out=regression_data mean=usl_Potassium;
run;

/*
The mean simulated intakes are then used as a covariate in the regression model.

The effect per milligram of potassium is very small, so the potassium usual intake will be converted to grams so that the coefficient is more readable.
*/

**scale down simulated potassium intake by 1000 to show the effect per 1,000 mg of potassium;
data regression_data;
	set regression_data;
	
	usl_Potassium = usl_Potassium/1000;
run;

ods select none;

**fit linear model;
proc surveyreg data=regression_data;

	model BPSY_AVG = usl_Potassium Current_Smoker Former_Smoker RIDAGEYR RIAGENDR;
	
	weight WTDRD1;
	
	ods output ParameterEstimates=bp_model;
run;

ods select all;

**summary dataset of model parameters;
%summary_coef_surveyreg(parameter_estimates=bp_model,
												response=BPSY_AVG,
												outname=bp_parameters);
												
proc print data=bp_parameters; 

	title "Systolic Blood Pressure vs. Usual Intake of Potassium";
run;