/*
Gelman-Rubin Test for MCMC Convergence

Description:

	Tests an MCMC model specification for convergence using the
	Gelman-Rubin test. A model is fit multiple times with the specified
	parameters using different random seeds each time.
	
Parameters:

	- num_chains:
	- pre_mcmc_lib: The SAS library that contains the pre-MCMC datasets. (default = WORK)
	- pre_mcmc_data: Prefix for the pre-MCMC datasets (max 23 characters). The following datasets with the prefix must be present:
									 	 - _mcmc_in: A SAS data set with indicators, standardized amounts, and covariates for the MCMC model.
									 	 - _backtran: A SAS data set with the following columns:
									 	 							  - variable: The name of the variable.
									 	 							  - tran_lambda: The value of the lambda used to transform the variable.
									 	 							  - minamount: Half of the minimum non-zero value of the variable.
									 	 							  - tran_center: The mean of the Box-Cox transformed variable before standardization.
									 	 							  - tran_scale: The standard deviation of the Box-Cox transformed variable before standardization divided by sqrt(2).
									 	 							  - biomarker: Binary flag indicating whether the variable is a biomarker assumed to be unbiased on the transformed scale.
	- id: Variable that identifies each subject.
	- repeat_obs: Variable that distinguishes repeat observations for each subject.
	- weight: Variable with weighting for each subject.
	- episodic_variables: Space-delimited list of episodic variables.
	- episodic_indicators: Space-delimited list of consumption indicator variables for episodic variables. 
												 Not needed if pre_mcmc_data was created by nci_multivar_preprocessor.
	- episodic_amounts: Space-delimited list of consumption amount variables for episodic variables.
											Not needed if pre_mcmc_data was created by nci_multivar_preprocessor.
	- daily_variables: Space-delimited list of daily variables.
	- daily_amounts: Space-delimited list of consumption amount variables for daily variables.
									 Not needed if pre_mcmc_data was created by nci_multivar_preprocessor.
	- default_covariates: Space-delimited list of covariates to be used for episodic indicators, episodic amounts, and daily amounts. 
												Does not affect the never-consumer model if present.
	- episodic_indicator_covariates: Space-delimited list of covariates to be used for episodic indicators. 
																	 If specified, overwrites default_covariates for episodic indicators.
	- episodic_amount_covariates: Space-delimited list of covariates to be used for episodic amounts. 
																If specified, overwrites default_covariates for episodic amounts.
	- daily_amount_covariates: Space-delimited list of covariates to be used for daily amounts. 
														 If specified, overwrites default_covariates for daily amounts.
	- individual_covariates: A SAS data set of covariate lists for individual indicators and amounts.
													 It must contain the following columns:
													 	 - variable: Name of the indicator or amount.
													 	 - covariates: A space-delimited string of covariates for the indicator or amount.
													 Overrides default_covariates, episodic_indicator_covariates, episodic_amount_covariates, and daily_amount_covariates for each named variable.
	- default_intercept: Flag to include an intercept in the models for episodic indicators, episodic amounts, and daily amounts. (Y/N, default = Y)
											 Does not affect the never-consumer model if present.
	- episodic_indicator_intercept: Flag to include an intercept in the model for episodic indicators. (Y/N).
																	If specified, overwrites default_intercept for episodic indicators.
	- episodic_amount_intercept: Flag to include an intercept in the model for episodic amounts. (Y/N)
															 If specified, overwrites default_intercept for episodic amounts.
	- daily_amount_intercept: Flag to include an intercept in the model for daily amounts. (Y/N)
														If specified, overwrites default_intercept for daily amounts.
	- individual_intercept: A SAS data set of intercepts for individual indicators and amounts.
													It must contain the following columns:
														- variable: Name of the indicator or amount.
														- intercept: Flag to include an intercept in the model for the indicator or amount (Y/N).
	- never_consumer_variable: One episodic variable to allow never-consumers. 
														 Can be a variable already listed as episodic or a new variable.
	- never_consumer_indicator: Consumption indicator variable for the never-consumer variable.
															Not needed if pre_mcmc_data was created by nci_multivar_preprocessor.
	- never_consumer_amount: Consumption amount variable for the never-consumer variable.
													 Not needed if pre_mcmc_data was created by nci_multivar_preprocessor.
	- never_consumer_covariates: Space-delimited list of covariates to be used for the never-consumer model. 
															 Not affected by default_covariates and must be specified separately.
	- never_consumer_intercept: Flag to include intercept in the never-consumer model.
															Not affected by default_intercept and must be specified separately. (Y/N, default = Y)
	- initial_mcmc_seed: Numeric starting seed for the random number generator. 
											 If specified, the seed will be incremented for each MCMC replicate. 
											 If blank, uses a randomly generated integer from -10^7 to 10^7, exclusive for each replicate. (default = blank)
	- num_mcmc_iterations: Integer specifying the total number (including burn-in) of iterations in the MCMC chain. (default = 12000)
	- num_burn: Integer specifying the number of burn-in iterations in the MCMC chain. Must be smaller than num_mcmc_iterations. (default = 2000)
	- num_thin: Integer specifying the number of iterations between MCMC samples used for calculating posterior means. (default = 25)
	- sigma_u_prior: A SAS data set specifying a prior covariance matrix for the random effects (u). 
									 Must be a valid covariance matrix with dimensions equal to the total number of episodic indicators, episodic amounts, and daily amounts.
	- sigma_u_constant: Flag specifying if the covariance matrix of the random effects (u) should remain the same for each iteration. (Y/N, default = N)
	- outlib: The SAS library to store output datasets. (default = WORK)
	- outname: The name of the output dataset.
	
Output:

	The following SAS data set is created in outlib:
	
		outname: A SAS data set with observation containing Gelman-Rubin statistics for each MCMC parameter.
	
Details:

	The Gelman-Rubin test works by creating multiple MCMC chains with
	different random seeds and calculating the within-chain and between-chain
	variation (Gelman and Rubin, 1992). If the model parameters converge, there
	should be little to no difference between different chains which will cause
	the between-chain variance to fall to zero. This means that convergence can
	be assessed by comparing the total variance of the model parameters to the
	within-chain variance.
	
	The Gelman-Rubin statistic is the square root of the ratio between the
	total variance and the within-chain variance of a parameter. If the
	Gelman-Rubin statistic is close to 1, then the model has converged for that
	parameter. A cutoff of 1.1 for convergence is suggested by Gelman, et al.
	(2004).
*/
%macro gelman_rubin(num_chains=5,
										pre_mcmc_lib=WORK,
										pre_mcmc_data=,
										id=,
										repeat_obs=,
										weight=,
										episodic_variables=,
										episodic_indicators=,
										episodic_amounts=,
										daily_variables=,
										daily_amounts=,
										default_covariates=,
										episodic_indicator_covariates=,
										episodic_amount_covariates=,
										daily_amount_covariates=,
										individual_covariates=,
										default_intercept=Y,
										episodic_indicator_intercept=,
										episodic_amount_intercept=,
										daily_amount_intercept=,
										individual_intercept=,
										never_consumer_variable=,
										never_consumer_indicator=,
										never_consumer_amount=,
										never_consumer_covariates=,
										never_consumer_intercept=Y,
										initial_mcmc_seed=,
										num_mcmc_iterations=12000,
										num_burn=2000,
										num_thin=25,
										sigma_u_prior=,
										sigma_u_constant=N,
										outlib=WORK,
										outname=);
										
	%local i;
										
	%let mcmc_seed = &initial_mcmc_seed.;
	
	%do i = 1 %to &num_chains.;
	
		**Fit MCMC model;
		%nci_multivar_mcmc(pre_mcmc_lib=&pre_mcmc_lib.,
											 pre_mcmc_data=&pre_mcmc_data.,
											 id=&id.,
											 repeat_obs=&repeat_obs.,
											 weight=&weight.,
											 episodic_variables=&episodic_variables.,
											 episodic_indicators=&episodic_indicators.,
											 episodic_amounts=&episodic_amounts.,
											 daily_variables=&daily_variables.,
											 daily_amounts=&daily_amounts.,
											 default_covariates=&default_covariates.,
											 episodic_indicator_covariates=&episodic_indicator_covariates.,
											 episodic_amount_covariates=&episodic_amount_covariates.,
											 daily_amount_covariates=&daily_amount_covariates.,
											 individual_covariates=&individual_covariates.,
											 default_intercept=&default_intercept.,
											 episodic_indicator_intercept=&episodic_indicator_intercept.,
											 episodic_amount_intercept=&episodic_amount_intercept.,
											 daily_amount_intercept=&daily_amount_intercept.,
											 individual_intercept=&individual_intercept.,
											 never_consumer_variable=&never_consumer_variable.,
											 never_consumer_indicator=&never_consumer_indicator.,
											 never_consumer_amount=&never_consumer_amount.,
											 never_consumer_covariates=&never_consumer_covariates.,
											 never_consumer_intercept=&never_consumer_intercept.,
											 mcmc_seed=&mcmc_seed.,
											 num_mcmc_iterations=&num_mcmc_iterations.,
											 num_burn=&num_burn.,
											 num_thin=&num_thin.,
											 sigma_u_prior=&sigma_u_prior.,
											 sigma_u_constant=&sigma_u_constant.,
											 outname=_model);
											 
		**Calculate parameter means and variances;
		%extract_parameters(multivar_mcmc_model=_model,
												outname=_model);
												
		data _mean&i.;
			set _model_means;
		run;
												
		proc iml;
		
			use _model_cov;
				read all var _NUM_ into mcmc_covariance[colname=parameter_names];
			close _model_cov;
			
			mcmc_variance = vecdiag(mcmc_covariance)`;
			
			create _variance&i. from mcmc_variance[colname=parameter_names];
				append from mcmc_variance;
			close _variance&i.;
		quit;
		
		**increment seed if it is specified;
		%if &mcmc_seed. ^= %str() %then %do;
		
			%let mcmc_seed = %eval(&mcmc_seed. + 1);
		%end;
	%end;
	
	data _mcmc_means;
		set _mean1-_mean&num_chains.;
	run;
	
	data _mcmc_variances;
		set _variance1-_variance&num_chains.;
	run;
	
	proc iml;
		
		use _mcmc_means;
			read all var _ALL_ into means[colname=parameter_names];
		close _mcmc_means;
		
		use _mcmc_variances;
			read all var _ALL_ into variances;
		close _mcmc_variances;
		
		num_samples = int((&num_mcmc_iterations. - &num_burn.)/&num_thin.);
		
		**calculate the joint mean of all chains;
		joint_mean = means[:,];
		
		**calculate the within-chain, between-chain, and total variances for each parameter;
		within_variance = variances[:,];
		between_variance = (num_samples/(&num_chains. - 1)) # ((means - joint_mean) ## 2)[+,];
		total_variance = (num_samples/(num_samples - 1)) # within_variance + ((&num_chains. + 1)/(&num_chains.*num_samples)) # between_variance;
		
		**Calculate the Gelman-Rubin statistics for each parameter;
		**if both total variance and within-chain variance are zero, the statistic is one;
		gr_statistics = sqrt(total_variance/within_variance);
		if any(total_variance = 0 & within_variance = 0) then do;
			
			gr_statistics[loc(total_variance = 0 & within_variance = 0)] = 1;
		end;
		
		create &outlib..&outname. from gr_statistics[colname=parameter_names];
			append from gr_statistics;
		close &outlib..&outname.;
	quit;
%mend gelman_rubin;