%include "&sysincludefiledir./internal/mcmc_check_input.sas";
%include "&sysincludefiledir./internal/mcmc_process_variables.sas";

libname _mcmciml "&sysincludefiledir./internal/iml_modules";

/*
Fit MCMC measurement error model

Description:

	Fits a measurement error model for one or more episodic and/or
	daily variables using an MCMC method.
	
Parameters:

	- pre_mcmc_lib: The SAS library that contains the pre-MCMC datasets. (default = WORK)
	- pre_mcmc_data: Prefix for the pre-MCMC datasets (max 23 characters). The following datasets with the prefix must be present:
									 	 - _mcmc_in: A SAS data set with indicators, standardized amounts, and covariates for the MCMC model.
									 	 - _backtran: A SAS data set with the following columns:
									 	 							  - variable: The name of the variable.
									 	 							  - tran_lambda: The value of the lambda used to transform the variable.
									 	 							  - minamount: Half of the minimum non-zero value of the variable.
									 	 							  - tran_center: The mean of the Box-Cox transformed variable before standardization.
									 	 							  - tran_scale: The standard deviation of the Box-Cox transformed variable before standardization divided by sqrt(2).
									 	 							  - biomarker: Binary flag indicating whether the variable is a biomarker assumed to be unbiased on the transformed scale.
	- id: Variable that identifies each subject.
	- repeat_obs: Variable that distinguishes repeat observations for each subject.
	- weight: Variable with weighting for each subject.
	- episodic_variables: Space-delimited list of episodic variables.
	- episodic_indicators: Space-delimited list of consumption indicator variables for episodic variables. 
												 Not needed if pre_mcmc_data was created by nci_multivar_preprocessor.
	- episodic_amounts: Space-delimited list of consumption amount variables for episodic variables.
											Not needed if pre_mcmc_data was created by nci_multivar_preprocessor.
	- daily_variables: Space-delimited list of daily variables.
	- daily_amounts: Space-delimited list of consumption amount variables for daily variables.
									 Not needed if pre_mcmc_data was created by nci_multivar_preprocessor.
	- default_covariates: Space-delimited list of covariates to be used for episodic indicators, episodic amounts, and daily amounts. 
												Does not affect the never-consumer model if present.
	- episodic_indicator_covariates: Space-delimited list of covariates to be used for episodic indicators. 
																	 If specified, overwrites default_covariates for episodic indicators.
	- episodic_amount_covariates: Space-delimited list of covariates to be used for episodic amounts. 
																If specified, overwrites default_covariates for episodic amounts.
	- daily_amount_covariates: Space-delimited list of covariates to be used for daily amounts. 
														 If specified, overwrites default_covariates for daily amounts.
	- individual_covariates: A SAS data set of covariate lists for individual indicators and amounts.
													 It must contain the following columns:
													 	 - variable: Name of the indicator or amount.
													 	 - covariates: A space-delimited string of covariates for the indicator or amount.
													 
													 Overrides default_covariates, episodic_indicator_covariates, episodic_amount_covariates, and daily_amount_covariates for each named variable.
	- default_intercept: Flag to include an intercept in the models for episodic indicators, episodic amounts, and daily amounts. (Y/N, default = Y)
											 Does not affect the never-consumer model if present.
	- episodic_indicator_intercept: Flag to include an intercept in the model for episodic indicators. (Y/N).
																	If specified, overwrites default_intercept for episodic indicators.
	- episodic_amount_intercept: Flag to include an intercept in the model for episodic amounts. (Y/N)
															 If specified, overwrites default_intercept for episodic amounts.
	- daily_amount_intercept: Flag to include an intercept in the model for daily amounts. (Y/N)
														If specified, overwrites default_intercept for daily amounts.
	- individual_intercept: A SAS data set of intercepts for individual indicators and amounts.
													It must contain the following columns:
														- variable: Name of the indicator or amount.
														- intercept: Flag to include an intercept in the model for the indicator or amount (Y/N).
	- never_consumer_variable: One episodic variable to allow never-consumers. 
														 Can be a variable already listed as episodic or a new variable.
	- never_consumer_indicator: Consumption indicator variable for the never-consumer variable.
															Not needed if pre_mcmc_data was created by nci_multivar_preprocessor.
	- never_consumer_amount: Consumption amount variable for the never-consumer variable.
													 Not needed if pre_mcmc_data was created by nci_multivar_preprocessor.
	- never_consumer_covariates: Space-delimited list of covariates to be used for the never-consumer model. 
															 Not affected by default_covariates and must be specified separately.
	- never_consumer_intercept: Flag to include intercept in the never-consumer model.
															Not affected by default_intercept and must be specified separately. (Y/N, default = Y)
	- mcmc_seed: Positive integer starting seed for the random number generator. 
							 If blank, uses a randomly generated integer from 1 to 2*10^7, inclusive. (default = blank)
	- num_mcmc_iterations: Integer specifying the total number (including burn-in) of iterations in the MCMC chain. (default = 12000)
	- num_burn: Integer specifying the number of burn-in iterations in the MCMC chain. Must be smaller than num_mcmc_iterations. (default = 2000)
	- num_thin: Integer specifying the number of iterations between MCMC samples used for calculating posterior means. (default = 25)
	- num_post: Integer specifying the number of random effect (u) matrix draws done after the MCMC chain is finished, conditional on the posterior mean of the parameters. 
							Used for regression calibration, see Regression Calibration section. (default = 0)
	- sigma_u_prior: A SAS data set specifying a prior covariance matrix for the random effects (u). 
									 Must be a valid covariance matrix with dimensions equal to the total number of episodic indicators, episodic amounts, and daily amounts.
	- sigma_u_constant: Flag specifying if the covariance matrix of the random effects (u) should remain the same for each iteration. (Y/N, default = N)
	- save_u_main: Flag specifying if random effects (u) matrices should be saved from the main MCMC chain.
								 If Y, saves u matrix every num_thin iterations after num_burn. 
								 This is used for fitting models with techniques such as multiple imputation. 
								 Does not affect conditional u matrices drawn after the MCMC chain. (Y/N, default = N)
	- save_all_u: Flag specifying if random effects (u) matrices are saved for every iteration. 
								Greatly increases disk space needed to store output. 
								Has no effect if save_u_main is N. (Y/N, default = N)
	- do_log_likelhood: Flag specifying if an estimate of the log-likelihood should be calculated after the MCMC chain has finished.
											See the Log-likelihood section for details. (Y/N, default = N)
	- outlib: The SAS library to store output datasets. (default = WORK)
	- outname: The prefix used to name output datasets. (max 23 characters)
	
Output:

	The following SAS datasets are created in outlib and prefixed with outname:
	
		- _beta1-_betaJ: Fixed effect coefficients at every MCMC iteration for each of the J indicators and amounts.
		- _sigma_u: Random effect covariance matrix elements at every MCMC iteration.
		- _sigma_e: Residual error covariance matrix elements at every MCMC iteration.
		- _u_main: Random effect (u) matrices saved from the main MCMC chain.
							 Contains the following columns:
							 	 - iteration: The MCMC iteration that the u matrix was saved from.
							 	 - u_col1-u_colJ: The random intercepts for each of the J indicators and amounts.
		- _u_post: Random effect (u) matrices drawn after the MCMC chain conditional on the posterior means of the parameters.
							 Contains the following columns:
							 	 - post_mcmc_iteration: The number of iterations after the MCMC chain that the u matrix was drawn.
							 	 - u_col1-u_colJ: The random intercepts for each of the J indicators and amounts.
		- _vars: Variables, indicators, and amount used in the model.
						 Contains the following columns:
						 	 - episodic_variable1-episodic_variableM: The names of the M episodic variables used in the model.
							 - episodic_indicator1-episodic_indicatorM: The names of the indicators for the M episodic variables used in the model.
							 - episodic_amount1-episodic_amountM: The names of the amounts for the M episodic variables used in the model.
							 - daily_variable1-daily_variableN: The names of the N episodic variables used in the model.
							 - daily_amount1-daily_amountN: The names of the amounts for the N daily variables used in the model.
							 - num_episodic: The number of episodic variables in the model.
							 - num_daily: The number of daily variables in the model.
							 - has_never_consumers: Y/N flag for whether never-consumers are allowed for episodic_variable1.
		- _covars: Covariates for each indicator and amount in the model.
							 Contains the following columns:
							 	 - variable: The name of the indicator or amount.
								 - covariate1-covariateP: The names of the P covariates used in the model for the variable.
								 - num_covariates: The number of covariates for the variable.
								 - intercept: Y/N flag for whether an intercept was used in the model for the variable.
		- _backtran: The backtransformation dataset passed through from pre_mcmc_data.
		- _subjects: The subject information for the MCMC model.
								 Contains the following columns:
								 	 - subject: The subject identifier for each unique subject used in the model.
								 	 - weight: The weighting for each unique subject used in the model.
		- _iters: The iteration parameters used in the MCMC.
							Contains the following columns:
						  	- num_trace: Number of main chain MCMC traces saved.
								- num_mcmc_iterations: Total number of iterations in the MCMC chain including burn-in, same as num_mcmc_iterations parameter.
								- num_burn: Number of burn-in iterations that will be discarded when calculating posterior means, same as num_burn parameter.
								- num_thin: Number of iterations between MCMC samples used for calculating posterior means, same as num_thin parameter.
								- num_post: Number of u matrices drawn after the MCMC chain conditional on the posterior means of the parameters, same as num_post parameter.
								- saved_u_main: Y/N flag for whether u matrices were saved from the main MCMC chain, same as the save_u_main parameter.
		- _seed: The random number generator seed used to generate the results, see the mcmc_seed parameter for details.
		
	If a never-consumer variable is modeled, the following SAS data sets are created in outlib and prefixed with outname:
	
		- _alpha1: Coefficients of never-consumer model at every MCMC iteration.
		- _conprob1: Average probability that a subject is a consumer at every MCMC iteration.
		- _covarsnc: Covariates for the never-consumer model.
								 Contains the following columns:
								 	 - covariate1-covariateP: The names of the P covariates used in the never-consumer model.
									 - num_covariates: The number of covariates in the never-consumer model.
									 - intercept: Y/N flag for whether an intercept was included in the never-consumer model.		
												
	If do_log_likelihood is Y, the following SAS data sets are created in outlib and prefixed outname:
	
		- _ll: The estimated marginal log-likelihood of the model.
												
About the Model:

	The model fit by this function is a multivariate
	mixed-effects model with fixed effects (beta) and a random intercept for
	each variable (u). Each daily variable has a single continuous outcome
	variable for the amount consumed. Each episodic variable has two model
	parts, a binary indicator of consumption and a continuous variable for
	amount consumed. A probit link function is used for modeling the binary
	indicator. The parts of the model (daily amounts, episodic indicators, and
	episodic amounts) can have separate covariate lists. The random intercepts
	are assumed to have multivariate Gaussian distribution with covariance
	matrix Sigma-u. The residual error is also assumed to have a
	multivariate Gaussian distribution with covariance matrix Sigma-e.
	
MCMC Procedure:

	The model is fit using a Gibbs sampler MCMC
	procedure. At each iteration, each parameter is updated sequentially using
	its distribution conditional on the other parameters. The conditional
	distributions and priors are outlined in Zhang, et al. (2011). After a
	specified burn-in period, subsequent samples of each parameter are averaged
	to calculate a posterior mean. The posterior means are the point estimates
	for the parameters. To ensure that the samples used to calculated the
	posterior mean are independent, the samples can be 'thinned'. For example,
	a thinning number of 5 means that every 5th sample after the burn-in period
	is used to calculate the posterior mean. When doing regression calibration,
	additional draws of the random intercept (u) matrix can be made
	conditional on the posterior means of beta, Sigma-u, and Sigma-e. This is
	useful when performing measurement error correction with regression
	calibration.
	
Never-consumers:

	The model can allow some subjects to be
	never-consumers of one episodic variable. If an episodic variable with
	never-consumers is specified, then the probability that each subject has
	any consumption is included as an additional part of the model for that
	variable. As with the other model parts, the never-consumer part of the
	model has its own covariate matrix and coefficients. To distiguish the
	coefficients from the regular fixed effect coefficients, they are referred
	to as alpha1. Only one variable can be allowed to have never-consumers.
	
Log-likelihood:

	An estimate of the log-likelihood marginalized over
	the random effects (u) can be output after the MCMC iterations have
	finished. The random effects are integrated out using the Laplace
	approximation. The posterior mode of the random effects is found by
	the BFGS algorithm with derivatives approximated by the central difference
	method. Since the model is fit using an MCMC method, the marginal
	log-likelihood is never used in fitting the model. The log-likelihood
	output by this function corresponds well to the log-likelihood found by
	maximum likelihood for univariate and bivariate models. For
	higher-dimensional models and models allowing never-consumers that cannot
	be fit using maximum likelihood, the estimated likelihood is experimental
	and other measures of model fit should be considered. Estimating the
	log-likelihood increases computation time, especially as the number of
	variables increases.
	
Regression Calibration:

	The model fit by nci_multivar_mcmc can
	be used in regression calibration for measurement error correction. When
	doing regression calibration, random effect (u) matrices must be drawn
	after the MCMC chain conditional on the posterior means of the parameters.
	This can be done by setting `num.post` to the number of post-MCMC u draws
	to be made.
*/
%macro nci_multivar_mcmc(pre_mcmc_lib=WORK,
												 pre_mcmc_data=,
												 id=,
												 repeat_obs=,
												 weight=,
												 episodic_variables=,
												 episodic_indicators=,
												 episodic_amounts=,
												 daily_variables=,
												 daily_amounts=,
												 default_covariates=,
												 episodic_indicator_covariates=,
												 episodic_amount_covariates=,
												 daily_amount_covariates=,
												 individual_covariates=,
												 default_intercept=Y,
												 episodic_indicator_intercept=,
												 episodic_amount_intercept=,
												 daily_amount_intercept=,
												 individual_intercept=,
												 never_consumer_variable=,
												 never_consumer_indicator=,
												 never_consumer_amount=,
												 never_consumer_covariates=,
												 never_consumer_intercept=Y,
												 mcmc_seed=,
												 num_mcmc_iterations=12000,
												 num_burn=2000,
												 num_thin=25,
												 num_post=0,
												 sigma_u_prior=,
												 sigma_u_constant=N,
												 save_u_main=N,
												 save_all_u=N,
												 do_log_likelihood=N,
												 outlib=WORK,
												 outname=);
												 
	%create_logger();
												 
	**1. Check input parameters;
	%check_input(num_mcmc_iterations=&num_mcmc_iterations.,
							 num_burn=&num_burn.,
							 num_thin=&num_thin.,
							 num_post=&num_post.,
							 pre_mcmc_data=&pre_mcmc_lib..&pre_mcmc_data.,
							 id=&id.,
							 repeat_obs=&repeat_obs.,
							 weight=&weight.,
							 episodic_variables=&episodic_variables.,
							 episodic_indicators=&episodic_indicators.,
							 episodic_amounts=&episodic_amounts.,
							 daily_variables=&daily_variables.,
							 daily_amounts=&daily_amounts.,
							 never_consumer_variable=&never_consumer_variable.,
							 never_consumer_indicator=&never_consumer_indicator.,
							 never_consumer_amount=&never_consumer_amount.,
							 default_covariates=&default_covariates.,
							 episodic_indicator_covariates=&episodic_indicator_covariates.,
							 episodic_amount_covariates=&episodic_amount_covariates.,
							 daily_amount_covariates=&daily_amount_covariates.,
							 individual_covariates=&individual_covariates.,
							 never_consumer_covariates=&never_consumer_covariates.,
							 sigma_u_prior=&sigma_u_prior.);
	
	**2. Process variable lists;
	%process_variable_lists(pre_mcmc_data=&pre_mcmc_lib..&pre_mcmc_data.,
													episodic_variables=&episodic_variables.,
													episodic_indicators=&episodic_indicators.,
													episodic_amounts=&episodic_amounts.,
													daily_variables=&daily_variables.,
													daily_amounts=&daily_amounts.,
													never_consumer_variable=&never_consumer_variable.,
													never_consumer_indicator=&never_consumer_indicator.,
													never_consumer_amount=&never_consumer_amount.,
													outname=&outlib..&outname.);
	
	**3. Process covariate lists;
	%process_covariate_lists(variable_data=&outlib..&outname._vars,
													 default_covariates=&default_covariates.,
													 episodic_indicator_covariates=&episodic_indicator_covariates.,
													 episodic_amount_covariates=&episodic_amount_covariates.,
													 daily_amount_covariates=&daily_amount_covariates.,
													 individual_covariates=&individual_covariates.,
													 default_intercept=&default_intercept.,
													 episodic_indicator_intercept=&episodic_indicator_intercept.,
													 episodic_amount_intercept=&episodic_amount_intercept.,
													 daily_amount_intercept=&daily_amount_intercept.,
													 individual_intercept=&individual_intercept.,
													 never_consumer_covariates=&never_consumer_covariates.,
													 never_consumer_intercept=&never_consumer_intercept.,
													 outname=&outlib..&outname.);
	
	proc iml;
	
		reset storage=_mcmciml.mcmc_modules;
		load module=_all_;
	
		**4. Initialize parameters for main loop: set seed, variable and covariate matrices, and priors;
		mcmc_parameters = initialize_mcmc("&pre_mcmc_lib..&pre_mcmc_data._mcmc_in",
																			"&id.",
																			"&repeat_obs.",
																			"&weight.",
																			"&outlib..&outname._vars",
																			"&outlib..&outname._covars",
																			&mcmc_seed.,
																			"&sigma_u_prior.");

		**5. Check design matrices and priors;
		call check_initialized(mcmc_parameters$"covariate_matrices",
													 mcmc_parameters$"priors",
													 "&save_u_main.",
													 "&save_all_u.",
													 &num_mcmc_iterations.,
													 &num_burn.,
													 &num_thin.,
													 &num_post.,
								 					 mcmc_parameters$"subject_data"$"has_never_consumers");
		
		**6. Main MCMC loop;
		mcmc_output = mcmc_main_loop(&num_mcmc_iterations.,
																 &num_burn.,
																 &num_thin.,
																 &num_post.,
																 mcmc_parameters$"subject_data"$"num_subjects",
																 mcmc_parameters$"subject_data"$"num_episodic",
																 mcmc_parameters$"subject_data"$"num_daily",
																 mcmc_parameters$"subject_data"$"num_recalls",
																 mcmc_parameters$"subject_data"$"recall_availability",
																 mcmc_parameters$"subject_data"$"weighting",
																 mcmc_parameters$"variable_matrices"$"episodic_indicators",
																 mcmc_parameters$"subject_data"$"has_never_consumers",
																 mcmc_parameters$"covariate_matrices"$"recall",
																 mcmc_parameters$"covariate_matrices"$"wt_recall",
																 mcmc_parameters$"covariate_matrices"$"wt_sq_sums",
																 mcmc_parameters$"covariate_matrices"$"never_consumer",
																 mcmc_parameters$"priors"$"alpha1_prior"$"mean",
																 mcmc_parameters$"priors"$"alpha1_prior"$"covariance",
																 mcmc_parameters$"priors"$"consumer_probabilities_prior",
																 mcmc_parameters$"priors"$"beta_prior"$"mean",
																 mcmc_parameters$"priors"$"beta_prior"$"covariance",
																 mcmc_parameters$"priors"$"r_matrix_prior",
																 mcmc_parameters$"priors"$"theta_matrix_prior",
																 mcmc_parameters$"priors"$"v_matrix_prior",
																 mcmc_parameters$"priors"$"sigma_e_prior",
																 mcmc_parameters$"priors"$"sigma_u_prior",
																 mcmc_parameters$"priors"$"u_matrix_prior",
																 mcmc_parameters$"priors"$"w_matrix_prior",
																 mcmc_parameters$"priors"$"xbeta_prior",
																 mcmc_parameters$"priors"$"xbeta_u_prior",
																 "&sigma_u_constant.",
																 "&save_u_main.",
																 "&save_all_u.",
																 "&do_log_likelihood.");

		**7. Create output object;
		call output_mcmc("&outlib..&outname.",
										 mcmc_output$"log_likelihood",
										 mcmc_output$"beta",
										 mcmc_output$"sigma_u",
										 mcmc_output$"sigma_e",
										 mcmc_output$"alpha1",
										 mcmc_output$"consumer_probabilities",
										 mcmc_output$"u_matrices_main",
										 mcmc_output$"u_matrices_post",
										 mcmc_parameters$"subject_data"$"subjects",
										 mcmc_parameters$"subject_data"$"weighting",
										 &num_mcmc_iterations.,
										 &num_burn.,
										 &num_thin.,
										 &num_post.,
										 mcmc_output$"saved_u_main",
										 mcmc_parameters$"subject_data"$"has_never_consumers");
										 
		call symputx("mcmc_seed", mcmc_parameters$"mcmc_seed");
	quit;
	
	**Output seed;
	data &outlib..&outname._seed;
	
		seed = &mcmc_seed.;
	run;
%mend nci_multivar_mcmc;