#Source code for data generation in the paper Schnitzer and Cefalu (2017) datagen_sparse<-function(ssize,seed=sample(1:100000,size=1)){ set.seed(seed) W1<-rbinom(n=ssize,size=1,prob=0.1) W2<-rbinom(n=ssize,size=1,prob=0.2) W3<-rbinom(n=ssize,size=1,prob=0.3) pA<-plogis(-1+W1+W2+W3) A<-rbinom(n=ssize,size=1,prob=pA) Y<-rnorm(n=ssize,mean=(2*A+W1+W2+W3)) #mean(Y[A==1])-mean(Y[A==0]) #very confounded, true effect = 2 return(list(Y=Y,A=A,W1=W1,W2=W2,W3=W3)) } #Correlated covariates datagen1<-function(ssize,seed=sample(1:100000,size=1)){ set.seed(seed) W1<-runif(n=ssize,min=-1,max=1) W2<-runif(n=ssize,min=(W1-1),max=(W1+1)) W3<-runif(n=ssize,min=-1,max=1) W4<-runif(n=ssize,min=(W3-1),max=(W3+1)) pA<-plogis(W2+W4+W3) A<-rbinom(n=ssize,size=1,prob=pA) Y<-rnorm(n=ssize,mean=(2*A+W1+W2+W4)) #mean(Y[A==0])-mean(Y[A==1]) #very confounded, true effect = 2 return(list(Y=Y,A=A,W1=W1,W2=W2,W3=W3,W4=W4)) } #Positivity violations datagen<-function(ssize,seed=sample(1:100000,size=1)){ set.seed(seed) W1<-rnorm(n=ssize) #instrument W2<-rnorm(n=ssize) #confounder with weak effect on outcome W3<-rnorm(n=ssize) #confounder pA<-plogis(W1+W2+W3) A<-rbinom(n=ssize,size=1,prob=pA) Y<-rnorm(n=ssize,mean=(2*A+0.5*W2+W3)) #mean(Y[A==0])-mean(Y[A==1]) #very confounded, true effect = 2 return(list(Y=Y,A=A,W1=W1,W2=W2,W3=W3)) } #Given W and A (binary), alpha, penalty vector or constant. #W is a matrix that includes a unit vector #Run glmnet to get gn gn_glmnet<-function(W,A,alpha,lambda){ library(glmnet) J<-dim(W)[2] if (alpha>1 | alpha<0){stop("alpha must be between 0 and 1")} if(length(lambda)==1){ mod<-glmnet(y=A,x=as.matrix(W), family="binomial",alpha=alpha,lambda=lambda,penalty.factor=c(0,rep(1,(J-1))),standardize=F) } else{ if(length(lambda)!=(J-1)){stop("The penalty vector must be as long as the number of covariates (excluding the intercept)")} lambdamax<-max(lambda) lambdavect<-lambda/lambdamax mod<-glmnet(y=A,x=as.matrix(W), family="binomial",alpha=alpha,lambda=lambdamax,penalty.factor=c(0,lambdavect),standardize=F) } beta<-rep(NA,J) beta[2:J]<-mod$beta[2:J] beta[1]<-mod$a0 gn<-plogis(t(beta%*%t(W))) return(list(gn=gn,beta=beta)) } #END gn_glmnet function #Given data, alpha, penalty vector or constant, outcome type (continuous or binary), Qn vector (not necessarily bounded), boundmax and boundmin (if outcome is continuous, otherwise ignored) #Fit TMLE with glmnet tmle_glmnet<-function(X_A,A,Y,alpha,lambda,outcome_type,Qnvect,boundmax=(max(Y,Qnvect)+0.05),boundmin=(min(Y,Qnvect)-0.05)){ X_A<-as.matrix(cbind(1,X_A)) if(dim(X_A)[1]!=length(Y) | length(A)!=length(Y) ){stop("The data dimensions do not correspond.")} if(length(Y) != length(Qnvect)){stop("Qnvect needs to be the same length as the sample size.")} ssize<-length(Y) #fit propensity score gn_mod<-gn_glmnet(W=X_A,A=A,alpha=alpha,lambda=lambda) gn<-gn_mod$gn beta<-gn_mod$beta #update step if (identical(outcome_type,"continuous")){ Y.ub<-Y Qn.ub<-Qnvect d<-boundmin c<-boundmax Y<-(Y.ub-d)/(c-d) Qn<-(Qn.ub-d)/(c-d) } else { Qn<-Qnvect d<-0 c<-1 } if (sum(gn<=.Machine$double.neg.eps, na.rm=T)==0 & sum(is.na(gn))==0){ epsilon_init<-glm.fit(y=Y[A==1],x=as.matrix(1/(gn[A==1])),offset=qlogis(Qn[A==1]),family=binomial())$coefficient Qnstar<-plogis(qlogis(Qn)+epsilon_init/gn) #bounded Qn initial initest.ub<-mean(Qnstar)*(c-d)+d initest<-mean(Qnstar) var.initest<-var((Y-Qnstar)*A/gn + (Qnstar-initest))/ssize*(c-d)^2 } else {print("\nThe TMLE update step was impossible due to missing or zero values in the propensity score\n")} return(list(EST=initest.ub,VAR.IF=var.initest,BETA=beta)) } # END tmle_glmnet function #Given function, data, fit a model for Qn, extract parameters #Returns a list of parameters fit_Qn<-function(Qn_fit_func, W, A, Y){ params<-Qn_fit_func(W=W,A=A,Y=Y) return(params) } #Example function to fit a model for Qn #Returns a list of parameters #Y is untransformed Qn_fit_lm<-function(W,A,Y){ Y.ub<-Y W<-as.matrix(W) Qn_mod<-lm.fit(y=Y,x=as.matrix(cbind(A,W))) #W contains the intercept beta<-Qn_mod$coefficients return(list(beta=beta)) } #Given function, parameters, baseline covariates #Creates (unscaled) Qn vector get_Qn<-function(Qn_pred_func, params, W, Y){ predinfo<-Qn_pred_func(params,W, Y) return(predinfo) } #Example function to create (unscaled) Qn vector Qn_pred_lm<-function(params, W, Y){ W<-as.matrix(W) #The parameters must be a list of beta coefficients and optionally boundmax, boundmin constants beta<-params$beta Qn.ub<-as.vector(cbind(1,W)[,!is.na(beta)]%*%beta[!is.na(beta)]) #Setting A=1 boundmin<-min(Y,Qn.ub)-0.05 boundmax<-max(Y,Qn.ub)+0.05 return(list(Qn=Qn.ub,boundmax=boundmax,boundmin=boundmin)) } #calculates the target loss function of the TMLE (used in the crossvalidation) #Y: univariate outcome (vector length=n) (unscaled) #A: treatment vector (vector length=n) where A=1 is the exposure level of interest TMLE_loss<-function(Y,A,X_A,X_Y,beta,Qn_params,Qn_pred_func,epsilon,update,outcome_type){ LOSS.sav<-rep(NA,length(epsilon)) Y.ub<-Y Qnres<-get_Qn(Qn_pred_func, params=Qn_params,W=X_Y, Y=Y.ub) Qn.ub<-Qnres$Qn if (identical(outcome_type,"continuous")){ #If continuous outcome, scale Y and Qn d<-Qnres$boundmin c<-Qnres$boundmax Y<-(Y.ub-d)/(c-d) Qn<-(Qn.ub-d)/(c-d) } else { Qn<-Qn.ub d<-0 c<-1 } Qn_starC<-Qn for (j in 1:length(epsilon)){ if (update[j]!=0){ #If this was an update step, then need to update using previous epsilon and beta gn<-plogis(t(beta[j-1,]%*%t(X_A))) Qn_starC<-plogis( qlogis(Qn_starC)+update[j]/gn) #The update is made with respect to the new current Q_star gn<-plogis(t(beta[j,]%*%t(X_A))) Qn_star<-plogis( qlogis(Qn_starC)+epsilon[j]/gn) } else { #not an update step, just changes Qn_star gn<-plogis(t(beta[j,]%*%t(X_A))) Qn_star<-plogis( qlogis(Qn_starC)+epsilon[j]/gn) } Qn_star[Qn_star<=0]<-.Machine$double.neg.eps Qn_star[Qn_star>=1]<-1-.Machine$double.neg.eps LOSS.sav[j]<- -sum( (Y*log(Qn_star)+(1-Y)*log(1-Qn_star))*A) } return(LOSS.sav) } ###Shrinkage CTMLE function #Given a lambda, initial Qnvect, computes a sequence of treatment model coefficients that greedy minimize the collaboratively penalized loss function. #Returns an estimate of E(Y^a) and other summaries #Y and Qnvect are unscaled #Qn_params should contain boundmin and boundmax if the outcome is continuous Shrinkage_CTMLE<-function(lambdaset,alpha=0,outcome_type,stop=length(lambdaset),X_A,X_Y,A,Y,ret.var=F,Qn_pred_func,Qn_params){ X_Y<-as.matrix(X_Y) X_A<-as.matrix(X_A) Y.ub<-Y ssize<-dim(X_Y)[1] Qnres<-get_Qn(Qn_pred_func, params=Qn_params,W=X_Y, Y=Y.ub) Qn.ub<-Qnres$Qn if (identical(outcome_type,"continuous")){ #If continuous outcome, scale Y and Qn d<-Qnres$boundmin c<-Qnres$boundmax Y<-(Y.ub-d)/(c-d) Qn<-(Qn.ub-d)/(c-d) } else { Qn<-Qn.ub d<-0 c<-1 } #Iterate fluctuations and shrinkage J<-dim(X_A)[2] #number of coefficients in the gn model, including intercept L<-length(lambdaset) #number of penalties to be applied est.ub<-rep(NA,L) #save estimates at every step var.est<-rep(NA,L) #save variance estimates at every step beta.sav<-matrix(nrow=L,ncol=J) #save coefficients in the gn model at every step #gnloss<-rep(NA,L) #if you want to keep track of the error in gn (should be decreasing with each step) #Qnloss<-rep(NA,L) #if you want to keep track of the error in Qn (should be decreasing with each step) update<-rep(0,L) #if there is an update (to get a new current Qn), saves the epsilon value, 0 o.w. eps<-rep(NA,L) #saves the TMLE update from each optimal forward step QnstarC<-Qn #initialize current Qn j<-1 #initialize beta lambda<-lambdaset[1] gn_mod<-gn_glmnet(W=X_A,A=A,alpha=alpha,lambda=lambda) beta.sav[j,]<-gn_mod$beta gn<-gn_mod$gn eps[j]<-epsilon<-glm.fit(y=as.matrix(Y[A==1]),x=as.matrix(1/(gn[A==1])),offset=as.matrix(qlogis(QnstarC[A==1])),family=binomial())$coefficient Qnstar<-plogis(qlogis(QnstarC)+epsilon/gn) #bounded Qn initial est<-mean(Qnstar) est.ub[j]<-est*(c-d)+d Qnstar_last<-Qnstar #current optimal Qnstar (loss is necessarily improved over QnstarC) if (stop>1){ #iteratively update beta then epsilon until convergence for (j in 2:stop){ lambda<-lambdaset[j] #update beta given next lambda (like forward step) gn_mod<-gn_glmnet(W=X_A,A=A,alpha=alpha,lambda=lambda) beta.sav[j,]<-gn_mod$beta gn<-gn_mod$gn eps[j]<-epsilon<-glm.fit(y=Y[A==1],x=as.matrix(1/(gn[A==1])),offset=qlogis(QnstarC[A==1]),family=binomial())$coefficient Qnstar<-plogis(qlogis(QnstarC)+epsilon/gn) #bounded Qn initial #Is updated Q* better than the last? If so, keep it as best estimate. If not... improve<- ((-sum( (Y*log(Qnstar_last)+(1-Y)*log(1-Qnstar_last))[A==1] )) - (-sum( (Y*log(Qnstar)+(1-Y)*log(1-Qnstar))[A==1] )) )>= 0 if (improve == F ){ #j must be > 1 beta_last<-beta.sav[j-1,] gn_last<-plogis(t(beta_last%*%t(X_A))) update[j]<-glm.fit(y=Y[A==1],x=as.matrix(1/(gn_last[A==1])),offset=qlogis(QnstarC[A==1]),family=binomial())$coefficient QnstarC<-plogis(qlogis(QnstarC)+update[j]/gn_last) #now calculate the updated Qstar from the new current one eps[j]<-epsilon<-glm.fit(y=Y[A==1],x=as.matrix(1/(gn[A==1])),offset=qlogis(QnstarC[A==1]),family=binomial())$coefficient Qnstar<-plogis(qlogis(QnstarC)+epsilon/gn) #bounded Qn updated } est<-mean(Qnstar) est.ub[j]<-est*(c-d)+d if (ret.var==T){ if (sum(gn<=.Machine$double.neg.eps)==0){ var.est[j]<-var((Y-Qnstar)*A/gn + (Qnstar-est))/ssize*(c-d)^2 } else { var.est[j]=NA } } Qnstar_last<-Qnstar #remember the Qnstar from the last iteration #gnloss[j]<- -sum(A*log(gn)+(1-A)*log(1-gn)) #Qnloss[j]<- -sum((Y*log(Qnstar)+(1-Y)*log(1-Qnstar))*A) } #end j loop } #end if stop>1 condition return(list(est=est.ub,var.est=var.est,beta=beta.sav,eps.sav=eps,upd.sav=update,Qn_params=Qn_params,Qn_pred_func=Qn_pred_func,outcome_type=outcome_type)) } ###### #Componentwise shrinkage CTMLE #Given a vector lambdaset, produces a sequence of TMLE models, decreasing in error in both Qn and gn CShrinkage_CTMLE<-function(lambdaset,alpha=0,outcome_type,stop=((length(lambdaset)-1)*(dim(X_A)[2]-1)+1),X_A,X_Y,A,Y,ret.var=F,Qn_pred_func,Qn_params){ X_Y<-as.matrix(X_Y) X_A<-as.matrix(X_A) Y.ub<-Y ssize<-dim(X_Y)[1] Qnres<-get_Qn(Qn_pred_func, params=Qn_params,W=X_Y, Y=Y.ub) Qn.ub<-Qnres$Qn if (identical(outcome_type,"continuous")){ #If continuous outcome, scale Y and Qn d<-Qnres$boundmin c<-Qnres$boundmax Y<-(Y.ub-d)/(c-d) Qn<-(Qn.ub-d)/(c-d) } else { Qn<-Qn.ub d<-0 c<-1 } #Iterate fluctuations and shrinkage J<-dim(X_A)[2] #number of coefficients in the gn model, including intercept (always 2+) K<-(length(lambdaset)-1)*(J-1) #number of penalty combinations to be applied est.ub<-rep(NA,K+1) #save estimates at every step var.est<-rep(NA,K+1) #save variance estimates at every step beta.sav<-matrix(nrow=(K+1),ncol=J) #save coefficients in the gn model at every step path.sav<-rep(0,K+1) #saves which covariate takes the step upd.sav<-rep(0,K+1) #saves epsilons where updates to Qn_C happen eps.sav<-rep(NA,K+1) #saves epsilons loss.sav<-rep(0,K+1) #saves loss (academic) #gnloss.sav<-rep(0,K+1) #saves loss on gn (academic) lambdamat<-matrix(rep(lambdaset,J-1),ncol=J-1,byrow=F) ind<-matrix(F,nrow=length(lambdaset),ncol=(J-1)) #initiate index within lambdamat ind[1,]<-rep(T,J-1) #initialize current Qn Qn_C <- Qn #initialize beta gn_mod<-gn_glmnet(W=X_A,A=A,alpha=alpha,lambda=lambdamat[ind]) beta.sav[1,]<-gn_mod$beta gn_k<-gn_mod$gn eps.sav[1]<-glm.fit(y=Y[A==1],x=as.matrix(1/(gn_k[A==1])),offset=qlogis(Qn_C[A==1]),family=binomial())$coefficient Qn_k<-plogis(qlogis(Qn_C)+eps.sav[1]/gn_k) #bounded Qn initial est<-mean(Qn_k) est.ub[1]<-est*(c-d)+d if (ret.var==T){ if (sum(gn_k<=.Machine$double.neg.eps)==0){ var.est[1]<-var((Y-Qn_k)*A/gn_k + (Qn_k-est))/ssize*(c-d)^2 } else { var.est[1]=NA } } if (stop>1){ #max is K+1 for (k in 2:stop){ #loop through the total number of steps Qn_k[Qn_k<=0]<-.Machine$double.neg.eps Qn_k[Qn_k>=1]<-1-.Machine$double.neg.eps loss.sav[k-1]<-loss_active<- -sum( A*(Y*log(Qn_k)+(1-Y)*log(1-Qn_k))) #gnloss.sav[k-1]<- -sum( A*log(gn_k)+(1-A)*log(1-gn_k)) loss_cand<-rep(NA,J-1) ind_cand<-list() beta_cand<-list() cand_func<-list() for (j in 1:(J-1)){ if (ind[length(lambdaset),j]!=T){ #checking to see if fully grown coefficient cand<-take_a_step(j=j,A=A,X_A=X_A,lambdamat=lambdamat,ind=ind) beta_cand[[j]]<-cand$beta; ind_cand[[j]]<-cand$ind cand_func[[j]]<-eval_candidate(Y=Y,A=A,Qn=Qn_C,beta=beta_cand[[j]],X_A=X_A) loss_cand[j]<-cand_func[[j]]$loss } else {loss_cand[j]<-NA} #If coefficient is already fully grown, can't take a step } if ( min(loss_cand, na.rm=T)==Inf ){ #begin check for all NA beta.sav[k,]<-beta<-rep(NA,length(beta)); eps.sav[k]<-NA path.sav[k]<-NA; } else { if ( min(loss_cand,na.rm=T) <= loss_active ){ j<-which.min(loss_cand); ind<-ind_cand[[j]]; beta.sav[k,]<-beta<-beta_cand[[j]]; gn_k<-cand_func[[j]]$gn eps.sav[k]<-cand_func[[j]]$eps Qn_k<-cand_func[[j]]$Qnstar path.sav[k]<-j; } if ( min(loss_cand,na.rm=T) > loss_active ){ #if no step improves, TMLE update with current gn_k and Qn_C upd.sav[k]<-glm.fit(y=Y[A==1],x=as.matrix(1/(gn_k[A==1])),offset=qlogis(Qn_C[A==1]),family=binomial())$coefficient Qn_C<-plogis(qlogis(Qn_C)+upd.sav[k]/gn_k) for (j in 1:(J-1)){ #Now, take a step with updated Qn_C if (ind[length(lambdaset),j]!=T){ #checking to see if fully grown coefficient cand<-take_a_step(j=j,A=A,X_A=X_A,lambdamat=lambdamat,ind=ind) beta_cand[[j]]<-cand$beta; ind_cand[[j]]<-cand$ind cand_func[[j]]<-eval_candidate(Y=Y,A=A,Qn=Qn_C,beta=beta_cand[[j]],X_A=X_A) loss_cand[j]<-cand_func[[j]]$loss } else {loss_cand[j]<-Inf} #If coefficient is already fully grown, can't take a step } j<-which.min(loss_cand); ind<-ind_cand[[j]]; beta.sav[k,]<-beta<-beta_cand[[j]]; gn_k<-cand_func[[j]]$gn eps.sav[k]<-cand_func[[j]]$eps Qn_k<-cand_func[[j]]$Qnstar path.sav[k]<-j; } #end check for all NA } #end of if statements est<-mean(Qn_k) est.ub[k]<-est*(c-d)+d if (ret.var==T){ if (sum(gn_k<=.Machine$double.neg.eps, na.rm=T)==0 & sum(is.na(gn_k))==0){ var.est[k]<-var((Y-Qn_k)*A/gn_k + (Qn_k-est))/ssize*(c-d)^2 } else { var.est[k]<-NA } } } #end of k loop Qn_k[Qn_k<=0]<-.Machine$double.neg.eps Qn_k[Qn_k>=1]<-1-.Machine$double.neg.eps loss.sav[K+1]<- -sum( A*(Y*log(Qn_k)+(1-Y)*log(1-Qn_k))) #gnloss.sav[K]<- -sum( A*log(gn_k)+(1-A)*log(1-gn_k)) } #end of if stop>1 return(list(est=est.ub,var.est=var.est,beta=beta.sav,eps.sav=eps.sav,upd.sav=upd.sav,path.sav=path.sav,Qn_params=Qn_params,Qn_pred_func=Qn_pred_func,outcome_type=outcome_type)) } eval_candidate<-function(Y,A,Qn,beta,X_A){ gn<-plogis(t(beta[!is.na(beta)]%*%t(X_A[,!is.na(beta)]))) #TMLE fluctuation to get epsilon if (sum(gn<=.Machine$double.neg.eps, na.rm=F)==0 & sum(is.na(gn))==0){ #If no subject has zero clever covariate eps<-glm.fit(y=Y[A==1],x=as.matrix(1/(gn[A==1])),offset=qlogis(Qn[A==1]),family=binomial())$coefficient Qnstar<-plogis(qlogis(Qn)+eps/gn) #bounded Qn initial Qnstar[Qnstar<=0]<-.Machine$double.neg.eps Qnstar[Qnstar>=1]<-1-.Machine$double.neg.eps l<- -sum((Y*log(Qnstar)+(1-Y)*log(1-Qnstar))*A) } else { l<-Inf; eps=NA; Qnstar=NA } return(list(loss=l,eps=eps,Qnstar=Qnstar,gn=gn,beta=beta)) } #For given coordinate, return beta for single step forward take_a_step<-function(j,A,X_A,lambdamat,ind){ J<-dim(X_A)[2] cur<-which(ind[,j]) ind[cur,j]<-F ind[cur+1,j]<-T if (sum(lambdamat[ind])!=0){ end<-dim(ind)[1] gn_mod<-gn_glmnet(W=X_A,A=A,alpha=alpha,lambda=lambdamat[ind]) beta<-gn_mod$beta } else { beta<-glm.fit(y=A, x=as.matrix(X_A), family=binomial() )$coefficients } return(list(beta=beta,ind=ind)) } #Var select CTMLE CTMLE<-function(outcome_type,stop=(dim(X_A)[2]),X_A,X_Y,A,Y,ret.var=F,Qn_pred_func,Qn_params){ X_Y<-as.matrix(X_Y) X_A<-as.matrix(X_A) Y.ub<-Y ssize<-dim(X_Y)[1] Qnres<-get_Qn(Qn_pred_func, params=Qn_params,W=X_Y, Y=Y.ub) Qn.ub<-Qnres$Qn if (identical(outcome_type,"continuous")){ #If continuous outcome, scale Y and Qn d<-Qnres$boundmin c<-Qnres$boundmax Y<-(Y.ub-d)/(c-d) Qn<-(Qn.ub-d)/(c-d) } else { Qn<-Qn.ub d<-0 c<-1 } #C-TMLE J<-dim(X_A)[2] #always 2+, includes intercept est.ub<-rep(NA,J) path.sav<-rep(NA,J-1) var.est<-rep(NA,J) beta.sav<-matrix(nrow=(J),ncol=J) upd.sav<-rep(0,J) #saves epsilons where updates to Qn_C happen eps.sav<-rep(NA,J) #saves epsilons loss.sav<-rep(0,J) #saves loss (academic) #gnloss.sav<-rep(0,J) #saves loss on gn (academic) #initialize current Qn Qn_C <- Qn #initialize beta mat_C<-as.matrix(rep(1,ssize)) gmod<-glm(A~1, family="binomial") beta<-gmod$coefficient beta.sav[1,]<-c(beta,rep(0,J-1)) #First step is forced through because it will necessarily improve gn_k<-plogis(t(beta%*%t(mat_C))) eps.sav[1]<-glm.fit(y=Y[A==1],x=as.matrix(1/(gn_k[A==1])),offset=qlogis(Qn_C[A==1]),family=binomial())$coefficient Qn_k<-plogis(qlogis(Qn_C)+eps.sav[1]/gn_k) #bounded Qn initial est<-mean(Qn_k) est.ub[1]<-est*(c-d)+d if (ret.var==T){ if (sum(gn_k<=.Machine$double.neg.eps)==0){ var.est[1]<-var((Y-Qn_k)*A/gn_k + (Qn_k-est))/ssize*(c-d)^2 } else { var.est[1]=NA } } avail.covars<-1:(J-1) used.covars<-NULL if (stop>1){ #max is J+1 for (k in 2:stop){ #loop through the total number of steps Qn_k[Qn_k<=0]<-.Machine$double.neg.eps Qn_k[Qn_k>=1]<-1-.Machine$double.neg.eps loss.sav[k-1]<-loss_active<- -sum( A*(Y*log(Qn_k)+(1-Y)*log(1-Qn_k))) #gnloss.sav[k-1]<- -sum( A*log(gn_k)+(1-A)*log(1-gn_k)) #pick next variable loss_vect<-NULL cand_func<-list() for (j in avail.covars){ mat_C_temp<-cbind(mat_C,X_A[,j+1]) gmod<-glm.fit(y=A, x=as.matrix(mat_C_temp), family=binomial()) beta<-gmod$coefficient cand_func[[j]]<-eval_candidate(Y=Y,A=A,Qn=Qn_C,beta=beta,X_A=mat_C_temp) loss_vect<-c(loss_vect,cand_func[[j]]$loss) } j<-avail.covars[which.min(loss_vect)] #this is Wj that leads to the lowest loss loss_cand<-loss_vect[which.min(loss_vect)] if(loss_cand==Inf) break; if (loss_cand < loss_active) { #take this step and keep the Qn_C avail.covars<-avail.covars[-which(avail.covars==j)] #covar is no longer available used.covars<-c(used.covars,j) #covar is now included in g mat_C<-as.matrix(cbind(mat_C,X_A[,j+1])) beta<-cand_func[[j]]$beta beta.sav[k,]<-rep(0,J) beta.sav[k,c(1,(used.covars+1))]<-beta Qn_k<-cand_func[[j]]$Qnstar eps.sav[k]<-cand_func[[j]]$eps gn_k<-cand_func[[j]]$gn } else { #otherwise, do the TMLE update to get new Qn_C upd.sav[k]<-glm.fit(y=Y[A==1],x=as.matrix(1/(gn_k[A==1])),offset=qlogis(Qn_C[A==1]),family=binomial())$coefficient Qn_C<-plogis(qlogis(Qn_C)+upd.sav[k]/gn_k) #note: the gn_k here is the previous step's #now take the step and get a new Qn_k loss_vect<-NULL cand_func<-list() for (j in avail.covars){ mat_C_temp<-cbind(mat_C,X_A[,j+1]) gmod<-glm.fit(y=A, x=as.matrix(mat_C_temp), family=binomial()) beta<-gmod$coefficient cand_func[[j]]<-eval_candidate(Y=Y,A=A,Qn=Qn_C,beta=beta,X_A=mat_C_temp) loss_vect<-c(loss_vect,cand_func[[j]]$loss) } j<-avail.covars[which.min(loss_vect)] #this is Wj that leads to the lowest loss avail.covars<-avail.covars[-which(avail.covars==j)] #covar is no longer available used.covars<-c(used.covars,j) #covar is now included in g mat_C<-as.matrix(cbind(mat_C,X_A[,j+1])) beta<-cand_func[[j]]$beta beta.sav[k,]<-rep(0,J) beta.sav[k,c(1,(used.covars+1))]<-beta Qn_k<-cand_func[[j]]$Qnstar eps.sav[k]<-cand_func[[j]]$eps gn_k<-cand_func[[j]]$gn } est<-mean(Qn_k) est.ub[k]<-est*(c-d)+d if (ret.var==T){ if (sum(gn_k<=.Machine$double.neg.eps, na.rm=T)==0 & sum(is.na(gn_k))==0){ var.est[k]<-var((Y-Qn_k)*A/gn_k + (Qn_k-est))/ssize*(c-d)^2 } else { var.est[k]<-NA } } } #end of k loop loss.sav[J+1]<- -sum( A*(Y*log(Qn_k)+(1-Y)*log(1-Qn_k))) #gnloss.sav[J+1]<- -sum( A*log(gn_k)+(1-A)*log(1-gn_k)) } #end of if stop>1 if (length(used.covars)>0){ path.sav[1:length(used.covars)]<-used.covars } return(list(est=est.ub,var.est=var.est,beta=beta.sav,eps.sav=eps.sav,upd.sav=upd.sav,path.sav=path.sav,Qn_params=Qn_params,Qn_pred_func=Qn_pred_func,outcome_type=outcome_type)) } ###### #A common CV function that can run CV for any of the above three methods #method indicates either variable selection CTMLE ("CTMLE"), Shrinkage CTMLE ("SCTMLE"), or Componentwise Shrinkage CTMLE ("CSCTMLE") CV<-function(lambdaset=NULL,alpha=0,stop=NULL,R,X_A,X_Y,A,Y,method, Qn_params, Qn_pred_func, outcome_type){ ssize<-length(Y) #User may provide an early stopping point. Otherwise, use the maximum (depends on method). if(identical(stop, NULL)){ if(identical(method,"CTMLE")){ stop<-dim(X_A)[2] } if(identical(method,"SCTMLE")){ stop<-length(lambdaset) } if(identical(method,"CSCTMLE")){ J<-dim(X_A)[2] #always 2+ K<-(length(lambdaset)-1)*(J-1) stop<-K+1 } } else { #check that provided stop is not too big, if it is, set to max if(identical(method,"CTMLE")){ if(stop>dim(X_A)[2]){ stop<-dim(X_A)[2] } } if(identical(method,"SCTMLE")){ if(stop>length(lambdaset)){ stop<-length(lambdaset) } } if(identical(method,"CSCTMLE")){ J<-dim(X_A)[2] #always 2+ K<-(length(lambdaset)-1)*(J-1) if(stop>(K+1)){stop<-(K+1)} } } lossval<-matrix(nrow=R,ncol=stop) #Randomly shuffle the data shuffle<-sample(ssize) X_As<-as.data.frame(as.matrix(X_A[shuffle,])) X_Ys<-as.data.frame(as.matrix(X_Y[shuffle,])) As<-A[shuffle] Ys<-Y[shuffle] #Create R equally sized folds folds <- cut(seq(1,length(Ys)),breaks=R,labels=FALSE) #Perform R fold cross validation for(k in 1:R){ #Segement your data by fold using the which() function testIndexes <- which(folds==k,arr.ind=TRUE) #trainData <- dat1[-testIndexes, ] X_A1<-X_As[-testIndexes,] X_Y1<-X_Ys[-testIndexes,] A1<-As[-testIndexes] Y1<-Ys[-testIndexes] #testData <- dat1[testIndexes, ] X_A2<-as.matrix(X_As[testIndexes,]) X_Y2<-as.matrix(X_Ys[testIndexes,]) A2<-As[testIndexes] Y2<-Ys[testIndexes] #run model on train Data if(identical(method,"CTMLE")){ TMLEmod<-CTMLE(outcome_type=outcome_type,stop=stop, X_A=X_A1,X_Y=X_Y1,A=A1,Y=Y1,Qn_pred_func=Qn_pred_func, Qn_params=Qn_params) } if(identical(method,"SCTMLE")){ TMLEmod<-Shrinkage_CTMLE(lambdaset=lambdaset,stop=stop,alpha=alpha,outcome_type=outcome_type,X_A=X_A1,X_Y=X_Y1,A=A1,Y=Y1,Qn_pred_func=Qn_pred_func, Qn_params=Qn_params) } if(identical(method,"CSCTMLE")){ TMLEmod<-CShrinkage_CTMLE(lambdaset=lambdaset,stop=stop,alpha=alpha,outcome_type=outcome_type,X_A=X_A1,X_Y=X_Y1,A=A1,Y=Y1,Qn_pred_func=Qn_pred_func, Qn_params=Qn_params) } betat<-TMLEmod$beta epsilont<-TMLEmod$eps.sav updatet<-TMLEmod$upd.sav #Test error with test Data lossval[k,]<-TMLE_loss(Y=Y2,A=A2,X_A=X_A2,X_Y=X_Y2,beta=betat,update=updatet,epsilon=epsilont,Qn_params=Qn_params, Qn_pred_func=Qn_pred_func, outcome_type=outcome_type) } lossseq<-colMeans(lossval) return(list(lossseq=lossseq,lambdaset=lambdaset)) } ### #Fit one of the 3 types of CTMLE: variable selection CTMLE ("CTMLE"), Shrinkage CTMLE ("SCTMLE"), or Componentwise Shrinkage CTMLE ("CSCTMLE") ctmle_glmnet<-function(Y, A, X_A, X_Y, outcome_type, method, Qn_fit_func=Qn_fit_lm, Qn_pred_func=Qn_pred_lm, lambda=NULL, alpha=0, stop=NULL, n.cv=5){ X_A<-as.matrix(X_A) X_Y<-as.matrix(X_Y) if(dim(X_A)[1]!=length(Y) | length(A)!=length(Y) | dim(X_Y)[1]!=length(Y) ){stop("The data dimensions do not correspond.")} #check if lambda is a vector (size 2+) if using SCTMLE or CSCTMLE if(identical(method,"SCTMLE")|identical(method,"CSCTMLE")){ if(length(lambda)<2){stop("lambda must be a vector of penalties with length greater than 1")} lambda<-sort(lambda, decreasing=T) } #fit Qn-model to get parameters Qn_params=fit_Qn(Qn_fit_func=Qn_fit_func, W=X_Y, A=A, Y=Y) cverror<-CV(lambdaset=lambda, alpha=alpha, stop=stop, R=n.cv, X_A=X_A, X_Y=X_Y, A=A, Y=Y, method=method, Qn_params=Qn_params, Qn_pred_func=Qn_pred_func, outcome_type=outcome_type)$lossseq cv.stop<-min(which(cverror==min(cverror,na.rm=T))) if(identical(method,"CTMLE")){ TMLEmod<-CTMLE(outcome_type=outcome_type,stop=cv.stop, X_A=X_A,X_Y=X_Y,A=A,Y=Y,Qn_pred_func=Qn_pred_func, Qn_params=Qn_params,ret.var=T) cat("\n", cv.stop-1, " covariate(s) was/were selected into the propensity score model\n") history<-list(beta=TMLEmod$beta,var.select=TMLEmod$path.sav,est=TMLEmod$est,cverror=cverror[1:cv.stop]) } if(identical(method,"SCTMLE")){ TMLEmod<-Shrinkage_CTMLE(lambdaset=lambdaset,stop=cv.stop,alpha=alpha,outcome_type=outcome_type,X_A=X_A,X_Y=X_Y,A=A,Y=Y,Qn_pred_func=Qn_pred_func, Qn_params=Qn_params,ret.var=T) cat("\nA penalty of ", lambdaset[cv.stop], " was selected for the propensity score model\n") #check this history<-list(beta=TMLEmod$beta,est=TMLEmod$est,cverror=cverror[1:cv.stop]) } if(identical(method,"CSCTMLE")){ TMLEmod<-CShrinkage_CTMLE(lambdaset=lambdaset,stop=cv.stop,alpha=alpha,outcome_type=outcome_type,X_A=X_A,X_Y=X_Y,A=A,Y=Y,Qn_pred_func=Qn_pred_func, Qn_params=Qn_params,ret.var=T) #need to create chosen penalty vector lambdamat<-matrix(rep(lambdaset,dim(X_A)[2]-1),nrow=length(lambdaset)) lambdachosen<-rep(max(lambdaset),dim(X_A)[2]-1) for (s in 1:(dim(X_A)[2]-1)){ index<-sum(TMLEmod$path.sav==s) lambdachosen[s]<-lambdaset[index+1] } cat("\nThe penalty vector ", lambdachosen, " was selected for the propensity score model\n") history<-list(beta=TMLEmod$beta,lambda.select=TMLEmod$path.sav,est=TMLEmod$est,cverror=cverror[1:cv.stop]) } return(list(est=TMLEmod$est[cv.stop],var.est=TMLEmod$var.est[cv.stop],beta=TMLEmod$beta[cv.stop,],history=history)) }