# Pohle, J., Langrock, R., van der Schaar, M., King, R. and Jensen, F.H.: #
# A primer on coupled state-switching models for multiple interacting time series #
#''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''#

# R code for the simulation experiments described in Section 3 #



##### 1) Load packages and required functions #####

library(Rcpp)
library(RcppArmadillo)
sourceCpp("Pohle_et_al_mllkRcpp.cpp") # C++ code for forward algorithm to evaluate HMM log-likelihood
source('Pohle_et_al_Functions_simulation_study.R') # functions used for parameter estimation for the different models considered



##### 2) DGP: Cartesian product CHMM #####

# number of observations for training and training+test set
T_train=1000
T_all<-1.1*T_train

# number of states per variable
NpV<-2

# number of state combinations
N<-NpV^2

# parameters for state-dependent normal distributions
# mean values
mu1<-c(2,6) # variable 1
mu2<-c(2,5) # variable 2
# standard deviations
sigma1<-sigma2<-rep(1.5,2) 

# transition probability matrix
gamma<-matrix(0,4,4)
gamma[1,]<-c(0.9,0.02,0.02,0.06)
gamma[4,]<-rev(gamma[1,])
gamma[2,]<-c(0.09,0.8,0.02,0.09)
gamma[3,]<-rev(gamma[2,])

# stationary distribution
delta<-solve(t(diag(4)-gamma+matrix(1,4,4)),rep(1,4))



##### 3) Simulation study #####

# number of simulation runs
Runs<-1000

# index needed for calculation of CHMM log-likelihood
ind_states<-rev(expand.grid(1:NpV,1:NpV)) # order of the state vectors (rows correspond to the bivariate state vectors)
ind_NpV<-array(0,dim=c(NpV,NpV,2)) # index array used inside the CHMM mllk function
for(m in 1:2){
  for(j in 1:NpV)
    ind_NpV[,j,m]<-which(ind_states[,m]==j)
}

# initialisations:

# objects to save estimated models
mod_coupled<-vector('list')
mod1_univariate<-vector('list')
mod2_univariate<-vector('list')
mod_bivariate<-vector('list')
# objects to save log-likelihood values
LL_train<-matrix(NA,Runs,4)
LL_test<-matrix(NA,Runs,4)
LL_all<-matrix(NA,Runs,4)
# objects to save number of falsely decoded states in the Viterbi sequence
fds_train<-matrix(NA,Runs,5)
fds_test<-matrix(NA,Runs,5)
fds_all<-matrix(NA,Runs,5)
# to control if label switching occurred:
ind.label<-matrix(NA,Runs,4)
  

# run simulation study:

set.seed(1000)
plot(NA,type='l',xlab='simulation run',ylab='',main="simulation progress",ylim=c(0,1),xlim=c(0,Runs),bty='n')
for(run in 1:Runs){
  points(run,0.5,pch=20,cex=2)
  
  # data generation
  s<-y_1<-y_2<-numeric(T_all) # y1 and y2 save the realised values for variable 1 and 2, respectively
  s_1<-s_2<-rep(1,T_all)      # s saves the overall states (values from 1 to 4), s_1 and s_2 save the individual states (values from 1 to 2)
  s[1]<-sample(1:4,1,prob=delta)
  if(s[1]==3 | s[1]==4) s_1[1]=2
  if(s[1]==2 | s[1]==4) s_2[1]=2
  y_1[1]<-rnorm(1,mu1[s_1[1]],sigma1[s_1[1]])
  y_2[1]<-rnorm(1,mu2[s_2[1]],sigma2[s_2[1]])
  for(t in 2:T_all){
    s[t]<-sample(1:N,1,prob=gamma[s[t-1],])
    if(s[t]==3 | s[t]==4) s_1[t]=2
    if(s[t]==2 | s[t]==4) s_2[t]=2
    y_1[t]<-rnorm(1,mu1[s_1[t]],sigma1[s_1[t]])
    y_2[t]<-rnorm(1,mu2[s_2[t]],sigma2[s_2[t]])
  }
  data_all<-cbind(y_1,y_2)
  data_train<-data_all[1:T_train,]
  data_test<-data_all[(T_train+1):T_all,]
    
  # a) Estimation Cartesian product CHMMs
  
  # starting values
  mu0<-c(mu1,mu2)
  sigma0<-c(sigma1,sigma2)
  gamma0<-gamma

  # numerical maximum log-likelihood estimation
  try(mod<-mle_coupled(data_train,mu0,sigma0,gamma0,NpV))

  # saving model and calculation of relevant likelihood values and Viterbi sequences
  if(exists('mod')){
    mod_coupled[[run]]<-mod
    LL_train[run,1]<-(-mod_coupled[[run]]$mllk)
    parvect_coupled<-n2w_coupled(mod_coupled[[run]]$mu,mod_coupled[[run]]$sigma,mod_coupled[[run]]$gamma,N)
    LL_test[run,1]<-(-mllk_coupled(parvect_coupled,data_test,NpV,N,ind_NpV))
    LL_all[run,1]<-(-mllk_coupled(parvect_coupled,data_all,NpV,N,ind_NpV))
    ind.label[run,1]<-sum(mod_coupled[[run]]$mu[1]>mod_coupled[[run]]$mu[2])+sum(mod_coupled[[run]]$mu[3]>mod_coupled[[run]]$mu[4])
    s_coupled<-viterbi_coupled(data_all,mod_coupled[[run]],NpV)
    fds_all[run,1]<-sum(!s==s_coupled)
    s_coupled<-viterbi_coupled(data_train,mod_coupled[[run]],NpV)
    fds_train[run,1]<-sum(!s[1:T_train]==s_coupled)
    s_coupled<-viterbi_coupled(data_test,mod_coupled[[run]],NpV)
    fds_test[run,1]<-sum(!s[(1+T_train):T_all]==s_coupled)
    rm(mod)
  }
    
  # b) Estimation bivariate HMMs
  
  # starting values
  mu0<-c(mu1,mu2)
  sigma0<-c(sigma1,sigma2)
  gamma0<-matrix(0.1,2,2)
  diag(gamma0)=0.9
    
  # numerical maximum log-likelihood estimation
  try(mod<-mle_bivariate(data_train,mu0,sigma0,gamma0,NpV))
  
  # saving model and calculation of relevant likelihood values and Viterbi sequences
  if(exists('mod')){
    mod_bivariate[[run]]<-mod
    LL_train[run,2]<-(-mod_bivariate[[run]]$mllk)
    parvect_bivariate<-n2w_bivariate(mod_bivariate[[run]]$mu,mod_bivariate[[run]]$sigma,mod_bivariate[[run]]$gamma,NpV)
    LL_test[run,2]<- (-mllk_bivariate(parvect_bivariate,data_test,NpV))
    LL_all[run,2]<- (-mllk_bivariate(parvect_bivariate,data_all,NpV))
    ind.label[run,2]<-sum(mod_bivariate[[run]]$mu[1]>mod_bivariate[[run]]$mu[2])
    s_bivariate<-viterbi_bivariate(data_all,mod_bivariate[[run]],NpV)
    fds_all[run,2]<-sum(!s_1==s_bivariate | !s_2==s_bivariate)
    s_bivariate<-viterbi_bivariate(data_train,mod_bivariate[[run]],NpV)
    fds_train[run,2]<-sum(!s_1[1:T_train]==s_bivariate | !s_2[1:T_train]==s_bivariate)
    s_bivariate<-viterbi_bivariate(data_test,mod_bivariate[[run]],NpV)
    fds_test[run,2]<-sum(!s_1[(1+T_train):T_all]==s_bivariate | !s_2[(1+T_train):T_all]==s_bivariate)
    rm(mod)
  }
    
  # c) Estimation univariate HMMs
    
  # starting values
    mu01<-mu1
    mu02<-mu2
    sigma01<-sigma1
    sigma02<-sigma2
    gamma0<-matrix(0.1,2,2)
    diag(gamma0)=0.9
    
    # numerical maximum log-likelihood estimation
    try(mod1<-mle_univariate(y_1[1:T_train],mu01,sigma01,gamma0,NpV))
    try(mod2<-mle_univariate(y_2[1:T_train],mu02,sigma02,gamma0,NpV))

    # saving model and calculation of relevant likelihood values and Viterbi sequences
    if(exists('mod1')){
      if(exists('mod2')){
        mod1_univariate[[run]]<-mod1
        LL_train[run,3]<- (-mod1_univariate[[run]]$mllk)
        parvect_univariate<-n2w_univariate(mod1_univariate[[run]]$mu,mod1_univariate[[run]]$sigma,mod1_univariate[[run]]$gamma,NpV)
        LL_test[run,3]<- (-mllk_univariate(parvect_univariate,y_1[(T_train+1):T_all],NpV))
        LL_all[run,3]<- (-mllk_univariate(parvect_univariate,y_1,NpV))
        mod2_univariate[[run]]<-mod2
        LL_train[run,4]<- (-mod2_univariate[[run]]$mllk)
        parvect_univariate<-n2w_univariate(mod2_univariate[[run]]$mu,mod2_univariate[[run]]$sigma,mod2_univariate[[run]]$gamma,NpV)
        LL_test[run,4]<- (-mllk_univariate(parvect_univariate,y_2[(T_train+1):T_all],NpV))
        LL_all[run,4]<- (-mllk_univariate(parvect_univariate,y_2,NpV))
        s1_univariate<-viterbi_univariate(y_1[(1+T_train):T_all],mod=mod1_univariate[[run]],NpV)
        s2_univariate<-viterbi_univariate(y_2[(1+T_train):T_all],mod=mod2_univariate[[run]],NpV)
        fds_test[run,3]<-sum(!s_1[(1+T_train):T_all]==s1_univariate)
        fds_test[run,4]<-sum(!s_2[(1+T_train):T_all]==s2_univariate)
        fds_test[run,5]<-sum(!s_1[(1+T_train):T_all]==s1_univariate | !s_2[(1+T_train):T_all]==s2_univariate)
        s1_univariate<-viterbi_univariate(y_1,mod=mod1_univariate[[run]],NpV)
        s2_univariate<-viterbi_univariate(y_2,mod=mod2_univariate[[run]],NpV)
        fds_all[run,3]<-sum(!s_1==s1_univariate)
        fds_all[run,4]<-sum(!s_2==s2_univariate)
        fds_all[run,5]<-sum(!s_1==s1_univariate | !s_2==s2_univariate)
        s1_univariate<-viterbi_univariate(y_1[1:T_train],mod=mod1_univariate[[run]],NpV)
        s2_univariate<-viterbi_univariate(y_2[1:T_train],mod=mod2_univariate[[run]],NpV)
        fds_train[run,3]<-sum(!s_1[1:T_train]==s1_univariate)
        fds_train[run,4]<-sum(!s_2[1:T_train]==s2_univariate)
        fds_train[run,5]<-sum(!s_1[1:T_train]==s1_univariate | !s_2[1:T_train]==s2_univariate)
        rm(mod1,mod2)
    } 
  }
}



##### 4) Estimation accuracy - plot estimated state-dependent distributions #####

# colours used
col<-c(rgb(176,048,096,maxColorValue=255),rgb(000,139,139,maxColorValue=255))
a<-0.2
colt<-c(rgb(176,048,096,maxColorValue=255,alpha=a*255),rgb(000,139,139,maxColorValue=255,alpha=a*255))

par(mfrow=c(3,2),mar=c(4,4,1,1),mgp=c(2.7,1,0))

# a) CHMM

# variable 1
z<-seq(-2,10,by=0.01)
plot(NA,bty='n',ylim=c(0,0.35),xlim=c(-2,10),main="",xlab=expression('y'^'(1)'),ylab='density',cex.axis=1.5,cex.lab=1.5)
for(i in 1:Runs){
  mod<-mod_coupled[[i]]
  lines(z,dnorm(z,mod$mu[1],sd=mod$sigma[1]),lwd=3,col=colt[1])
  lines(z,dnorm(z,mod$mu[2],sd=mod$sigma[2]),lwd=3,col=colt[2])
}
lines(z,dnorm(z,mu1[1],sd=sigma1[1]),type='l',lwd=3,ylim=c(0,0.4))
lines(z,dnorm(z,mu1[2],sd=sigma1[2]),type='l',lwd=3)

# variable 2
z<-seq(-2,10,by=0.01)
plot(NA,bty='n',ylim=c(0,0.35),xlim=c(-2,10),main="",xlab=expression('y'^'(2)'),ylab='density',cex.axis=1.5,cex.lab=1.5)
for(i in 1:Runs){
  mod<-mod_coupled[[i]]
  lines(z,dnorm(z,mod$mu[3],sd=mod$sigma[3]),lwd=3,col=colt[1])
  lines(z,dnorm(z,mod$mu[4],sd=mod$sigma[4]),lwd=3,col=colt[2])
}
lines(z,dnorm(z,mu2[1],sd=sigma2[1]),type='l',lwd=3)
lines(z,dnorm(z,mu2[2],sd=sigma2[2]),type='l',lwd=3)


# b) bivariate HMM

# variable 1
z<-seq(-2,10,by=0.01)
plot(NA,bty='n',ylim=c(0,0.35),xlim=c(-2,10),main="",xlab=expression('y'^'(1)'),ylab='density',cex.axis=1.5,cex.lab=1.5)
for(i in 1:Runs){
  mod<-mod_bivariate[[i]]
  lines(z,dnorm(z,mod$mu[1],sd=mod$sigma[1]),lwd=3,col=colt[1])
  lines(z,dnorm(z,mod$mu[2],sd=mod$sigma[2]),lwd=3,col=colt[2])
}
lines(z,dnorm(z,mu1[1],sd=sigma1[1]),type='l',lwd=3,ylim=c(0,0.4))
lines(z,dnorm(z,mu1[2],sd=sigma1[2]),type='l',lwd=3)

# variable 2
z<-seq(-2,10,by=0.01)
plot(NA,bty='n',ylim=c(0,0.35),xlim=c(-2,10),main="",xlab=expression('y'^'(2)'),ylab='density',cex.axis=1.5,cex.lab=1.5)
for(i in 1:Runs){
  mod<-mod_bivariate[[i]]
  lines(z,dnorm(z,mod$mu[3],sd=mod$sigma[3]),lwd=3,col=colt[1])
  lines(z,dnorm(z,mod$mu[4],sd=mod$sigma[4]),lwd=3,col=colt[2])
}
lines(z,dnorm(z,mu2[1],sd=sigma2[1]),type='l',lwd=3)
lines(z,dnorm(z,mu2[2],sd=sigma2[2]),type='l',lwd=3)


# c) univariate HMMs

# variable 1
z<-seq(-2,10,by=0.01)
plot(NA,bty='n',ylim=c(0,0.35),xlim=c(-2,10),main="",xlab=expression('y'^'(1)'),ylab='density',cex.axis=1.5,cex.lab=1.5)
for(i in 1:Runs){
  mod<-mod1_univariate[[i]]
  lines(z,dnorm(z,mod$mu[1],sd=mod$sigma[1]),lwd=3,col=colt[1])
  lines(z,dnorm(z,mod$mu[2],sd=mod$sigma[2]),lwd=3,col=colt[2])
}
lines(z,dnorm(z,mu1[1],sd=sigma1[1]),type='l',lwd=3,ylim=c(0,0.4))
lines(z,dnorm(z,mu1[2],sd=sigma1[2]),type='l',lwd=3)

# variable 2
z<-seq(-2,10,by=0.01)
plot(NA,bty='n',ylim=c(0,0.35),xlim=c(-2,10),main="",xlab=expression('y'^'(2)'),ylab='density',cex.axis=1.5,cex.lab=1.5)
for(i in 1:Runs){
  mod<-mod2_univariate[[i]]
  lines(z,dnorm(z,mod$mu[1],sd=mod$sigma[1]),lwd=3,col=colt[1])
  lines(z,dnorm(z,mod$mu[2],sd=mod$sigma[2]),lwd=3,col=colt[2])
}
lines(z,dnorm(z,mu2[1],sd=sigma2[1]),type='l',lwd=3)
lines(z,dnorm(z,mu2[2],sd=sigma2[2]),type='l',lwd=3)



##### 5) Classification performance - average percentage of falsely decoded states #####

# training data set
average_fsd_train<-round(apply(fds_train,2,mean)/T_train*100,1)

# test data set
average_fsd_test<-round(apply(fds_test,2,mean),1)

# table
average_fsd<-rbind(average_fsd_train,average_fsd_test)
colnames(average_fsd)<-c('CHMM', 'bi. HMM', 'uni. HMM v1 ','uni. HMM v2','uni. HMM combined')

average_fsd



##### 6) Forecasting performance - conditional log-likelihood of test set given training data #####

# calculation of conditional log-likelihood of test set given training set
cond_ll<-matrix(NA,Runs,3)
cond_ll[,1]<-LL_all[,1]-LL_train[,1] # CHMM
cond_ll[,2]<-LL_all[,2]-LL_train[,2] # bi. HMM
cond_ll[,3]<-LL_all[,3]-LL_train[,3]+LL_all[,4]-LL_train[,4] #ind. HMMs

# percentage of largest conditional log likelihood among the fitted models
max_cond_ll<-matrix(as.numeric(table(apply(cond_ll,1,which.max))/Runs*100),nrow=1)
colnames(max_cond_ll)<-c('CHMM', 'bi. HMM', 'uni. HMM')
max_cond_ll
