# Maximum approximate likelihood estimation of general continuous-time state-space models -- R code

## this file contains the R-Code for reproducing the results of the case study obtained in Section 4
## the code was tested in R (version 4.0.2) on Windows 10 with packages...

# ...dplyr (version 1.0.2)
# ...splines (version 4.0.2)
# ...gamlss (version 5.2.0)
# ...gamlss.mx (version 6.0.0)
# ...expm (version 0.999.5)

library(dplyr)
library(splines)
library(gamlss)
library(gamlss.mx)
library(expm)


data <- read.csv("DataStatModelling.csv")
head(data, n = 10)


### compute Bx, the B-spline base matrix (= design matrix) ###

nb = 8 # number of basic B-splines
ord = 4
degree = ord - 1
nrknots = nb - (ord - 2)
knots = seq(7, 35, length = nrknots + 2 * degree)

ids <- unique(data$ID)
Bx <- list()
for (i in 1:length(ids)) {
  x <- data$age[which(data$ID == ids[i])]
  Bx[[i]] <- splineDesign(knots, x, ord, outer.ok = T)
}



# m: number of intervals
# bm: range of state values
# theta.star: vector containing (unconstrained) initial values for the model parameters

# likeihood function
L_nBi <- function(theta.star, m, bm, obs, Bx){
  thetaOU <- exp(theta.star[1])
  sigmaOU <- exp(theta.star[2])
  size <- exp(theta.star[3])
  phis <- matrix(theta.star[4:length(theta.star)], ncol = 2, byrow = FALSE)
  
  b <- seq(-bm, bm, length = m + 1) # specify boundaries of m intervals
  h <- b[2] - b[1] # h is the length of each interval
  bstar <- (b[-1] + b[-(m + 1)]) * 0.5 # midpoints of the m intervals
  
  # for the initial distribution:
  delta <- dnorm(bstar, 0, sqrt(sigmaOU^2 / (2 * thetaOU))) * h # initial distribution = stationary distribution of OU process
  
  deltat <- 1:4 # length of time intervals between consecutive observations
  Gammas <- array(0, dim = c(m, m, length(deltat))) # transition probability matrices for different time intervals
  for (j in 1:length(deltat)) {
    Dt <- deltat[j]
    Gamma <- matrix(0, m, m)
    for (i in 1:m){
      Gamma[i, ] <- diff(pnorm(b, exp(-thetaOU * Dt) * bstar[i],
                               sqrt((1 - exp(-2 * thetaOU * Dt)) * sigmaOU^2 / (2 * thetaOU)))) # conditional distribution of OU process
    }
    Gamma <- Gamma / rowSums(Gamma)
    Gammas[,,j] <- Gamma
  }
  
  l.all <- 0
  for (i in 1:length(unique(obs$ID))) {
    obsID <- obs[which(obs$ID == unique(obs$ID)[i]), ]
    Bmatrix <- Bx[[i]]
    
    allprobs <- matrix(1, ncol = length(bstar), nrow = nrow(obsID))
    for (t in 1:nrow(obsID)) {
      allprobs[t, ] <- dnbinom(obsID$inc[t], size = size, 
                               mu = exp(bstar + as.vector(Bmatrix[t, ] %*% phis[, obsID$gender[t] + 1])) )
    }
    
    # forward algorithm:
    foo <- delta * allprobs[1, ]
    lscale <- log(sum(foo))
    phi <- foo / sum(foo)
    for (t in 2:nrow(obsID)){
      foo <- phi %*% Gammas[,,obsID$timeDiff[t]] * allprobs[t, ]
      sumfoo <- sum(foo)
      lscale <- lscale + log(sumfoo)
      phi <- foo / sumfoo
    }
    
    l.all <- l.all + lscale
  }
  return(-l.all)
}



######################################## PLEASE NOTE: ######################################################
######### results may differ from the paper as the original data cannot be made publicly available ########
######### the provided data set has been simulated based on the fitted model ##############################



##### fit continuous-time SSM #####

theta.star <- c(log(0.3), log(2), log(0.8), rep(0, 2*nb)) # thetaOU, sigmaOU, size (= dispersion parameter), omegas

mod <- nlm(L_nBi, theta.star, m = 100, bm = 9, obs = data, Bx,
           print.level = 1, iterlim = 10000, hessian = TRUE)


exp(mod$estimate[1:2]) # theta (drift parameter) & sigma (diffusion coefficient) OU process, and dispersion parameter of neg. binomial dist.





##### Benchmark models: GAM and GAMM with random intercept term #####

# data preparation
obs <- data[, c(1:2, 4:5)]
head(obs)
obs$gender <- as.factor(obs$gender)
obs$ID <- as.factor(obs$ID)

mod.gam <- gamlss(inc ~ pvc(age, by = gender), data = obs, family = NBI)
mod.gamm <- gamlssNP(inc ~ pvc(age, by = gender), K = 10, random =~ 1|ID, mixture = "gq",
                     data = obs, family = NBI)





##### state decoding using the Viterbi algorithm #####

viterbi <- function(mod, obs, m, bm, Bx){
  thetaOU <- exp(mod$estimate[1])
  sigmaOU <- exp(mod$estimate[2])
  size <- exp(mod$estimate[3])
  phis <- matrix(mod$estimate[4:length(mod$estimate)], ncol = 2, byrow = FALSE)
  
  states <- etaAll <- vector()
  
  b <- seq(-bm, bm, length = m + 1)
  h <- b[2] - b[1]
  bstar <- (b[-1] + b[-(m + 1)]) * 0.5

  delta <- dnorm(bstar, 0, sqrt(sigmaOU^2 / (2 * thetaOU))) * h
  
  deltat <- 1:4
  Gammas <- array(0, dim = c(m, m, length(deltat)))
  for (j in 1:length(deltat)) {
    Dt <- deltat[j]
    Gamma <- matrix(0, m, m)
    for (i in 1:m){
      Gamma[i, ] <- diff(pnorm(b, exp(-thetaOU * Dt) * bstar[i],
                               sqrt((1 - exp(-2 * thetaOU * Dt)) * sigmaOU^2 / (2 * thetaOU))))
    }
    Gamma <- Gamma / rowSums(Gamma)
    Gammas[,,j] <- Gamma
  }
  
  for (i in 1:length(unique(obs$ID))) {
    obsID <- obs[which(obs$ID == unique(obs$ID)[i]), ]
    Bmatrix <- Bx[[i]]
    n <- nrow(obsID)
    
    allprobs <- matrix(1, ncol = length(bstar), nrow = n)
    eta <- rep(NA, n)
    for (t in 1:n) {
      eta[t] <- as.vector(Bmatrix[t, ] %*% phis[, obsID$gender[t] + 1])
      allprobs[t, ] <- dnbinom(obsID$inc[t], size = size, mu = exp(bstar + eta[t]) )
    }
    
    xi <- matrix(0, nrow = n, ncol = m)
    foo <- delta * allprobs[1, ]
    xi[1,] <- foo / sum(foo)
    for (t in 2:n){
      foo <- apply(xi[t - 1, ] * Gammas[,,obsID$timeDiff[t]], 2, max) * allprobs[t, ]
      xi[t,] <- foo / sum(foo)
    }
    iv <- numeric(n)
    iv[n] <- which.max(xi[n, ])
    for (t in (n - 1):1) {
      iv[t] <- which.max(Gammas[,,obsID$timeDiff[t + 1]][, iv[t + 1]] * xi[t, ])
    }
    
    states <- c(states, iv)
    etaAll <- c(etaAll, eta)
  }
  df <- data.frame(states = states, bstar = bstar[states], eta = etaAll) 
  return(df)
}


vit <- viterbi(mod, obs = data, m = 500, bm = 9, Bx)




##### continuous-time HMMs #####

# likelihood function
L_ctHMM <- function(theta.star, m, obs, Bx){
  nbs <- ncol(Bx[[1]]) * 2
  size <- exp(theta.star[1])
  mu_state <- theta.star[1 + 1:m]
  phis <- matrix(theta.star[(m + 1) + 1:nbs], ncol = 2, byrow = FALSE)
  qs <- exp(theta.star[(nbs + m + 2):length(theta.star)])
  
  # transition intensity matrices
  Q <- diag(m)
  Q[!Q] <- qs
  diag(Q) <- 0
  diag(Q) <- -apply(Q, 1, sum)
  Qube <- array(rep(0, m * m * 4), dim = c(m, m, 4))
  for (i in 1:4) {
    Qube[, , i] <- expm(Q * i)
  }
  
  # for the initial distribution:
  delta <- expm(Q * 1000)[1, ]
  #delta <- solve(t(1 - Q), rep(1, m))
  
  l.all <- 0
  for (i in 1:length(unique(obs$ID))) {
    obsID <- obs[which(obs$ID == unique(obs$ID)[i]), ]
    Bmatrix <- Bx[[i]]
    
    allprobs <- matrix(1, ncol = m, nrow = nrow(obsID))
    for (t in 1:nrow(obsID)) {
      allprobs[t, ] <- dnbinom(obsID$inc[t], size = size, mu = exp(mu_state + 
                               as.vector(Bmatrix[t, ] %*% phis[, obsID$gender[t] + 1])) )
    }
    
    foo <- delta * allprobs[1, ]
    lscale <- log(sum(foo))
    phi <- foo / sum(foo)
    
    for (t in 2:nrow(obsID)){
      foo <- phi %*% Qube[,,obsID$timeDiff[t]] * allprobs[t, ]
      sumfoo <- sum(foo)
      lscale <- lscale + log(sumfoo)
      phi <- foo / sumfoo
    }
    
    l.all <- l.all + lscale
  }
  
  return(-l.all)
}


### fit continuous-time HMM: example code for 3 states ###

m = 3 # number of states
qs <- rep(0.05, 6)
mu_state <- c(2, 0, -2)

theta.star <- c(log(0.8), mu_state, rep(0, 2 * nb), log(qs)) # size, mu_state, omegas, transition intensities

mod.HMM <- nlm(L_ctHMM, theta.star, m, obs = data, Bx,
               print.level = 2, iterlim = 10000, hessian = TRUE)

