# Maximum approximate likelihood estimation of general continuous-time state-space models -- R code

## this file contains the R-Code for reproducing the simulation experiments shown in Section 3
## the code was tested in R (version 4.0.2) on Windows 10


## brief overview of the objects used throughout this script:

# m: number of intervals
# bm: range of state values
# theta.star: vector containing (unconstrained) initial values for the model parameters, 
  # i.e. theta & sigma (drift parameter & diffusion coefficient of the OU process) and alpha
# deltat: length of time intervals between consecutive observations
# match2array: matches the interval lengths to the Gamma array used in the likelihood evaluation (L_Pois)
# T: number of observations


# likelihood function
L_Pois <- function(theta.star, m, bm, obs, deltat, match2array){
  thetaOU <- exp(theta.star[1])
  sigmaOU <- exp(theta.star[2])
  alpha <- exp(theta.star[3])
  
  b <- seq(-bm, bm, length = m + 1) # specify boundaries of m intervals
  h <- b[2] - b[1] # h is the length of each interval
  bstar <- (b[-1] + b[-(m + 1)]) * 0.5 # midpoints of the m intervals
  
  delta <- dnorm(bstar, 0, sqrt(sigmaOU^2 / (2 * thetaOU))) * h # initial distribution = stationary distribution of OU process
  
  deltat <- unique(deltat)
  Gammas <- array(0, dim = c(m, m, length(deltat))) # transition probability matrices for different time intervals
  for (j in 1:length(deltat)) {
    Dt <- sort(deltat)[j]
    Gamma <- matrix(0, m, m)
    for (i in 1:m){
      Gamma[i,] <- diff(pnorm(b, exp(-thetaOU * Dt) * bstar[i],
                              sqrt((1 - exp(-2 * thetaOU * Dt)) * sigmaOU^2 / (2 * thetaOU)))) # conditional distribution of OU process
    }
    Gamma <- Gamma / rowSums(Gamma)
    Gammas[,,j] <- Gamma
  }
  
  # forward algorithm:
  lscale <- 0
  foo <- delta * dpois(obs[1], lambda = alpha * exp(bstar))
  lscale <- log(sum(foo))
  phi <- foo / sum(foo)
  
  for (t in 2:length(obs)){
    foo <- phi %*% Gammas[,,match2array[t]] * dpois(obs[t], lambda = alpha * exp(bstar))
    sumfoo <- sum(foo) 
    lscale <- lscale + log(sumfoo) 
    phi <- foo / sumfoo
  }
  return(-lscale)
}


# function to simulate data from a continous-time Poisson SSM
simCountData <- function(thetaOU, sigmaOU, alpha, deltat, n.obs){
  st <- count <- numeric(n.obs)
  lambda <- alpha * exp(st[1])
  count[1] <- rpois(1, lambda)
  for (t in 2:n.obs){
    st[t] <- rnorm(1, exp(-thetaOU * deltat[t]) * st[t - 1], 
                   sqrt((1 - exp(-2 * thetaOU * deltat[t])) * sigmaOU^2 / (2 * thetaOU))) # conditional distribution of OU process
    lambda <- alpha * exp(st[t])
    count[t] <- rpois(1, lambda)
  }
  df <- data.frame(time = cumsum(deltat), state = st, count = count,
                   theta = rep(thetaOU, n.obs), sigma = rep(sigmaOU, n.obs))
  return(df)
}



##### effect of number of intervals m on estimation accuracy #####

alpha <- 200
T <- 2000
set.seed(1)
deltat <- rpois(T, 30) / 24
summary(deltat)

sortDeltat <- sort(unique(deltat))
match2array <- rep(NA, T)
for (i in 1:T) {
  match2array[i] <- which(sortDeltat == deltat[i])
}

m <- c(20, 30, 50, 100, 150)

thetaOU <- c(0.02, 0.5, 2) 
sigmaOU <- c(0.1, 0.5, 1) 
sqrt(sigmaOU^2 / (2 * thetaOU)) # standard deviation of the stationary distribution of the OU process


### setting 1 ###

theta <- thetaOU[1]
sigma <- sigmaOU[1]

# visualize stationary distribution
x <- seq(-5, 5, by = 0.01)
plot(x, dnorm(x, 0, sqrt(sigma^2 / (2 * theta)) ), type="l")

set.seed(1)
obs <- simCountData(theta, sigma, alpha, deltat, T)

theta.star <- c(log(0.05), log(0.2), log(250)) # thetaOU, sigmaOU, alpha

mods <- list()
for (i in 1:length(m)) {
  startzeit <- Sys.time()
  mods[[i]] <- nlm(L_Pois, theta.star, m = m[i], bm = 2.5, obs$count, deltat, match2array, print.level = 1, iterlim = 1000, hessian = FALSE)
  mods[[i]]$duration <- Sys.time() - startzeit
  print(mods[[i]]$duration)
}


### setting 2 ###

theta <- thetaOU[2]
sigma <- sigmaOU[2]

set.seed(1)
obs <- simCountData(theta, sigma, alpha, deltat, T)


theta.star <- c(log(0.3), log(0.7), log(250))

mods2 <- list()
for (i in 1:length(m)) {
  startzeit <- Sys.time()
  mods2[[i]] <- nlm(L_Pois, theta.star, m = m[i], bm = 2.5, obs$count, deltat, match2array, print.level = 1, iterlim = 1000, hessian = FALSE)
  mods2[[i]]$duration <- Sys.time() - startzeit
  print(mods2[[i]]$duration)
}


### setting 3 ### 

theta <- thetaOU[3]
sigma <- sigmaOU[3]

set.seed(1)
obs <- simCountData(theta, sigma, alpha, deltat, T)

theta.star <- c(log(1.5), log(1.5), log(250))

mods3 <- list()
for (i in 1:length(m)) {
  startzeit <- Sys.time()
  mods3[[i]] <- nlm(L_Pois, theta.star, m = m[i], bm = 2.5, obs$count, deltat, match2array, print.level = 1, iterlim = 1000, hessian = FALSE)
  mods3[[i]]$duration <- Sys.time() - startzeit
  print(mods3[[i]]$duration)
}



### estimation results for all settings ###

results <- data.frame(setting = rep(c(1, 2, 3), each = 5), m = rep(m, 3), 
                      theta = 0, sigma = 0, alpha = 0, time = 0, llk = 0)
for (i in 1:3) {
  if(i == 1){fits <- mods}
  if(i == 2){fits <- mods2}
  if(i == 3){fits <- mods3}
  for(j in 1:length(m)){
    results[j + (5 * (i - 1)), 3] <- exp(fits[[j]]$estimate[1])
    results[j + (5 * (i - 1)), 4] <- exp(fits[[j]]$estimate[2])
    results[j + (5 * (i - 1)), 5] <- exp(fits[[j]]$estimate[3])
    results[j + (5 * (i - 1)), 6] <- fits[[j]]$duration
    results[j + (5 * (i - 1)), 7] <- fits[[j]]$minimum
  }
}
results





##### empirical check of estimators' consistency #####

alpha <- 200
theta <- 0.5
sigma <- 0.5

theta.star <- c(log(0.3), log(0.7), log(250)) # thetaOU, sigmaOU, alpha

bm <- 2.5
m <- 100

T <- c(2000, 5000, 10000)


##################### WARNING: the following code runs for a few hours ####################################

for (j in 1:length(T)) {
  set.seed(1)
  deltat <- rpois(T[j], 30) / 24
  
  sortDeltat <- sort(unique(deltat))
  match2array <- rep(NA, T[j])
  for (i in 1:T[j]) {
    match2array[i] <- which(sortDeltat == deltat[i])
  }
  
  mods <- list()
  for (k in 1:200) {
    obs <- simCountData(theta, sigma, alpha, deltat, T[j])
    startzeit <- Sys.time()
    mods[[k]] <- nlm(L_Pois, theta.star, m = m, bm = bm, obs$count, deltat, match2array, 
                     print.level = 1, iterlim = 1000, hessian = FALSE)
    mods[[k]]$duration <- Sys.time() - startzeit
  }
  
  assign(paste0("mods", T[j]), mods)
}
