--- title: "dfaTMB: Dynamic Factor Analysis" author: "Eli Holmes" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: yes vignette: > %\VignetteIndexEntry{dfaTMB: Dynamic Factor Analysis} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- \newcommand{\QQ}{\mathbf{Q}} \newcommand{\RR}{\mathbf{R}} \newcommand{\xx}{\mathbf{x}} ```{r setup, include = FALSE} # knitr options knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) set.seed(1234) ``` *`dfaTMB()` will be replaced with `MARSS(..., method="TMB")`* The `dfaTMB()` function allows you to fit DFAs with the same form as `MARSS(x, form="dfa")`. This has a diagonal $\QQ$ with 1 on the diagonal and a stochastic $\xx_1$ with mean 0 and variance of 5 (diagonal variance-covariance matrix). There are only 3 options allowed for $\RR$: diagonal and equal, diagonal and unequal, and unconstrained. Example data ```{r} library(MARSS) data(lakeWAplankton, package = "MARSS") phytoplankton <- c("Cryptomonas", "Diatoms", "Greens", "Unicells", "Other.algae") dat <- as.data.frame(lakeWAplanktonTrans) |> subset(Year >= 1980 & Year <= 1989) |> subset(select=phytoplankton) |> t() |> MARSS::zscore() ``` Fit with MARSS ```{r} m1.em <- MARSS(dat, model=list(R='unconstrained', m=1, tinitx=1), form='dfa', z.score=FALSE, silent = TRUE) ``` Fit with TMB. Note the syntax will be updated to match MARSS(). ```{r} library(marssTMB) m1.tmb <- dfaTMB(dat, model=list(m=1, R='unconstrained')) ``` Compare parameter estimates ```{r} library(tidyr) pars <- data.frame(name = c(paste0("R", rownames(coefficients(m1.em)$R)), paste0("Z", rownames(coefficients(m1.em)$Z))), EM = c(coefficients(m1.em)$R, coefficients(m1.em)$Z), TMB = c(as.vector(m1.tmb$Estimates$R[lower.tri(m1.tmb$Estimates$R,diag=TRUE)]), as.vector(m1.tmb$Estimates$Z))) pars <- pars %>% tidyr::pivot_longer(2:3, names_to = "model") ``` ```{r} library(ggplot2) dodge <- position_dodge(width=0.5) ggplot(pars, aes(x=name, y=value, col=model)) + geom_point(position=dodge) + ggtitle("same estimates") ``` Compare two states models. ```{r} m1.bfgs <- MARSS(dat, model=list(R='unconstrained', m=2, tinitx=1), form='dfa', z.score=FALSE, silent = TRUE, method="BFGS") m1.tmb <- dfaTMB(dat, model=list(m=2, R='unconstrained')) c(m1.bfgs$logLik, m1.tmb$logLik) ``` ```{r echo=FALSE} ZmatGen <- function(n, m) { tZ <- matrix(0.5, nrow = n, ncol = m) if (m > 1) { for (i in 1:(m - 1)) { tZ[i, (m - (m - 2) + (i - 1)):m] <- rep(0, m - 1 - (i - 1)) } return(tZ) } else { return(tZ) } } pars <- data.frame(name = c(paste0("R", rownames(coefficients(m1.bfgs)$R)), paste0("Z", rownames(coefficients(m1.bfgs)$Z))), EM = c(coefficients(m1.bfgs)$R, coefficients(m1.bfgs)$Z), TMB = c(as.vector(m1.tmb$Estimates$R[lower.tri(m1.tmb$Estimates$R,diag=TRUE)]), as.vector(m1.tmb$Estimates$Z[ZmatGen(nrow(dat), 2)!=0]))) pars <- pars %>% tidyr::pivot_longer(2:3, names_to = "model") dodge <- position_dodge(width=0.5) ggplot(pars, aes(x=name, y=value, col=model)) + geom_point(position=dodge) + ggtitle("same estimates") ``` Compare two states models with R diagonal and equal ```{r} mod.list <- list(R='diagonal and equal', m=2, tinitx=1) m1.bfgs <- MARSS(dat, model=mod.list, form='dfa', z.score=FALSE, silent = TRUE, method="BFGS") m1.tmb <- dfaTMB(dat, model=mod.list) c(m1.bfgs$logLik, m1.tmb$logLik) ``` ## Include a comparison with covariates ```{r} data(lakeWAplankton, package = "MARSS") phytoplankton <- c("Cryptomonas", "Diatoms", "Greens", "Unicells", "Other.algae") dat <- as.data.frame(lakeWAplanktonTrans) |> subset(Year >= 1980 & Year <= 1989) |> subset(select=phytoplankton) |> t() |> MARSS::zscore() # add a temperature covariate temp <- as.data.frame(lakeWAplanktonTrans) |> subset(Year >= 1980 & Year <= 1989) |> subset(select=Temp) covar <- t(temp) m_cov_tmb <- dfaTMB(dat, model=list(m=1, R='diagonal and unequal'), EstCovar = TRUE, Covars = covar) m_cov_tmb$Estimates$D # add a 2nd covariate TP <- as.data.frame(lakeWAplanktonTrans) |> subset(Year >= 1980 & Year <= 1989) |> subset(select=TP) covar <- rbind(covar, t(TP)) m_cov2_tmb <- dfaTMB(dat, model=list(m=1, R='diagonal and unequal'), EstCovar = TRUE, Covars = covar) m_cov2_tmb$Estimates$D ``` ## Look at a bigger data set Compare three states models with R diagonal and unequal and 12 time series. The EM algorithm and TMB algorithms struggles to converge with DFA models. This is specific to this set of time series. ```{r} dat2 <- rbind(dat, dat+matrix(rnorm(nrow(dat)*ncol(dat),0,0.2), nrow(dat), ncol(dat))) dat2 <- MARSS::zscore(dat2) mod.list <- list(R='diagonal and unequal', m=3, tinitx=1) # Control set to match setting in MARSS.tmp() t1.bfgs <- system.time(m1.bfgs <- MARSS(dat2, model=mod.list, form='dfa', z.score=FALSE, silent = TRUE, method="BFGS", control = list(reltol = 1e-12, maxit=2000)))[1] t1.em <- system.time(m1.em <- MARSS(dat2, model=mod.list, form='dfa', z.score=FALSE, silent = TRUE))[1] t1.tmb <- system.time(m1.tmb <- dfaTMB(dat2, model=mod.list))[1] t1.tmb2 <- system.time(m1.tmb2 <- dfaTMB(dat2, model=mod.list, fun.opt="optim"))[1] ``` TMB + nlminb() is much faster. But at the default convergence settings, the Kalman-Filter + BFGS ran longer and got to a higher maximum likelihood. This seems to be an issue with TMB for both the optimizers. Also the speed savings seems to be due to nlminb() not TMB per se. ```{r} # Log-likelihood c(em=m1.em$logLik, kfoptim=m1.bfgs$logLik, TMBnlminb=m1.tmb$logLik, TMBoptim=m1.tmb2$logLik) # Time c(em=t1.em, kfoptim=t1.bfgs, TMBnlminb=t1.tmb, TMBoptim=t1.tmb2) ``` It might be an odd initial condition issue as if BFGS is started with EM, it does not find the lower log-likelihood but ends up at the higher value that EM and TMB found. Note, this varies a bit. This is for `set.seed(1234)`. ```{r} m1 <- MARSS(dat2, model=mod.list, form='dfa', z.score=FALSE, silent = TRUE, control = list(minit=1, maxit=10)) t2.bfgs <- system.time(m2.bfgs <- MARSS(dat2, model=mod.list, form='dfa', z.score=FALSE, silent = TRUE, method="BFGS", inits = m1))[1] c(t2.bfgs, m2.bfgs$logLik) ``` ```{r} df <- data.frame(t=1:37, bfgs=tidy(m1.bfgs)$estimate, em=tidy(m1.em)$estimate) |> pivot_longer(2:3) ggplot(df, aes(x=t, y=value, col=name)) + geom_point() ```