diff --git a/.Rbuildignore b/.Rbuildignore index 587f5c2..2fc4eb5 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,3 +4,5 @@ ^CLAUDE\.md$ ^inst/validation/snapshots$ ^\.github$ +^\.positai$ +^\.claude$ diff --git a/.gitignore b/.gitignore index 050561d..d061d71 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ Rplots.pdf vignettes/*.html vignettes/*.R inst/validation/snapshots/*.rds +.positai diff --git a/DESCRIPTION b/DESCRIPTION index 0b75779..ea56a6f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,6 +18,7 @@ Suggests: ARTnetData, knitr, rmarkdown, + survival, testthat (>= 3.0.0) VignetteBuilder: knitr RoxygenNote: 7.3.3 diff --git a/R/NetParams.R b/R/NetParams.R index 1fddf02..df507f5 100644 --- a/R/NetParams.R +++ b/R/NetParams.R @@ -1,3 +1,74 @@ +# Joint log-linear lm on log(duration.time) for ongoing partnerships. +# Same length-bias convention as the "empirical" default (uses only +# ongoing, relies on the memoryless property to read stratum-specific +# predicted durations as the median full-duration quantity that feeds +# the geometric transformation in build_netstats). Returns the fitted +# model plus a vector of per-row predicted durations on the training +# set so callers can stratify however they like. +fit_joint_lm_dur <- function(l_layer, race, geog.lvl) { + ongoing <- l_layer[l_layer$ongoing2 == 1 & + !is.na(l_layer$duration.time) & + l_layer$duration.time > 0, , drop = FALSE] + if (nrow(ongoing) < 20) return(NULL) + terms <- c("index.age.grp", "sqrt(index.age.grp)", "hiv2", "same.age.grp") + if (isTRUE(race)) { + terms <- c(terms, "as.factor(race.cat.num)", "same.race") + } + if (!is.null(geog.lvl)) { + terms <- c("geogYN", terms) + } + fml <- reformulate(terms, response = "log(duration.time)") + fit <- tryCatch( + suppressWarnings(lm(fml, data = ongoing)), + error = function(e) NULL + ) + if (is.null(fit)) return(NULL) + fitted_dur <- exp(predict(fit, newdata = ongoing)) + list(model = fit, ongoing = ongoing, fitted_dur = fitted_dur) +} + + +# Given a partnership-level layer and a non-default duration.method, +# return a list(homog, byage, joint_duration_model) of duration +# summaries matching the shape of the empirical output (so that the +# geometric transformation, smoothing, and sex.cess.mod logic in the +# calling layer block apply uniformly). `byage_strata` is the vector +# of stratum keys expected in the output (first = 0 for non-matched, +# subsequent = index.age.grp values for matched-within-age-group). +compute_alt_durs <- function(l_layer, duration.method, byage_strata, + race, geog.lvl) { + stopifnot(duration.method == "joint_lm") + + res <- fit_joint_lm_dur(l_layer, race = race, geog.lvl = geog.lvl) + if (is.null(res)) { + return(list(homog = data.frame(mean.dur = NA_real_, median.dur = NA_real_), + byage = data.frame(index.age.grp = byage_strata, + mean.dur = NA_real_, median.dur = NA_real_), + model = NULL)) + } + # Stratum medians from predicted durations on the training set. + byage <- do.call(rbind, lapply(byage_strata, function(k) { + sub <- if (k == 0) { + res$ongoing[res$ongoing$same.age.grp == 0, , drop = FALSE] + } else { + res$ongoing[res$ongoing$same.age.grp == 1 & + res$ongoing$index.age.grp == k, , drop = FALSE] + } + if (nrow(sub) == 0) { + data.frame(index.age.grp = k, mean.dur = NA_real_, median.dur = NA_real_) + } else { + pred_sub <- exp(predict(res$model, newdata = sub)) + data.frame(index.age.grp = k, + mean.dur = mean(pred_sub, na.rm = TRUE), + median.dur = median(pred_sub, na.rm = TRUE)) + } + })) + homog <- data.frame(mean.dur = mean(res$fitted_dur, na.rm = TRUE), + median.dur = median(res$fitted_dur, na.rm = TRUE)) + list(homog = homog, byage = byage, model = res$model) +} + + # Fit a joint GLM predicting `response` from all available individual # attributes (age, race, the concurrent-layer degree, HIV status, # geography). Candidate interactions are considered one at a time and @@ -56,6 +127,37 @@ fit_joint_glm <- function(d, response, main_terms, #' oldest and second oldest age groups. #' @param oo.nquants Number of quantiles to split the one-off partnership risk distribution (count #' of one-off partners per unit time). +#' @param duration.method Character. Controls how partnership durations for dissolution-coef +#' estimation are computed. One of: +#' \itemize{ +#' \item `"empirical"` (default) — mean / median of observed durations among ongoing +#' partnerships, stratified by (age-match x index.age.grp). Relies on the +#' geometric / memoryless assumption that median elapsed time in ongoing +#' partnerships equals median full-partnership duration. Byte-identical to the +#' pre-refactor behavior. +#' \item `"joint_lm"` — log-linear `lm(log(duration.time) ~ )` on ongoing partnerships; per-stratum medians computed from model +#' predictions. The covariate-adjusted version of `"empirical"`: targets the +#' same cross-sectional age-of-extant-ties quantity (via the same geometric +#' transformation), but corrects for confounding of the duration estimate by +#' other attributes — the direct analog of the marginal-vs-joint fix that +#' `method = "joint"` applies to formation stats. Fitted model is stored at +#' `netparams$$joint_duration_model`; see issue #73 for the follow-up +#' to predict per-dyad on the synthetic target population in `build_netstats`. +#' } +#' Both methods flow through the same geometric transformation +#' (`mean.dur.adj = 1 / (1 - 2^(-1 / median))`) that `build_netstats` passes to +#' `dissolution_coefs()`, so the TERGM offset structure is identical across +#' methods. The TERGM dissolution is geometric (constant hazard per timestep), so +#' under the inspection-paradox identity the `duration` parameter it consumes +#' simultaneously determines the simulated mean partnership duration and the +#' simulated mean age of extant ties at cross-section; both methods target the +#' observable latter quantity. A prior `"weibull_strat"` option targeting the +#' underlying mean full-partnership duration was explored and removed: its +#' output can't be faithfully honored by a geometric TERGM without a non-standard +#' age-dependent dissolution term, making it a diagnostic rather than a +#' production target. The one-off layer has no durations, so this flag only +#' affects main and casual-layer output. #' @param method Character. Either `"existing"` (default) or `"joint"`. `"existing"` reproduces #' the pre-refactor behavior byte-for-byte: a separate univariate Poisson/binomial/linear #' fit for each ERGM target statistic. `"joint"` leaves all of those outputs intact **and** @@ -115,9 +217,11 @@ fit_joint_glm <- function(d, response, main_terms, build_netparams <- function(epistats, smooth.main.dur = FALSE, oo.nquants = 5, + duration.method = c("empirical", "joint_lm"), method = c("existing", "joint"), browser = FALSE) { method <- match.arg(method) + duration.method <- match.arg(duration.method) # Ensures that ARTnetData is installed if (system.file(package = "ARTnetData") == "") stop(missing_data_msg) @@ -592,6 +696,49 @@ build_netparams <- function(epistats, durs.main.all <- durs.main.all[, c(3, 1, 2, 4, 5)] out$main$durs.main.byage <- durs.main.all + + ## duration.method override (see #63 phase 3) ---- + # Replace the empirical stratum medians with model-based ones and + # re-apply the geometric transformation. Under joint_lm the predicted + # stratum medians flow through the same `mean.dur.adj = + # 1/(1 - 2^(-1/median))` formula empirical uses — the only difference + # is that the medians are covariate-adjusted rather than marginal. + if (duration.method == "joint_lm") { + alt <- compute_alt_durs( + l_layer = lmain[lmain$RAI == 1 | lmain$IAI == 1, , drop = FALSE], + duration.method = duration.method, + byage_strata = out$main$durs.main.byage$index.age.grp, + race = race, geog.lvl = geog.lvl + ) + # homog + if (!is.na(alt$homog$median.dur)) { + out$main$durs.main.homog$mean.dur <- alt$homog$mean.dur + out$main$durs.main.homog$median.dur <- alt$homog$median.dur + out$main$durs.main.homog$rates.main.adj <- + 1 - 2^(-1 / (wt * out$main$durs.main.homog$median.dur)) + out$main$durs.main.homog$mean.dur.adj <- + 1 / (1 - 2^(-1 / (wt * out$main$durs.main.homog$median.dur))) + } + # byage: per-stratum replacement, with empirical fallback if a stratum + # fit failed (keeps the row populated so dissolution_coefs doesn't NA). + for (i in seq_len(nrow(out$main$durs.main.byage))) { + k <- out$main$durs.main.byage$index.age.grp[i] + j <- which(alt$byage$index.age.grp == k) + if (length(j) == 1 && !is.na(alt$byage$median.dur[j])) { + out$main$durs.main.byage$mean.dur[i] <- alt$byage$mean.dur[j] + out$main$durs.main.byage$median.dur[i] <- alt$byage$median.dur[j] + out$main$durs.main.byage$rates.main.adj[i] <- + 1 - 2^(-1 / (wt * alt$byage$median.dur[j])) + out$main$durs.main.byage$mean.dur.adj[i] <- + 1 / (1 - 2^(-1 / (wt * alt$byage$median.dur[j]))) + } + } + if (!is.null(alt$model)) { + out$main$joint_duration_model <- alt$model + } + } + + if (smooth.main.dur == TRUE) { n2 <- nrow(durs.main.all) n1 <- n2 - 1 @@ -984,6 +1131,41 @@ build_netparams <- function(epistats, durs.casl.all <- durs.casl.all[, c(3, 1, 2, 4, 5)] out$casl$durs.casl.byage <- durs.casl.all + + ## duration.method override (see #63 phase 3) ---- + if (duration.method == "joint_lm") { + alt <- compute_alt_durs( + l_layer = lcasl[lcasl$RAI == 1 | lcasl$IAI == 1, , drop = FALSE], + duration.method = duration.method, + byage_strata = out$casl$durs.casl.byage$index.age.grp, + race = race, geog.lvl = geog.lvl + ) + if (!is.na(alt$homog$median.dur)) { + out$casl$durs.casl.homog$mean.dur <- alt$homog$mean.dur + out$casl$durs.casl.homog$median.dur <- alt$homog$median.dur + out$casl$durs.casl.homog$rates.casl.adj <- + 1 - 2^(-1 / (wt * out$casl$durs.casl.homog$median.dur)) + out$casl$durs.casl.homog$mean.dur.adj <- + 1 / (1 - 2^(-1 / (wt * out$casl$durs.casl.homog$median.dur))) + } + for (i in seq_len(nrow(out$casl$durs.casl.byage))) { + k <- out$casl$durs.casl.byage$index.age.grp[i] + j <- which(alt$byage$index.age.grp == k) + if (length(j) == 1 && !is.na(alt$byage$median.dur[j])) { + out$casl$durs.casl.byage$mean.dur[i] <- alt$byage$mean.dur[j] + out$casl$durs.casl.byage$median.dur[i] <- alt$byage$median.dur[j] + out$casl$durs.casl.byage$rates.casl.adj[i] <- + 1 - 2^(-1 / (wt * alt$byage$median.dur[j])) + out$casl$durs.casl.byage$mean.dur.adj[i] <- + 1 / (1 - 2^(-1 / (wt * alt$byage$median.dur[j]))) + } + } + if (!is.null(alt$model)) { + out$casl$joint_duration_model <- alt$model + } + } + + # If sexual cessation model, then set diss coef for age grp above boundary to 1 if (sex.cess.mod == TRUE) { index.age.grp <- max(out$casl$durs.casl.byage$index.age.grp) + 1 diff --git a/man/build_netparams.Rd b/man/build_netparams.Rd index 85a2f6d..14cc261 100644 --- a/man/build_netparams.Rd +++ b/man/build_netparams.Rd @@ -8,6 +8,7 @@ build_netparams( epistats, smooth.main.dur = FALSE, oo.nquants = 5, + duration.method = c("empirical", "joint_lm"), method = c("existing", "joint"), browser = FALSE ) @@ -21,6 +22,37 @@ oldest and second oldest age groups.} \item{oo.nquants}{Number of quantiles to split the one-off partnership risk distribution (count of one-off partners per unit time).} +\item{duration.method}{Character. Controls how partnership durations for dissolution-coef +estimation are computed. One of: +\itemize{ +\item \code{"empirical"} (default) — mean / median of observed durations among ongoing +partnerships, stratified by (age-match x index.age.grp). Relies on the +geometric / memoryless assumption that median elapsed time in ongoing +partnerships equals median full-partnership duration. Byte-identical to the +pre-refactor behavior. +\item \code{"joint_lm"} — log-linear \verb{lm(log(duration.time) ~ )} on ongoing partnerships; per-stratum medians computed from model +predictions. The covariate-adjusted version of \code{"empirical"}: targets the +same cross-sectional age-of-extant-ties quantity (via the same geometric +transformation), but corrects for confounding of the duration estimate by +other attributes — the direct analog of the marginal-vs-joint fix that +\code{method = "joint"} applies to formation stats. Fitted model is stored at +\verb{netparams$$joint_duration_model}; see issue #73 for the follow-up +to predict per-dyad on the synthetic target population in \code{build_netstats}. +} +Both methods flow through the same geometric transformation +(\code{mean.dur.adj = 1 / (1 - 2^(-1 / median))}) that \code{build_netstats} passes to +\code{dissolution_coefs()}, so the TERGM offset structure is identical across +methods. The TERGM dissolution is geometric (constant hazard per timestep), so +under the inspection-paradox identity the \code{duration} parameter it consumes +simultaneously determines the simulated mean partnership duration and the +simulated mean age of extant ties at cross-section; both methods target the +observable latter quantity. A prior \code{"weibull_strat"} option targeting the +underlying mean full-partnership duration was explored and removed: its +output can't be faithfully honored by a geometric TERGM without a non-standard +age-dependent dissolution term, making it a diagnostic rather than a +production target. The one-off layer has no durations, so this flag only +affects main and casual-layer output.} + \item{method}{Character. Either \code{"existing"} (default) or \code{"joint"}. \code{"existing"} reproduces the pre-refactor behavior byte-for-byte: a separate univariate Poisson/binomial/linear fit for each ERGM target statistic. \code{"joint"} leaves all of those outputs intact \strong{and} diff --git a/tests/testthat/test-duration-method.R b/tests/testthat/test-duration-method.R new file mode 100644 index 0000000..632857b --- /dev/null +++ b/tests/testthat/test-duration-method.R @@ -0,0 +1,125 @@ +# Tests for the duration.method flag added in PR for #63 phase 3. + +skip_without_artnetdata <- function() { + testthat::skip_if(system.file(package = "ARTnetData") == "", + "ARTnetData not installed") +} + +cache_env <- new.env(parent = emptyenv()) + +get_np <- function(duration.method) { + key <- duration.method + if (!is.null(cache_env[[key]])) return(cache_env[[key]]) + set.seed(20260419L) + epistats <- build_epistats( + geog.lvl = "city", geog.cat = "Atlanta", + init.hiv.prev = c(0.33, 0.137, 0.084), + race = TRUE, time.unit = 7 + ) + set.seed(20260419L) + np <- build_netparams(epistats, smooth.main.dur = TRUE, + duration.method = duration.method, + method = "existing") + cache_env[[key]] <- np + np +} + +test_that("default duration.method is 'empirical'", { + skip_without_artnetdata() + np_default <- get_np("empirical") + # No joint_duration_model attached when method is empirical + expect_null(np_default$main$joint_duration_model) + expect_null(np_default$casl$joint_duration_model) +}) + +test_that("joint_lm stores fitted model at joint_duration_model", { + skip_without_artnetdata() + np <- get_np("joint_lm") + for (layer in c("main", "casl")) { + m <- np[[layer]]$joint_duration_model + expect_s3_class(m, "lm") + # Fit should have a reasonable number of observations + expect_gt(stats::nobs(m), 500) + # Response is log(duration.time) + expect_true(grepl("log\\(duration\\.time\\)", + as.character(formula(m))[2])) + } +}) + +test_that("all duration.methods preserve output shape for dissolution_coefs", { + skip_without_artnetdata() + # Every method must produce durs..byage with the columns + # dissolution_coefs() needs downstream, in the same order and types. + expected_cols <- c("index.age.grp", "mean.dur", "median.dur", + "rates.main.adj", "mean.dur.adj") + for (dm in c("empirical", "joint_lm")) { + df <- get_np(dm)$main$durs.main.byage + expect_identical(colnames(df), expected_cols, + info = paste("column mismatch for", dm)) + expect_gt(nrow(df), 0) + expect_true(all(is.finite(df$mean.dur.adj)), + info = paste("mean.dur.adj must be finite for", dm)) + } +}) + +test_that("duration.method does not affect non-duration netparams fields", { + skip_without_artnetdata() + np_e <- get_np("empirical") + np_j <- get_np("joint_lm") + # Everything except durs.*.byage, durs.*.homog, joint_duration_model + # should be identical across methods. + for (layer in c("main", "casl", "inst", "all")) { + if (is.null(np_e[[layer]])) next + common_fields <- setdiff( + names(np_e[[layer]]), + c(paste0("durs.", layer, ".byage"), + paste0("durs.", layer, ".homog"), + "joint_duration_model") + ) + for (f in common_fields) { + expect_equal(np_j[[layer]][[f]], np_e[[layer]][[f]], + info = paste0("[joint_lm ", layer, "$", f, "]")) + } + } +}) + +test_that("joint_lm rejects invalid method arg with clear error", { + skip_without_artnetdata() + set.seed(20260419L) + epistats <- build_epistats( + geog.lvl = "city", geog.cat = "Atlanta", + init.hiv.prev = c(0.33, 0.137, 0.084), + race = TRUE, time.unit = 7 + ) + expect_error( + build_netparams(epistats, duration.method = "weibull_strat", + method = "existing"), + regexp = "should be one of" + ) +}) + +test_that("joint_lm can be combined with method = 'joint' in build_netstats", { + skip_without_artnetdata() + set.seed(20260419L) + epistats <- build_epistats( + geog.lvl = "city", geog.cat = "Atlanta", + init.hiv.prev = c(0.33, 0.137, 0.084), + race = TRUE, time.unit = 7 + ) + set.seed(20260419L) + np <- build_netparams(epistats, smooth.main.dur = TRUE, + duration.method = "joint_lm", + method = "joint") + set.seed(20260419L) + ns <- build_netstats(epistats, np, + expect.mort = 0.000478213, network.size = 3000, + method = "joint") + # Dissolution coefs still valid + expect_s3_class(ns$main$diss.byage, "disscoef") + expect_s3_class(ns$casl$diss.byage, "disscoef") + expect_true(all(is.finite(ns$main$diss.byage$coef.diss))) + # Both the ego-level joint_model and the dyad joint_duration_model + # coexist on netparams + expect_s3_class(np$main$joint_model, "glm") + expect_s3_class(np$main$joint_duration_model, "lm") +})