Also adding generalizability to exportToCsv. Deprecating attrtion fra…

…ction
OHDSI · Aug 31, 2023 · 61d2bc5 · 61d2bc5
1 parent 1c24f56
commit 61d2bc5
Show file tree

Hide file tree

Showing 10 changed files with 418 additions and 260 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -7,9 +7,17 @@ Changes:
 
 2. Added the `getGeneralizabilityTable()` function.
 
-3. Improved computation of overall standard deviation when computing covariate balance. Should produce more accurate balance estimations.
+3. Improved computation of overall standard deviation when computing covariate balance (actually computing the SD instead of taking the mean of the target and comparator). Should produce more accurate balance estimations.
 
-4. Generated population objects now keep track of likely target estimator (e.g. 'ATT'). Informs selection of base population when calling `getGeneralizabilityTable()`. 
+4. Generated population objects now keep track of likely target estimator (e.g. 'ATT', or 'ATE'). This informs selection of base population when calling `getGeneralizabilityTable()`. 
+
+5. Deprecated the `attritionFractionThreshold` argument of the `createCmDiagnosticThresholds` function, and instead added the `generalizabilitySdmThreshold` argument.
+
+6. The results schema specifications of the `exportToCsv()` function has changed:
+    - Removed the `attrition_fraction` and `attrition_diagnostic` fields from the `cm_diagnostics_summary ` table.
+    - Added the `generalizability_max_sdm` and `generalizabiltiy_diagnostic` fields to the `cm_diagnostics_summary` table.
+    - Added the `mean_before`, `mean_after`, `target_std_diff`, `comparator_std_diff`, and `target_comparator_std_diff` fields to both the `cm_covariate_balance` and `cm_shared_covariate_balance` tables.
+
 
 Bugfixes:
 

diff --git a/R/Export.R b/R/Export.R
@@ -45,10 +45,11 @@ getResultsDataModel <- function() {
 #'                              any covariate has an SDM exceeding this threshold, the diagnostic will
 #'                              fail.
 #' @param equipoiseThreshold    What is the minimum required equipoise?
-#' @param attritionFractionThreshold What is the maximum allowed attrition fraction? If the attrition
-#'                                   between the input target cohort and the target cohort entering the
-#'                                   outcome model is greater than this fraction, the diagnostic will
-#'                                   fail.
+#' @param attritionFractionThreshold DEPRECATED. See `generalizabilitySdmThreshold` instead.
+#' @param generalizabilitySdmThreshold What is the maximum allowed standardized difference of mean
+#'                                     (SDM)when comparing the population before and after PS
+#'                                     adjustments? If the SDM is greater than this value, the diagnostic
+#'                                     will fail.
 #'
 #' @return
 #' An object of type `CmDiagnosticThresholds`.
@@ -58,14 +59,19 @@ createCmDiagnosticThresholds <- function(mdrrThreshold = 10,
                                          easeThreshold = 0.25,
                                          sdmThreshold = 0.1,
                                          equipoiseThreshold = 0.2,
-                                         attritionFractionThreshold = 1) {
+                                         attritionFractionThreshold = NULL,
+                                         generalizabilitySdmThreshold = 1) {
   errorMessages <- checkmate::makeAssertCollection()
   checkmate::assertNumeric(mdrrThreshold, len = 1, lower = 0, add = errorMessages)
   checkmate::assertNumeric(easeThreshold, len = 1, lower = 0, add = errorMessages)
   checkmate::assertNumeric(sdmThreshold, len = 1, lower = 0, add = errorMessages)
   checkmate::assertNumeric(equipoiseThreshold, len = 1, lower = 0, add = errorMessages)
-  checkmate::assertNumeric(attritionFractionThreshold, len = 1, lower = 0, add = errorMessages)
+  checkmate::assertNumeric(generalizabilitySdmThreshold, len = 1, lower = 0, add = errorMessages)
   checkmate::reportAssertions(collection = errorMessages)
+  if (!is.null(attritionFractionThreshold)) {
+    warning("The attritionFractionThreshold argument is deprecated and will be ignored. ",
+    "See generalizabilitySdmThreshold instead.")
+  }
   thresholds <- list()
   for (name in names(formals(createCmDiagnosticThresholds))) {
     thresholds[[name]] <- get(name)
@@ -535,7 +541,8 @@ exportCohortMethodResults <- function(outputFolder,
       "calibratedCi95Ub",
       "calibratedP",
       "calibratedLogRr",
-      "calibratedSeLogRr"
+      "calibratedSeLogRr",
+      "targetEstimator"
     ) %>%
     mutate(databaseId = !!databaseId) %>%
     enforceMinCellValue("targetSubjects", minCellCount) %>%
@@ -579,7 +586,8 @@ exportCmInteractionResults <- function(outputFolder,
         "calibratedCi95Ub",
         "calibratedP",
         "calibratedLogRr",
-        "calibratedSeLogRr"
+        "calibratedSeLogRr",
+        "targetEstimator"
       ) %>%
       mutate(databaseId = !!databaseId) %>%
       enforceMinCellValue("targetSubjects", minCellCount) %>%
@@ -730,22 +738,39 @@ tidyBalance <- function(balance, minCellCount) {
            stdDiffBefore = "beforeMatchingStdDiff",
            targetMeanAfter = "afterMatchingMeanTarget",
            comparatorMeanAfter = "afterMatchingMeanComparator",
-           stdDiffAfter = "afterMatchingStdDiff"
+           stdDiffAfter = "afterMatchingStdDiff",
+           meanBefore = "beforeMatchingMean",
+           meanAfter = "afterMatchingMean",
+           "targetStdDiff",
+           "comparatorStdDiff",
+           "targetComparatorStdDiff",
+
     ) %>%
     mutate(
       targetMeanBefore = ifelse(is.na(.data$targetMeanBefore), 0, .data$targetMeanBefore),
       comparatorMeanBefore = ifelse(is.na(.data$comparatorMeanBefore), 0, .data$comparatorMeanBefore),
       stdDiffBefore = ifelse(is.na(.data$stdDiffBefore), 0, .data$stdDiffBefore),
       targetMeanAfter = ifelse(is.na(.data$targetMeanAfter), 0, .data$targetMeanAfter),
       comparatorMeanAfter = ifelse(is.na(.data$comparatorMeanAfter), 0, .data$comparatorMeanAfter),
-      stdDiffAfter = ifelse(is.na(.data$stdDiffAfter), 0, .data$stdDiffAfter)
+      stdDiffAfter = ifelse(is.na(.data$stdDiffAfter), 0, .data$stdDiffAfter),
+      meanBefore = ifelse(is.na(.data$meanBefore), 0, .data$meanBefore),
+      meanAfter = ifelse(is.na(.data$stdDiffAfter), 0, .data$meanAfter),
+      targetStdDiff = ifelse(is.na(.data$targetStdDiff), 0, .data$targetStdDiff),
+      comparatorStdDiff = ifelse(is.na(.data$comparatorStdDiff), 0, .data$comparatorStdDiff),
+      targetComparatorStdDiff = ifelse(is.na(.data$targetComparatorStdDiff), 0, .data$targetComparatorStdDiff)
     ) %>%
     filter(!(round(.data$targetMeanBefore) == 0 &
                round(.data$comparatorMeanBefore, 3) == 0 &
                round(.data$stdDiffBefore, 3) == 0 &
                round(.data$targetMeanAfter, 3) == 0 &
                round(.data$comparatorMeanAfter, 3) == 0 &
-               round(.data$stdDiffAfter, 3) == 0)) %>%
+               round(.data$stdDiffAfter, 3) == 0 &
+               round(.data$meanBefore, 3) == 0 &
+               round(.data$meanAfter, 3) == 0 &
+               round(.data$targetStdDiff, 3) == 0 &
+               round(.data$comparatorStdDiff, 3) == 0 &
+               round(.data$targetComparatorStdDiff, 3) == 0)
+           ) %>%
     enforceMinCellValue("targetMeanBefore",
                         minCellCount / inferredTargetBeforeSize,
                         silent = TRUE
@@ -762,13 +787,26 @@ tidyBalance <- function(balance, minCellCount) {
                         minCellCount / inferredComparatorAfterSize,
                         silent = TRUE
     ) %>%
+    enforceMinCellValue("meanBefore",
+                        minCellCount / inferredComparatorAfterSize,
+                        silent = TRUE
+    ) %>%
+    enforceMinCellValue("meanAfter",
+                        minCellCount / inferredComparatorAfterSize,
+                        silent = TRUE
+    ) %>%
     mutate(
       targetMeanBefore = round(.data$targetMeanBefore, 3),
       comparatorMeanBefore = round(.data$comparatorMeanBefore, 3),
       stdDiffBefore = round(.data$stdDiffBefore, 3),
       targetMeanAfter = round(.data$targetMeanAfter, 3),
       comparatorMeanAfter = round(.data$comparatorMeanAfter, 3),
-      stdDiffAfter = round(.data$stdDiffAfter, 3)
+      stdDiffAfter = round(.data$stdDiffAfter, 3),
+      meanBefore = round(.data$meanBefore, 3),
+      meanAfter = round(.data$meanAfter, 3),
+      targetStdDiff = round(.data$targetStdDiff, 3),
+      comparatorStdDiff = round(.data$comparatorStdDiff, 3),
+      targetComparatorStdDiff = round(.data$targetComparatorStdDiff, 3)
     ) %>%
     return()
 }
@@ -1119,82 +1157,79 @@ exportDiagnosticsSummary <- function(outputFolder,
                                      cmDiagnosticThresholds) {
   message("- diagnostics_summary table")
   reference <- getFileReference(outputFolder)
+  resultsSummary <- getResultsSummary(outputFolder)
 
-  getMaxSdm <- function(balanceFile) {
+  getMaxSdms <- function(balanceFile) {
     balance <- readRDS(file.path(outputFolder, balanceFile))
     if (nrow(balance) == 0) {
-      return(as.numeric(NA))
+      tibble(balanceFile = !!balanceFile,
+             maxSdm = as.numeric(NA),
+             maxTargetSdm = as.numeric(NA),
+             maxComparatorSdm = as.numeric(NA),
+             maxTargetComparatorSdm = as.numeric(NA)) %>%
+        return()
     } else {
-      return(max(abs(balance$afterMatchingStdDiff), na.rm = TRUE))
+      tibble(balanceFile = !!balanceFile,
+             maxSdm = as.numeric(max(abs(balance$afterMatchingStdDiff), na.rm = TRUE)),
+             maxTargetSdm = as.numeric(max(abs(balance$targetStdDiff), na.rm = TRUE)),
+             maxComparatorSdm = as.numeric(max(abs(balance$comparatorStdDiff), na.rm = TRUE)),
+             maxTargetComparatorSdm = as.numeric(max(abs(balance$targetComparatorStdDiff), na.rm = TRUE))) %>%
+        return()
     }
   }
-
   getEquipoise <- function(sharedPsFile) {
     ps <- readRDS(file.path(outputFolder, sharedPsFile))
-    return(computeEquipoise(ps))
+    tibble(sharedPsFile = !!sharedPsFile,
+           equipoise = computeEquipoise(ps)) %>%
+      return()
   }
 
   balanceFiles <- reference %>%
     filter(.data$balanceFile != "") %>%
     distinct(.data$balanceFile) %>%
     pull()
-  maxSdm <- as.numeric(sapply(balanceFiles, getMaxSdm))
-
+  maxSdm <- bind_rows(lapply(balanceFiles, getMaxSdms)) %>%
+    select("balanceFile", "maxSdm")
   sharedBalanceFiles <- reference %>%
     filter(.data$sharedBalanceFile != "") %>%
     distinct(.data$sharedBalanceFile) %>%
     pull()
-  sharedMaxSdm <- as.numeric(sapply(sharedBalanceFiles, getMaxSdm))
-
+  sharedMaxSdm <- bind_rows(lapply(sharedBalanceFiles, getMaxSdms)) %>%
+    rename(sharedBalanceFile = "balanceFile",
+           sharedMaxSdm = "maxSdm")
   sharedPsFiles <- reference %>%
     filter(.data$sharedPsFile != "") %>%
     distinct(.data$sharedPsFile) %>%
     pull()
-  equipoise <- as.numeric(sapply(sharedPsFiles, getEquipoise))
-
-  results1 <- reference %>%
+  equipoise <- bind_rows(lapply(sharedPsFiles, getEquipoise))
+  results <- reference %>%
     filter(.data$outcomeOfInterest) %>%
-    left_join(tibble(
-      balanceFile = balanceFiles,
-      maxSdm = maxSdm
-    ),
-    by = "balanceFile"
-    ) %>%
-    left_join(tibble(
-      sharedBalanceFile = sharedBalanceFiles,
-      sharedMaxSdm = sharedMaxSdm
-    ),
-    by = "sharedBalanceFile"
-    ) %>%
-    left_join(tibble(
-      sharedPsFile = sharedPsFiles,
-      equipoise = equipoise
-    ),
-    by = "sharedPsFile"
-    ) %>%
+    inner_join(
+      resultsSummary,
+      by = join_by("analysisId", "targetId", "comparatorId", "outcomeId")) %>%
+    left_join(maxSdm, by = "balanceFile") %>%
+    left_join(sharedMaxSdm, by = "sharedBalanceFile") %>%
+    mutate(generalizabilityMaxSdm = if_else(.data$targetEstimator == "att",
+                                            .data$maxTargetSdm,
+                                            if_else(.data$targetEstimator == "atu",
+                                                    .data$maxComparatorSdm,
+                                                    .data$maxTargetComparatorSdm))) %>%
+    left_join(equipoise, by = "sharedPsFile") %>%
     select(
       "analysisId",
       "targetId",
       "comparatorId",
       "outcomeId",
       "maxSdm",
       "sharedMaxSdm",
-      "equipoise"
-    )
-
-  results2 <- getResultsSummary(outputFolder) %>%
-    select(
-      "analysisId",
-      "targetId",
-      "comparatorId",
-      "outcomeId",
+      "equipoise",
       "mdrr",
-      "attritionFraction",
+      "generalizabilityMaxSdm",
       "ease"
     )
 
-  results <- results1 %>%
-    inner_join(results2, by = c("analysisId", "targetId", "comparatorId", "outcomeId")) %>%
+  # Apply diagnostics thresholds:
+  results <- results %>%
     mutate(balanceDiagnostic = case_when(
       is.na(.data$maxSdm) ~ "NOT EVALUATED",
       .data$maxSdm < cmDiagnosticThresholds$sdmThreshold ~ "PASS",
@@ -1215,9 +1250,9 @@ exportDiagnosticsSummary <- function(outputFolder,
       .data$mdrr < cmDiagnosticThresholds$mdrrThreshold ~ "PASS",
       TRUE ~ "FAIL"
     )) %>%
-    mutate(attritionDiagnostic = case_when(
-      is.na(.data$attritionFraction) ~ "NOT EVALUATED",
-      .data$attritionFraction < cmDiagnosticThresholds$attritionFractionThreshold ~ "PASS",
+    mutate(generalizabilityDiagnostic = case_when(
+      is.na(.data$generalizabilityMaxSdm) ~ "NOT EVALUATED",
+      .data$generalizabilityMaxSdm < cmDiagnosticThresholds$generalizabilitySdmThreshold ~ "PASS",
       TRUE ~ "FAIL"
     )) %>%
     mutate(easeDiagnostic = case_when(
@@ -1226,13 +1261,18 @@ exportDiagnosticsSummary <- function(outputFolder,
       TRUE ~ "FAIL"
     )) %>%
     mutate(unblind = ifelse(.data$mdrrDiagnostic != "FAIL" &
-                              .data$attritionDiagnostic != "FAIL" &
+                              .data$generalizabilityDiagnostic != "FAIL" &
                               .data$easeDiagnostic != "FAIL" &
                               .data$equipoiseDiagnostic != "FAIL" &
                               .data$balanceDiagnostic != "FAIL" &
                               .data$sharedBalanceDiagnostic != "FAIL", 1, 0),
            databaseId = !!databaseId)
 
+  # Add deprecated fields:
+  results <- results %>%
+    mutate(attritionFraction = as.numeric(NA),
+           attritionDiagnostic = "NOT EVALUATED")
+
   if (nrow(results) == 0) {
     results <- createEmptyResult("cm_diagnostics_summary")
   }

diff --git a/R/RunAnalyses.R b/R/RunAnalyses.R
@@ -473,7 +473,7 @@ runCmAnalyses <- function(connectionDetails,
       tasks <- split(subset, subset$sharedPsFile)
       cluster <- ParallelLogger::makeCluster(min(length(tasks), multiThreadingSettings$trimMatchStratifyThreads))
       ParallelLogger::clusterRequire(cluster, "CohortMethod")
-      dummy <- ParallelLogger::clusterApply(cluster, tasks, addPsToStudyPop, outputFolder = outputFolder)
+      dummy <- ParallelLogger::clusterApply(cluster, tasks, addPsToStudyPopForSubset, outputFolder = outputFolder)
       ParallelLogger::stopCluster(cluster)
     }
   }
@@ -805,25 +805,37 @@ doFitSharedPsModel <- function(params, refitPsForEveryStudyPopulation) {
   return(NULL)
 }
 
-addPsToStudyPop <- function(subset, outputFolder) {
+addPsToStudyPopForSubset <- function(subset, outputFolder) {
   ps <- readRDS(file.path(outputFolder, subset$sharedPsFile[1]))
 
   addToStudyPop <- function(i) {
     refRow <- subset[i, ]
     studyPop <- readRDS(file.path(outputFolder, refRow$studyPopFile))
-    newMetaData <- attr(studyPop, "metaData")
-    newMetaData$psModelCoef <- attr(ps, "metaData")$psModelCoef
-    newMetaData$psModelPriorVariance <- attr(ps, "metaData")$psModelPriorVariance
-    idx <- match(studyPop$rowId, ps$rowId)
-    studyPop$propensityScore <- ps$propensityScore[idx]
-    studyPop$iptw <- ps$iptw[idx]
-    attr(studyPop, "metaData") <- newMetaData
+    studyPop <- addPsToStudyPopulation(studyPop, ps)
     saveRDS(studyPop, file.path(outputFolder, refRow$psFile))
     return(NULL)
   }
   plyr::l_ply(1:nrow(subset), addToStudyPop)
 }
 
+addPsToStudyPopulation <- function(studyPopulation, ps) {
+  # Merge meta-data
+  newMetaData <- attr(studyPopulation, "metaData")
+  psMetaData <-  attr(ps, "metaData")
+  missingColumns <- setdiff(names(psMetaData), names(newMetaData))
+  newMetaData <- append(newMetaData, psMetaData[missingColumns])
+  attr(studyPopulation, "metaData") <- newMetaData
+
+  # Merge data
+  missingColumns <- setdiff(names(ps), names(studyPopulation))
+  idx <- match(studyPopulation$rowId, ps$rowId)
+  studyPopulation <- bind_cols(
+    studyPopulation,
+    ps[idx, missingColumns]
+  )
+  return(studyPopulation)
+}
+
 applyTrimMatchStratify <- function(ps, arguments) {
   if (!is.null(arguments$trimByPsArgs)) {
     functionArgs <- arguments$trimByPsArgs
@@ -943,12 +955,8 @@ doFitOutcomeModelPlus <- function(params) {
   studyPop <- do.call("createStudyPopulation", args)
 
   if (!is.null(params$args$createPsArgs)) {
-    # Add PS
     ps <- getPs(params$sharedPsFile)
-    idx <- match(studyPop$rowId, ps$rowId)
-    studyPop$propensityScore <- ps$propensityScore[idx]
-    studyPop$iptw <- ps$iptw[idx]
-    ps <- studyPop
+    ps <- addPsToStudyPopulation(studyPop, ps)
   } else {
     ps <- studyPop
   }
@@ -1724,7 +1732,7 @@ summarizeResults <- function(referenceTable, outputFolder, mainFileName, interac
         seLogRr = if (is.null(coefficient)) NA else outcomeModel$outcomeModelTreatmentEstimate$seLogRr,
         llr = if (is.null(coefficient)) NA else outcomeModel$outcomeModelTreatmentEstimate$llr,
         mdrr = !!mdrr,
-        attritionFraction = !!attritionFraction
+        targetEstimator = outcomeModel$targetEstimator
       )
 
     mainResults[[i]] <- mainResult
@@ -1741,7 +1749,8 @@ summarizeResults <- function(referenceTable, outputFolder, mainFileName, interac
             ci95Ub = exp(outcomeModel$outcomeModelInteractionEstimates$logUb95[j]),
             p = !!p,
             logRr = outcomeModel$outcomeModelInteractionEstimates$logRr[j],
-            seLogRr = outcomeModel$outcomeModelInteractionEstimates$seLogRr[j]
+            seLogRr = outcomeModel$outcomeModelInteractionEstimates$seLogRr[j],
+            targetEstimator = outcomeModel$targetEstimator
           )
       }
     }