Getting Started
The following script is used to create and save cached synthetic
CDISC data to the data/
directory to use in examples and
tests in the tern
package.
Setup & Helper Functions
library(dplyr)
library(formatters)
study_duration_secs <- 31556952 * 2
sample_fct <- function(x, N, ...) { # nolint
checkmate::assert_number(N)
factor(sample(x, N, replace = TRUE, ...), levels = if (is.factor(x)) levels(x) else x)
}
retain <- function(df, value_var, event, outside = NA) {
indices <- c(1, which(event == TRUE), nrow(df) + 1)
values <- c(outside, value_var[event == TRUE])
rep(values, diff(indices))
}
relvar_init <- function(relvar1, relvar2) {
if (length(relvar1) != length(relvar2)) {
message(simpleError(
"The argument value length of relvar1 and relvar2 differ. They must contain the same number of elements."
))
return(NA)
}
return(list("relvar1" = relvar1, "relvar2" = relvar2))
}
rel_var <- function(df = NULL, var_name = NULL, var_values = NULL, related_var = NULL) {
if (is.null(df)) {
message("Missing data frame argument value.")
return(NA)
} else {
n_relvar1 <- length(unique(df[, related_var, drop = TRUE]))
n_relvar2 <- length(var_values)
if (n_relvar1 != n_relvar2) {
message(paste("Unequal vector lengths for", related_var, "and", var_name))
return(NA)
} else {
relvar1 <- unique(df[, related_var, drop = TRUE])
relvar2_values <- rep(NA, nrow(df))
for (r in seq_len(length(relvar1))) {
matched <- which(df[, related_var, drop = TRUE] == relvar1[r])
relvar2_values[matched] <- var_values[r]
}
return(relvar2_values)
}
}
}
visit_schedule <- function(visit_format = "WEEK",
n_assessments = 10L,
n_days = 5L) {
if (!(toupper(visit_format) %in% c("WEEK", "CYCLE"))) {
message("Visit format value must either be: WEEK or CYCLE")
return(NA)
}
if (toupper(visit_format) == "WEEK") {
assessments <- 1:n_assessments
assessments_ord <- -1:n_assessments
visit_values <- c("SCREENING", "BASELINE", paste(toupper(visit_format), assessments, "DAY", (assessments * 7) + 1))
} else if (toupper(visit_format) == "CYCLE") {
cycles <- sort(rep(1:n_assessments, times = 1, each = n_days))
days <- rep(seq(1:n_days), times = n_assessments, each = 1)
assessments_ord <- 0:(n_assessments * n_days)
visit_values <- c("SCREENING", paste(toupper(visit_format), cycles, "DAY", days))
}
visit_values <- stats::reorder(factor(visit_values), assessments_ord)
}
ADSL
generate_adsl <- function(N = 200) { # nolint
set.seed(1)
sys_dtm <- as.numeric(strptime("20/2/2019 11:16:16.683", "%d/%m/%Y %H:%M:%OS"))
country_site_prob <- c(.5, .121, .077, .077, .075, .052, .046, .025, .014, .003)
adsl <- tibble::tibble(
STUDYID = rep("AB12345", N) %>% with_label("Study Identifier"),
COUNTRY = sample_fct(
c("CHN", "USA", "BRA", "PAK", "NGA", "RUS", "JPN", "GBR", "CAN", "CHE"),
N,
prob = country_site_prob
) %>% with_label("Country"),
SITEID = sample_fct(1:20, N, prob = rep(country_site_prob, times = 2)),
SUBJID = paste("id", seq_len(N), sep = "-") %>% with_label("Subject Identifier for the Study"),
AGE = (sapply(stats::rchisq(N, df = 5, ncp = 10), max, 0) + 20) %>% with_label("Age"),
SEX = c("F", "M") %>% sample_fct(N, prob = c(.52, .48)) %>% with_label("Sex"),
ARMCD = c("ARM A", "ARM B", "ARM C") %>% sample_fct(N) %>% with_label("Planned Arm Code"),
ARM = dplyr::recode(
.data$ARMCD,
"ARM A" = "A: Drug X", "ARM B" = "B: Placebo", "ARM C" = "C: Combination"
) %>% with_label("Description of Planned Arm"),
ACTARMCD = .data$ARMCD %>% with_label("Actual Arm Code"),
ACTARM = .data$ARM %>% with_label("Description of Actual Arm"),
RACE = c(
"ASIAN", "BLACK OR AFRICAN AMERICAN", "WHITE", "AMERICAN INDIAN OR ALASKA NATIVE",
"MULTIPLE", "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER", "OTHER", "UNKNOWN"
) %>%
sample_fct(N, prob = c(.55, .23, .16, .05, .004, .003, .002, .002)) %>%
with_label("Race"),
TRTSDTM = as.POSIXct(
sys_dtm + sample(seq(0, study_duration_secs), size = N, replace = TRUE),
origin = "1970-01-01"
) %>% with_label("Datetime of First Exposure to Treatment"),
TRTEDTM = as.POSIXct(as.numeric(.data$TRTSDTM) + study_duration_secs, origin = "1970-01-01") %>%
with_label("Datetime of Last Exposure to Treatment"),
EOSDY = as.numeric(ceiling(difftime(.data$TRTEDTM, .data$TRTSDTM, units = "days"))) %>%
with_label("End of Study Relative Day"),
STRATA1 = c("A", "B", "C") %>% sample_fct(N) %>% with_label("Stratification Factor 1"),
STRATA2 = c("S1", "S2") %>% sample_fct(N) %>% with_label("Stratification Factor 2"),
BMRKR1 = stats::rchisq(N, 6) %>% with_label("Continuous Level Biomarker 1"),
BMRKR2 = sample_fct(c("LOW", "MEDIUM", "HIGH"), N) %>% with_label("Continuous Level Biomarker 2")
)
# associate sites with countries and regions
tern_ex_adsl <- adsl %>%
dplyr::mutate(
SITEID = paste0(.data$COUNTRY, "-", .data$SITEID) %>% with_label("Study Site Identifier"),
REGION1 = factor(dplyr::case_when(
COUNTRY %in% c("NGA") ~ "Africa",
COUNTRY %in% c("CHN", "JPN", "PAK") ~ "Asia",
COUNTRY %in% c("RUS") ~ "Eurasia",
COUNTRY %in% c("GBR") ~ "Europe",
COUNTRY %in% c("CAN", "USA") ~ "North America",
COUNTRY %in% c("BRA") ~ "South America",
TRUE ~ as.character(NA)
)) %>% with_label("Geographic Region 1"),
SAFFL = factor("Y") %>% with_label("Safety Population Flag")
) %>%
dplyr::mutate(USUBJID = paste(.data$STUDYID, .data$SITEID, .data$SUBJID, sep = "-") %>%
with_label("Unique Subject Identifier"))
save(tern_ex_adsl, file = "data/tern_ex_adsl.rda", compress = "xz")
}
ADAE
generate_adae <- function(adsl = tern_ex_adsl,
max_n_aes = 5) {
set.seed(1)
lookup_ae <- tibble::tribble(
~AEBODSYS, ~AELLT, ~AEDECOD, ~AEHLT, ~AEHLGT, ~AETOXGR, ~AESOC, ~AESER, ~AEREL,
"cl A.1", "llt A.1.1.1.1", "dcd A.1.1.1.1", "hlt A.1.1.1", "hlgt A.1.1", "1", "cl A", "N", "N",
"cl A.1", "llt A.1.1.1.2", "dcd A.1.1.1.2", "hlt A.1.1.1", "hlgt A.1.1", "2", "cl A", "Y", "N",
"cl B.1", "llt B.1.1.1.1", "dcd B.1.1.1.1", "hlt B.1.1.1", "hlgt B.1.1", "5", "cl B", "Y", "Y",
"cl B.2", "llt B.2.1.2.1", "dcd B.2.1.2.1", "hlt B.2.1.2", "hlgt B.2.1", "3", "cl B", "N", "N",
"cl B.2", "llt B.2.2.3.1", "dcd B.2.2.3.1", "hlt B.2.2.3", "hlgt B.2.2", "1", "cl B", "Y", "N",
"cl C.1", "llt C.1.1.1.3", "dcd C.1.1.1.3", "hlt C.1.1.1", "hlgt C.1.1", "4", "cl C", "N", "Y",
"cl C.2", "llt C.2.1.2.1", "dcd C.2.1.2.1", "hlt C.2.1.2", "hlgt C.2.1", "2", "cl C", "N", "Y",
"cl D.1", "llt D.1.1.1.1", "dcd D.1.1.1.1", "hlt D.1.1.1", "hlgt D.1.1", "5", "cl D", "Y", "Y",
"cl D.1", "llt D.1.1.4.2", "dcd D.1.1.4.2", "hlt D.1.1.4", "hlgt D.1.1", "3", "cl D", "N", "N",
"cl D.2", "llt D.2.1.5.3", "dcd D.2.1.5.3", "hlt D.2.1.5", "hlgt D.2.1", "1", "cl D", "N", "Y"
)
aag <- utils::read.table(
sep = ",", header = TRUE,
text = paste(
"NAMVAR,SRCVAR,GRPTYPE,REFNAME,REFTERM,SCOPE",
"CQ01NAM,AEDECOD,CUSTOM,D.2.1.5.3/A.1.1.1.1 aesi,dcd D.2.1.5.3,",
"CQ01NAM,AEDECOD,CUSTOM,D.2.1.5.3/A.1.1.1.1 aesi,dcd A.1.1.1.1,",
"SMQ01NAM,AEDECOD,SMQ,C.1.1.1.3/B.2.2.3.1 aesi,dcd C.1.1.1.3,BROAD",
"SMQ01NAM,AEDECOD,SMQ,C.1.1.1.3/B.2.2.3.1 aesi,dcd B.2.2.3.1,BROAD",
"SMQ02NAM,AEDECOD,SMQ,Y.9.9.9.9/Z.9.9.9.9 aesi,dcd Y.9.9.9.9,NARROW",
"SMQ02NAM,AEDECOD,SMQ,Y.9.9.9.9/Z.9.9.9.9 aesi,dcd Z.9.9.9.9,NARROW",
sep = "\n"
), stringsAsFactors = FALSE
)
adae <- Map(
function(id, sid) {
n_aes <- sample(c(0, seq_len(max_n_aes)), 1)
i <- sample(seq_len(nrow(lookup_ae)), n_aes, TRUE)
dplyr::mutate(
lookup_ae[i, ],
USUBJID = id,
STUDYID = sid
)
},
adsl$USUBJID,
adsl$STUDYID
) %>%
Reduce(rbind, .) %>%
`[`(c(10, 11, 1, 2, 3, 4, 5, 6, 7, 8, 9)) %>%
dplyr::mutate(AETERM = gsub("dcd", "trm", .data$AEDECOD)) %>%
dplyr::mutate(AESEV = dplyr::case_when(
AETOXGR == 1 ~ "MILD",
AETOXGR %in% c(2, 3) ~ "MODERATE",
AETOXGR %in% c(4, 5) ~ "SEVERE"
))
# merge adsl to be able to add AE date and study day variables
adae <- dplyr::inner_join(adsl, adae, by = c("STUDYID", "USUBJID"), multiple = "all") %>%
dplyr::rowwise() %>%
dplyr::mutate(trtsdt_int = as.numeric(as.Date(.data$TRTSDTM))) %>%
dplyr::mutate(trtedt_int = dplyr::case_when(
!is.na(.data$TRTEDTM) ~ as.numeric(as.Date(.data$TRTEDTM)),
is.na(.data$TRTEDTM) ~ floor(trtsdt_int + (study_duration_secs) / 86400)
)) %>%
dplyr::mutate(ASTDTM = as.POSIXct(
(sample(.data$trtsdt_int:.data$trtedt_int, size = 1) * 86400),
origin = "1970-01-01"
)) %>%
dplyr::arrange(.data$STUDYID, .data$USUBJID, .data$ASTDTM, .data$AETERM)
adae <- adae %>%
dplyr::group_by(.data$USUBJID) %>%
dplyr::mutate(AESEQ = seq_len(dplyr::n())) %>%
dplyr::mutate(ASEQ = .data$AESEQ) %>%
dplyr::ungroup() %>%
dplyr::arrange(
.data$STUDYID,
.data$USUBJID,
.data$ASTDTM,
.data$AETERM,
.data$AESEQ
)
outcomes <- c(
"UNKNOWN",
"NOT RECOVERED/NOT RESOLVED",
"RECOVERED/RESOLVED WITH SEQUELAE",
"RECOVERING/RESOLVING",
"RECOVERED/RESOLVED"
)
adae <- adae %>%
dplyr::mutate(AEOUT = factor(ifelse(
.data$AETOXGR == "5",
"FATAL",
as.character(sample_fct(outcomes, nrow(adae), prob = c(0.1, 0.2, 0.1, 0.3, 0.3)))
))) %>%
dplyr::mutate(TRTEMFL = ifelse(.data$ASTDTM >= .data$TRTSDTM, "Y", ""))
l_aag <- split(aag, interaction(aag$NAMVAR, aag$SRCVAR, aag$GRPTYPE, drop = TRUE))
# Create aesi flags
l_aesi <- lapply(l_aag, function(d_adag, d_adae) {
names(d_adag)[names(d_adag) == "REFTERM"] <- d_adag$SRCVAR[1]
names(d_adag)[names(d_adag) == "REFNAME"] <- d_adag$NAMVAR[1]
if (d_adag$GRPTYPE[1] == "CUSTOM") {
d_adag <- d_adag[-which(names(d_adag) == "SCOPE")]
} else if (d_adag$GRPTYPE[1] == "SMQ") {
names(d_adag)[names(d_adag) == "SCOPE"] <- paste0(substr(d_adag$NAMVAR[1], 1, 5), "SC")
}
d_adag <- d_adag[-which(names(d_adag) %in% c("NAMVAR", "SRCVAR", "GRPTYPE"))]
d_new <- dplyr::left_join(x = d_adae, y = d_adag, by = intersect(names(d_adae), names(d_adag)))
d_new[, dplyr::setdiff(names(d_new), names(d_adae)), drop = FALSE]
}, adae)
tern_ex_adae <- dplyr::bind_cols(adae, l_aesi)
tern_ex_adae <- tern_ex_adae %>%
dplyr::select(-"trtsdt_int", -"trtedt_int") %>%
var_relabel(
STUDYID = "Study Identifier",
USUBJID = "Unique Subject Identifier",
AEBODSYS = "Body System or Organ Class",
AELLT = "Lowest Level Term",
AEDECOD = "Dictionary-Derived Term",
AEHLT = "High Level Term",
AEHLGT = "High Level Group Term",
AETOXGR = "Analysis Toxicity Grade",
AESOC = "Primary System Organ Class",
AESER = "Serious Event",
AEREL = "Analysis Causality",
AETERM = "Reported Term for the Adverse Event",
AESEV = "Severity/Intensity",
ASTDTM = "Analysis Start Datetime",
AESEQ = "Sponsor-Defined Identifier",
ASEQ = "Analysis Sequence Number",
AEOUT = "Outcome of Adverse Event",
TRTEMFL = "Treatment Emergent Analysis Flag",
CQ01NAM = "CQ 01 Reference Name",
SMQ01NAM = "SMQ 01 Reference Name",
SMQ01SC = "SMQ 01 Scope",
SMQ02NAM = "SMQ 02 Reference Name",
SMQ02SC = "SMQ 02 Scope"
)
save(tern_ex_adae, file = "data/tern_ex_adae.rda", compress = "xz")
}
ADLB
generate_adlb <- function(adsl = tern_ex_adsl,
max_n_lbs = 5L) {
set.seed(1)
lbcat <- c("CHEMISTRY", "CHEMISTRY", "IMMUNOLOGY")
param <- c(
"Alanine Aminotransferase Measurement",
"C-Reactive Protein Measurement",
"Immunoglobulin A Measurement"
)
paramcd <- c("ALT", "CRP", "IGA")
paramu <- c("U/L", "mg/L", "g/L")
aval_mean <- c(20, 1, 2)
visit_format <- "WEEK"
n_assessments <- 5L
n_days <- 5L
# validate and initialize related variables
lbcat_init_list <- relvar_init(param, lbcat)
param_init_list <- relvar_init(param, paramcd)
unit_init_list <- relvar_init(param, paramu)
adlb <- expand.grid(
STUDYID = unique(adsl$STUDYID),
USUBJID = adsl$USUBJID,
PARAM = as.factor(param_init_list$relvar1),
AVISIT = visit_schedule(visit_format = visit_format, n_assessments = n_assessments, n_days = n_days),
stringsAsFactors = FALSE
)
# assign AVAL based on different test
adlb <- adlb %>%
dplyr::mutate(AVAL = stats::rnorm(nrow(adlb), mean = 1, sd = 0.2)) %>%
dplyr::left_join(data.frame(PARAM = param, ADJUST = aval_mean), by = "PARAM") %>%
dplyr::mutate(AVAL = .data$AVAL * .data$ADJUST) %>%
dplyr::select(-"ADJUST")
# assign related variable values: PARAMxLBCAT are related
adlb$LBCAT <- as.factor(rel_var(
df = adlb,
var_name = "LBCAT",
var_values = lbcat_init_list$relvar2,
related_var = "PARAM"
))
# assign related variable values: PARAMxPARAMCD are related
adlb$PARAMCD <- as.factor(rel_var(
df = adlb,
var_name = "PARAMCD",
var_values = param_init_list$relvar2,
related_var = "PARAM"
))
adlb$AVALU <- as.factor(rel_var(
df = adlb,
var_name = "AVALU",
var_values = unit_init_list$relvar2,
related_var = "PARAM"
))
adlb <- adlb %>% dplyr::mutate(AVISITN = dplyr::case_when(
AVISIT == "SCREENING" ~ -1,
AVISIT == "BASELINE" ~ 0,
(grepl("^WEEK", AVISIT) | grepl("^CYCLE", AVISIT)) ~ as.numeric(AVISIT) - 2,
TRUE ~ NA_real_
))
adlb <- adlb %>%
dplyr::mutate(AVISITN = dplyr::case_when(
AVISIT == "SCREENING" ~ -1,
AVISIT == "BASELINE" ~ 0,
(grepl("^WEEK", AVISIT) | grepl("^CYCLE", AVISIT)) ~ as.numeric(AVISIT) - 2,
TRUE ~ NA_real_
))
# order to prepare for change from screening and baseline values
adlb <- adlb[order(adlb$STUDYID, adlb$USUBJID, adlb$PARAMCD, adlb$AVISITN), ]
adlb <- Reduce(rbind, lapply(split(adlb, adlb$USUBJID), function(x) {
x$STUDYID <- adsl$STUDYID[which(adsl$USUBJID == x$USUBJID[1])]
x$ABLFL2 <- ifelse(x$AVISIT == "SCREENING", "Y", "")
x$ABLFL <- ifelse(toupper(visit_format) == "WEEK" & x$AVISIT == "BASELINE",
"Y",
ifelse(toupper(visit_format) == "CYCLE" & x$AVISIT == "CYCLE 1 DAY 1", "Y", "")
)
x
}))
adlb$BASE <- ifelse(adlb$ABLFL2 != "Y", retain(adlb, adlb$AVAL, adlb$ABLFL == "Y"), NA)
anrind_choices <- c("HIGH", "LOW", "NORMAL")
adlb <- adlb %>%
dplyr::mutate(BASETYPE = "LAST") %>%
dplyr::mutate(ANRIND = sample_fct(anrind_choices, nrow(adlb), prob = c(0.1, 0.1, 0.8))) %>%
dplyr::mutate(ANRLO = dplyr::case_when(
.data$PARAMCD == "ALT" ~ 7,
.data$PARAMCD == "CRP" ~ 8,
.data$PARAMCD == "IGA" ~ 0.8
)) %>%
dplyr::mutate(ANRHI = dplyr::case_when(
.data$PARAMCD == "ALT" ~ 55,
.data$PARAMCD == "CRP" ~ 10,
.data$PARAMCD == "IGA" ~ 3
)) %>%
dplyr::mutate(DTYPE = NA) %>%
dplyr::mutate(ATOXGR = factor(dplyr::case_when(
.data$ANRIND == "LOW" ~ sample(
c("-1", "-2", "-3", "-4", "-5"),
nrow(adlb),
replace = TRUE,
prob = c(0.30, 0.25, 0.20, 0.15, 0)
),
.data$ANRIND == "HIGH" ~ sample(
c("1", "2", "3", "4", "5"),
nrow(adlb),
replace = TRUE,
prob = c(0.30, 0.25, 0.20, 0.15, 0)
),
.data$ANRIND == "NORMAL" ~ "0"
))) %>%
dplyr::group_by(.data$USUBJID, .data$PARAMCD, .data$BASETYPE) %>%
dplyr::mutate(BTOXGR = .data$ATOXGR[.data$ABLFL == "Y"]) %>%
dplyr::ungroup()
# High and low descriptions of the different PARAMCD values
# This is currently hard coded as the GDSR does not have these descriptions yet
grade_lookup <- tibble::tribble(
~PARAMCD, ~ATOXDSCL, ~ATOXDSCH,
"ALB", "Hypoalbuminemia", NA_character_,
"ALKPH", NA_character_, "Alkaline phosphatase increased",
"ALT", NA_character_, "Alanine aminotransferase increased",
"AST", NA_character_, "Aspartate aminotransferase increased",
"BILI", NA_character_, "Blood bilirubin increased",
"CA", "Hypocalcemia", "Hypercalcemia",
"CHOLES", NA_character_, "Cholesterol high",
"CK", NA_character_, "CPK increased",
"CREAT", NA_character_, "Creatinine increased",
"CRP", NA_character_, "C reactive protein increased",
"GGT", NA_character_, "GGT increased",
"GLUC", "Hypoglycemia", "Hyperglycemia",
"HGB", "Anemia", "Hemoglobin increased",
"IGA", NA_character_, "Immunoglobulin A increased",
"POTAS", "Hypokalemia", "Hyperkalemia",
"LYMPH", "CD4 lymphocytes decreased", NA_character_,
"PHOS", "Hypophosphatemia", NA_character_,
"PLAT", "Platelet count decreased", NA_character_,
"SODIUM", "Hyponatremia", "Hypernatremia",
"WBC", "White blood cell decreased", "Leukocytosis",
)
# merge grade_lookup onto adlb
adlb <- dplyr::left_join(adlb, grade_lookup, by = "PARAMCD")
adlb <- var_relabel(
adlb,
STUDYID = "Study Identifier",
USUBJID = "Unique Subject Identifier"
)
# merge adsl to be able to add LB date and study day variables
adlb <- dplyr::inner_join(
adsl,
adlb,
by = c("STUDYID", "USUBJID"),
multiple = "all"
) %>%
dplyr::rowwise() %>%
dplyr::mutate(trtsdt_int = as.numeric(as.Date(.data$TRTSDTM))) %>%
dplyr::mutate(trtedt_int = dplyr::case_when(
!is.na(TRTEDTM) ~ as.numeric(as.Date(.data$TRTEDTM)),
is.na(TRTEDTM) ~ floor(.data$trtsdt_int + study_duration_secs / 86400)
)) %>%
dplyr::mutate(ADTM = as.POSIXct(
(sample(.data$trtsdt_int:.data$trtedt_int, size = 1) * 86400),
origin = "1970-01-01"
)) %>%
dplyr::select(-"trtsdt_int", -"trtedt_int") %>%
dplyr::ungroup() %>%
dplyr::arrange(.data$STUDYID, .data$USUBJID, .data$ADTM)
adlb <- adlb %>%
dplyr::mutate(ASPID = sample(seq_len(dplyr::n()))) %>%
dplyr::group_by(.data$USUBJID) %>%
dplyr::mutate(LBSEQ = seq_len(dplyr::n())) %>%
dplyr::ungroup() %>%
dplyr::arrange(
.data$STUDYID,
.data$USUBJID,
.data$PARAMCD,
.data$BASETYPE,
.data$AVISITN,
.data$DTYPE,
.data$ADTM,
.data$LBSEQ,
.data$ASPID
)
adlb <- adlb %>% dplyr::mutate(ONTRTFL = factor(dplyr::case_when(
is.na(.data$TRTSDTM) ~ "",
is.na(.data$ADTM) ~ "Y",
(.data$ADTM < .data$TRTSDTM) ~ "",
(.data$ADTM > .data$TRTEDTM) ~ "",
TRUE ~ "Y"
)))
flag_variables <- function(data,
apply_grouping,
apply_filter,
apply_mutate) {
data_compare <- data %>%
dplyr::mutate(row_check = seq_len(nrow(data)))
data <- data_compare %>%
{
if (apply_grouping == TRUE) {
dplyr::group_by(., .data$USUBJID, .data$PARAMCD, .data$BASETYPE, .data$AVISIT)
} else {
dplyr::group_by(., .data$USUBJID, .data$PARAMCD, .data$BASETYPE)
}
} %>%
dplyr::arrange(.data$ADTM, .data$ASPID, .data$LBSEQ) %>%
{
if (apply_filter == TRUE) {
dplyr::filter(
.,
(.data$AVISIT != "BASELINE" & .data$AVISIT != "SCREENING") &
(.data$ONTRTFL == "Y" | .data$ADTM <= .data$TRTSDTM)
) %>%
dplyr::filter(.data$ATOXGR == max(as.numeric(as.character(.data$ATOXGR))))
} else if (apply_filter == FALSE) {
dplyr::filter(
.,
(.data$AVISIT != "BASELINE" & .data$AVISIT != "SCREENING") &
(.data$ONTRTFL == "Y" | .data$ADTM <= .data$TRTSDTM)
) %>%
dplyr::filter(.data$ATOXGR == min(as.numeric(as.character(.data$ATOXGR))))
} else {
dplyr::filter(
.,
.data$AVAL == min(.data$AVAL) &
(.data$AVISIT != "BASELINE" & .data$AVISIT != "SCREENING") &
(.data$ONTRTFL == "Y" | .data$ADTM <= .data$TRTSDTM)
)
}
} %>%
dplyr::slice(1) %>%
{
if (apply_mutate == TRUE) {
dplyr::mutate(., new_var = ifelse(is.na(.data$DTYPE), "Y", ""))
} else {
dplyr::mutate(., new_var = ifelse(is.na(.data$AVAL) == FALSE & is.na(.data$DTYPE), "Y", ""))
}
} %>%
dplyr::ungroup()
data_compare$new_var <- ifelse(data_compare$row_check %in% data$row_check, "Y", "")
data_compare <- data_compare[, -which(names(data_compare) %in% c("row_check"))]
return(data_compare)
}
adlb <- flag_variables(adlb, TRUE, "ELSE", FALSE) %>% dplyr::rename(WORS01FL = "new_var")
adlb <- flag_variables(adlb, FALSE, TRUE, TRUE) %>% dplyr::rename(WGRHIFL = "new_var")
adlb <- flag_variables(adlb, FALSE, FALSE, TRUE) %>% dplyr::rename(WGRLOFL = "new_var")
adlb <- flag_variables(adlb, TRUE, TRUE, TRUE) %>% dplyr::rename(WGRHIVFL = "new_var")
adlb <- flag_variables(adlb, TRUE, FALSE, TRUE) %>% dplyr::rename(WGRLOVFL = "new_var")
tern_ex_adlb <- adlb %>%
dplyr::mutate(
ANL01FL = ifelse(
(.data$ABLFL == "Y" | (.data$WORS01FL == "Y" & is.na(.data$DTYPE))) &
(.data$AVISIT != "SCREENING"),
"Y",
""
),
PARAM = as.factor(.data$PARAM)
) %>%
var_relabel(
PARAM = "Parameter",
AVISIT = "Analysis Visit",
AVAL = "Analysis Value",
LBCAT = "Category for Lab Test",
PARAMCD = "Parameter Code",
AVALU = "Analysis Value Unit",
AVISITN = "Analysis Visit (N)",
ABLFL2 = "Screening Record Flag",
ABLFL = "Baseline Record Flag",
BASE = "Baseline Value",
BASETYPE = "Baseline Type",
ANRIND = "Analysis Reference Range Indicator",
ANRLO = "Analysis Normal Range Lower Limit",
ANRHI = "Analysis Normal Range Upper Limit",
DTYPE = "Derivation Type",
ATOXGR = "Analysis Toxicity Grade",
BTOXGR = "Baseline Toxicity Grade",
ATOXDSCL = "Analysis Toxicity Description Low",
ATOXDSCH = "Analysis Toxicity Description High",
ADTM = "Analysis Datetime",
ASPID = "Analysis Sponsor Identifier",
LBSEQ = "Sequence Number",
ONTRTFL = "On Treatment Record Flag",
WORS01FL = "Worst Observation in Window Flag 01",
WGRHIFL = "Worst High Grade per Patient",
WGRLOFL = "Worst Low Grade per Patient",
WGRHIVFL = "Worst High Grade per Patient per Visit",
WGRLOVFL = "Worst Low Grade per Patient per Visit",
ANL01FL = "Analysis Flag 01 Baseline Post-Baseline"
)
save(tern_ex_adlb, file = "data/tern_ex_adlb.rda", compress = "xz")
}
ADPP
generate_adpp <- function(adsl = tern_ex_adsl) {
set.seed(1)
paramcd <- c(
"AUCIFO", "CMAX", "CLO", "RMAX", "TON",
"RENALCL", "RENALCLD", "RCAMINT", "RCPCINT"
)
param <- c(
"AUC Infinity Obs", "Max Conc", "Total CL Obs", "Time of Maximum Response",
"Time to Onset", "Renal CL", "Renal CL Norm by Dose",
"Amt Rec from T1 to T2", "Pct Rec from T1 to T2"
)
paramu <- c("day*ug/mL", "ug/mL", "ml/day/kg", "hr", "hr", "L/hr", "L/hr/mg", "mg", "%")
aval_mean <- c(200, 30, 5, 10, 3, 0.05, 0.005, 1.5613, 15.65)
# validate and initialize related variables
param_init_list <- relvar_init(param, paramcd)
unit_init_list <- relvar_init(param, paramu)
adpp <- expand.grid(
STUDYID = unique(adsl$STUDYID),
USUBJID = adsl$USUBJID,
PARAM = as.factor(param_init_list$relvar1),
stringsAsFactors = FALSE
)
adpp <- adpp %>%
dplyr::mutate(AVAL = stats::rnorm(nrow(adpp), mean = 1, sd = 0.2)) %>%
dplyr::left_join(data.frame(PARAM = param, ADJUST = aval_mean), by = "PARAM") %>%
dplyr::mutate(AVAL = .data$AVAL * .data$ADJUST) %>%
dplyr::select(-"ADJUST")
# assign related variable values: PARAMxPARAMCD are related
adpp$PARAMCD <- as.factor(rel_var(
df = adpp,
var_name = "PARAMCD",
var_values = param_init_list$relvar2,
related_var = "PARAM"
))
# assign related variable values: PARAMxPARAMU are related
adpp$AVALU <- as.factor(rel_var(
df = adpp,
var_name = "PARAMU",
var_values = unit_init_list$relvar2,
related_var = "PARAM"
))
tern_ex_adpp <- adsl %>%
dplyr::inner_join(adpp, by = c("STUDYID", "USUBJID"), multiple = "all") %>%
dplyr::filter(.data$ACTARM != "B: Placebo", !(.data$ACTARM == "A: Drug X")) %>%
var_relabel(
PARAM = "Parameter",
AVAL = "Analysis Value",
PARAMCD = "Parameter Code",
AVALU = "Analysis Value Unit"
)
save(tern_ex_adpp, file = "data/tern_ex_adpp.rda", compress = "xz")
}
ADRS
generate_adrs <- function(adsl = tern_ex_adsl) {
set.seed(1)
param_codes <- stats::setNames(1:5, c("CR", "PR", "SD", "PD", "NE"))
lookup_ars <- expand.grid(
ARM = c("A: Drug X", "B: Placebo", "C: Combination"),
AVALC = names(param_codes)
) %>% dplyr::mutate(
AVAL = param_codes[.data$AVALC],
p_scr = c(rep(0, 3), rep(0, 3), c(1, 1, 1), c(0, 0, 0), c(0, 0, 0)),
p_bsl = c(rep(0, 3), rep(0, 3), c(1, 1, 1), c(0, 0, 0), c(0, 0, 0)),
p_cycle = c(c(.4, .3, .5), c(.35, .25, .25), c(.1, .2, .08), c(.14, 0.15, 0.15), c(.01, 0.1, 0.02)),
p_eoi = c(c(.4, .3, .5), c(.35, .25, .25), c(.1, .2, .08), c(.14, 0.15, 0.15), c(.01, 0.1, 0.02)),
p_fu = c(c(.3, .2, .4), c(.2, .1, .3), c(.2, .2, .2), c(.3, .5, 0.1), rep(0, 3))
)
adrs <- split(adsl, adsl$USUBJID) %>%
lapply(function(pinfo) {
probs <- dplyr::filter(lookup_ars, .data$ARM == as.character(pinfo$ACTARM))
# screening
rsp_screen <- sample(probs$AVALC, 1, prob = probs$p_scr) %>% as.character()
# baseline
rsp_bsl <- sample(probs$AVALC, 1, prob = probs$p_bsl) %>% as.character()
# cycle
rsp_c2d1 <- sample(probs$AVALC, 1, prob = probs$p_cycle) %>% as.character()
rsp_c4d1 <- sample(probs$AVALC, 1, prob = probs$p_cycle) %>% as.character()
# end of induction
rsp_eoi <- sample(probs$AVALC, 1, prob = probs$p_eoi) %>% as.character()
# follow up
rsp_fu <- sample(probs$AVALC, 1, prob = probs$p_fu) %>% as.character()
best_rsp <- min(param_codes[c(rsp_screen, rsp_bsl, rsp_eoi, rsp_fu, rsp_c2d1, rsp_c4d1)])
best_rsp_i <- which.min(param_codes[c(rsp_screen, rsp_bsl, rsp_eoi, rsp_fu, rsp_c2d1, rsp_c4d1)])
avisit <- c("SCREENING", "BASELINE", "CYCLE 2 DAY 1", "CYCLE 4 DAY 1", "END OF INDUCTION", "FOLLOW UP")
# meaningful date information
trtsdt_int <- as.numeric(as.Date(pinfo$TRTSDTM))
trtedt_int <- ifelse(
!is.na(pinfo$TRTEDTM), as.numeric(as.Date(pinfo$TRTEDTM)),
floor(trtsdt_int + (pinfo$study_duration_secs) / 86400)
)
scr_date <- as.POSIXct(((trtsdt_int - 100) * 86400), origin = "1970-01-01")
bs_date <- pinfo$TRTSDTM
flu_date <- as.POSIXct((sample(trtsdt_int:trtedt_int, size = 1) * 86400), origin = "1970-01-01")
eoi_date <- as.POSIXct((sample(trtsdt_int:trtedt_int, size = 1) * 86400), origin = "1970-01-01")
c2d1_date <- as.POSIXct((sample(trtsdt_int:trtedt_int, size = 1) * 86400), origin = "1970-01-01")
c4d1_date <- min(c2d1_date + 60 * 86400, pinfo$TRTEDTM)
tibble::tibble(
STUDYID = pinfo$STUDYID,
USUBJID = pinfo$USUBJID,
PARAMCD = as.factor(c(rep("OVRINV", 6), "BESRSPI", "INVET")),
PARAM = as.factor(dplyr::recode(
.data$PARAMCD,
OVRINV = "Overall Response by Investigator - by visit",
OVRSPI = "Best Overall Response by Investigator (no confirmation required)",
BESRSPI = "Best Confirmed Overall Response by Investigator",
INVET = "Investigator End Of Induction Response"
)),
AVALC = c(
rsp_screen, rsp_bsl, rsp_c2d1, rsp_c4d1, rsp_eoi, rsp_fu,
names(param_codes)[best_rsp],
rsp_eoi
),
AVAL = param_codes[.data$AVALC],
AVISIT = factor(c(avisit, avisit[best_rsp_i], avisit[5]), levels = avisit)
) %>%
merge(
tibble::tibble(
AVISIT = avisit,
ADTM = c(scr_date, bs_date, c2d1_date, c4d1_date, eoi_date, flu_date),
AVISITN = c(-1, 0, 2, 4, 999, 999),
TRTSDTM = pinfo$TRTSDTM
) %>%
dplyr::select(-"TRTSDTM"),
by = "AVISIT"
)
}) %>%
Reduce(rbind, .) %>%
dplyr::mutate(
AVALC = factor(.data$AVALC, levels = names(param_codes)),
DTHFL = factor(sample(c("Y", "N"), nrow(.), replace = TRUE, prob = c(1, 0.8)))
)
# merge ADSL to be able to add RS date and study day variables
adrs <- dplyr::inner_join(
adsl,
adrs,
by = c("STUDYID", "USUBJID"),
multiple = "all"
)
tern_ex_adrs <- adrs %>%
dplyr::arrange(
.data$STUDYID,
.data$USUBJID,
.data$PARAMCD,
.data$AVISITN,
.data$ADTM
) %>%
var_relabel(
STUDYID = "Study Identifier",
USUBJID = "Unique Subject Identifier",
AVISIT = "Analysis Visit",
PARAMCD = "Parameter Code",
PARAM = "Parameter",
AVALC = "Analysis Value (C)",
AVAL = "Analysis Value",
ADTM = "Analysis Datetime",
AVISITN = "Analysis Visit (N)",
DTHFL = "Death Flag"
)
save(tern_ex_adrs, file = "data/tern_ex_adrs.rda", compress = "xz")
}
ADTTE
generate_adtte <- function(adsl = tern_ex_adsl) {
set.seed(1)
lookup_tte <- tibble::tribble(
~ARM, ~PARAMCD, ~PARAM, ~LAMBDA, ~CNSR_P,
"ARM A", "OS", "Overall Survival", log(2) / 610, 0.4,
"ARM B", "OS", "Overall Survival", log(2) / 490, 0.3,
"ARM C", "OS", "Overall Survival", log(2) / 365, 0.2,
"ARM A", "PFS", "Progression Free Survival", log(2) / 365, 0.4,
"ARM B", "PFS", "Progression Free Survival", log(2) / 305, 0.3,
"ARM C", "PFS", "Progression Free Survival", log(2) / 243, 0.2,
"ARM A", "EFS", "Event Free Survival", log(2) / 365, 0.4,
"ARM B", "EFS", "Event Free Survival", log(2) / 305, 0.3,
"ARM C", "EFS", "Event Free Survival", log(2) / 243, 0.2,
"ARM A", "CRSD", "Duration of Confirmed Response", log(2) / 305, 0.4,
"ARM B", "CRSD", "Duration of Confirmed Response", log(2) / 243, 0.3,
"ARM C", "CRSD", "Duration of Confirmed Response", log(2) / 182, 0.2
)
evntdescr_sel <- c(
"Death",
"Disease Progression",
"Last Tumor Assessment",
"Adverse Event",
"Last Date Known To Be Alive"
)
cnsdtdscr_sel <- c(
"Preferred Term",
"Clinical Cut Off",
"Completion or Discontinuation",
"End of AE Reporting Period"
)
adtte <- split(adsl, adsl$USUBJID) %>%
lapply(FUN = function(pinfo) {
lookup_tte %>%
dplyr::filter(.data$ARM == as.character(pinfo$ACTARMCD)) %>%
dplyr::rowwise() %>%
dplyr::mutate(
STUDYID = pinfo$STUDYID,
USUBJID = pinfo$USUBJID,
CNSR = sample(c(0, 1), 1, prob = c(1 - .data$CNSR_P, .data$CNSR_P)),
AVAL = stats::rexp(1, .data$LAMBDA),
AVALU = "DAYS"
) %>%
dplyr::select(-"LAMBDA", -"CNSR_P")
}) %>%
Reduce(rbind, .)
# merge ADSL to be able to add TTE date and study day variables
adtte <- dplyr::inner_join(
adsl,
dplyr::select(adtte, -"ARM"),
by = c("STUDYID", "USUBJID"),
multiple = "all"
) %>%
dplyr::rowwise() %>%
dplyr::mutate(trtsdt_int = as.numeric(as.Date(.data$TRTSDTM))) %>%
dplyr::mutate(trtedt_int = dplyr::case_when(
!is.na(TRTEDTM) ~ as.numeric(as.Date(TRTEDTM)),
is.na(TRTEDTM) ~ floor(trtsdt_int + (study_duration_secs) / 86400)
)) %>%
dplyr::mutate(ADTM = as.POSIXct(
(sample(.data$trtsdt_int:.data$trtedt_int, size = 1) * 86400),
origin = "1970-01-01"
)) %>%
dplyr::select(-"trtsdt_int", -"trtedt_int") %>%
dplyr::ungroup() %>%
dplyr::arrange(.data$STUDYID, .data$USUBJID, .data$ADTM)
adtte <- adtte %>%
dplyr::group_by(.data$USUBJID) %>%
dplyr::mutate(PARAM = as.factor(.data$PARAM)) %>%
dplyr::mutate(PARAMCD = as.factor(.data$PARAMCD)) %>%
dplyr::ungroup() %>%
dplyr::arrange(
.data$STUDYID,
.data$USUBJID,
.data$PARAMCD,
.data$ADTM
)
# adding adverse event counts and log follow-up time
tern_ex_adtte <- dplyr::bind_rows(
adtte,
data.frame(adtte %>%
dplyr::group_by(.data$USUBJID) %>%
dplyr::slice_head(n = 1) %>%
dplyr::mutate(
PARAMCD = "TNE",
PARAM = "Total Number of Exacerbations",
AVAL = stats::rpois(1, 3),
AVALU = "COUNT",
lgTMATRSK = log(stats::rexp(1, rate = 3)),
dplyr::across("ADTM", ~NA)
))
) %>%
dplyr::arrange(
.data$STUDYID,
.data$USUBJID,
.data$PARAMCD,
.data$ADTM
)
var_labels(tern_ex_adtte)[1:21] <- var_labels(adtte)[1:21]
tern_ex_adtte <- tern_ex_adtte %>%
var_relabel(
STUDYID = "Study Identifier",
USUBJID = "Unique Subject Identifier",
PARAMCD = "Parameter Code",
PARAM = "Parameter",
CNSR = "Censor",
AVAL = "Analysis Value",
AVALU = "Analysis Value Unit",
ADTM = "Analysis Datetime",
lgTMATRSK = "Log Time At Risk"
)
save(tern_ex_adtte, file = "data/tern_ex_adtte.rda", compress = "xz")
}
Generate Data
# Generate & load ADSL
generate_adsl()
load("data/tern_ex_adsl.rda")
# Generate other datasets
generate_adae()
generate_adlb()
generate_adpp()
generate_adrs()
generate_adtte()