Sys.setenv(OMP_THREAD_LIMIT = 1) # Reducing core use, to avoid accidental use of too many cores
library(Colossus)
library(data.table)
if (system.file(package = "survival") != "") {
library(survival)
}
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:data.table':
#>
#> between, first, last
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, unionOne alternative regression method in Colossus for
predicting the probability of an event in independent trials is logistic
regression. The theory is presented in the following section.
We start by describing the structure of the data. The logistic
regression models in Colossus assume the data can be
represented in terms of trials, events, and risk factors. Similar to a
Poisson model using person-years and number of events in each row, a
logistic model requires some measure of the number of independent trials
and events for each group. A special case is an analysis in which each
row represents one trial, and either 0 or 1 events are observed. The
likelihood (\(L\)) for a logistic model
is a function of the trials (\(N_i\)),
events (\(y_i\)), and the probability
of a success (\(p_i\)) for each row,
\(i\). In Colossus, the
log-likelihood (\(Ll\)) is
optimized.
\[ \begin{aligned} L = \prod_i \left [ p_i^{y_i} \times (1-p_i)^{N_i - y_i} \right ] \\ Ll = \sum_i \left [ y_i \ln(p_i) + (N_i - y_i) \ln(1-p_i) \right ] \\ \end{aligned} \]
Similar to other survival models in Colossus, a model is
written as a function of the various risk factors (\(f(\vec{\beta}, \vec{x})\)). This function
is related to the probability through a linking function. Four options
are currently available: odds, identity, complementary log, and probit
linking, the first three were based on the options present in 32-bit
Epicure. The last option, probit, uses the cumulative density function
of the standard normal distribution (\(\Phi\)).
\[ \begin{aligned} p_{odds} = \frac{f \left(\vec{\beta}, \vec{x} \right)}{1+f \left(\vec{\beta}, \vec{x} \right)} \\ p_{id} = f \left(\vec{\beta}, \vec{x} \right) \\ p_{comp} = e^{-f \left(\vec{\beta}, \vec{x} \right)} \\ p_{probit} = \Phi \left( f \left(\vec{\beta},\vec{x} \right) \right) \end{aligned} \]
Similar to other regression models in Colossus, the
first derivative vector and second derivative matrix are calculated to
optimize the model parameters.
\[ \begin{aligned} \frac{\partial Ll}{\partial \mu} = \sum_i \left [ y_i \frac{\partial p_i/\partial \mu}{p_i} - \left(N_i - y_i \right) \frac{\partial p_i/\partial \mu}{1-p_i} \right ] \\ \frac{\partial^2 Ll}{\partial \mu \partial \nu} = \sum_i \left [ y_i \left (\frac{\partial^2 p_i/\partial \mu \partial \nu}{p_i} - \frac{\partial p_i/\partial \mu}{p_i} \frac{\partial p_i/\partial \nu}{p_i} \right ) - \left(N_i - y_i \right) \left ( \frac{\partial^2 p_i/\partial \mu \partial \nu}{1-p_i} + \frac{\partial p_i/\partial \mu}{1-p_i} \frac{\partial p_i/\partial \nu}{1-p_i} \right ) \right ] \\ \end{aligned} \]
The use of a logistic model in Colossus closely follows
the other regression options. We start by loading and preparing the
data. In this example, we will be using the Veterans’ Administration
Lung Cancer study data from the survival package. Every row
has 1 trial and a binary event status column. Once again, our model uses
a trial column, an event column, and the “risk” equation that will be
converted to a probability. In this case, the number of trials is 1, so
the column can be omitted, but to illustrate, we will artificially
create a trial column named “trial”.
#
if (system.file(package = "survival") != "") {
veteran %>% setDT()
df <- copy(veteran)
} else {
karno <- c(60, 70, 60, 60, 70, 20, 40, 80, 50, 70, 60, 40, 30, 80, 70, 60, 60, 40, 80, 60, 40, 60, 60, 30, 80, 30, 50, 60, 80, 40, 20, 80, 30, 75, 70, 60, 30, 60, 80, 60, 70, 50, 50, 40, 40, 20, 70, 40, 80, 80, 50, 80, 30, 80, 50, 80, 50, 70, 60, 40, 80, 80, 70, 90, 90, 80, 80, 70, 60, 90, 80, 80, 50, 50, 70, 70, 20, 60, 90, 30, 20, 70, 90, 80, 50, 70, 60, 90, 50, 30, 70, 20, 30, 60, 40, 30, 20, 60, 70, 80, 85, 70, 70, 70, 50, 30, 40, 40, 40, 99, 80, 60, 60, 60, 60, 50, 70, 10, 40, 70, 90, 80, 50, 40, 40, 60, 70, 30, 60, 30, 60, 80, 75, 60, 70, 80, 30)
trt <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)
cell <- c("squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "adeno", "adeno", "adeno", "adeno", "adeno", "adeno", "adeno", "adeno", "adeno", "large", "large", "large", "large", "large", "large", "large", "large", "large", "large", "large", "large", "large", "large", "large", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "squamous", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "smallcell", "adeno", "adeno", "adeno", "adeno", "adeno", "adeno", "adeno", "adeno", "adeno", "adeno", "adeno", "adeno", "adeno", "adeno", "adeno", "adeno", "adeno", "adeno", "large", "large", "large", "large", "large", "large", "large", "large", "large", "large", "large", "large")
status <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
df <- data.table(
karno = karno,
trt = trt,
cell = cell,
status = status
)
}
# Make the same adjustments as Epicure example 6.5
karno <- df$karno
karno[93] <- 20
df$karno <- karno
df$trt <- df$trt - 1
df$trt <- as.integer(df$trt == 0)
cell_lvl <- c("large", "squamous", "smallcell", "adeno")
df$cell <- as.integer(factor(df$celltype, level = cell_lvl)) - 1
df$karno50 <- df$karno - 50
df$trial <- 1We will start with testing the effects of cell type. By default,
Colossus will always remove the first level of a factored
column. We can force Colossus to include every level by
defining the lowest level to an unused value, -1 in this case. This
gives the same final model as if we removed the baseline and added an
intercept column (CONST). In this case, the p-values are for the null
hypothesis that the cell type has no effect. If we removed the baseline
and added an intercept, the p-values would be for the null hypothesis
that the cell type has no effect above the baseline.
df$cell <- factor(df$cell, levels = c(-1, 0, 1, 2, 3))
model <- Logit(trial, status) ~ loglinear(cell)
model <- Logit(status) ~ loglinear(cell)
control <- list(verbose = 0, ncores = 1)
e <- LogisticRun(model, df, control = control)
print(e)
#> |--------------------------------------------------------------------------------|
#> Final Results
#> Covariate Subterm Central Estimate Standard Error 2-tail p-value
#> <char> <char> <num> <num> <num>
#> 1: cell_0 loglin 3.26 1.018 1.38e-03
#> 2: cell_1 loglin 2.05 0.531 1.16e-04
#> 3: cell_2 loglin 2.71 0.596 5.58e-06
#> 4: cell_3 loglin 3.26 1.018 1.38e-03
#> |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -|
#>
#> Logisitic Model Used
#> Odds Ratio Linking Function Used
#> |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -|
#> -2*Log-Likelihood: 64.429, Deviance: 64.429, AIC: 72.429, BIC: 84.109
#> Iterations run: 7
#> maximum step size: 6.739e-03, maximum first derivative: 2.208e-03
#> Last iteration improved the log-likelihood by: 8.753e-05
#> Analysis converged
#> Records Used: 137, Records Removed: 0
#> Run finished in 0.034 seconds
#> |--------------------------------------------------------------------------------|In this analysis, we were using the default of the odds ratio. The linking function can be changed by using the “link” option. The following examples show the impact of using each linking function. Note that the probability must be between 0-1, so the starting values are set to ensure that the initial probability estimates are valid. Because we are using categorical risk factors, every model converges to the same score and event probabilities; however, the standard error and p-values are affected. If we were using continuous variables, the linking functions would give a way to define further complicated risk models.
a_n <- c(0.1, 0.1, 0.1, 0.1)
e <- LogisticRun(model, df, control = control, a_n = a_n, link = "odds")
print(e)
#> |--------------------------------------------------------------------------------|
#> Final Results
#> Covariate Subterm Central Estimate Standard Error 2-tail p-value
#> <char> <char> <num> <num> <num>
#> 1: cell_0 loglin 3.26 1.018 1.38e-03
#> 2: cell_1 loglin 2.05 0.531 1.16e-04
#> 3: cell_2 loglin 2.71 0.596 5.58e-06
#> 4: cell_3 loglin 3.26 1.018 1.38e-03
#> |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -|
#>
#> Logisitic Model Used
#> Odds Ratio Linking Function Used
#> |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -|
#> -2*Log-Likelihood: 64.429, Deviance: 64.429, AIC: 72.429, BIC: 84.109
#> Iterations run: 7
#> maximum step size: 6.013e-03, maximum first derivative: 1.961e-03
#> Last iteration improved the log-likelihood by: 6.929e-05
#> Analysis converged
#> Records Used: 137, Records Removed: 0
#> Run finished in 0.024 seconds
#> |--------------------------------------------------------------------------------|
a_n <- c(-1, -1, -1, -1)
e <- LogisticRun(model, df, control = control, a_n = a_n, link = "ident")
print(e)
#> |--------------------------------------------------------------------------------|
#> Final Results
#> Covariate Subterm Central Estimate Standard Error 2-tail p-value
#> <char> <char> <num> <num> <num>
#> 1: cell_0 loglin -0.0377 0.0377 0.3173
#> 2: cell_1 loglin -0.1213 0.0607 0.0457
#> 3: cell_2 loglin -0.0645 0.0373 0.0834
#> 4: cell_3 loglin -0.0377 0.0377 0.3173
#> |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -|
#>
#> Logisitic Model Used
#> Identity Linking Function Used
#> |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -|
#> -2*Log-Likelihood: 64.429, Deviance: 64.429, AIC: 72.429, BIC: 84.109
#> Iterations run: 7
#> maximum step size: 1.209e-04, maximum first derivative: 2.863e-02
#> Last iteration improved the log-likelihood by: 1.173e-05
#> Analysis converged
#> Records Used: 137, Records Removed: 0
#> Run finished in 0.024 seconds
#> |--------------------------------------------------------------------------------|
a_n <- c(0.1, 0.1, 0.1, 0.1)
e <- LogisticRun(model, df, control = control, a_n = a_n, link = "loglink")
print(e)
#> |--------------------------------------------------------------------------------|
#> Final Results
#> Covariate Subterm Central Estimate Standard Error 2-tail p-value
#> <char> <char> <num> <num> <num>
#> 1: cell_0 loglin -3.28 1.000 1.05e-03
#> 2: cell_1 loglin -2.11 0.500 2.49e-05
#> 3: cell_2 loglin -2.74 0.577 2.07e-06
#> 4: cell_3 loglin -3.28 1.000 1.05e-03
#> |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -|
#>
#> Logisitic Model Used
#> Complementary Log Linking Function Used
#> |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -|
#> -2*Log-Likelihood: 64.429, Deviance: 64.429, AIC: 72.429, BIC: 84.109
#> Iterations run: 9
#> maximum step size: 2.008e-03, maximum first derivative: 7.219e-04
#> Last iteration improved the log-likelihood by: 8.184e-06
#> Analysis converged
#> Records Used: 137, Records Removed: 0
#> Run finished in 0.025 seconds
#> |--------------------------------------------------------------------------------|
a_n <- c(-1, -1, -1, -1)
e <- LogisticRun(model, df, control = control, a_n = a_n, link = "probit")
print(e)
#> |--------------------------------------------------------------------------------|
#> Final Results
#> Covariate Subterm Central Estimate Standard Error 2-tail p-value
#> <char> <char> <num> <num> <num>
#> 1: cell_0 loglin 0.581 0.251 0.0209
#> 2: cell_1 loglin 0.186 0.231 0.4218
#> 3: cell_2 loglin 0.428 0.185 0.0208
#> 4: cell_3 loglin 0.581 0.251 0.0209
#> |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -|
#>
#> Logisitic Model Used
#> Probability Unit (probit) Linking Function Used
#> |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -|
#> -2*Log-Likelihood: 64.429, Deviance: 64.429, AIC: 72.429, BIC: 84.109
#> Iterations run: 6
#> maximum step size: 1.818e-03, maximum first derivative: 9.635e-03
#> Last iteration improved the log-likelihood by: 9.895e-05
#> Analysis converged
#> Records Used: 137, Records Removed: 0
#> Run finished in 0.026 seconds
#> |--------------------------------------------------------------------------------|