|
18 | 18 | from skglm.utils.data import make_dummy_survival_data |
19 | 19 |
|
20 | 20 | n_samples, n_features = 500, 100 |
21 | | -tm, s, X = make_dummy_survival_data( |
| 21 | +X, y = make_dummy_survival_data( |
22 | 22 | n_samples, n_features, |
23 | 23 | normalize=True, |
24 | 24 | random_state=0 |
25 | 25 | ) |
26 | 26 |
|
| 27 | +tm, s = y[:, 0], y[:, 1] |
| 28 | + |
27 | 29 | # %% |
28 | 30 | # The synthetic data has the following properties: |
29 | 31 | # |
| 32 | +# * ``X`` is the matrix of predictors, generated using standard normal distribution with Toeplitz covariance. |
30 | 33 | # * ``tm`` is the vector of occurrence times which follows a Weibull(1) distribution |
31 | 34 | # * ``s`` indicates the observations censorship and follows a Bernoulli(0.5) distribution |
32 | | -# * ``X`` is the matrix of predictors, generated using standard normal distribution with Toeplitz covariance. |
33 | 35 | # |
34 | 36 | # Let's inspect the data quickly: |
35 | 37 | import matplotlib.pyplot as plt |
|
70 | 72 | datafit = compiled_clone(Cox()) |
71 | 73 | penalty = compiled_clone(L1(alpha)) |
72 | 74 |
|
73 | | -datafit.initialize(X, (tm, s)) |
| 75 | +datafit.initialize(X, y) |
74 | 76 |
|
75 | 77 | # init solver |
76 | 78 | solver = ProxNewton(fit_intercept=False, max_iter=50) |
77 | 79 |
|
78 | 80 | # solve the problem |
79 | | -w_sk = solver.solve(X, (tm, s), datafit, penalty)[0] |
| 81 | +w_sk = solver.solve(X, y, datafit, penalty)[0] |
80 | 82 |
|
81 | 83 | # %% |
82 | 84 | # For this data a regularization value a relatively sparse solution is found: |
|
93 | 95 | from lifelines import CoxPHFitter |
94 | 96 |
|
95 | 97 | # format data |
96 | | -stacked_tm_s_X = np.hstack((tm[:, None], s[:, None], X)) |
97 | | -df = pd.DataFrame(stacked_tm_s_X) |
| 98 | +stacked_y_X = np.hstack((y, X)) |
| 99 | +df = pd.DataFrame(stacked_y_X) |
98 | 100 |
|
99 | 101 | # fit lifelines estimator |
100 | 102 | lifelines_estimator = CoxPHFitter(penalizer=alpha, l1_ratio=1.).fit( |
|
106 | 108 |
|
107 | 109 | # %% |
108 | 110 | # Check that both solvers find solutions having the same objective value: |
109 | | -obj_sk = datafit.value((tm, s), w_sk, X @ w_sk) + penalty.value(w_sk) |
110 | | -obj_ll = datafit.value((tm, s), w_ll, X @ w_ll) + penalty.value(w_ll) |
| 111 | +obj_sk = datafit.value(y, w_sk, X @ w_sk) + penalty.value(w_sk) |
| 112 | +obj_ll = datafit.value(y, w_ll, X @ w_ll) + penalty.value(w_ll) |
111 | 113 |
|
112 | 114 | print(f"Objective skglm: {obj_sk:.6f}") |
113 | 115 | print(f"Objective lifelines: {obj_ll:.6f}") |
|
141 | 143 | solver.max_iter = n_iter |
142 | 144 |
|
143 | 145 | start = time.perf_counter() |
144 | | - w = solver.solve(X, (tm, s), datafit, penalty)[0] |
| 146 | + w = solver.solve(X, y, datafit, penalty)[0] |
145 | 147 | end = time.perf_counter() |
146 | 148 |
|
147 | 149 | records["skglm"]["objs"].append( |
148 | | - datafit.value((tm, s), w, X @ w) + penalty.value(w) |
| 150 | + datafit.value(y, w, X @ w) + penalty.value(w) |
149 | 151 | ) |
150 | 152 | records["skglm"]["times"].append(end - start) |
151 | 153 |
|
|
164 | 166 | w = lifelines_estimator.params_.values |
165 | 167 |
|
166 | 168 | records["lifelines"]["objs"].append( |
167 | | - datafit.value((tm, s), w, X @ w) + penalty.value(w) |
| 169 | + datafit.value(y, w, X @ w) + penalty.value(w) |
168 | 170 | ) |
169 | 171 | records["lifelines"]["times"].append(end - start) |
170 | 172 |
|
|
212 | 214 | # |
213 | 215 | # Let's start by generating data with tied observations. This can be achieved |
214 | 216 | # by passing in a ``with_ties=True`` to ``make_dummy_survival_data`` function. |
215 | | -tm, s, X = make_dummy_survival_data( |
| 217 | +X, y = make_dummy_survival_data( |
216 | 218 | n_samples, n_features, |
217 | 219 | normalize=True, |
218 | 220 | with_ties=True, |
219 | 221 | random_state=0 |
220 | 222 | ) |
| 223 | +tm, s = y[:, 0], y[:, 1] |
221 | 224 |
|
222 | 225 | # check the data has tied observations |
223 | 226 | print(f"Number of unique times {len(np.unique(tm))} out of {n_samples}") |
|
228 | 231 |
|
229 | 232 | # ensure using Efron estimate |
230 | 233 | datafit = compiled_clone(Cox(use_efron=True)) |
231 | | -datafit.initialize(X, (tm, s)) |
| 234 | +datafit.initialize(X, y) |
232 | 235 |
|
233 | 236 | # solve the problem |
234 | 237 | solver = ProxNewton(fit_intercept=False, max_iter=50) |
235 | | -w_sk = solver.solve(X, (tm, s), datafit, penalty)[0] |
| 238 | +w_sk = solver.solve(X, y, datafit, penalty)[0] |
236 | 239 |
|
237 | 240 | # %% |
238 | 241 | # Again a relatively sparse solution is found: |
|
257 | 260 | w_ll = lifelines_estimator.params_.values |
258 | 261 |
|
259 | 262 | # Check that both solvers find solutions with the same objective value |
260 | | -obj_sk = datafit.value((tm, s), w_sk, X @ w_sk) + penalty.value(w_sk) |
261 | | -obj_ll = datafit.value((tm, s), w_ll, X @ w_ll) + penalty.value(w_ll) |
| 263 | +obj_sk = datafit.value(y, w_sk, X @ w_sk) + penalty.value(w_sk) |
| 264 | +obj_ll = datafit.value(y, w_ll, X @ w_ll) + penalty.value(w_ll) |
262 | 265 |
|
263 | 266 | print(f"Objective skglm: {obj_sk:.6f}") |
264 | 267 | print(f"Objective lifelines: {obj_ll:.6f}") |
|
272 | 275 |
|
273 | 276 | # time skglm |
274 | 277 | start = time.perf_counter() |
275 | | -solver.solve(X, (tm, s), datafit, penalty)[0] |
| 278 | +solver.solve(X, y, datafit, penalty)[0] |
276 | 279 | end = time.perf_counter() |
277 | 280 |
|
278 | 281 | total_time_skglm = end - start |
|
0 commit comments