|
4 | 4 | import pandas as pd |
5 | 5 | import scipy.stats as stats |
6 | 6 |
|
7 | | -from batchglm.api.models.glm_nb import Simulator |
8 | 7 | import diffxpy.api as de |
9 | 8 |
|
10 | 9 |
|
11 | | -class TestPairwiseNull(unittest.TestCase): |
| 10 | +class _TestPairwiseNull: |
12 | 11 |
|
13 | | - def test_null_distribution_ztest(self, n_cells: int = 2000, n_genes: int = 100, n_groups=2): |
14 | | - """ |
15 | | - Test if de.wald() generates a uniform p-value distribution |
16 | | - if it is given data simulated based on the null model. Returns the p-value |
17 | | - of the two-side Kolmgorov-Smirnov test for equality of the observed |
18 | | - p-value distriubution and a uniform distribution. |
| 12 | + noise_model: str |
19 | 13 |
|
20 | | - :param n_cells: Number of cells to simulate (number of observations per test). |
21 | | - :param n_genes: Number of genes to simulate (number of tests). |
22 | | - """ |
23 | | - logging.getLogger("tensorflow").setLevel(logging.ERROR) |
24 | | - logging.getLogger("batchglm").setLevel(logging.WARNING) |
25 | | - logging.getLogger("diffxpy").setLevel(logging.WARNING) |
| 14 | + def _prepate_data( |
| 15 | + self, |
| 16 | + n_cells: int, |
| 17 | + n_genes: int, |
| 18 | + n_groups: int |
| 19 | + ): |
| 20 | + if self.noise_model == "nb": |
| 21 | + from batchglm.api.models.glm_nb import Simulator |
| 22 | + rand_fn_loc = lambda shape: np.random.uniform(0.1, 1, shape) |
| 23 | + rand_fn_scale = lambda shape: np.random.uniform(0.5, 1, shape) |
| 24 | + elif self.noise_model == "norm" or self.noise_model is None: |
| 25 | + from batchglm.api.models.glm_norm import Simulator |
| 26 | + rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape) |
| 27 | + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) |
| 28 | + else: |
| 29 | + raise ValueError("noise model %s not recognized" % self.noise_model) |
26 | 30 |
|
27 | 31 | sim = Simulator(num_observations=n_cells, num_features=n_genes) |
28 | 32 | sim.generate_sample_description(num_batches=0, num_conditions=0) |
29 | | - sim.generate() |
| 33 | + sim.generate_params( |
| 34 | + rand_fn_loc=rand_fn_loc, |
| 35 | + rand_fn_scale=rand_fn_scale |
| 36 | + ) |
| 37 | + sim.generate_data() |
30 | 38 |
|
31 | 39 | random_sample_description = pd.DataFrame({ |
32 | 40 | "condition": np.random.randint(n_groups, size=sim.nobs) |
33 | 41 | }) |
34 | | - |
35 | | - test = de.test.pairwise( |
36 | | - data=sim.x, |
37 | | - grouping="condition", |
38 | | - test="z-test", |
39 | | - noise_model="nb", |
40 | | - sample_description=random_sample_description, |
41 | | - dtype="float64" |
42 | | - ) |
43 | | - summary = test.summary() |
44 | | - |
45 | | - # Compare p-value distribution under null model against uniform distribution. |
46 | | - pval_h0 = stats.kstest(test.pval[~np.eye(test.pval.shape[0]).astype(bool)].flatten(), 'uniform').pvalue |
47 | | - |
48 | | - logging.getLogger("diffxpy").info('KS-test pvalue for null model match of wald(): %f' % pval_h0) |
49 | | - assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5) |
50 | | - |
51 | | - return True |
52 | | - |
53 | | - def test_null_distribution_z_lazy(self, n_cells: int = 2000, n_genes: int = 100): |
| 42 | + return sim, random_sample_description |
| 43 | + |
| 44 | + def _test_null_distribution_basic( |
| 45 | + self, |
| 46 | + test: str, |
| 47 | + lazy: bool, |
| 48 | + quick_scale: bool = False, |
| 49 | + n_cells: int = 3000, |
| 50 | + n_genes: int = 200, |
| 51 | + n_groups: int = 3 |
| 52 | + ): |
54 | 53 | """ |
55 | | - Test if de.pairwise() generates a uniform p-value distribution for lazy z-tests |
| 54 | + Test if de.wald() generates a uniform p-value distribution |
56 | 55 | if it is given data simulated based on the null model. Returns the p-value |
57 | | - of the two-side Kolmgorov-Smirnov test for equality of the observed |
| 56 | + of the two-side Kolmgorov-Smirnov test for equality of the observed |
58 | 57 | p-value distriubution and a uniform distribution. |
59 | 58 |
|
60 | 59 | :param n_cells: Number of cells to simulate (number of observations per test). |
61 | 60 | :param n_genes: Number of genes to simulate (number of tests). |
62 | 61 | """ |
63 | | - logging.getLogger("tensorflow").setLevel(logging.ERROR) |
64 | | - logging.getLogger("batchglm").setLevel(logging.WARNING) |
65 | | - logging.getLogger("diffxpy").setLevel(logging.WARNING) |
66 | | - |
67 | | - sim = Simulator(num_observations=n_cells, num_features=n_genes) |
68 | | - sim.generate_sample_description(num_batches=0, num_conditions=0) |
69 | | - sim.generate() |
70 | | - |
71 | | - random_sample_description = pd.DataFrame({ |
72 | | - "condition": np.random.randint(4, size=sim.nobs) |
73 | | - }) |
74 | | - |
| 62 | + sim, sample_description = self._prepate_data( |
| 63 | + n_cells=n_cells, |
| 64 | + n_genes=n_genes, |
| 65 | + n_groups=n_groups |
| 66 | + ) |
75 | 67 | test = de.test.pairwise( |
76 | | - data=sim.x, |
| 68 | + data=sim.input_data, |
| 69 | + sample_description=sample_description, |
77 | 70 | grouping="condition", |
78 | | - test='z-test', |
79 | | - lazy=True, |
80 | | - noise_model="nb", |
81 | | - pval_correction="global", |
82 | | - quick_scale=True, |
83 | | - sample_description=random_sample_description, |
84 | | - dtype="float64" |
| 71 | + test=test, |
| 72 | + lazy=lazy, |
| 73 | + quick_scale=quick_scale, |
| 74 | + noise_model=self.noise_model |
85 | 75 | ) |
| 76 | + _ = test.summary() |
86 | 77 |
|
87 | 78 | # Compare p-value distribution under null model against uniform distribution. |
88 | | - pvals = test.pval_pairs(groups0=0, groups1=1) |
89 | | - pval_h0 = stats.kstest(pvals.flatten(), 'uniform').pvalue |
| 79 | + if lazy: |
| 80 | + pval_h0 = stats.kstest(test.pval_pairs(groups0=0, groups1=1).flatten(), 'uniform').pvalue |
| 81 | + else: |
| 82 | + pval_h0 = stats.kstest(test.pval[0, 1, :].flatten(), 'uniform').pvalue |
90 | 83 |
|
91 | 84 | logging.getLogger("diffxpy").info('KS-test pvalue for null model match of wald(): %f' % pval_h0) |
92 | 85 | assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5) |
93 | | - |
94 | 86 | return True |
95 | 87 |
|
96 | | - def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100, n_groups=2): |
97 | | - """ |
98 | | - Test if de.wald() generates a uniform p-value distribution |
99 | | - if it is given data simulated based on the null model. Returns the p-value |
100 | | - of the two-side Kolmgorov-Smirnov test for equality of the observed |
101 | | - p-value distriubution and a uniform distribution. |
102 | 88 |
|
103 | | - :param n_cells: Number of cells to simulate (number of observations per test). |
104 | | - :param n_genes: Number of genes to simulate (number of tests). |
105 | | - """ |
| 89 | +class TestPairwiseNullStandard(unittest.TestCase, _TestPairwiseNull): |
| 90 | + |
| 91 | + def test_null_distribution_ttest(self): |
106 | 92 | logging.getLogger("tensorflow").setLevel(logging.ERROR) |
107 | 93 | logging.getLogger("batchglm").setLevel(logging.WARNING) |
108 | 94 | logging.getLogger("diffxpy").setLevel(logging.WARNING) |
109 | 95 |
|
110 | | - sim = Simulator(num_observations=n_cells, num_features=n_genes) |
111 | | - sim.generate_sample_description(num_batches=0, num_conditions=0) |
112 | | - sim.generate() |
113 | | - |
114 | | - random_sample_description = pd.DataFrame({ |
115 | | - "condition": np.random.randint(n_groups, size=sim.nobs) |
116 | | - }) |
117 | | - |
118 | | - test = de.test.pairwise( |
119 | | - data=sim.x, |
120 | | - grouping="condition", |
121 | | - test="lrt", |
122 | | - noise_model="nb", |
123 | | - sample_description=random_sample_description, |
124 | | - dtype="float64" |
125 | | - ) |
126 | | - |
127 | | - # Compare p-value distribution under null model against uniform distribution. |
128 | | - pval_h0 = stats.kstest(test.pval[~np.eye(test.pval.shape[0]).astype(bool)].flatten(), 'uniform').pvalue |
| 96 | + np.random.seed(1) |
| 97 | + self.noise_model = None |
| 98 | + self._test_null_distribution_basic(test="t-test", lazy=False) |
129 | 99 |
|
130 | | - logging.getLogger("diffxpy").info('KS-test pvalue for null model match of wald(): %f' % pval_h0) |
131 | | - assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5) |
132 | | - |
133 | | - return True |
134 | | - |
135 | | - def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 10000, n_groups=2): |
136 | | - """ |
137 | | - Test if de.wald() generates a uniform p-value distribution |
138 | | - if it is given data simulated based on the null model. Returns the p-value |
139 | | - of the two-side Kolmgorov-Smirnov test for equality of the observed |
140 | | - p-value distriubution and a uniform distribution. |
141 | | -
|
142 | | - :param n_cells: Number of cells to simulate (number of observations per test). |
143 | | - :param n_genes: Number of genes to simulate (number of tests). |
144 | | - """ |
| 100 | + def test_null_distribution_rank(self): |
145 | 101 | logging.getLogger("tensorflow").setLevel(logging.ERROR) |
146 | 102 | logging.getLogger("batchglm").setLevel(logging.WARNING) |
147 | 103 | logging.getLogger("diffxpy").setLevel(logging.WARNING) |
148 | 104 |
|
149 | | - sim = Simulator(num_observations=n_cells, num_features=n_genes) |
150 | | - sim.generate_sample_description(num_batches=0, num_conditions=0) |
151 | | - sim.generate() |
152 | | - |
153 | | - random_sample_description = pd.DataFrame({ |
154 | | - "condition": np.random.randint(n_groups, size=sim.nobs) |
155 | | - }) |
156 | | - |
157 | | - test = de.test.pairwise( |
158 | | - data=sim.x, |
159 | | - grouping="condition", |
160 | | - test="t-test", |
161 | | - sample_description=random_sample_description, |
162 | | - ) |
163 | | - summary = test.summary() |
164 | | - |
165 | | - # Compare p-value distribution under null model against uniform distribution. |
166 | | - pval_h0 = stats.kstest(test.pval[~np.eye(test.pval.shape[0]).astype(bool)].flatten(), 'uniform').pvalue |
167 | | - |
168 | | - logging.getLogger("diffxpy").info('KS-test pvalue for null model match of wald(): %f' % pval_h0) |
169 | | - assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5) |
| 105 | + np.random.seed(1) |
| 106 | + self.noise_model = None |
| 107 | + self._test_null_distribution_basic(test="rank", lazy=False) |
170 | 108 |
|
171 | | - return True |
172 | 109 |
|
173 | | - def test_null_distribution_wilcoxon(self, n_cells: int = 2000, n_genes: int = 10000, n_groups=2): |
174 | | - """ |
175 | | - Test if de.wald() generates a uniform p-value distribution |
176 | | - if it is given data simulated based on the null model. Returns the p-value |
177 | | - of the two-side Kolmgorov-Smirnov test for equality of the observed |
178 | | - p-value distriubution and a uniform distribution. |
| 110 | +class TestPairwiseNullNb(unittest.TestCase, _TestPairwiseNull): |
179 | 111 |
|
180 | | - :param n_cells: Number of cells to simulate (number of observations per test). |
181 | | - :param n_genes: Number of genes to simulate (number of tests). |
182 | | - """ |
| 112 | + def test_null_distribution_ztest(self): |
183 | 113 | logging.getLogger("tensorflow").setLevel(logging.ERROR) |
184 | 114 | logging.getLogger("batchglm").setLevel(logging.WARNING) |
185 | 115 | logging.getLogger("diffxpy").setLevel(logging.WARNING) |
186 | 116 |
|
187 | | - sim = Simulator(num_observations=n_cells, num_features=n_genes) |
188 | | - sim.generate_sample_description(num_batches=0, num_conditions=0) |
189 | | - sim.generate() |
190 | | - |
191 | | - random_sample_description = pd.DataFrame({ |
192 | | - "condition": np.random.randint(n_groups, size=sim.nobs) |
193 | | - }) |
194 | | - |
195 | | - test = de.test.pairwise( |
196 | | - data=sim.x, |
197 | | - grouping="condition", |
198 | | - test="wilcoxon", |
199 | | - sample_description=random_sample_description, |
200 | | - ) |
201 | | - summary = test.summary() |
202 | | - |
203 | | - # Compare p-value distribution under null model against uniform distribution. |
204 | | - pval_h0 = stats.kstest(test.pval[~np.eye(test.pval.shape[0]).astype(bool)].flatten(), 'uniform').pvalue |
| 117 | + np.random.seed(1) |
| 118 | + self.noise_model = "nb" |
| 119 | + self._test_null_distribution_basic(test="z-test", lazy=False, quick_scale=False) |
| 120 | + self._test_null_distribution_basic(test="z-test", lazy=False, quick_scale=True) |
205 | 121 |
|
206 | | - logging.getLogger("diffxpy").info('KS-test pvalue for null model match of wald(): %f' % pval_h0) |
207 | | - assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5) |
208 | | - |
209 | | - return True |
210 | | - |
211 | | - |
212 | | -class TestPairwiseDE(unittest.TestCase): |
213 | | - |
214 | | - def test_ztest_de(self, n_cells: int = 2000, n_genes: int = 500): |
215 | | - """ |
216 | | - Test if de.lrt() generates a uniform p-value distribution |
217 | | - if it is given data simulated based on the null model. Returns the p-value |
218 | | - of the two-side Kolmgorov-Smirnov test for equality of the observed |
219 | | - p-value distriubution and a uniform distribution. |
220 | | -
|
221 | | - :param n_cells: Number of cells to simulate (number of observations per test). |
222 | | - :param n_genes: Number of genes to simulate (number of tests). |
223 | | - """ |
| 122 | + def test_null_distribution_ztest_lazy(self): |
224 | 123 | logging.getLogger("tensorflow").setLevel(logging.ERROR) |
225 | 124 | logging.getLogger("batchglm").setLevel(logging.WARNING) |
226 | 125 | logging.getLogger("diffxpy").setLevel(logging.WARNING) |
227 | 126 |
|
228 | | - num_non_de = n_genes // 2 |
229 | | - sim = Simulator(num_observations=n_cells, num_features=n_genes) |
230 | | - sim.generate_sample_description(num_batches=0, num_conditions=2) |
231 | | - # simulate: coefficients ~ log(N(1, 0.5)). |
232 | | - # re-sample if N(1, 0.5) <= 0 |
233 | | - sim.generate_params(rand_fn=lambda shape: 1 + stats.truncnorm.rvs(-1 / 0.5, np.infty, scale=0.5, size=shape)) |
234 | | - sim.params["a"][1, :num_non_de] = 0 |
235 | | - sim.params["b"][1, :num_non_de] = 0 |
236 | | - sim.params["isDE"] = ("features",), np.arange(n_genes) >= num_non_de |
237 | | - sim.generate_data() |
| 127 | + np.random.seed(1) |
| 128 | + self.noise_model = "nb" |
| 129 | + self._test_null_distribution_basic(test="z-test", lazy=True, quick_scale=False) |
| 130 | + self._test_null_distribution_basic(test="z-test", lazy=True, quick_scale=True) |
238 | 131 |
|
239 | | - sample_description = sim.sample_description |
| 132 | + def test_null_distribution_wald(self): |
| 133 | + logging.getLogger("tensorflow").setLevel(logging.ERROR) |
| 134 | + logging.getLogger("batchglm").setLevel(logging.WARNING) |
| 135 | + logging.getLogger("diffxpy").setLevel(logging.WARNING) |
240 | 136 |
|
241 | | - test = de.test.pairwise( |
242 | | - data=sim.x, |
243 | | - grouping="condition", |
244 | | - test="z-test", |
245 | | - noise_model="nb", |
246 | | - sample_description=sample_description, |
247 | | - ) |
248 | | - summary = test.summary() |
| 137 | + np.random.seed(1) |
| 138 | + self.noise_model = "nb" |
| 139 | + self._test_null_distribution_basic(test="wald", lazy=False, quick_scale=False) |
| 140 | + self._test_null_distribution_basic(test="wald", lazy=False, quick_scale=True) |
249 | 141 |
|
250 | | - frac_nonde_sig = np.mean( |
251 | | - np.sum(test.qval[~np.eye(test.pval.shape[0]).astype(bool), :num_non_de] < 0.05) / |
252 | | - (2 * num_non_de) |
253 | | - ) |
254 | | - frac_de_sig = np.mean( |
255 | | - np.sum(test.qval[~np.eye(test.pval.shape[0]).astype(bool), num_non_de:] < 0.05) / |
256 | | - (2 * (n_genes - num_non_de)) |
257 | | - ) |
258 | | - logging.getLogger("diffxpy").info('fraction of non-DE genes with q-value < 0.05: %.1f%%' % |
259 | | - str(np.round(100. * frac_nonde_sig, 3))) |
260 | | - logging.getLogger("diffxpy").info('fraction of DE genes with q-value < 0.05: %.1f%%' % |
261 | | - str(np.round(100. * frac_de_sig, 3))) |
| 142 | + def test_null_distribution_lrt(self): |
| 143 | + logging.getLogger("tensorflow").setLevel(logging.ERROR) |
| 144 | + logging.getLogger("batchglm").setLevel(logging.WARNING) |
| 145 | + logging.getLogger("diffxpy").setLevel(logging.WARNING) |
262 | 146 |
|
263 | | - assert frac_de_sig > 0.5, "too many DE" |
264 | | - assert frac_nonde_sig < 0.5, "too many non-DE" |
265 | | - return True |
| 147 | + np.random.seed(1) |
| 148 | + self.noise_model = "nb" |
| 149 | + self._test_null_distribution_basic(test="lrt", lazy=False, quick_scale=False) |
266 | 150 |
|
267 | 151 |
|
268 | 152 | if __name__ == '__main__': |
|
0 commit comments