improved numerical stability of ztests

davidsebfischer · davidsebfischer · commit bc271bb10d40 · 2019-08-25T19:41:42.000+02:00
by catching division by zero if stdev is zero.
diff --git a/diffxpy/stats/stats.py b/diffxpy/stats/stats.py
@@ -211,6 +211,7 @@ def wald_test(
         if theta_mle.shape[0] != theta0.shape[0]:
             raise ValueError('stats.wald_test(): theta_mle and theta0 have to contain the same number of entries')
 
+    theta_sd = np.nextafter(0, np.inf, out=theta_sd, where=theta_sd < np.nextafter(0, np.inf))
     wald_statistic = np.abs(np.divide(theta_mle - theta0, theta_sd))
     pvals = 2 * (1 - scipy.stats.norm(loc=0, scale=1).cdf(wald_statistic))  # two-tailed test
     return pvals
@@ -313,7 +314,10 @@ def two_coef_z_test(
     if theta_mle0.shape[0] != theta_sd0.shape[0]:
         raise ValueError('stats.two_coef_z_test(): theta_mle0 and theta_sd0 have to contain the same number of entries')
 
-    z_statistic = np.abs((theta_mle0 - theta_mle1) / np.sqrt(np.square(theta_sd0) + np.square(theta_sd1)))
+    divisor = np.square(theta_sd0) + np.square(theta_sd1)
+    divisor = np.nextafter(0, np.inf, out=divisor, where=divisor < np.nextafter(0, np.inf))
+    divisor = np.sqrt(divisor)
+    z_statistic = np.abs((theta_mle0 - theta_mle1)) / divisor
     pvals = 2 * (1 - scipy.stats.norm(loc=0, scale=1).cdf(z_statistic))  # two-tailed test
     return pvals
 
diff --git a/diffxpy/testing/det.py b/diffxpy/testing/det.py
@@ -2102,11 +2102,13 @@ def __init__(
         self.grouping = grouping
         self.groups = list(np.asarray(groups))
 
-        # values of parameter estimates: coefficients x genes array with one coefficient per group
+        # Values of parameter estimates: coefficients x genes array with one coefficient per group
         self._theta_mle = model_estim.a_var
-        # standard deviation of estimates: coefficients x genes array with one coefficient per group
-        # theta_sd = sqrt(diagonal(fisher_inv))
-        self._theta_sd = np.sqrt(np.diagonal(model_estim.fisher_inv, axis1=-2, axis2=-1)).T
+        # Standard deviation of estimates: coefficients x genes array with one coefficient per group
+        # Need .copy() here as nextafter needs mutabls copy.
+        theta_sd = np.diagonal(model_estim.fisher_inv, axis1=-2, axis2=-1).T.copy()
+        theta_sd = np.nextafter(0, np.inf, out=theta_sd, where=theta_sd < np.nextafter(0, np.inf))
+        self._theta_sd = np.sqrt(theta_sd)
         self._logfc = None
 
         # Call tests in constructor.
@@ -2307,11 +2309,13 @@ def __init__(
         else:
             self.groups = groups.tolist()
 
-        # values of parameter estimates: coefficients x genes array with one coefficient per group
+        # Values of parameter estimates: coefficients x genes array with one coefficient per group
         self._theta_mle = model_estim.a_var
-        # standard deviation of estimates: coefficients x genes array with one coefficient per group
-        # theta_sd = sqrt(diagonal(fisher_inv))
-        self._theta_sd = np.sqrt(np.diagonal(model_estim.fisher_inv, axis1=-2, axis2=-1)).T
+        # Standard deviation of estimates: coefficients x genes array with one coefficient per group
+        # Need .copy() here as nextafter needs mutabls copy.
+        theta_sd = np.diagonal(model_estim.fisher_inv, axis1=-2, axis2=-1).T.copy()
+        theta_sd = np.nextafter(0, np.inf, out=theta_sd, where=theta_sd < np.nextafter(0, np.inf))
+        self._theta_sd = np.sqrt(theta_sd)
 
     def _correction(self, pvals, method="fdr_bh") -> np.ndarray:
         """
diff --git a/diffxpy/testing/tests.py b/diffxpy/testing/tests.py
@@ -819,8 +819,8 @@ def two_sample(
     :param kwargs: [Debugging] Additional arguments will be passed to the _fit method.
     """
     if test in ['t-test', 'rank'] and noise_model is not None:
-        raise ValueError('base.two_sample(): Do not specify `noise_model` if using test t-test or rank_test: ' +
-                         'The t-test is based on a gaussian noise model and wilcoxon is model free.')
+        raise Warning('two_sample(): Do not specify `noise_model` if using test t-test or rank_test: ' +
+                      'The t-test is based on a gaussian noise model and the rank sum test is model free.')
 
     gene_names = parse_gene_names(data, gene_names)
     grouping = parse_grouping(data, sample_description, grouping)
@@ -906,11 +906,11 @@ def pairwise(
         data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase],
         grouping: Union[str, np.ndarray, list],
         as_numeric: Union[List[str], Tuple[str], str] = (),
-        test: str = 'z-test',
-        lazy: bool = False,
+        test: str = "z-test",
+        lazy: bool = True,
         gene_names: Union[np.ndarray, list] = None,
         sample_description: pd.DataFrame = None,
-        noise_model: str = None,
+        noise_model: str = "nb",
         size_factors: np.ndarray = None,
         batch_size: int = None,
         training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO",