theislab
diff --git a/‎diffxpy/api/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎diffxpy/api/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎diffxpy/api/stats.py‎
Lines changed: 1 addition & 1 deletion b/‎diffxpy/api/stats.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎diffxpy/api/test.py‎
Lines changed: 3 additions & 11 deletions b/‎diffxpy/api/test.py‎
Lines changed: 3 additions & 11 deletions
diff --git a/‎diffxpy/enrichment/enrich.py‎
Lines changed: 76 additions & 41 deletions b/‎diffxpy/enrichment/enrich.py‎
Lines changed: 76 additions & 41 deletions
diff --git a/‎diffxpy/pkg_constants.py‎
Lines changed: 14 additions & 5 deletions b/‎diffxpy/pkg_constants.py‎
Lines changed: 14 additions & 5 deletions
diff --git a/‎diffxpy/stats/stats.py‎
Lines changed: 18 additions & 10 deletions b/‎diffxpy/stats/stats.py‎
Lines changed: 18 additions & 10 deletions
diff --git a/‎diffxpy/testing/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎diffxpy/testing/__init__.py‎
Lines changed: 3 additions & 0 deletions
@@ -5,3 +5,4 @@
 from . import enrich
 from . import stats
 from . import utils
+from .. import pkg_constants
@@ -3,7 +3,7 @@
 from diffxpy.stats.stats import wald_test
 from diffxpy.stats.stats import wald_test_chisq
 from diffxpy.stats.stats import two_coef_z_test
-from diffxpy.stats.stats import wilcoxon_test
+from diffxpy.stats.stats import mann_whitney_u_test
 from diffxpy.stats.stats import t_test_moments
 from diffxpy.stats.stats import t_test_raw
 
 
@@ -1,11 +1,3 @@
-from diffxpy.testing.base import two_sample
-from diffxpy.testing.base import lrt
-from diffxpy.testing.base import wald
-from diffxpy.testing.base import t_test
-from diffxpy.testing.base import wilcoxon
-from diffxpy.testing.base import partition
-from diffxpy.testing.base import pairwise
-from diffxpy.testing.base import versus_rest
-from diffxpy.testing.base import continuous_1d
-from diffxpy.testing.base import design_matrix
-from diffxpy.testing.base import coef_names
+from diffxpy.testing import lrt, wald, t_test, rank_test, two_sample, pairwise, \
+    versus_rest, partition, continuous_1d
+from diffxpy.testing import design_matrix, coef_names
@@ -1,12 +1,15 @@
+import logging
 import numpy as np
 import pandas as pd
+from typing import Union
 
 from ..stats import stats
 from ..testing import correction
-from ..testing.base import _DifferentialExpressionTest
+from ..testing.det import _DifferentialExpressionTest
 
+logger = logging.getLogger(__name__)
 
-class RefSets():
+class RefSets:
     """
     Class for a list of gene sets.
 
@@ -18,7 +21,7 @@ class RefSets():
     .gmt files can be downloaded from http://software.broadinstitute.org/gsea/msigdb/collections.jsp for example.
     """
 
-    class _Set():
+    class _Set:
         """ 
         Class for a single gene set.
         """
@@ -182,18 +185,19 @@ def overlap(self, enq_set: set, set_id=None):
             for x in self.sets:
                 x.intersect = x.genes.intersection(enq_set)
         else:
-            x.intersect = self.get_set(id).genes.intersection(enq_set)
+            x.intersect = self.get_set(id).genes.intersection(enq_set)  # bug
 
 
 def test(
-        RefSets: RefSets,
-        DETest: _DifferentialExpressionTest = None,
-        pval: np.array = None,
-        gene_ids: list = None,
+        ref: RefSets,
+        det: Union[_DifferentialExpressionTest, None] = None,
+        pval: Union[np.array, None] = None,
+        gene_ids: Union[list, None] = None,
         de_threshold=0.05,
+        incl_all_zero=False,
         all_ids=None,
-        clean_ref=True,
-        upper=False
+        clean_ref=False,
+        capital=True
 ):
     """ Perform gene set enrichment.
 
@@ -214,51 +218,61 @@ def test(
         which can be matched against the identifieres in the sets in RefSets.
     :param de_threshold:
         Significance threshold at which a differential test (a multiple-testing 
-        corrected p-value) is called siginficant. This 
+        corrected p-value) is called siginficant. T
+    :param incl_all_zero:
+        Wehther to include genes in gene universe which were all zero.
     :param all_ids:
         Set of all gene identifiers, this is used as the background set in the
         hypergeometric test. Only supply this if not all genes were tested
         and are supplied above in DETest or gene_ids.
     :param clean_ref:
         Whether or not to only retain gene identifiers in RefSets that occur in 
         the background set of identifiers supplied here through all_ids.
-    :param upper:
+    :param capital:
         Make all gene IDs captial.
     """
     return Enrich(
-        RefSets=RefSets,
-        DETest=DETest,
+        ref=ref,
+        det=det,
         pval=pval,
         gene_ids=gene_ids,
         de_threshold=de_threshold,
+        incl_all_zero=incl_all_zero,
         all_ids=all_ids,
         clean_ref=clean_ref,
-        upper=upper)
+        capital=capital
+    )
 
 
-class Enrich():
+class Enrich:
     """
     """
 
     def __init__(
             self,
-            RefSets: RefSets,
-            DETest: _DifferentialExpressionTest = None,
-            pval: np.array = None,
-            gene_ids: list = None,
-            de_threshold=0.05,
-            all_ids=None,
-            clean_ref=True,
-            upper=False
+            ref: RefSets,
+            det: Union[_DifferentialExpressionTest, None],
+            pval: Union[np.array, None],
+            gene_ids: Union[list, None],
+            de_threshold,
+            incl_all_zero,
+            all_ids,
+            clean_ref,
+            capital
     ):
         self._n_overlaps = None
         self._pval_enrich = None
         self._qval_enrich = None
         # Load multiple-testing-corrected differential expression
         # p-values from differential expression output.
-        if DETest is not None:
-            self._qval_de = DETest.qval
-            self._gene_ids = DETest.gene_ids
+        if det is not None:
+            if incl_all_zero:
+                self._qval_de = det.qval
+                self._gene_ids = det.gene_ids
+            else:
+                idx_not_all_zero = np.where(np.logical_not(det.summary()["zero_mean"].values))[0]
+                self._qval_de = det.qval[idx_not_all_zero]
+                self._gene_ids = det.gene_ids[idx_not_all_zero]
         elif pval is not None and gene_ids is not None:
             self._qval_de = np.asarray(pval)
             self._gene_ids = gene_ids
@@ -268,8 +282,11 @@ def __init__(
         # Select significant genes based on user defined threshold.
         if any([x is np.nan for x in self._gene_ids]):
             idx_notnan = np.where([x is not np.nan for x in self._gene_ids])[0]
-            print('Discarded ' + str(len(self._gene_ids) - len(idx_notnan)) + ' nan gene ids, leaving ' +
-                  str(len(idx_notnan)) + ' genes.')
+            logger.info(
+                " Discarded %i nan gene ids, leaving %i genes.",
+                len(self._gene_ids) - len(idx_notnan),
+                len(idx_notnan)
+            )
             self._qval_de = self._qval_de[idx_notnan]
             self._gene_ids = self._gene_ids[idx_notnan]
 
@@ -280,25 +297,31 @@ def __init__(
         else:
             self._all_ids = set(self._gene_ids)
 
-        if upper == True:
+        if capital:
             self._gene_ids = [x.upper() for x in self._gene_ids]
             self._all_ids = set([x.upper() for x in self._all_ids])
+            self._significant_ids = set([x.upper() for x in self._significant_ids])
 
         # Generate diagnostic statistic of number of possible overlaps in total.
-        print(str(len(set(self._all_ids).intersection(set(RefSets._genes)))) +
-              ' overlaps found between refset (' + str(len(RefSets._genes)) +
-              ') and provided gene list (' + str(len(self._all_ids)) + ').')
-        self.missing_genes = list(set(RefSets._genes).difference(set(self._all_ids)))
+        logger.info(
+            " %i overlaps found between refset (%i) and provided gene list (%i).",
+            len(set(self._all_ids).intersection(set(ref._genes))),
+            len(ref._genes),
+            len(self._all_ids)
+        )
+        self.missing_genes = list(set(ref._genes).difference(set(self._all_ids)))
         # Clean reference set to only contains ids that were observed in
         # current study if required.
-        self.RefSets = RefSets
-        if clean_ref == True:
+        self.RefSets = ref
+        if clean_ref:
             self.RefSets.clean(self._all_ids)
         # Print if there are empty sets.
         idx_nonempty = np.where([len(x.genes) > 0 for x in self.RefSets.sets])[0]
         if len(self.RefSets.sets) - len(idx_nonempty) > 0:
-            print('Found ' + str(len(self.RefSets.sets) - len(idx_nonempty)) +
-                  ' empty sets, removing those.')
+            logger.info(
+                " Found %i empty sets, removing those.",
+                len(self.RefSets.sets) - len(idx_nonempty)
+            )
             self.RefSets = self.RefSets.subset(idx=idx_nonempty)
         elif len(idx_nonempty) == 0:
             raise ValueError('all RefSets were empty')
@@ -356,7 +379,7 @@ def grepv_sets(self, x):
         """
         return self.RefSets.grepv_sets(x)
 
-    def set(id):
+    def set(self, id):
         """ 
         Return the set with a given set identifier.
         """
@@ -374,9 +397,11 @@ def significant_set_ids(self, threshold=0.05) -> np.array:
         """
         return [self.RefSets._ids[i] for i in np.where(self.qval <= threshold)[0]]
 
-    def summary(self) -> pd.DataFrame:
+    def summary(self, sort=True) -> pd.DataFrame:
         """
         Summarize gene set enrichement analysis as an output table.
+
+        :param sort: Whether to sort table by p-value.
         """
         res = pd.DataFrame({
             "set": self.RefSets._ids,
@@ -388,5 +413,15 @@ def summary(self) -> pd.DataFrame:
             "background": len(self._all_ids)
         })
         # Sort by p-value
-        res = res.iloc[np.argsort(res['pval'].values), :]
+        if sort:
+            res = res.iloc[np.argsort(res['pval'].values), :]
         return res
+
+    def set_summary(self, id: str):
+        """
+        Summarize gene set enrichement analysis for a given set.
+        :param id: Gene set to enquire.
+
+        :return: Slice of summary table.
+        """
+        return self.summary(sort=False).iloc[self.RefSets._ids.index(id), :]
@@ -1,7 +1,16 @@
-BATCHGLM_OPTIM_GD = True
-BATCHGLM_OPTIM_ADAM = True
+DE_TREAT_ZEROVAR_TT_AS_SIG = True
+
+BATCHGLM_OPTIM_GD = False
+BATCHGLM_OPTIM_ADAM = False
 BATCHGLM_OPTIM_ADAGRAD = False
 BATCHGLM_OPTIM_RMSPROP = False
-BATCHGLM_OPTIM_NEWTON = True
-BATCHGLM_OPTIM_IRLS = True
-BATCHGLM_TERMINATION_TYPE = "by_feature"
+BATCHGLM_OPTIM_NEWTON = False
+BATCHGLM_OPTIM_NEWTON_TR = False
+BATCHGLM_OPTIM_IRLS = False
+BATCHGLM_OPTIM_IRLS_GD = False
+BATCHGLM_OPTIM_IRLS_TR = True
+BATCHGLM_OPTIM_IRLS_GD_TR = True
+
+BATCHGLM_PROVIDE_BATCHED = True
+BATCHGLM_PROVIDE_FIM = True
+BATCHGLM_PROVIDE_HESSIAN = False
@@ -1,7 +1,9 @@
+from typing import Union
+
 import numpy as np
 import numpy.linalg
 import scipy.stats
-from typing import Union
+import xarray as xr
 
 
 def likelihood_ratio_test(
@@ -37,7 +39,7 @@ def likelihood_ratio_test(
     return pvals
 
 
-def wilcoxon_test(
+def mann_whitney_u_test(
         x0: np.ndarray,
         x1: np.ndarray,
 ):
@@ -68,7 +70,8 @@ def wilcoxon_test(
         scipy.stats.mannwhitneyu(
             x=x0[:, i].flatten(),
             y=x1[:, i].flatten(),
-            alternative='two-sided'
+            use_continuity=True,
+            alternative="two-sided"
         ).pvalue for i in range(x0.shape[1])
     ])
     return pvals
@@ -152,7 +155,7 @@ def t_test_moments(
         out=s_delta
     )
 
-    t_statistic = np.abs((mu0 - mu1) / s_delta)
+    t_statistic = np.abs(mu0 - mu1) / s_delta
 
     divisor = (
             (np.square(var0 / n0) / (n0 - 1)) +
@@ -165,16 +168,15 @@ def t_test_moments(
         out=divisor
     )
 
-    with np.errstate(over='ignore'):
-        df = np.square((var0 / n0) + (var1 / n1)) / divisor
+    df = np.square((var0 / n0) + (var1 / n1)) / divisor
     np.clip(
         df,
         a_min=np.nextafter(0, np.inf, dtype=df.dtype),
         a_max=np.nextafter(np.inf, 0, dtype=df.dtype),
         out=df
     )
 
-    pval = 2 * (1 - scipy.stats.t(df).cdf(t_statistic))
+    pval = 2 * scipy.stats.t.sf(t_statistic, df)
     return pval
 
 
@@ -261,6 +263,9 @@ def wald_test_chisq(
             raise ValueError('stats.wald_test(): theta_mle and theta0 have to contain the same number of entries')
 
     theta_diff = theta_mle - theta0
+    # Convert to nd.array to avoid gufunc error.
+    if isinstance(theta_diff, xr.DataArray):
+        theta_diff = theta_diff.values
     wald_statistic = np.array([
         np.matmul(
             np.matmul(
@@ -337,6 +342,9 @@ def hypergeom_test(
     :param background: int
         Size of background set.
     """
-    pvals = np.array([1 - scipy.stats.hypergeom(M=background, n=references[i], N=enquiry).cdf(x - 1) for i, x in
-                      enumerate(intersections)])
-    return (pvals)
+    pvals = np.array([1 - scipy.stats.hypergeom(
+        M=background,
+        n=references[i],
+        N=enquiry
+    ).cdf(x - 1) for i, x in enumerate(intersections)])
+    return pvals
@@ -0,0 +1,3 @@
+from .tests import lrt, wald, t_test, rank_test, two_sample, pairwise, \
+    versus_rest, partition, continuous_1d
+from .utils import design_matrix, coef_names
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .tests import lrt, wald, t_test, rank_test, two_sample, pairwise, \`
	`2`	`+ versus_rest, partition, continuous_1d`
	`3`	`+from .utils import design_matrix, coef_names`