77from ..testing import correction
88from ..testing .det import _DifferentialExpressionTest
99
10- logger = logging .getLogger (__name__ )
1110
1211class RefSets :
1312 """
@@ -42,15 +41,19 @@ def clean(self, ids):
4241
4342 def __init__ (self , sets = None , fn = None , type = 'gmt' ):
4443 if sets is not None :
45- self .load_sets (sets , type = type )
46- self ._genes = np .sort (np .unique (np .concatenate ([np .asarray (list (x .genes )) for x in self .sets ])))
44+ if len (sets ) > 0 :
45+ self .load_sets (sets , type = type )
46+ self ._genes = np .sort (np .unique (np .concatenate ([np .asarray (list (x .genes )) for x in self .sets ])))
47+ else :
48+ self .sets = []
49+ self ._genes = np .array ([])
4750 elif fn is not None :
4851 self .read_from_file (fn = fn , type = type )
4952 self ._genes = np .sort (np .unique (np .concatenate ([np .asarray (list (x .genes )) for x in self .sets ])))
5053 else :
5154 self .sets = []
5255 self ._genes = np .array ([])
53- self ._ids = [x .id for x in self .sets ]
56+ self ._ids = np . array ( [x .id for x in self .sets ])
5457 self ._set_lens = np .array ([x .len for x in self .sets ])
5558 self .genes_discarded = None
5659
@@ -113,7 +116,7 @@ def add(self, id: str, source: str, gene_ids: list):
113116 self .sets .append (self ._Set (id = id , source = source , gene_ids = gene_ids ))
114117 # Update summary variables:
115118 self ._genes = np .sort (np .unique (np .concatenate ([np .asarray (list (x .genes )) for x in self .sets ])))
116- self ._ids = [x .id for x in self .sets ]
119+ self ._ids = np . array ( [x .id for x in self .sets ])
117120 self ._set_lens = np .array ([x .len for x in self .sets ])
118121
119122 ## Processing functions.
@@ -165,7 +168,7 @@ def get_set(self, id):
165168 """
166169 Return the set with a given set identifier.
167170 """
168- return self .sets [self ._ids .index (id )]
171+ return self .sets [self ._ids .tolist (). index (id )]
169172
170173 ## Overlap functions.
171174
@@ -191,9 +194,9 @@ def overlap(self, enq_set: set, set_id=None):
191194def test (
192195 ref : RefSets ,
193196 det : Union [_DifferentialExpressionTest , None ] = None ,
194- pval : Union [np .array , None ] = None ,
197+ scores : Union [np .array , None ] = None ,
195198 gene_ids : Union [list , None ] = None ,
196- de_threshold = 0.05 ,
199+ threshold = 0.05 ,
197200 incl_all_zero = False ,
198201 all_ids = None ,
199202 clean_ref = False ,
@@ -205,38 +208,31 @@ def test(
205208 nice doc string and that the call to this is de.enrich.test which
206209 makes more sense to me than de.enrich.Enrich.
207210
208- :param RefSets:
209- The annotated gene sets against which enrichment is tested.
210- :param DETest:
211- The differential expression results object which is tested
211+ :param ref: The annotated gene sets against which enrichment is tested.
212+ :param det: The differential expression results object which is tested
212213 for enrichment in the gene sets.
213- :param pval:
214- Alternative to DETest, vector of p-values for differential expression.
215- :param gene_ids:
216- If pval was supplied instead of DETest, use gene_ids to supply the
214+ :param scores: Alternative to DETest, vector of scores (scalar per gene) which are then
215+ used to discretize gene list. This can for example be corrected p-values from a differential expression
216+ test, in that case the parameter threshold would be a significance threshold.
217+ :param gene_ids: If pval was supplied instead of DETest, use gene_ids to supply the
217218 vector of gene identifiers (strings) that correspond to the p-values
218- which can be matched against the identifieres in the sets in RefSets.
219- :param de_threshold:
220- Significance threshold at which a differential test (a multiple-testing
221- corrected p-value) is called siginficant. T
222- :param incl_all_zero:
223- Wehther to include genes in gene universe which were all zero.
224- :param all_ids:
225- Set of all gene identifiers, this is used as the background set in the
219+ which can be matched against the identifiers in the sets in RefSets.
220+ :param threshold: Threshold of parameter scores at which a gene is included as a hit: In the case
221+ of differential test p-values in scores, threshold is the significance threshold.
222+ :param incl_all_zero: Wehther to include genes in gene universe which were all zero.
223+ :param all_ids: Set of all gene identifiers, this is used as the background set in the
226224 hypergeometric test. Only supply this if not all genes were tested
227225 and are supplied above in DETest or gene_ids.
228- :param clean_ref:
229- Whether or not to only retain gene identifiers in RefSets that occur in
226+ :param clean_ref: Whether or not to only retain gene identifiers in RefSets that occur in
230227 the background set of identifiers supplied here through all_ids.
231- :param capital:
232- Make all gene IDs captial.
228+ :param capital: Make all gene IDs captial.
233229 """
234230 return Enrich (
235231 ref = ref ,
236232 det = det ,
237- pval = pval ,
233+ scores = scores ,
238234 gene_ids = gene_ids ,
239- de_threshold = de_threshold ,
235+ threshold = threshold ,
240236 incl_all_zero = incl_all_zero ,
241237 all_ids = all_ids ,
242238 clean_ref = clean_ref ,
@@ -252,9 +248,9 @@ def __init__(
252248 self ,
253249 ref : RefSets ,
254250 det : Union [_DifferentialExpressionTest , None ],
255- pval : Union [np .array , None ],
256- gene_ids : Union [list , None ],
257- de_threshold ,
251+ scores : Union [np .array , None ],
252+ gene_ids : Union [list , np . ndarray , None ],
253+ threshold ,
258254 incl_all_zero ,
259255 all_ids ,
260256 clean_ref ,
@@ -263,6 +259,8 @@ def __init__(
263259 self ._n_overlaps = None
264260 self ._pval_enrich = None
265261 self ._qval_enrich = None
262+ if isinstance (gene_ids , list ):
263+ gene_ids = np .asarray (gene_ids )
266264 # Load multiple-testing-corrected differential expression
267265 # p-values from differential expression output.
268266 if det is not None :
@@ -273,24 +271,24 @@ def __init__(
273271 idx_not_all_zero = np .where (np .logical_not (det .summary ()["zero_mean" ].values ))[0 ]
274272 self ._qval_de = det .qval [idx_not_all_zero ]
275273 self ._gene_ids = det .gene_ids [idx_not_all_zero ]
276- elif pval is not None and gene_ids is not None :
277- self ._qval_de = np .asarray (pval )
274+ elif scores is not None and gene_ids is not None :
275+ self ._qval_de = np .asarray (scores )
278276 self ._gene_ids = gene_ids
279277 else :
280278 raise ValueError ('Supply either DETest or pval and gene_ids to Enrich().' )
281279 # Take out NA genes labels:
282280 # Select significant genes based on user defined threshold.
283281 if any ([x is np .nan for x in self ._gene_ids ]):
284282 idx_notnan = np .where ([x is not np .nan for x in self ._gene_ids ])[0 ]
285- logger .info (
283+ logging . getLogger ( "diffxpy" ) .info (
286284 " Discarded %i nan gene ids, leaving %i genes." ,
287285 len (self ._gene_ids ) - len (idx_notnan ),
288286 len (idx_notnan )
289287 )
290288 self ._qval_de = self ._qval_de [idx_notnan ]
291289 self ._gene_ids = self ._gene_ids [idx_notnan ]
292290
293- self ._significant_de = self ._qval_de <= de_threshold
291+ self ._significant_de = self ._qval_de <= threshold
294292 self ._significant_ids = set (self ._gene_ids [np .where (self ._significant_de )[0 ]])
295293 if all_ids is not None :
296294 self ._all_ids = set (all_ids )
@@ -303,7 +301,7 @@ def __init__(
303301 self ._significant_ids = set ([x .upper () for x in self ._significant_ids ])
304302
305303 # Generate diagnostic statistic of number of possible overlaps in total.
306- logger .info (
304+ logging . getLogger ( "diffxpy" ) .info (
307305 " %i overlaps found between refset (%i) and provided gene list (%i)." ,
308306 len (set (self ._all_ids ).intersection (set (ref ._genes ))),
309307 len (ref ._genes ),
@@ -318,7 +316,7 @@ def __init__(
318316 # Print if there are empty sets.
319317 idx_nonempty = np .where ([len (x .genes ) > 0 for x in self .RefSets .sets ])[0 ]
320318 if len (self .RefSets .sets ) - len (idx_nonempty ) > 0 :
321- logger .info (
319+ logging . getLogger ( "diffxpy" ) .info (
322320 " Found %i empty sets, removing those." ,
323321 len (self .RefSets .sets ) - len (idx_nonempty )
324322 )
@@ -389,7 +387,10 @@ def significant_sets(self, threshold=0.05) -> list:
389387 """
390388 Return significant sets from gene set enrichement analysis as an output table.
391389 """
392- return self .RefSets .subset (idx = np .where (self .qval <= threshold )[0 ])
390+ sig_sets = np .where (self .qval <= threshold )[0 ]
391+ if len (sig_sets ) == 0 :
392+ logging .getLogger ("diffxpy" ).info ("no significant sets found" )
393+ return self .RefSets .subset (idx = sig_sets )
393394
394395 def significant_set_ids (self , threshold = 0.05 ) -> np .array :
395396 """
@@ -424,4 +425,4 @@ def set_summary(self, id: str):
424425
425426 :return: Slice of summary table.
426427 """
427- return self .summary (sort = False ).iloc [self .RefSets ._ids .index (id ), :]
428+ return self .summary (sort = False ).iloc [self .RefSets ._ids .tolist (). index (id ), :]
0 commit comments