ChEB-AI · sfluegel05 · Jun 5, 2024 · Jun 5, 2024 · Jun 6, 2024 · Jun 7, 2024
diff --git a/README.md b/README.md
@@ -3,14 +3,30 @@
 ChEBai is a deep learning library designed for the integration of deep learning methods with chemical ontologies, particularly ChEBI.
 The library emphasizes the incorporation of the semantic qualities of the ontology into the learning process.
 
-## Installation
+##  News
+
+We now support regression tasks!
+
+## Note for developers
 
-You can install ChEBai via pip:
+If you have used ChEBai before PR #39, the file structure in which your ChEBI-data is saved has changed. This means that
+datasets will be freshly generated. The data however is the same. If you want to keep the old data (including the old
+splits), you can use a migration script. It copies the old data to the new location for a specific ChEBI class
+(including chebi version and other parameters). The script can be called by specifying the data module from a config
 ```
-pip install chebai
+python chebai/preprocessing/migration/chebi_data_migration.py migrate --datamodule=[path-to-data-config]
+```
+or by specifying the class name (e.g. `ChEBIOver50`) and arguments separately
 ```
+python chebai/preprocessing/migration/chebi_data_migration.py migrate --class_name=[data-class] [--chebi_version=[version]]
+```
+The new dataset will by default generate random data splits (with a given seed).
+To reuse a fixed data split, you have to provide the path of the csv file generated during the migration:
+`--data.init_args.splits_file_path=[path-to-processed_data]/splits.csv`
 
-Alternatively, you can get the latest development version directly from GitHub:
+## Installation
+
+To install ChEBai, follow these steps:
 
 1. Clone the repository:
 ```
@@ -63,11 +79,16 @@ A command with additional options may look like this:
 python3 -m chebai fit --trainer=configs/training/default_trainer.yml --model=configs/model/electra.yml --model.train_metrics=configs/metrics/micro-macro-f1.yml --model.test_metrics=configs/metrics/micro-macro-f1.yml --model.val_metrics=configs/metrics/micro-macro-f1.yml --model.pretrained_checkpoint=electra_pretrained.ckpt --model.load_prefix=generator. --data=configs/data/chebi50.yml --model.criterion=configs/loss/bce.yml --data.init_args.batch_size=10 --trainer.logger.init_args.name=chebi50_bce_unweighted --data.init_args.num_workers=9 --model.pass_loss_kwargs=false --data.init_args.chebi_version=231 --data.init_args.data_limit=1000
 ```
 
-### Fine-tuning for Toxicity prediction
+### Fine-tuning for classification tasks, e.g. Toxicity prediction
 ```
 python -m chebai fit --config=[path-to-your-tox21-config] --trainer.callbacks=configs/training/default_callbacks.yml  --model.pretrained_checkpoint=[path-to-pretrained-model]
 ```
 
+### Fine-tuning for regression tasks, e.g. solubility prediction
+```
+python -m chebai fit --config=[path-to-your-esol-config] --trainer.callbacks=configs/training/solCur_callbacks.yml  --model.pretrained_checkpoint=[path-to-pretrained-model]
+```
+
 ### Predicting classes given SMILES strings
 ```
 python3 -m chebai predict_from_file --model=[path-to-model-config] --checkpoint_path=[path-to-model] --input_path={path-to-file-containing-smiles] [--classes_path=[path-to-classes-file]] [--save_to=[path-to-output]]

diff --git a/chebai/cli.py b/chebai/cli.py
@@ -60,15 +60,40 @@ def call_data_methods(data: Type[XYBaseDataModule]):
         )
 
         for kind in ("train", "val", "test"):
-            for average in ("micro-f1", "macro-f1", "balanced-accuracy"):
+            for average in (
+                "micro-f1",
+                "macro-f1",
+                "balanced-accuracy",
+                "roc-auc",
+                "f1",
+                "mse",
+                "rmse",
+                "r2",
+            ):
+                # When using lightning > 2.5.1 then need to uncomment all metrics that are not used
+                # for average in ("mse", "rmse","r2"): # for regression
+                # for average in ("f1", "roc-auc"): # for binary classification
+                # for average in ("micro-f1", "macro-f1", "roc-auc"): # for multilabel classification
+                # for average in ("micro-f1", "macro-f1", "balanced-accuracy", "roc-auc"): # for multilabel classification using balanced-accuracy
                 parser.link_arguments(
                     "data.num_of_labels",
                     f"model.init_args.{kind}_metrics.init_args.metrics.{average}.init_args.num_labels",
                     apply_on="instantiate",
                 )
+
         parser.link_arguments(
             "data.num_of_labels", "trainer.callbacks.init_args.num_labels"
         )
+        # parser.link_arguments(
+        #     "model.init_args.out_dim", "trainer.callbacks.init_args.num_labels"
+        # )
+        # parser.link_arguments(
+        #     "data", "model.init_args.criterion.init_args.data_extractor"
+        # )
+        # parser.link_arguments(
+        #     "data.init_args.chebi_version",
+        #     "model.init_args.criterion.init_args.data_extractor.init_args.chebi_version",
+        # )
 
     @staticmethod
     def subcommands() -> Dict[str, Set[str]]:

diff --git a/chebai/loss/focal_loss.py b/chebai/loss/focal_loss.py
@@ -0,0 +1,152 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+# from https://github.com/itakurah/Focal-loss-PyTorch
+
+
+class FocalLoss(nn.Module):
+    def __init__(
+        self,
+        gamma=2,
+        alpha=None,
+        reduction="mean",
+        task_type="binary",
+        num_classes=None,
+    ):
+        """
+        Unified Focal Loss class for binary, multi-class, and multi-label classification tasks.
+        :param gamma: Focusing parameter, controls the strength of the modulating factor (1 - p_t)^gamma
+        :param alpha: Balancing factor, can be a scalar or a tensor for class-wise weights. If None, no class balancing is used.
+        :param reduction: Specifies the reduction method: 'none' | 'mean' | 'sum'
+        :param task_type: Specifies the type of task: 'binary', 'multi-class', or 'multi-label'
+        :param num_classes: Number of classes (only required for multi-class classification)
+        """
+        super(FocalLoss, self).__init__()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        self.task_type = task_type
+        self.num_classes = num_classes
+
+        # Handle alpha for class balancing in multi-class tasks
+        if (
+            task_type == "multi-class"
+            and alpha is not None
+            and isinstance(alpha, (list, torch.Tensor))
+        ):
+            assert (
+                num_classes is not None
+            ), "num_classes must be specified for multi-class classification"
+            if isinstance(alpha, list):
+                self.alpha = torch.Tensor(alpha)
+            else:
+                self.alpha = alpha
+
+    def forward(self, inputs, targets):
+        """
+        Forward pass to compute the Focal Loss based on the specified task type.
+        :param inputs: Predictions (logits) from the model.
+                       Shape:
+                         - binary/multi-label: (batch_size, num_classes)
+                         - multi-class: (batch_size, num_classes)
+        :param targets: Ground truth labels.
+                        Shape:
+                         - binary: (batch_size,)
+                         - multi-label: (batch_size, num_classes)
+                         - multi-class: (batch_size,)
+        """
+        if self.task_type == "binary":
+            return self.binary_focal_loss(inputs, targets)
+        elif self.task_type == "multi-class":
+            return self.multi_class_focal_loss(inputs, targets)
+        elif self.task_type == "multi-label":
+            return self.multi_label_focal_loss(inputs, targets)
+        else:
+            raise ValueError(
+                f"Unsupported task_type '{self.task_type}'. Use 'binary', 'multi-class', or 'multi-label'."
+            )
+
+    def binary_focal_loss(self, inputs, targets):
+        """Focal loss for binary classification."""
+        probs = torch.sigmoid(inputs)
+        targets = targets.float()
+
+        # Compute binary cross entropy
+        bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+
+        # Compute focal weight
+        p_t = probs * targets + (1 - probs) * (1 - targets)
+        focal_weight = (1 - p_t) ** self.gamma
+
+        # Apply alpha if provided
+        if self.alpha is not None:
+            alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets)
+            bce_loss = alpha_t * bce_loss
+
+        # Apply focal loss weighting
+        loss = focal_weight * bce_loss
+
+        if self.reduction == "mean":
+            return loss.mean()
+        elif self.reduction == "sum":
+            return loss.sum()
+        return loss
+
+    def multi_class_focal_loss(self, inputs, targets):
+        """Focal loss for multi-class classification."""
+        if self.alpha is not None:
+            alpha = self.alpha.to(inputs.device)
+
+        # Convert logits to probabilities with softmax
+        probs = F.softmax(inputs, dim=1)
+
+        # One-hot encode the targets
+        targets_one_hot = F.one_hot(targets, num_classes=self.num_classes).float()
+
+        # Compute cross-entropy for each class
+        ce_loss = -targets_one_hot * torch.log(probs)
+
+        # Compute focal weight
+        p_t = torch.sum(probs * targets_one_hot, dim=1)  # p_t for each sample
+        focal_weight = (1 - p_t) ** self.gamma
+
+        # Apply alpha if provided (per-class weighting)
+        if self.alpha is not None:
+            alpha_t = alpha.gather(0, targets)
+            ce_loss = alpha_t.unsqueeze(1) * ce_loss
+
+        # Apply focal loss weight
+        loss = focal_weight.unsqueeze(1) * ce_loss
+
+        if self.reduction == "mean":
+            return loss.mean()
+        elif self.reduction == "sum":
+            return loss.sum()
+        return loss
+
+    def multi_label_focal_loss(self, inputs, targets):
+        """Focal loss for multi-label classification."""
+        probs = torch.sigmoid(inputs)
+
+        # Compute binary cross entropy
+        bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+
+        # Compute focal weight
+        p_t = probs * targets + (1 - probs) * (1 - targets)
+        focal_weight = (1 - p_t) ** self.gamma
+
+        # Apply alpha if provided
+        if self.alpha is not None:
+            alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets)
+            bce_loss = alpha_t * bce_loss
+
+        # Apply focal loss weight
+        loss = focal_weight * bce_loss
+
+        if self.reduction == "mean":
+            return loss.mean()
+        elif self.reduction == "sum":
+            return loss.sum()
+        return loss
diff --git a/chebai/loss/semantic.py b/chebai/loss/semantic.py
@@ -2,7 +2,7 @@
 import math
 import os
 import pickle
-from typing import TYPE_CHECKING, List, Literal, Union
+from typing import TYPE_CHECKING, List, Literal, Union, Tuple
 
 import torch
 
@@ -62,7 +62,8 @@ def __init__(
         pos_epsilon: float = 0.01,
         multiply_by_softmax: bool = False,
         use_sigmoidal_implication: bool = False,
-        weight_epoch_dependent: Union[bool | tuple[int, int]] = False,
+        weight_epoch_dependent: Union[bool, Tuple[int, int]] = False,
+        weight_epoch_dependent: Union[bool, Tuple[int, int]] = False,
         start_at_epoch: int = 0,
         violations_per_cls_aggregator: Literal[
             "sum", "max", "mean", "log-sum", "log-max", "log-mean"

diff --git a/chebai/models/base.py b/chebai/models/base.py
@@ -41,7 +41,8 @@ def __init__(
         exclude_hyperparameter_logging: Optional[Iterable[str]] = None,
         **kwargs,
     ):
-        super().__init__()
+        super().__init__(**kwargs)
+        # super().__init__()
         if exclude_hyperparameter_logging is None:
             exclude_hyperparameter_logging = tuple()
         self.criterion = criterion
@@ -264,7 +265,7 @@ def _execute(
                 loss_kwargs = dict()
                 if self.pass_loss_kwargs:
                     loss_kwargs = loss_kwargs_candidates
-                loss_kwargs["current_epoch"] = self.trainer.current_epoch
+                # loss_kwargs["current_epoch"] = self.trainer.current_epoch
                 loss = self.criterion(loss_data, loss_labels, **loss_kwargs)
                 if isinstance(loss, tuple):
                     unnamed_loss_index = 1

diff --git a/chebai/models/electra.py b/chebai/models/electra.py
@@ -19,7 +19,8 @@
 
 logging.getLogger("pysmiles").setLevel(logging.CRITICAL)
 
-from chebai.loss.semantic import DisjointLoss as ElectraChEBIDisjointLoss  # noqa
+# TODO: put back in before pull request
+# from chebai.loss.semantic import DisjointLoss as ElectraChEBIDisjointLoss  # noqa
 
 
 class ElectraPre(ChebaiBaseNet):
@@ -40,6 +41,7 @@ class ElectraPre(ChebaiBaseNet):
 
     def __init__(self, config: Dict[str, Any] = None, **kwargs: Any):
         super().__init__(config=config, **kwargs)
+
         self.generator_config = ElectraConfig(**config["generator"])
         self.generator = ElectraForMaskedLM(self.generator_config)
         self.discriminator_config = ElectraConfig(**config["discriminator"])
@@ -224,6 +226,7 @@ def __init__(
         config: Optional[Dict[str, Any]] = None,
         pretrained_checkpoint: Optional[str] = None,
         load_prefix: Optional[str] = None,
+        model_type="classification",
         **kwargs: Any,
     ):
         # Remove this property in order to prevent it from being stored as a
@@ -236,6 +239,8 @@ def __init__(
             config["num_labels"] = self.out_dim
         self.config = ElectraConfig(**config, output_attentions=True)
         self.word_dropout = nn.Dropout(config.get("word_dropout", 0))
+        self.model_type = model_type
+        self.pass_loss_kwargs = True
 
         in_d = self.config.hidden_size
         self.output = nn.Sequential(
@@ -262,6 +267,10 @@ def __init__(
         else:
             self.electra = ElectraModel(config=self.config)
 
+        # freeze parameters
+        # for param in self.electra.parameters():
+        #     param.requires_grad = False
+
     def _process_for_loss(
         self,
         model_output: Dict[str, Tensor],
@@ -280,9 +289,16 @@ def _process_for_loss(
             tuple: A tuple containing the processed model output, labels, and loss arguments.
         """
         kwargs_copy = dict(loss_kwargs)
+        output = model_output["logits"]
         if labels is not None:
             labels = labels.float()
-        return model_output["logits"], labels, kwargs_copy
+        if "missing_labels" in kwargs_copy:
+            missing_labels = kwargs_copy.pop("missing_labels")
+            output = output * (~missing_labels).int() - 10000 * missing_labels.int()
+            labels = labels * (~missing_labels).int()
+        if self.model_type == "classification":
+            assert ((labels <= torch.tensor(1.0)) & (labels >= torch.tensor(0.0))).all()
+        return output, labels, kwargs_copy
 
     def _get_prediction_and_labels(
         self, data: Dict[str, Any], labels: Tensor, model_output: Dict[str, Tensor]
@@ -303,7 +319,25 @@ def _get_prediction_and_labels(
         if "non_null_labels" in loss_kwargs:
             n = loss_kwargs["non_null_labels"]
             d = d[n]
-        return torch.sigmoid(d), labels.int() if labels is not None else None
+        if self.model_type == "classification":
+            # print(self.model_type, ' in electra 324')
+            # for mulitclass here softmax instead of sigmoid
+            d = torch.sigmoid(
+                d
+            )  # changing this made a difference for the roc-auc but not the f1, why?
+            if "missing_labels" in loss_kwargs:
+                missing_labels = loss_kwargs["missing_labels"]
+                d = d * (~missing_labels).int().to(
+                    device=d.device
+                )  # we set the prob of missing labels to 0
+                labels = labels * (~missing_labels).int().to(
+                    device=d.device
+                )  # we set the labels of missing labels to 0
+            return d, labels.int() if labels is not None else None
+        elif self.model_type == "regression":
+            return d, labels
+        else:
+            raise ValueError("Please specify a valid model type in your model config.")
 
     def forward(self, data: Dict[str, Tensor], **kwargs: Any) -> Dict[str, Any]:
         """