[QEff. Finetune] Updated handling of custom dataset in FT. Updated finetune.md readme file. (#520)

quic-meetkuma · web-flow · commit 32ecb16b678d · 2025-08-05T13:33:56.000+05:30
- Introduced the handling of custom dataset via --dataset_config
argument. This argument expects a json file which has parameters to
enable custom preprocessing for any dataset.
- Updated the docs to reflect the changes in the interface of custom
dataset usage.

---------

Signed-off-by: meetkuma &lt;meetkuma@qti.qualcomm.com&gt;
diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
@@ -288,11 +288,10 @@ def main(**kwargs) -> None:
                 --model_name "meta-llama/Llama-3.2-1B" \\
                 --lr 5e-4
     """
-    # TODO:Remove TrainConfig() and update_config() as all params are passed in kwargs by parser
     train_config = TrainConfig()
     update_config(train_config, **kwargs)
-    dataset_config = generate_dataset_config(train_config.dataset)
-    update_config(dataset_config, **kwargs)
+    custom_dataset_config_file = kwargs.pop("custom_dataset_config", None)
+    dataset_config = generate_dataset_config(train_config.dataset, custom_dataset_config_file)
 
     logger.prepare_for_logs(train_config.output_dir, train_config.dump_logs, train_config.log_level)
 
diff --git a/QEfficient/finetune/configs/dataset_config.py b/QEfficient/finetune/configs/dataset_config.py
@@ -41,7 +41,5 @@ class imdb_dataset:
 @dataclass
 class custom_dataset:
     dataset: str = "custom_dataset"
-    file: str = "dataset/custom_dataset.py"
     train_split: str = "train"
     test_split: str = "validation"
-    data_path: str = ""
diff --git a/QEfficient/finetune/configs/sample_peft_config.json b/QEfficient/finetune/configs/sample_peft_config.json
@@ -0,0 +1,17 @@
+{
+    "r": 32,
+    "lora_alpha": 64,
+    "target_modules": [
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+        "up_proj",
+        "down_proj",
+        "gate_proj"
+    ],
+    "bias": "none",
+    "task_type": "CAUSAL_LM",
+    "lora_dropout": 0.05,
+    "inference_mode": false
+}
diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py
@@ -6,6 +6,7 @@
 # -----------------------------------------------------------------------------
 
 import importlib
+import logging
 from pathlib import Path
 
 from QEfficient.finetune.utils.logging_utils import logger
@@ -26,51 +27,81 @@ def load_module_from_py_file(py_file: str) -> object:
 
 
 def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=None):
-    if ":" in dataset_config.file:
-        module_path, func_name = dataset_config.file.split(":")
-    else:
-        module_path, func_name = dataset_config.file, "get_custom_dataset"
+    if not hasattr(dataset_config, "preproc_file"):
+        logger.raise_error("Can not find preproc_file key in dataset_config file.", RuntimeError)
+
+    if ":" not in dataset_config.preproc_file:
+        logger.raise_error(
+            "The 'preproc_file' key in dataset_config file should follow the format: python_file_path:function_name",
+            RuntimeError,
+        )
+
+    module_path, func_name = dataset_config.preproc_file.split(":")
+    logger.log_rank_zero(
+        f"Using '{func_name}' function from {module_path} as preprocessing function in dataset preprocessing.",
+        logging.DEBUG,
+    )
 
     if not module_path.endswith(".py"):
-        logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError)
+        logger.raise_error(f"Custom dataset preprocessing file {module_path} is not a .py file.", ValueError)
 
     module_path = Path(module_path)
     if not module_path.is_file():
         logger.raise_error(
-            f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
+            f"Custom dataset file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
         )
 
     module = load_module_from_py_file(module_path.as_posix())
     try:
         return getattr(module, func_name)(dataset_config, tokenizer, split, context_length)
     except AttributeError:
         logger.raise_error(
-            f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).",
+            f"For custom dataset preprocessing, the method ({func_name}) is not "
+            f"present in the file ({module_path.as_posix()}).",
             AttributeError,
         )
 
 
 def get_data_collator(dataset_processer, dataset_config):
-    if ":" in dataset_config.file:
-        module_path, func_name = dataset_config.file.split(":")
+    if not hasattr(dataset_config, "collate_file"):
+        logger.log_rank_zero(
+            "Can not find collate_file key in dataset_config file. Using the default data collator function instead.",
+            logging.WARNING,
+        )
+        return None
+
+    if ":" not in dataset_config.collate_file:
+        logger.log_rank_zero(
+            "Can not find function name in 'collate_file' key in dataset_config "
+            "file. Using the default data collator function instead. If this is "
+            "not intended then change the format of the 'collate_file' key in "
+            "dataset_config file to follow the format: python_file_path:function_name",
+            logging.WARNING,
+        )
+        return None
     else:
-        module_path, func_name = dataset_config.file, "get_data_collator"
+        module_path, func_name = dataset_config.collate_file.split(":")
+        logger.log_rank_zero(
+            f"Using '{func_name}' function from {module_path} as collate_fn in dataset preprocessing.",
+            logging.DEBUG,
+        )
 
     if not module_path.endswith(".py"):
-        logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError)
+        logger.raise_error(f"Custom dataset collate file {module_path} is not a .py file.", ValueError)
 
     module_path = Path(module_path)
     if not module_path.is_file():
         logger.raise_error(
-            f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
+            f"Custom dataset collate file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
         )
 
     module = load_module_from_py_file(module_path.as_posix())
     try:
         return getattr(module, func_name)(dataset_processer)
     except AttributeError:
         logger.log_rank_zero(
-            f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()})."
+            f"Can not find the function {func_name} in file "
+            f"({module_path.as_posix()}). Using the default data collator "
+            "function instead."
         )
-        logger.log_rank_zero("Using the default data_collator instead.")
         return None
diff --git a/QEfficient/finetune/dataset/custom_dataset/sample_dataset_config.json b/QEfficient/finetune/dataset/custom_dataset/sample_dataset_config.json
@@ -0,0 +1,7 @@
+{
+    "train_split": "train",
+    "test_split": "test",
+    "test_split_ratio": 0.15,
+    "preproc_file": "./QEfficient/finetune/dataset/custom_dataset/disc_preproc.py:get_preprocessed_disc",
+    "disc_style": "sarcasm_more"
+}
diff --git a/QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py b/QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py
@@ -0,0 +1,87 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+
+import datasets
+from transformers.data import DataCollatorForSeq2Seq
+
+
+def get_data_collator(tokenizer):
+    return DataCollatorForSeq2Seq(tokenizer)
+
+
+def get_preprocessed_disc(dataset_config, tokenizer, split, context_length=None):
+    dataset = datasets.load_dataset("hallisky/DiSC")
+
+    # Considering 'train' split as this dataset has only one split.
+    dataset = dataset["train"]
+
+    test_split_ratio = dataset_config.test_split_ratio
+    disc_style = dataset_config.disc_style
+
+    # Only collect the samples for a given style.
+    available_styles = set(dataset["category"])
+    if disc_style not in available_styles:
+        raise RuntimeError(f"For DiSC dataset the provided disc_style '{disc_style}' is not supported.")
+
+    dataset = dataset.filter(lambda example: example["category"] == disc_style)
+
+    # Shuffle the dataset before splitting
+    dataset = dataset.shuffle(seed=42)
+
+    # Split the data in train and test split.
+    total_samples = len(dataset)
+    test_size = int(total_samples * test_split_ratio)
+    train_size = total_samples - test_size
+
+    if split == "test":
+        indices = range(train_size, total_samples)
+    else:
+        indices = range(0, train_size)
+
+    dataset = dataset.select(indices)
+
+    if tokenizer.pad_token is None:
+        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+    # Below is the template of the DiSC dataset.
+    # <bos>### Original:{original} \n ### Rewrite: {rewrite} <eos>
+    template = "### Original:{original} \n ### Rewrite: "
+
+    def apply_prompt_template(sample):
+        return {
+            "input": template.format(original=sample["original"]),
+            "label": sample["generation"],
+        }
+
+    dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))
+
+    def tokenize_add_label(sample):
+        input = tokenizer.encode(
+            tokenizer.bos_token + sample["input"],
+            add_special_tokens=False,
+            max_length=context_length,
+            pad_to_max_length=True,
+        )
+        label = tokenizer.encode(
+            sample["label"] + tokenizer.pad_token + tokenizer.eos_token,
+            add_special_tokens=False,
+            max_length=context_length,
+            pad_to_max_length=True,
+        )
+
+        sample = {
+            "input_ids": (input + label),
+            "attention_mask": [1] * (len(input) + len(label)),
+            "labels": [-100] * len(input) + label,
+        }
+
+        return sample
+
+    dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset.features))
+
+    return dataset
diff --git a/QEfficient/finetune/dataset/dataset_config.py b/QEfficient/finetune/dataset/dataset_config.py
@@ -5,7 +5,6 @@
 #
 # -----------------------------------------------------------------------------
 
-from functools import partial
 
 from QEfficient.finetune.dataset.alpaca_dataset import (
     InstructionDataset as get_alpaca_dataset,
@@ -23,7 +22,7 @@
 )
 
 DATASET_PREPROC = {
-    "alpaca_dataset": partial(get_alpaca_dataset),
+    "alpaca_dataset": get_alpaca_dataset,
     "grammar_dataset": get_grammar_dataset,
     "gsm8k_dataset": get_gsm8k_dataset,
     "custom_dataset": get_custom_dataset,
diff --git a/QEfficient/finetune/utils/config_utils.py b/QEfficient/finetune/utils/config_utils.py
@@ -8,13 +8,14 @@
 import inspect
 import json
 import os
+from collections import namedtuple
 from dataclasses import asdict
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
 import yaml
 from peft import LoraConfig as PeftLoraConfig
 
-import QEfficient.finetune.configs.dataset_config as datasets
+import QEfficient.finetune.configs.dataset_config as qeff_datasets
 from QEfficient.finetune.configs.peft_config import LoraConfig
 from QEfficient.finetune.configs.training import TrainConfig
 from QEfficient.finetune.dataset.dataset_config import DATASET_PREPROC
@@ -86,11 +87,14 @@ def generate_peft_config(train_config: TrainConfig, **kwargs) -> Any:
     return peft_config
 
 
-def generate_dataset_config(dataset_name: str) -> Any:
+def generate_dataset_config(dataset_name: str, custom_dataset_config: Optional[str] = None) -> Any:
     """Generate a dataset configuration based on the specified dataset.
 
     Args:
         dataset_name (str): Name of the dataset to be used for finetuning.
+        custom_dataset_config (str): Dataset config json file for custom datset.
+            This file contains dataset specific arguments to be used in dataset
+            preprocessing step.
 
     Returns:
         Any: A dataset configuration object.
@@ -101,7 +105,20 @@ def generate_dataset_config(dataset_name: str) -> Any:
     supported_datasets = DATASET_PREPROC.keys()
     assert dataset_name in supported_datasets, f"Given dataset '{dataset_name}' is not supported."
     # FIXME (Meet): Replace below logic by creating using auto registry of datasets.
-    dataset_config = {k: v for k, v in inspect.getmembers(datasets)}[dataset_name]()
+    dataset_config = {k: v for k, v in inspect.getmembers(qeff_datasets)}[dataset_name]()
+    if dataset_name == "custom_dataset":
+        if custom_dataset_config is None:
+            logger.raise_error(
+                "For 'custom_dataset', please provide dataset config file via 'custom_dataset_config' flag.",
+                RuntimeError,
+            )
+        custom_dataset_dict = asdict(dataset_config)
+        custom_dataset_dict_override = load_config_file(custom_dataset_config)
+        # Override existing and add new params to dataset_config.
+        custom_dataset_dict.update(custom_dataset_dict_override)
+
+        custom_dataset_class = namedtuple("custom_dataset", custom_dataset_dict.keys())
+        dataset_config = custom_dataset_class(**custom_dataset_dict)
     return dataset_config
 
 
diff --git a/QEfficient/finetune/utils/dataset_utils.py b/QEfficient/finetune/utils/dataset_utils.py
@@ -64,8 +64,9 @@ def get_dataloader_kwargs(train_config, dataset, dataset_processer, split):
             kwargs["drop_last"] = False
     else:
         kwargs["batch_size"] = batch_size
-        kwargs["drop_last"] = False
-    kwargs["collate_fn"] = DataCollatorForSeq2Seq(dataset_processer)
+        kwargs["drop_last"] = True
+    # todo: -100 should be changed to a variable. or tokenizer.pad_token_id
+    kwargs["collate_fn"] = DataCollatorForSeq2Seq(dataset_processer, label_pad_token_id=-100)
     return kwargs
 
 
diff --git a/QEfficient/finetune/utils/parser.py b/QEfficient/finetune/utils/parser.py
@@ -43,6 +43,13 @@ def get_finetune_parser():
         default=None,
         help="Name of the tokenizer,if not passed as an argument, it uses the value of model_name",
     )
+    parser.add_argument(
+        "--custom_dataset_config",
+        "--custom-dataset-config",
+        type=str,
+        default=None,
+        help="Path of custom dataset config json file to override the custom dataset params such as test_split_ratio, test_split etc.",
+    )
     parser.add_argument(
         "--run_validation",
         "--run-validation",
diff --git a/docs/source/finetune.md b/docs/source/finetune.md

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,6 @@`
`5`	`5`	`#`
`6`	`6`	`# -----------------------------------------------------------------------------`
`7`	`7`
`8`		`-from functools import partial`
`9`	`8`
`10`	`9`	`from QEfficient.finetune.dataset.alpaca_dataset import (`
`11`	`10`	`InstructionDataset as get_alpaca_dataset,`
`@@ -23,7 +22,7 @@`
`23`	`22`	`)`
`24`	`23`
`25`	`24`	`DATASET_PREPROC = {`
`26`		`- "alpaca_dataset": partial(get_alpaca_dataset),`
	`25`	`+ "alpaca_dataset": get_alpaca_dataset,`
`27`	`26`	`"grammar_dataset": get_grammar_dataset,`
`28`	`27`	`"gsm8k_dataset": get_gsm8k_dataset,`
`29`	`28`	`"custom_dataset": get_custom_dataset,`