microsoft · Hoder-zyf · Jul 15, 2025 · Jul 16, 2025 · Jul 17, 2025 · Jul 17, 2025
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -41,6 +41,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     """The recommend time limit for running on full data"""
     full_timeout: int = 3600
     """The timeout limit for running on full data"""
+    ensemble_timeout: int = 18000
 
     ### specific feature
 

diff --git a/rdagent/components/coder/data_science/ensemble/prompts.yaml b/rdagent/components/coder/data_science/ensemble/prompts.yaml
@@ -2,7 +2,7 @@ ensemble_coder:
   system: |-
     You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science.
     Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
-
+    
     ## Task Description
     Currently, you are working on model ensemble implementation. Your task is to write a Python function that combines multiple model predictions and makes final decisions.
 
@@ -105,7 +105,7 @@ ensemble_eval:
     - Code should have no try-except blocks because they can hide errors.
     - Check whether the code implement the scoring process using the given metric.
     - The stdout includes the local variable values from the ensemble code execution. Check whether the validation score is calculated correctly.
-    
+
     Please respond with your feedback in the following JSON format and order
     ```json
     {
@@ -115,7 +115,7 @@ ensemble_eval:
         "final_decision": <true/false>
     }
     ```
-  user: |-    
+  user: |-
     --------- Ensemble test stdout ---------
     {{ stdout }}   
     {% if workflow_stdout is not none %}

diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -158,6 +158,19 @@ pipeline_coder:
     You should be very careful about the label classes number in the debug mode. The label classes should be the same as the full run even when you are in the debug mode. The label classes number is often used to build the model.
     {% endif %}
 
+    ## Ensemble Guidelines:
+    {% if task_desc == "Ensemble" %}
+    - The code should have no try-except blocks to ensure errors are exposed.
+    - Verify that the scoring uses the specified metric exactly and correctly.
+    - Validate that the prediction shapes and values are consistent and sensible.
+    - Confirm that the ensemble completes training and inference within expected time (no timeout or incomplete training).
+    - Critically, check that the base models maintain good quality and are **not deliberately degraded to save time**. For example:
+        - Avoid freezing large parts of the model that reduce learning capacity.
+        - Avoid replacing full models with simplistic embedding regressors.
+        - Avoid using tricks that severely impair model expressiveness just to reduce runtime.
+    {% endif %}
+
+
     ## General Guidelines
     1. Code correctness is the top priority. Ensure your code is runnable and produces the expected output even if some task requirements are not fully met because the task itself might contain some errors like the wrong package name or wrong package function names.
     2. Use the print() function for all output; do not use the logging module.

diff --git a/rdagent/scenarios/data_science/dev/runner/eval.py b/rdagent/scenarios/data_science/dev/runner/eval.py
@@ -78,14 +78,19 @@ def evaluate(
         gt_implementation: FBWorkspace,
         queried_knowledge: QueriedKnowledge = None,
         **kwargs,
-    ) -> DSRunnerFeedback:
+    ) -> DSCoSTEEREvalFeedback:
+        if "Ensemble" in target_task.name:
+            running_timeout_period = DS_RD_SETTING.ensemble_timeout
+        else:
+            running_timeout_period = self.scen.real_full_timeout()
+
         env = get_ds_env(
             extra_volumes={
                 f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": T(
                     "scenarios.data_science.share:scen.input_path"
                 ).r()
             },
-            running_timeout_period=self.scen.real_full_timeout(),
+            running_timeout_period=running_timeout_period,
         )
 
         stdout = implementation.execute(

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
@@ -141,6 +141,8 @@ hypothesis_gen:
     2. **Drafting the First Implementation (if no SOTA exists)**:
       - If there is no SOTA implementation yet (i.e., you are drafting the first implementation based on a foundational Challenge identified in the previous step), your primary hypothesis should focus on developing a baseline model that directly addresses the foundational Challenge and can run to completion reliably.
       - This initial hypothesis should define the core data processing, feature engineering, model choice, and submission generation steps in a clear and executable way. Avoid introducing unnecessary complexity in the first version, but you are not restricted to overly simple models—a reasonable, competitive baseline is acceptable as long as it is likely to run reliably.
+    3. **Actionable Changes**:
+      - If a Challenge involves underperforming models, propose specific actions like removing or replacing those models.
     {% endif %}
     {% if plan.draft is true %}3{% else %}2{% endif %}. **Actionable Changes**:
       - If a Challenge involves underperforming models (e.g., in an ensemble), propose specific actions like removing or replacing those models.
@@ -182,7 +184,6 @@ hypothesis_gen:
       - **`DataLoadSpec`**: Responsible for loading raw competition data, ensuring data is converted to the correct types, and potentially providing an initial exploratory data analysis (EDA) summary. (e.g., fixing `zipfile.BadZipFile` by improving loading logic).
       - **`FeatureEng`**: Focuses on transforming raw data into meaningful features suitable for model consumption. Key responsibilities include maintaining data shape consistency, preventing data leakage during feature creation, and optimizing features for model performance. Feature engineering should be model-agnostic.
       - **`Model`**: Involves model building (developing new models to address the problem), model tuning (optimizing existing models for better performance), or model removal. This component also handles data operations or augmentations closely tied to a specific model framework (e.g., PyTorch `Datasets` & `DataLoaders`, TensorFlow `tf.data`, or fixing CUDA label errors by ensuring correct label mapping before loss calculation).
-      - **`Ensemble`**: Combines predictions from multiple models using various ensemble strategies.
       - **`Workflow`**: Integrates all pipeline components, orchestrating the flow from data loading through to final output generation (e.g., correcting `submission.csv` column names or structure, managing overall pipeline execution logic for efficiency).
 
     2. **Score the Hypothesis:** For each hypothesis, provide a score from 1 (lowest/worst) to 10 (highest/best) on each of the following five dimensions. Base your scores on all provided information.
@@ -199,7 +200,7 @@ hypothesis_gen:
     1. Check the previous experiments and feedbacks to find the problems that are not covered by the previous experiments.
     2. Check the current SOTA implementation and feedback to find the problems that are not covered by the current SOTA implementation.
     3. Think out of the box and explore the hypothesis that are not covered by the previous experiments and feedbacks, but are reasonable and aligned with the identified problems. 
-    4. Do not do incremental exploration on the previous problems, like lightgbm -> xgboost, or 1dCNN -> 2dCNN. Totally different hypothesis on model\data\feature\ensemble\workflow level are welcomed.
+    4. Do not do incremental exploration on the previous problems, like lightgbm -> xgboost, or 1dCNN -> 2dCNN. Totally different hypothesis on model\data\feature\workflow level are welcomed.
     {% endif %}
 
     {% if plan.suggest_model_architecture is true %}
@@ -231,6 +232,7 @@ hypothesis_gen:
     # Identified Challenges{% if enable_idea_pool %} with Sampled Ideas{% endif %}
     {{ problems }}
 
+
 hypothesis_critique:
   system: |-
     {% include "scenarios.data_science.share:scen.role" %}
@@ -361,7 +363,62 @@ hypothesis_rewrite:
     {{ rewrite_output_format }}
     {% endif %}
 
-  user: |-
+
+hypothesis_select:
+  system: |-
+    You are a Kaggle Grandmaster with deep expertise in model evaluation and decision making. Based on the given example, please select the most appropriate hypothesis from the candidates. 
+    These hypotheses are sourced from `model/data/feature/workflow`. Choose the one that best matches the intent or logic of the prompt. 
+    Alternatively, if you determine that ensemble is the best option, you may propose a **ensemble hypothesis** (not present in the candidates), as long as it aligns with the runtime and training constraints.  
+    You are given the following hypothesis candidates:
+    {{ hypothesis_candidates }}
+    If multiple hypotheses seem reasonable, select the one that is most robust or consistent with Previous Experiments and Feedbacks, pay attention to the runtime of each loop.
+
+    If you believe that previous methods have reached their limits and the current setting only involves a single model, feel free to propose an ensemble solution. However, you **must** carefully allocate the training and runtime budget to ensure the **ensemble logic is well-executed and evaluated**, without compromising the performance of the previous models.
+
+    ### 1. Ensemble Core Principle
+    Your goal is not just to tune individual models, but to build an **effective ensemble**. Make design decisions that lead to **strong overall ensemble performance**, not just strong base models.
+    Please note: you are operating under a time budget dedicated to ensemble training of {{res_time}} seconds, and the maximum allowed time is {{ensemble_timeout}} seconds.
+    {{use_ratio}}% of the total ensemble time has been used.
+
+    {% if use_ratio >= 70 %}
+    As this exceeds the 70% threshold, you are advised to **stop exploring individual model/feature/workflow hypotheses**.  
+    Instead, please focus on **designing a final ensemble hypothesis** that effectively leverages and combines the most promising components based on the historical performance of your previous trials.  
+    Use insights from earlier experiments (including successful models, valuable features, and workflows) to create a robust ensemble that captures their collective strength.
+    {% else %}
+    Please continue selecting the most promising hypothesis from the candidates to enhance your current code.
+    {% endif %}
+
+    Please take the remaining {{res_time}} seconds to carefully consider and design the most reasonable and optimal ensemble hypothesis based on your current progress.
+    Assume training a single model takes about 1 hour. For example, if you have roughly twice that time left, you can try training multiple models with different random seeds or data splits to reuse time effectively.
+    If you have more time, you might consider training a multi-fold ensemble. Use your judgment to decide how many folds or seeds fit within your remaining time budget.
+
+    ### 2. Training-Time Resource Allocation
+    - You may use **multiple folds** if justified, but you must **ensure the full pipeline completes within runtime limits**.
+    - Avoid reducing base model quality just to save time. For example:
+      - Freezing large parts of the model (e.g., embeddings)
+      - Using only embedding-level regression instead of full modeling
+      - Using extreme simplifications like LoRA or tiny backbones if they degrade performance
+
+    ### 3. Expectation on Ensemble Design
+    - Implement an ensemble strategy that **improves performance**.
+      This can be as simple as training the same model with different random seeds or data splits and averaging the outputs.
+      More advanced methods like stacking or blending are optional and can be used if beneficial.
+      Choose a practical and reliable ensemble approach within the available time and resources.
+    - Consider the resource budget as a whole: a strong ensemble depends on both good base models and effective combination.
+
+    ### 4. Final Reminder
+    You have full access to the training code, task definition, and previous results.
+    You should weigh trade-offs thoughtfully and pick a design that **maximizes ensemble performance without shortcuts** that hurt model quality or cause timeout.
+    - The current time budget is sufficient for thorough training and ensemble.
+    - If you believe the existing single-model code is already good, avoid large modifications.
+    - Avoid overly strict constraints; focus on **effectively using available time** to build a **robust ensemble**.
+
+    {% if hypothesis_output_format is not none %}
+    ## Final Output Format in JSON Schema:
+    {{ hypothesis_output_format }}
+    {% endif %}
+
+  user: |- 
     # Scenario Description
     {{ scenario_desc }}
 
@@ -371,13 +428,6 @@ hypothesis_rewrite:
     # Current SOTA Implementation
     {{ sota_exp_desc }}
 
-    # Original Hypotheses and Their Critiques
-    {{ hypothesis_critique_pairs }}
-
-    {% if time_status is not none %}
-    # Time Status
-    {{ time_status }}
-    {% endif %}
 
 
 task_gen:
@@ -483,12 +533,34 @@ task_gen:
       - Double-check that validation scores are saved correctly to `scores.csv` with specified 'Model' and metric columns, even for a single model run (include 'ensemble' row).
     8. **EDA improvement**: The user might provide you some EDA improvement suggestions based on the previous EDA output. If so, you should also include the EDA improvement in your sketch.
 
+    # Guidelines for Ensemble Implementation
+      You must carefully allocate the training and runtime budget to ensure the **ensemble logic is well-executed and evaluated**, without compromising model performance.
+      ### 1. Core Principle
+      Your goal is not just to tune individual models, but to build an **effective ensemble**. Make design decisions that lead to **strong overall ensemble performance**, not just strong base models.
+      ### 2. Training-Time Resource Allocation
+      - You may use **multiple folds** if justified, but you must **ensure the full pipeline completes within runtime limits**.
+      - Avoid reducing base model quality just to save time. For example:
+        -  Freezing large parts of the model (e.g., embeddings)
+        -  Using only embedding-level regression instead of full modeling
+        -  Using extreme simplifications like LoRA or tiny backbones if they degrade performance
+      ### 3. Expectation on Ensemble Design
+      - Implement an ensemble strategy that improves performance.
+        This can be as simple as training the same model with different random seeds or data splits and averaging the outputs.
+        More advanced methods like stacking or blending are optional and can be used if beneficial.
+        Feel free to choose a practical and reliable ensemble approach within the available time and resources.
+      - Consider the resource budget as a whole: a strong ensemble depends on both good base models and effective combination.
+      ### 4. Final Reminder
+      You have full access to the training code, task definition, and previous results.
+      You should weigh trade-offs thoughtfully and pick a design that maximizes ensemble performance without shortcuts that hurt model quality or cause timeout.
+      - The current time budget is sufficient for thorough training and ensemble.
+      - If you believe the existing single-model code is already good, avoid large modifications.
+      - Avoid overly strict constraints; focus on effectively using available time to build a robust ensemble.
+
     # Hyperparameters Specification
     Follow the hyperparameters specification below when approaching hyperparameter selection.
     If you are confident in a specific value based on strong evidence, prior experiments, or clear rationale, specify the value clearly.
     {% include "scenarios.data_science.share:spec.hyperparameter" %}
 
-
     {% if task_output_format is not none %}
 
     # Output Format
@@ -533,8 +605,6 @@ task_gen:
     This sketch should implement the following hypotheses:
 
     {% for hypothesis in hypotheses %}
-    ## {{ hypothesis.problem_name }}
-    **Why:** {{ hypothesis.problem_desc }}
     **Hypothesis:** {{ hypothesis.hypothesis }}
 
     {% endfor %}
@@ -614,7 +684,7 @@ output_format:
       "problem name 1 (should be exactly same as the problem name provided)": {
         {% if enable_idea_pool %}"inspired": "True or False. Set to True if the hypothesis is inspired by the user provided ideas. Otherwise, set it to False.",{% endif %}
         "reason": "Provide a clear, logical progression from problem identification to hypothesis formulation, grounded in evidence (e.g., trace history, domain principles, or competition constraints). Refer to the Hypothesis Guidelines for better understanding. Reason should be short with no more than two sentences.",
-        "component": "The component tag of the hypothesis. Must be one of ('DataLoadSpec', 'FeatureEng', 'Model', 'Ensemble', 'Workflow').",
+        "component": "The component tag of the hypothesis. Must be one of ('DataLoadSpec', 'FeatureEng', 'Model', 'Workflow').",
         "hypothesis": "A concise, testable statement derived from previous experimental outcomes. Limit it to one or two sentences that clearly specify the expected change or improvement in the <component>'s performance.",
         "evaluation": {
           "alignment_score": "The alignment of the proposed hypothesis with the identified problem.",
@@ -666,4 +736,14 @@ output_format:
       }
     }
 
+  hypothesis_select_format: |- 
+    Choose the best hypothesis from the provided hypothesis candidates {{ hypothesis_candidates }}.  
+    You must return a dictionary in the following format **for each selected hypothesis**:
+    {
+      "hypothesis": "...",  
+      "component": "..."  // Must be one of: 'DataLoadSpec', 'FeatureEng', 'Model', 'Workflow'
+    }
+
+    The **"hypothesis"** must be selected **from the provided hypothesis candidates** (do not generate new ones),  
+    **except** when you choose `"component": "Ensemble"`, in which case you should write your **own hypothesis**.