diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index 5d6d1b107..3fb6431a0 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -41,6 +41,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     """The recommend time limit for running on full data"""
     full_timeout: int = 3600
     """The timeout limit for running on full data"""
+    ensemble_timeout: int = 18000
 
     ### specific feature
 
diff --git a/rdagent/components/coder/data_science/ensemble/prompts.yaml b/rdagent/components/coder/data_science/ensemble/prompts.yaml
index b7db201b5..74c3dbbd1 100644
--- a/rdagent/components/coder/data_science/ensemble/prompts.yaml
+++ b/rdagent/components/coder/data_science/ensemble/prompts.yaml
@@ -2,7 +2,7 @@ ensemble_coder:
   system: |-
     You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science.
     Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
-
+    
     ## Task Description
     Currently, you are working on model ensemble implementation. Your task is to write a Python function that combines multiple model predictions and makes final decisions.
 
@@ -105,7 +105,7 @@ ensemble_eval:
     - Code should have no try-except blocks because they can hide errors.
     - Check whether the code implement the scoring process using the given metric.
     - The stdout includes the local variable values from the ensemble code execution. Check whether the validation score is calculated correctly.
-    
+
     Please respond with your feedback in the following JSON format and order
     ```json
     {
@@ -115,7 +115,7 @@ ensemble_eval:
         "final_decision": <true/false>
     }
     ```
-  user: |-    
+  user: |-
     --------- Ensemble test stdout ---------
     {{ stdout }}   
     {% if workflow_stdout is not none %}
diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
index d376db30a..d72e51add 100644
--- a/rdagent/components/coder/data_science/pipeline/prompts.yaml
+++ b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -158,6 +158,19 @@ pipeline_coder:
     You should be very careful about the label classes number in the debug mode. The label classes should be the same as the full run even when you are in the debug mode. The label classes number is often used to build the model.
     {% endif %}
 
+    ## Ensemble Guidelines:
+    {% if task_desc == "Ensemble" %}
+    - The code should have no try-except blocks to ensure errors are exposed.
+    - Verify that the scoring uses the specified metric exactly and correctly.
+    - Validate that the prediction shapes and values are consistent and sensible.
+    - Confirm that the ensemble completes training and inference within expected time (no timeout or incomplete training).
+    - Critically, check that the base models maintain good quality and are **not deliberately degraded to save time**. For example:
+        - Avoid freezing large parts of the model that reduce learning capacity.
+        - Avoid replacing full models with simplistic embedding regressors.
+        - Avoid using tricks that severely impair model expressiveness just to reduce runtime.
+    {% endif %}
+
+
     ## General Guidelines
     1. Code correctness is the top priority. Ensure your code is runnable and produces the expected output even if some task requirements are not fully met because the task itself might contain some errors like the wrong package name or wrong package function names.
     2. Use the print() function for all output; do not use the logging module.
diff --git a/rdagent/scenarios/data_science/dev/runner/eval.py b/rdagent/scenarios/data_science/dev/runner/eval.py
index b355b91a3..76c8fa90e 100644
--- a/rdagent/scenarios/data_science/dev/runner/eval.py
+++ b/rdagent/scenarios/data_science/dev/runner/eval.py
@@ -78,14 +78,19 @@ def evaluate(
         gt_implementation: FBWorkspace,
         queried_knowledge: QueriedKnowledge = None,
         **kwargs,
-    ) -> DSRunnerFeedback:
+    ) -> DSCoSTEEREvalFeedback:
+        if "Ensemble" in target_task.name:
+            running_timeout_period = DS_RD_SETTING.ensemble_timeout
+        else:
+            running_timeout_period = self.scen.real_full_timeout()
+
         env = get_ds_env(
             extra_volumes={
                 f"{DS_RD_SETTING.local_data_path}/{self.scen.competition}": T(
                     "scenarios.data_science.share:scen.input_path"
                 ).r()
             },
-            running_timeout_period=self.scen.real_full_timeout(),
+            running_timeout_period=running_timeout_period,
         )
 
         stdout = implementation.execute(
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
index 9dced5cb1..45d033f3b 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
@@ -141,6 +141,8 @@ hypothesis_gen:
     2. **Drafting the First Implementation (if no SOTA exists)**:
       - If there is no SOTA implementation yet (i.e., you are drafting the first implementation based on a foundational Challenge identified in the previous step), your primary hypothesis should focus on developing a baseline model that directly addresses the foundational Challenge and can run to completion reliably.
       - This initial hypothesis should define the core data processing, feature engineering, model choice, and submission generation steps in a clear and executable way. Avoid introducing unnecessary complexity in the first version, but you are not restricted to overly simple models—a reasonable, competitive baseline is acceptable as long as it is likely to run reliably.
+    3. **Actionable Changes**:
+      - If a Challenge involves underperforming models, propose specific actions like removing or replacing those models.
     {% endif %}
     {% if plan.draft is true %}3{% else %}2{% endif %}. **Actionable Changes**:
       - If a Challenge involves underperforming models (e.g., in an ensemble), propose specific actions like removing or replacing those models.
@@ -182,7 +184,6 @@ hypothesis_gen:
       - **`DataLoadSpec`**: Responsible for loading raw competition data, ensuring data is converted to the correct types, and potentially providing an initial exploratory data analysis (EDA) summary. (e.g., fixing `zipfile.BadZipFile` by improving loading logic).
       - **`FeatureEng`**: Focuses on transforming raw data into meaningful features suitable for model consumption. Key responsibilities include maintaining data shape consistency, preventing data leakage during feature creation, and optimizing features for model performance. Feature engineering should be model-agnostic.
       - **`Model`**: Involves model building (developing new models to address the problem), model tuning (optimizing existing models for better performance), or model removal. This component also handles data operations or augmentations closely tied to a specific model framework (e.g., PyTorch `Datasets` & `DataLoaders`, TensorFlow `tf.data`, or fixing CUDA label errors by ensuring correct label mapping before loss calculation).
-      - **`Ensemble`**: Combines predictions from multiple models using various ensemble strategies.
       - **`Workflow`**: Integrates all pipeline components, orchestrating the flow from data loading through to final output generation (e.g., correcting `submission.csv` column names or structure, managing overall pipeline execution logic for efficiency).
 
     2. **Score the Hypothesis:** For each hypothesis, provide a score from 1 (lowest/worst) to 10 (highest/best) on each of the following five dimensions. Base your scores on all provided information.
@@ -199,7 +200,7 @@ hypothesis_gen:
     1. Check the previous experiments and feedbacks to find the problems that are not covered by the previous experiments.
     2. Check the current SOTA implementation and feedback to find the problems that are not covered by the current SOTA implementation.
     3. Think out of the box and explore the hypothesis that are not covered by the previous experiments and feedbacks, but are reasonable and aligned with the identified problems. 
-    4. Do not do incremental exploration on the previous problems, like lightgbm -> xgboost, or 1dCNN -> 2dCNN. Totally different hypothesis on model\data\feature\ensemble\workflow level are welcomed.
+    4. Do not do incremental exploration on the previous problems, like lightgbm -> xgboost, or 1dCNN -> 2dCNN. Totally different hypothesis on model\data\feature\workflow level are welcomed.
     {% endif %}
 
     {% if plan.suggest_model_architecture is true %}
@@ -231,6 +232,7 @@ hypothesis_gen:
     # Identified Challenges{% if enable_idea_pool %} with Sampled Ideas{% endif %}
     {{ problems }}
 
+
 hypothesis_critique:
   system: |-
     {% include "scenarios.data_science.share:scen.role" %}
@@ -361,7 +363,62 @@ hypothesis_rewrite:
     {{ rewrite_output_format }}
     {% endif %}
 
-  user: |-
+
+hypothesis_select:
+  system: |-
+    You are a Kaggle Grandmaster with deep expertise in model evaluation and decision making. Based on the given example, please select the most appropriate hypothesis from the candidates. 
+    These hypotheses are sourced from `model/data/feature/workflow`. Choose the one that best matches the intent or logic of the prompt. 
+    Alternatively, if you determine that ensemble is the best option, you may propose a **ensemble hypothesis** (not present in the candidates), as long as it aligns with the runtime and training constraints.  
+    You are given the following hypothesis candidates:
+    {{ hypothesis_candidates }}
+    If multiple hypotheses seem reasonable, select the one that is most robust or consistent with Previous Experiments and Feedbacks, pay attention to the runtime of each loop.
+
+    If you believe that previous methods have reached their limits and the current setting only involves a single model, feel free to propose an ensemble solution. However, you **must** carefully allocate the training and runtime budget to ensure the **ensemble logic is well-executed and evaluated**, without compromising the performance of the previous models.
+    
+    ### 1. Ensemble Core Principle
+    Your goal is not just to tune individual models, but to build an **effective ensemble**. Make design decisions that lead to **strong overall ensemble performance**, not just strong base models.
+    Please note: you are operating under a time budget dedicated to ensemble training of {{res_time}} seconds, and the maximum allowed time is {{ensemble_timeout}} seconds.
+    {{use_ratio}}% of the total ensemble time has been used.
+
+    {% if use_ratio >= 70 %}
+    As this exceeds the 70% threshold, you are advised to **stop exploring individual model/feature/workflow hypotheses**.  
+    Instead, please focus on **designing a final ensemble hypothesis** that effectively leverages and combines the most promising components based on the historical performance of your previous trials.  
+    Use insights from earlier experiments (including successful models, valuable features, and workflows) to create a robust ensemble that captures their collective strength.
+    {% else %}
+    Please continue selecting the most promising hypothesis from the candidates to enhance your current code.
+    {% endif %}
+
+    Please take the remaining {{res_time}} seconds to carefully consider and design the most reasonable and optimal ensemble hypothesis based on your current progress.
+    Assume training a single model takes about 1 hour. For example, if you have roughly twice that time left, you can try training multiple models with different random seeds or data splits to reuse time effectively.
+    If you have more time, you might consider training a multi-fold ensemble. Use your judgment to decide how many folds or seeds fit within your remaining time budget.
+
+    ### 2. Training-Time Resource Allocation
+    - You may use **multiple folds** if justified, but you must **ensure the full pipeline completes within runtime limits**.
+    - Avoid reducing base model quality just to save time. For example:
+      - Freezing large parts of the model (e.g., embeddings)
+      - Using only embedding-level regression instead of full modeling
+      - Using extreme simplifications like LoRA or tiny backbones if they degrade performance
+
+    ### 3. Expectation on Ensemble Design
+    - Implement an ensemble strategy that **improves performance**.
+      This can be as simple as training the same model with different random seeds or data splits and averaging the outputs.
+      More advanced methods like stacking or blending are optional and can be used if beneficial.
+      Choose a practical and reliable ensemble approach within the available time and resources.
+    - Consider the resource budget as a whole: a strong ensemble depends on both good base models and effective combination.
+
+    ### 4. Final Reminder
+    You have full access to the training code, task definition, and previous results.
+    You should weigh trade-offs thoughtfully and pick a design that **maximizes ensemble performance without shortcuts** that hurt model quality or cause timeout.
+    - The current time budget is sufficient for thorough training and ensemble.
+    - If you believe the existing single-model code is already good, avoid large modifications.
+    - Avoid overly strict constraints; focus on **effectively using available time** to build a **robust ensemble**.
+
+    {% if hypothesis_output_format is not none %}
+    ## Final Output Format in JSON Schema:
+    {{ hypothesis_output_format }}
+    {% endif %}
+  
+  user: |- 
     # Scenario Description
     {{ scenario_desc }}
 
@@ -371,13 +428,6 @@ hypothesis_rewrite:
     # Current SOTA Implementation
     {{ sota_exp_desc }}
 
-    # Original Hypotheses and Their Critiques
-    {{ hypothesis_critique_pairs }}
-
-    {% if time_status is not none %}
-    # Time Status
-    {{ time_status }}
-    {% endif %}
 
 
 task_gen:
@@ -483,12 +533,34 @@ task_gen:
       - Double-check that validation scores are saved correctly to `scores.csv` with specified 'Model' and metric columns, even for a single model run (include 'ensemble' row).
     8. **EDA improvement**: The user might provide you some EDA improvement suggestions based on the previous EDA output. If so, you should also include the EDA improvement in your sketch.
 
+    # Guidelines for Ensemble Implementation
+      You must carefully allocate the training and runtime budget to ensure the **ensemble logic is well-executed and evaluated**, without compromising model performance.
+      ### 1. Core Principle
+      Your goal is not just to tune individual models, but to build an **effective ensemble**. Make design decisions that lead to **strong overall ensemble performance**, not just strong base models.
+      ### 2. Training-Time Resource Allocation
+      - You may use **multiple folds** if justified, but you must **ensure the full pipeline completes within runtime limits**.
+      - Avoid reducing base model quality just to save time. For example:
+        -  Freezing large parts of the model (e.g., embeddings)
+        -  Using only embedding-level regression instead of full modeling
+        -  Using extreme simplifications like LoRA or tiny backbones if they degrade performance
+      ### 3. Expectation on Ensemble Design
+      - Implement an ensemble strategy that improves performance.
+        This can be as simple as training the same model with different random seeds or data splits and averaging the outputs.
+        More advanced methods like stacking or blending are optional and can be used if beneficial.
+        Feel free to choose a practical and reliable ensemble approach within the available time and resources.
+      - Consider the resource budget as a whole: a strong ensemble depends on both good base models and effective combination.
+      ### 4. Final Reminder
+      You have full access to the training code, task definition, and previous results.
+      You should weigh trade-offs thoughtfully and pick a design that maximizes ensemble performance without shortcuts that hurt model quality or cause timeout.
+      - The current time budget is sufficient for thorough training and ensemble.
+      - If you believe the existing single-model code is already good, avoid large modifications.
+      - Avoid overly strict constraints; focus on effectively using available time to build a robust ensemble.
+      
     # Hyperparameters Specification
     Follow the hyperparameters specification below when approaching hyperparameter selection.
     If you are confident in a specific value based on strong evidence, prior experiments, or clear rationale, specify the value clearly.
     {% include "scenarios.data_science.share:spec.hyperparameter" %}
 
-    
     {% if task_output_format is not none %}
 
     # Output Format
@@ -533,8 +605,6 @@ task_gen:
     This sketch should implement the following hypotheses:
 
     {% for hypothesis in hypotheses %}
-    ## {{ hypothesis.problem_name }}
-    **Why:** {{ hypothesis.problem_desc }}
     **Hypothesis:** {{ hypothesis.hypothesis }}
 
     {% endfor %}
@@ -614,7 +684,7 @@ output_format:
       "problem name 1 (should be exactly same as the problem name provided)": {
         {% if enable_idea_pool %}"inspired": "True or False. Set to True if the hypothesis is inspired by the user provided ideas. Otherwise, set it to False.",{% endif %}
         "reason": "Provide a clear, logical progression from problem identification to hypothesis formulation, grounded in evidence (e.g., trace history, domain principles, or competition constraints). Refer to the Hypothesis Guidelines for better understanding. Reason should be short with no more than two sentences.",
-        "component": "The component tag of the hypothesis. Must be one of ('DataLoadSpec', 'FeatureEng', 'Model', 'Ensemble', 'Workflow').",
+        "component": "The component tag of the hypothesis. Must be one of ('DataLoadSpec', 'FeatureEng', 'Model', 'Workflow').",
         "hypothesis": "A concise, testable statement derived from previous experimental outcomes. Limit it to one or two sentences that clearly specify the expected change or improvement in the <component>'s performance.",
         "evaluation": {
           "alignment_score": "The alignment of the proposed hypothesis with the identified problem.",
@@ -666,4 +736,14 @@ output_format:
       }
     }
 
+  hypothesis_select_format: |- 
+    Choose the best hypothesis from the provided hypothesis candidates {{ hypothesis_candidates }}.  
+    You must return a dictionary in the following format **for each selected hypothesis**:
+    {
+      "hypothesis": "...",  
+      "component": "..."  // Must be one of: 'DataLoadSpec', 'FeatureEng', 'Model', 'Workflow'
+    }
+
+    The **"hypothesis"** must be selected **from the provided hypothesis candidates** (do not generate new ones),  
+    **except** when you choose `"component": "Ensemble"`, in which case you should write your **own hypothesis**.
 
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
index 55a366649..cf827bc65 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -15,6 +15,7 @@
 from rdagent.core.proposal import ExpGen
 from rdagent.core.scenario import Scenario
 from rdagent.log import rdagent_logger as logger
+from rdagent.oai.backend.base import RD_Agent_TIMER_wrapper
 from rdagent.oai.llm_utils import APIBackend, md5_hash
 from rdagent.scenarios.data_science.dev.feedback import ExperimentFeedback
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
@@ -247,6 +248,13 @@ class HypothesisDetail(BaseModel):
     evaluation: HypothesisEvaluation = Field(description="Evaluate the quality of the hypothesis.")
 
 
+class HypothesisSimple(BaseModel):
+    hypothesis: str = Field(
+        description="The statement of the hypothesis. It could be a design of a new component, or a concise, testable statement derived from previous experimental outcomes."
+    )
+    component: HypothesisComponent = Field(description="The component tag of the hypothesis.")
+
+
 class HypothesisList(BaseModel):
     deduplicated_challenges: List[str] = Field(
         description="A list of deduplicated challenge captions. Each must retain its original wording. If multiple captions are semantically identical, keep the first one."
@@ -601,8 +609,6 @@ def hypothesis_gen(
         sys_prompt = T(".prompts_v2:hypothesis_gen.system").r(
             hypothesis_output_format=(
                 T(".prompts_v2:output_format.hypothesis").r(pipeline=pipeline, enable_idea_pool=enable_idea_pool)
-                if not self.supports_response_schema
-                else None
             ),
             pipeline=pipeline,
             enable_idea_pool=enable_idea_pool,
@@ -619,30 +625,10 @@ def hypothesis_gen(
         response = APIBackend().build_messages_and_create_chat_completion(
             user_prompt=user_prompt,
             system_prompt=sys_prompt,
-            response_format=HypothesisList if self.supports_response_schema else {"type": "json_object"},
-            json_target_type=(
-                Dict[str, Dict[str, str | Dict[str, str | int]]] if not self.supports_response_schema else None
-            ),
+            response_format={"type": "json_object"},
+            json_target_type=Dict[str, Dict[str, str | Dict[str, str | int]]],
         )
-        if self.supports_response_schema:
-            hypotheses = HypothesisList(**json.loads(response))
-            resp_dict = {
-                h.caption: {
-                    "reason": h.challenge,
-                    "component": h.component.value,
-                    "hypothesis": h.hypothesis,
-                    "evaluation": {
-                        "alignment_score": h.evaluation.alignment.score,
-                        "impact_score": h.evaluation.impact.score,
-                        "novelty_score": h.evaluation.novelty.score,
-                        "feasibility_score": h.evaluation.feasibility.score,
-                        "risk_reward_balance_score": h.evaluation.risk_reward_balance.score,
-                    },
-                }
-                for h in hypotheses.hypotheses
-            }
-        else:
-            resp_dict = json.loads(response)
+        resp_dict = json.loads(response)
         logger.info(f"Generated hypotheses:\n" + json.dumps(resp_dict, indent=2))
 
         # make sure the problem name is aligned
@@ -894,6 +880,52 @@ def hypothesis_rank(
             appendix=hypothesis_dict[max_score_problem_name].get("appendix", None),
         )
 
+    def hypothesis_select_with_llm(
+        self, scenario_desc: str, exp_feedback_list_desc: str, sota_exp_desc: str, hypothesis_candidates: dict
+    ):
+
+        # time_use_current = 0
+        # for exp, feedback in trace.hist:
+        #     if exp.running_info.running_time is not None:
+        #         time_use_current += exp.running_info.running_time
+        # res_time = 12*3600 - time_use_current
+        res_time = RD_Agent_TIMER_wrapper.timer.remain_time()
+        total_time = RD_Agent_TIMER_wrapper.timer.all_duration
+        use_time = round(total_time.total_seconds(), 2) - round(res_time.total_seconds(), 2)
+        use_ratio = 100 * use_time / round(total_time.total_seconds(), 2)
+        use_ratio = round(use_ratio, 2)
+
+        ensemble_timeout = DS_RD_SETTING.ensemble_timeout
+        hypothesis_candidates = str(json.dumps(hypothesis_candidates, indent=2))
+
+        sys_prompt = T(".prompts_v2:hypothesis_select.system").r(
+            hypothesis_candidates=hypothesis_candidates,
+            res_time=round(res_time.total_seconds(), 2),
+            ensemble_timeout=ensemble_timeout,
+            use_ratio=use_ratio,
+            hypothesis_output_format=T(".prompts_v2:output_format.hypothesis_select_format").r(
+                hypothesis_candidates=hypothesis_candidates
+            ),
+        )
+
+        user_prompt = T(".prompts_v2:hypothesis_select.user").r(
+            scenario_desc=scenario_desc,
+            exp_and_feedback_list_desc=exp_feedback_list_desc,
+            sota_exp_desc=sota_exp_desc,
+        )
+
+        response = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=user_prompt,
+            system_prompt=sys_prompt,
+            response_format=HypothesisSimple if self.supports_response_schema else {"type": "json_object"},
+            json_target_type=(
+                Dict[str, Dict[str, str | Dict[str, str | int]]] if not self.supports_response_schema else None
+            ),
+        )
+
+        response_dict = json.loads(response)
+        return response_dict
+
     def task_gen(
         self,
         component_desc: str,
@@ -1141,10 +1173,32 @@ def gen(
             improved_hypotheses_dict = hypothesis_dict.copy()  # Use original hypotheses directly
 
         # Step 3: Select the best hypothesis
-        pickled_problem_name, new_hypothesis = self.hypothesis_rank(
-            hypothesis_dict=improved_hypotheses_dict,
-            problem_dict=all_problems,
+        # pickled_problem_name, new_hypothesis = self.hypothesis_rank(
+        #     hypothesis_dict=hypothesis_dict,
+        #     problem_dict=  all_problems,
+        # )
+
+        response_dict = self.hypothesis_select_with_llm(
+            scenario_desc=scenario_desc,
+            exp_feedback_list_desc=exp_feedback_list_desc,
+            sota_exp_desc=sota_exp_desc,
+            hypothesis_candidates=hypothesis_dict,
         )
+        component_map = {
+            "Model": HypothesisComponent.Model,
+            "Ensemble": HypothesisComponent.Ensemble,
+            "Workflow": HypothesisComponent.Workflow,
+            "FeatureEng": HypothesisComponent.FeatureEng,
+            "DataLoadSpec": HypothesisComponent.DataLoadSpec,
+        }
+
+        comp_str = response_dict.get("component")
+        hypo_str = response_dict.get("hypothesis")
+
+        if comp_str in component_map and hypo_str is not None:
+            new_hypothesis = DSHypothesis(component=component_map[comp_str], hypothesis=hypo_str)
+
+        pickled_problem_name = None
         # Step 3.5: Update knowledge base with the picked problem
         if DS_RD_SETTING.enable_knowledge_base:
             trace.knowledge_base.update_pickled_problem(all_problems, pickled_problem_name)
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
index 3e61bb96a..b7ca2ba0b 100644
--- a/rdagent/scenarios/data_science/scen/__init__.py
+++ b/rdagent/scenarios/data_science/scen/__init__.py
@@ -177,6 +177,7 @@ def get_competition_full_desc(self) -> str:
             metric_direction=self.metric_direction,
             raw_description=self.raw_description,
             use_raw_description=DS_RD_SETTING.use_raw_description,
+            ensemble_limit=None,
             time_limit=None,
             recommend_time_limit=None,
             eda_output=None,
@@ -189,6 +190,7 @@ def get_scenario_all_desc(self, eda_output=None) -> str:
         """
         eda_output depends on dynamic .md files from current workspace, not fixed.
         """
+        ensemble_timeout = getattr(DS_RD_SETTING, "ensemble_timeout", None)
         return T(".prompts:scenario_description").r(
             background=self.background,
             submission_specifications=self.submission_specifications,
@@ -197,6 +199,7 @@ def get_scenario_all_desc(self, eda_output=None) -> str:
             metric_direction=self.metric_direction,
             raw_description=self.raw_description,
             use_raw_description=DS_RD_SETTING.use_raw_description,
+            ensemble_limit=f"{ensemble_timeout / 60 / 60:.2f} hours" if ensemble_timeout is not None else None,
             time_limit=f"{self.real_full_timeout() / 60 / 60 : .2f} hours" if DS_RD_SETTING.show_hard_limit else None,
             recommend_time_limit=(
                 f"{self.recommend_full_timeout() / 60 / 60 : .2f} hours" if DS_RD_SETTING.sample_data_by_LLM else None
diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
index 77e926f66..2e9303ecd 100644
--- a/rdagent/scenarios/data_science/scen/prompts.yaml
+++ b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -36,7 +36,11 @@ scenario_description: |-
   ====== Time Limit On Full Code Execution ======
   Your full code's execution is limited to **{{ time_limit }}**. After this time limit, your code will be terminated and all time and resources are wasted. Always make sure your code will not run longer than this time limit.
   During this time limit, you have all the resources available to you. Please fully leverage all the computational resources(CPUs and GPUs) to achieve the best performance like choose a powerful model, use a large batch size, enable data sampler with big parallel.
-  {% endif %}{% if debug_time_limit is not none%}
+  {% endif %}
+  {% if ensemble_limit is not none %}
+  If your code involves ensemble training, note that the total time allowed for ensemble runs is {{ ensemble_limit }} seconds. Make sure to plan your ensemble strategy wisely within this limit.
+  {% endif %}
+  {% if debug_time_limit is not none%}
   ====== Time Limit On Debug Mode Code Execution ======
   Your are also required to include a debug mode in your code, the debug code's execution is limited to **{{ debug_time_limit }}**. You should make sure 10 percent of the data training one epoch can be finished within this time limit. If not, your should propose a new debug strategy in your task.
   {% endif %}{% if recommend_time_limit is not none %}