diff --git a/.devcontainer/env b/.devcontainer/env
index 8282f374a..2f497fdfb 100644
--- a/.devcontainer/env
+++ b/.devcontainer/env
@@ -9,17 +9,19 @@ TIMEOUT_FAIL_LIMIT=100
 # CHAT_TEMPERATURE=0.7
 
 CHAT_STREAM=False
-CHAT_TEMPERATURE=1
 CHAT_MODEL=o1-preview
 SYSTEM_PROMPT_ROLE=user
 
-BACKEND=rdagent.oai.backend.LiteLLMAPIBackend
-OPENAI_API_KEY=sk-1234
-OPENAI_API_BASE=http://ep14.213428.xyz:38881
+BACKEND=rdagent.oai.backend.LiteLLMAPIBackend 
+OPENAI_API_KEY=sk-1234 
+OPENAI_API_BASE=http://10.150.240.117:38803 
+EMBEDDING_MODEL=text-embedding-3-small 
+CHAT_MODEL=gpt-5 
+CHAT_TEMPERATURE=1 
 
 
 # amc chat model configs:
-EMBEDDING_MODEL=text-embedding-ada-002
+#EMBEDDING_MODEL=text-embedding-ada-002
 
 # Cache Setting (Optional):
 DUMP_CHAT_CACHE=True
diff --git a/rdagent/app/agentic_sys/conf.py b/rdagent/app/agentic_sys/conf.py
new file mode 100644
index 000000000..8d0e76b98
--- /dev/null
+++ b/rdagent/app/agentic_sys/conf.py
@@ -0,0 +1,27 @@
+
+from pydantic_settings import SettingsConfigDict
+
+from rdagent.core.conf import ExtendedBaseSettings
+
+
+class AgenticSysSetting(ExtendedBaseSettings):
+    model_config = SettingsConfigDict(env_prefix="AS_", protected_namespaces=())
+
+    competition: str | None = None
+
+    # Main components
+    ## Scen
+    scen: str = "rdagent.scenarios.agentic_sys.scen.AgenticSysScen"
+    """
+    Scenario class for data science tasks.
+    - For Kaggle competitions, use: "rdagent.scenarios.data_science.scen.KaggleScen"
+    - For custom data science scenarios, use: "rdagent.scenarios.data_science.scen.DataScienceScen"
+    """
+    exp_gen: str = "rdagent.scenarios.agentic_sys.proposal.AgenticSysExpGen"
+    coder: str = "rdagent.scenarios.agentic_sys.dev.AgenticSysCoder"
+    runner: str = "rdagent.scenarios.agentic_sys.dev.AgenticSysRunner"
+
+    feedback: str = "rdagent.scenarios.agentic_sys.feedback.AgenticSysExp2Feedback"
+
+
+ASYS_RD_SETTING = AgenticSysSetting()
diff --git a/rdagent/app/agentic_sys/loop.py b/rdagent/app/agentic_sys/loop.py
new file mode 100644
index 000000000..de92d76ed
--- /dev/null
+++ b/rdagent/app/agentic_sys/loop.py
@@ -0,0 +1,50 @@
+import asyncio
+from pathlib import Path
+from typing import Optional
+
+import fire
+import typer
+from typing_extensions import Annotated
+
+
+from rdagent.core.utils import import_class
+from rdagent.log import rdagent_logger as logger
+
+from rdagent.app.agentic_sys.conf import ASYS_RD_SETTING
+from rdagent.scenarios.agentic_sys.loop import AgenticSysRDLoop
+
+
+def main(
+    path: Optional[str] = None,
+    checkout: Annotated[bool, typer.Option("--checkout/--no-checkout", "-c/-C")] = True,
+    checkout_path: Optional[str] = None,
+    step_n: Optional[int] = None,
+    loop_n: Optional[int] = None,
+    timeout: Optional[str] = None,
+    competition="deepresearch",
+    replace_timer=True,
+    exp_gen_cls: Optional[str] = None,
+):
+    if not checkout_path is None:
+        checkout = Path(checkout_path)
+
+    if competition is not None:
+        ASYS_RD_SETTING.competition = competition
+
+    if not ASYS_RD_SETTING.competition:
+        logger.error("Please specify competition name.")
+
+    if path is None:
+        agentic_sys_loop = AgenticSysRDLoop(ASYS_RD_SETTING)
+    else:
+        agentic_sys_loop: AgenticSysRDLoop = AgenticSysRDLoop.load(path, checkout=checkout, replace_timer=replace_timer)
+
+    # replace exp_gen if we have new class
+    if exp_gen_cls is not None:
+        agentic_sys_loop.exp_gen = import_class(exp_gen_cls)(agentic_sys_loop.exp_gen.scen)
+
+    asyncio.run(agentic_sys_loop.run(step_n=step_n, loop_n=loop_n, all_duration=timeout))
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
index 964ce3683..679ed15f8 100644
--- a/rdagent/core/proposal.py
+++ b/rdagent/core/proposal.py
@@ -162,6 +162,9 @@ def __init__(self, scen: ASpecificScen, knowledge_base: ASpecificKB | None = Non
         self.knowledge_base: ASpecificKB | None = knowledge_base
         self.current_selection: tuple[int, ...] = (-1,)
 
+        # When parallel multiple nodes in the trace, nodes are not committed before finish running.
+        self.uncommitted_experiments: dict[int, Experiment] = {}  # loop_id -> Experiment
+
     def get_sota_hypothesis_and_experiment(self) -> tuple[Hypothesis | None, Experiment | None]:
         """Access the last experiment result, sub-task, and the corresponding hypothesis."""
         # TODO: The return value does not align with the signature.
@@ -240,6 +243,39 @@ def get_parents(self, child_idx: int) -> list[int]:
 
         return ancestors
 
+    def register_uncommitted_exp(self, exp: DSExperiment, loop_id: int):
+        self.uncommitted_experiments[loop_id] = exp
+
+    def deregister_uncommitted_exp(self, loop_id: int):
+        if loop_id in self.uncommitted_experiments:
+            del self.uncommitted_experiments[loop_id]
+
+    def sync_dag_parent_and_hist(
+        self,
+        exp_and_fb: tuple[Experiment, ExperimentFeedback],
+        cur_loop_id: int,
+    ) -> None:
+        """
+        Adding corresponding parent index to the dag_parent when the hist is going to be changed.
+        Should be called when the hist is changed.
+        """
+
+        if len(self.hist) == 0 or len(self.get_current_selection()) == 0:
+            # the node we are going to add is the first node of hist / root node of a new sub-trace
+            self.dag_parent.append(())
+
+        else:
+            current_node_idx = self.current_selection[0]
+
+            if current_node_idx == -1:
+                # the current selection is the latest one
+                current_node_idx = len(self.hist) - 1
+
+            self.dag_parent.append((current_node_idx,))
+        self.hist.append(exp_and_fb)
+        self.idx2loop_id[len(self.hist) - 1] = cur_loop_id
+        self.deregister_uncommitted_exp(cur_loop_id)
+
 
 class CheckpointSelector:
     """
@@ -298,7 +334,7 @@ def __init__(self, scen: Scenario) -> None:
         self.scen = scen
 
     @abstractmethod
-    def gen(self, trace: Trace, plan: ExperimentPlan | None = None) -> Experiment:
+    def gen(self, trace: Trace) -> Experiment:
         """
         Generate the experiment based on the trace.
         Planning is part of gen, but since we may support multi-stage planning,
diff --git a/rdagent/scenarios/agentic_sys/dev.py b/rdagent/scenarios/agentic_sys/dev.py
new file mode 100644
index 000000000..cf98c5d4f
--- /dev/null
+++ b/rdagent/scenarios/agentic_sys/dev.py
@@ -0,0 +1,2467 @@
+import sys
+from blosc2 import exp
+from matplotlib.style import context
+from prefect import task
+from rdagent.core.developer import Developer
+from rdagent.core.experiment import Experiment, FBWorkspace
+from rdagent.log import rdagent_logger as logger
+from pathlib import Path
+import subprocess
+import sys
+import json
+import re
+import os
+from typing import Dict, Any, List, Optional
+from rdagent.scenarios.agentic_sys.env import get_agent_sys_env
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.agentic_sys.tools.web_search import create_web_search_tool
+
+# TODO:  We only list the dummy coder and runner here.
+# If we want to implement the a comprehensive agentic system R&D Agent, we need to implement it with CoSTEER.
+
+
+class AgenticSysCoder(Developer[Experiment]):
+    #generate code for agentic system experiment
+    def __init__(self, scen):
+        self.scen = scen
+
+        #initialize LLM backend 
+        self.api_backend = APIBackend()
+        logger.info("Initialized AgenticSysCoder with LLM backend")
+
+        #initialize web search tool
+        search_config_path = Path(__file__).parent / "tools" / "search_config.yaml"
+
+        #web search tool
+        self.web_search_tool = None
+
+        try:
+            self.web_search_tool = create_web_search_tool(search_config_path)
+            logger.info("Initialized web search tool for external knowledge retrieval")
+        except Exception as e:
+            logger.warning(f"Failed to initialize web search tool: {e}")
+            self.web_search_tool = None
+        
+        logger.info("Initialized AgenticSysCoder with LLM")
+
+
+    def develop(self, exp: Experiment) -> Experiment:
+        # TODO: implement the coder
+        '''
+        generate code based on experiment assumption
+        '''
+        logger.info("Starting code generation for the experiment")
+
+        try:
+            # 1. Initialize workspace with FBWorkspace 
+            exp.experiment_workspace = FBWorkspace()
+            ws_path = Path(exp.experiment_workspace.workspace_path)
+            ws_path.mkdir(parents=True, exist_ok=True)
+            logger.info(f"Initialized workspace at {ws_path}")
+
+            #2. prepare enhanced context with web search
+            context = self.prepare_enhanced_context(exp)
+
+
+
+            #2. Generate code files using CoSTEER approach
+            code_artifacts = self.generate_code_with_costeer(exp)
+            exp.experiment_workspace.inject_files(**code_artifacts)
+            logger.info(f"Injected {len(code_artifacts)} files into workspace")
+
+            #prepare execution environment following conf.py pattern
+            timeout = self.calculate_timeout(exp)
+            env = get_agent_sys_env(
+                # extra_volumes = {str(ws_path): "/workspace"},
+                running_timeout_period = timeout,
+                enable_cache=True
+            )
+            logger.info(f"Prepared execution environment")
+
+            # 3) Optinal pre-run validation
+            try: 
+                if self.should_validate_generation(exp):
+                    validation_result = self.validate_generated_code(env, ws_path)
+                    if not getattr(validation_result, 'success', False):
+                        logger.warning(f"Pre-run validation failed: {validation_result.message}")
+            except Exception as e_val:
+                logger.error(f"Validation step raised: {e_val} continuing...")
+
+            #4. run the entrypoint inside environment (use train.py as entry)
+            try: 
+                logger.info("Running generated code inside validation")
+                # run_res = env.run(
+                #     entry = "bash",
+                #     cmd = "cd /workspace && python train.py", timeout = timeout
+                # )
+                run_res = exp.experiment_workspace.run(env=env, entry="python train.py")
+                #collect run outputs
+                exp.run_returncode = getattr(run_res, 'returncode', None)
+                exp.run_stdout = getattr(run_res, 'stdout', getattr(run_res, 'logs', None))
+                exp.run_stderr = getattr(run_res, 'stderr', None)
+                logger.info(f"Run finished")
+            except Exception as e_run:
+                raise
+                logger.error(f"Execution inside environment failed: {e_run}")
+                #keep exception and let caller decide; still return exp with workspace
+                exp.run_exception = e_run
+
+        except Exception as e:
+            raise
+            logger.error(f"Code generation failed: {str(e)}")
+            exp.exception = e
+            if not hasattr(exp, 'experiment_workspace') or not exp.experiment_workspace:
+                try:
+                    exp.experiment_workspace = self.create_fallback_workspace(exp)
+                except Exception as e_fallback:
+                    pass
+        return exp
+
+    def prepare_enhanced_context(self, exp:Experiment):
+        """
+        Prepare enhanced context with external knowledge from web search
+        
+        Args:
+            exp: Current experiment
+            
+        Returns:
+            Enhanced context dictionary
+        """
+        hypothesis = getattr(exp, 'hypothesis', 'Improve agentic system performance')
+        
+        # Base context
+        context = {
+            'hypothesis': hypothesis,
+            'scenario_desc': self.scen.get_scenario_all_desc(),
+            'success_criteria': self.scen.get_success_criteria(),
+            'task_id': getattr(exp, 'id', 'unknown'),
+            'task_domain': getattr(self.scen, 'domain', 'general'),
+        }
+        
+        # Add web search results if available (NEW)
+        if self.web_search_tool:
+            try:
+                logger.info("Retrieving external knowledge via web search...")
+                
+                # Check if search service is healthy
+                if not self.web_search_tool.client.health_check():
+                    logger.warning("Search service unavailable, skipping external search")
+                    context['external_sources'] = []
+                    return context
+                
+                # Identify knowledge gaps
+                knowledge_gaps = self._identify_knowledge_gaps(exp, hypothesis)
+                
+                # Prepare search context
+                search_context = {
+                    'methodology': self._extract_methodology(hypothesis),
+                    'complexity': self._assess_complexity(hypothesis)
+                }
+                
+                # Perform web search
+                external_sources = self.web_search_tool.search_for_hypothesis(
+                    task_description=hypothesis,
+                    current_gaps=knowledge_gaps,
+                    context=search_context
+                )
+                
+                context['external_sources'] = external_sources
+                logger.info(f"Retrieved {len(external_sources)} external sources")
+                
+                # Add summary of external sources to context
+                if external_sources:
+                    context['external_knowledge_summary'] = self._summarize_external_sources(
+                        external_sources
+                    )
+                
+            except Exception as e:
+                logger.error(f"Web search failed: {e}")
+                context['external_sources'] = []
+        else:
+            context['external_sources'] = []
+        
+        return context
+
+    def identify_knowledge_gaps(self, exp, hypothesis):
+        """
+        Identify knowledge gaps from hypothesis
+        returns:
+            list of knowledge gap descriptions
+        """
+
+        gaps = []
+
+        #Extract keywords indicating knowledge needs
+        hypothesis_lower = hypothesis.lower()
+
+        #Common agentic system knowledge areas
+        knowledge_areas = {
+            'planning': ['plan', 'planning', 'strategy', 'approach'],
+            'reasoning': ['reason', 'reasoning', 'logic', 'inference'],
+            'learning': ['learn', 'learning', 'adapt', 'optimization'],
+            'memory': ['memory', 'context', 'history', 'recall'],
+            'tool_use': ['tool', 'api', 'external', 'integration'],
+            'evaluation': ['evaluate', 'assessment', 'metric', 'performance'],
+            'communication': ['communicate', 'language', 'dialogue', 'interaction']
+        }
+        
+        for area, keywords in knowledge_areas.items():
+            if any(kw in hypothesis_lower for kw in keywords):
+                gaps.append(f"{area} techniques and best practices")
+        
+        # Add general gaps if none identified
+        if not gaps:
+            gaps.append("agentic system design patterns")
+            gaps.append("system implementation strategies")
+        
+        logger.info(f"Identified knowledge gaps: {gaps}")
+        return gaps[:5]  # Limit to top 5
+
+
+    def extract_methodology(self, hypothesis: str) -> str:
+        """Extract methodology from hypothesis"""
+        hypothesis_lower = hypothesis.lower()
+        
+        methodologies = {
+            'reinforcement learning': ['rl', 'reinforcement', 'q-learning', 'policy'],
+            'retrieval augmented generation': ['rag', 'retrieval', 'augmented'],
+            'chain of thought': ['cot', 'chain of thought', 'reasoning chain'],
+            'tree of thought': ['tot', 'tree of thought', 'reasoning tree'],
+            'multi-agent': ['multi-agent', 'multiple agents', 'agent collaboration'],
+            'iterative refinement': ['iterative', 'refinement', 'feedback loop']
+        }
+        
+        for method, keywords in methodologies.items():
+            if any(kw in hypothesis_lower for kw in keywords):
+                return method
+        
+        return 'general agentic approach'
+    
+    def assess_complexity(self, hypothesis: str) -> str:
+        """Assess hypothesis complexity"""
+        hypothesis_lower = hypothesis.lower()
+        
+        high_complexity_indicators = [
+            'complex', 'advanced', 'sophisticated', 'multi-stage',
+            'distributed', 'parallel', 'optimization'
+        ]
+        
+        medium_complexity_indicators = [
+            'moderate', 'standard', 'typical', 'conventional'
+        ]
+        
+        if any(ind in hypothesis_lower for ind in high_complexity_indicators):
+            return 'high'
+        elif any(ind in hypothesis_lower for ind in medium_complexity_indicators):
+            return 'medium'
+        else:
+            return 'low'
+    
+    def summarize_external_sources(self, sources: List[Dict[str, Any]]) -> str:
+        """
+        Summarize external sources for context injection
+        
+        Args:
+            sources: List of external source dictionaries
+            
+        Returns:
+            Formatted summary string
+        """
+        if not sources:
+            return "No external sources available."
+        
+        summary_parts = []
+        
+        # High credibility sources
+        high_cred = [s for s in sources if s.get('credibility_level') == 'High']
+        if high_cred:
+            summary_parts.append(
+                f"High-credibility sources ({len(high_cred)}): "
+                + ", ".join(s['title'][:50] for s in high_cred[:3])
+            )
+        
+        # Key insights
+        key_insights = []
+        for source in sources[:5]:
+            summary = source.get('summary', '')
+            if len(summary) > 50:
+                key_insights.append(summary[:100])
+        
+        if key_insights:
+            summary_parts.append(
+                "Key insights: " + " | ".join(key_insights[:2])
+            )
+        
+        return "\n".join(summary_parts)
+
+
+    
+    def generate_code_with_costeer(self, exp) -> Dict[str, str]:
+        """
+        Generate code artifacts using CoSTEER approach
+        """
+        logger.info("Generating code using CoSTEER framework")
+        hypothesis = getattr(exp, 'hypothesis', 'Improve agentic system performance')
+        context = {
+            'hypothesis': hypothesis,
+            'scenario_desc': self.scen.get_scenario_all_desc(),
+            'success_criteria': self.scen.get_success_criteria(),
+        }
+        # generate code artifacts
+        code_artifacts = {}
+
+        #1. generate main agent implementation
+        agent_code = self.generate_agent_code(context)
+        code_artifacts['agent.py'] = agent_code
+
+        #2. Generate execution script
+        train_code = self.generate_train_script(context)
+        code_artifacts['train.py'] = train_code
+
+        #3. Generate requirements file
+        requirements = self.generate_requirements(context)
+        code_artifacts['requirements.txt'] = requirements
+
+        #4. Generate configuration file if needed
+        if self.needs_config_file(context):
+            config_code = self.generate_config_file(context)
+            code_artifacts['config.py'] = config_code
+
+        logger.info(f"Generated {len(code_artifacts)} code artifacts")
+        return code_artifacts
+    
+    def prepare_execution_environment(self, exp: Experiment, ws_path: Path):
+        """
+        Prepare execution environment similar to DS CoSTEER approach
+        """
+        try:
+            # Get environment configuration
+            extra_volumes = {str(ws_path): "/workspace"}
+            #Set timeout based on experiment complexity
+            timeout = self.calculate_timeout(exp)
+            #create environment using agent_sys specific configuration
+            env = get_agent_sys_env(
+                extra_volumes = extra_volumes,
+                running_timeout_period = timeout, 
+                enable_cache=True
+            )
+            logger.info("Prepared execution environment successfully")
+            return env
+        
+        except Exception as e:
+            logger.error(f"Failed to prepare execution environment: {str(e)}")
+            raise
+
+    def calculate_timeout(self, exp: Experiment) -> int:
+        """
+        Calculate appropriate timeout based on experiment characteristics
+        """
+        base_timeout = 300  # default 5 minutes
+        #Adjust timeout based on hypothesis comnplexity
+        hypothesis = getattr(exp, 'hypothesis', '')
+        if 'parallel' in hypothesis.lower() or 'concurrent' in hypothesis.lower():
+            return base_timeout * 2  #parallel tasks may need more time
+        elif 'optimisation' in hypothesis.lower():
+            return base_timeout * 4 #learning/optimization may need more time
+        elif 'simple' in hypothesis.lower() or 'basic' in hypothesis.lower():
+            return base_timeout  #simple tasks
+        return base_timeout
+    
+    def should_validate_generation(self,exp: Experiment) -> bool:
+        """
+        Determine if we should validate generated code before proceeding
+
+        Validation is recommended when: 
+        1. It's the first experiment (no prior validation history)
+        2. The hypothesis involves complex/risk operations
+        3. Previous experiment has validation failures
+        4. Configuration explicitly requires validation
+        
+        parameters:
+        exp: Experiment
+        The experiment to potentially 
+        Returns: bool
+        True if validation should be performed
+        """
+        #1. check global configuration flag
+        validation_config = getattr(self.scen, 'enable_code_validation', True)
+        if not validation_config:
+            logger.info("Code validation disabled by configuration")
+            return False
+        #2. always validate first experiment
+        if not hasattr(exp, 'iteration_number') or exp.iteration_number == 0:
+            logger.info("First experiment - validation enabled ")
+            return True
+        #3. check hypothesis complexity/risk indicators
+        hypothesis = getattr(exp, 'hypothesis', '').lower()
+        
+        #High risk keywords that suggest validation is needed
+        high_risk_keywords = [
+            'parallel', 'concurrent', 'multi-thread', 'async',  # Concurrency risks
+            'optimization', 'complex', 'advanced',              # Complexity
+            'distributed', 'network', 'remote',                 # Network operations
+            'file system', 'database', 'io',                    # I/O operations
+            'experimental', 'novel', 'new approach'             # Unproven approaches
+        ]
+
+        if any(keyword in hypothesis for keyword in high_risk_keywords):
+            logger.info(f"High risk hypothesis detected, validation enabled")
+            return True
+        
+        if hasattr(exp, 'previous_validation_failed') and exp.previous_validation_failed:
+            logger.info("Previous validation failed, re-enabling validation")
+            return True
+        
+        #5. skip validation for simple/proven approaches
+        simple_keywords = ['simple', 'basic', 'straightforward', 'minimal']
+        if any(keyword in hypothesis for keyword in simple_keywords):
+            logger.info("Simple hypothesis detected, skipping validation")
+            return False
+
+        # 6. Default behavior: validate every N experiments
+        validation_interval = getattr(self.scen, 'validation_interval', 3)
+        iteration = getattr(exp, 'iteration_number', 0)
+    
+        if iteration % validation_interval == 0:
+            logger.info(f"Periodic validation (interval={validation_interval})")
+            return True
+    
+        # 7. Default: skip validation for efficiency
+        logger.info("No validation triggers met - skipping validation")
+        return False
+        
+
+    def validate_generated_code(self, env, ws_path: Path):
+        """
+        Validate generated code by running basic checks
+        """
+        class ValidationResult:
+            def __init__(self,success, message):
+                self.success = success
+                self.message = message
+
+        try:
+            #Run basic syntax check
+            check_cmd = "python -m py_compile agent.py && python -m py_compile train.py"
+            result = env.run(
+                entry_point = "bash",
+                cmd = f'cd/workspace && {check_cmd}',
+                timeout = 30
+            )
+            if result.returncode == 0: 
+                return ValidationResult(True, "syntax validation passed")
+            else:
+                return ValidationResult(False, f"Syntax validation failed: {result.stderr}")
+        
+        except Exception as e:
+            return ValidationResult(False, f"Validation error: {str(e)}")
+        
+    # def generate_agent_code(self,context):
+    #     """
+    #     Generate agent code based on context
+    #     """
+    #     hypothesis = context.get('hypothesis', 'Improve agentic system performance')
+
+    #     #enhanced agent template with CoSTEER improvement
+    #     return f'''
+    #     """
+    #     Agentic System Implementation - CoSTEER enhanced
+    #     Hypothesis: {hypothesis}
+    #     Generated with intelligent code generation
+    #     """
+    #     import time
+    #     import logging
+    #     import threading
+    #     from typing import Dict, List, Any, Optional
+    #     from concurrent.futures import ThreadPoolExecutor, as_completed
+    #     from dataclasses import dataclass
+    #     from enum import Enum
+    #     import json
+
+    #     #Configurable logging
+    #     logging.basicConfig(level = logging.INFO)
+    #     logger = logging.getLogger("AgenticSystem")
+
+    #     class TaskStatus(Enum):
+    #         PENDING = "pending"
+    #         RUNNING = "running"
+    #         COMPLETED = "completed"
+    #         FAILED = "failed"
+
+    #     @dataclass
+    #     class TaskResult:
+    #         task_id: int
+    #         success: bool
+    #         execution_time: float
+    #         error: Optional[str] = None
+    #         data: Optional[Dict[str, Any]] = None
+        
+    #     class AgenticSystem:
+    #         """
+    #         Ehanced Agentic System with CoSTEER optimizations
+    #         """
+    #         def __init__(self, config[Dict] = None):
+    #             self.name = "CoSTEER_AgenticSystem"
+    #             self.task_count = 0
+    #             self.config = config if config else self.get_default_config()
+
+    #             #Performance Tracking
+    #             self.performance_metrics = {{"total_tasks": 0,"successful_tasks": 0,"failed_tasks": 0,"total_execution_time": 0}}
+
+    #             #thread safety
+    #             self.lock = threading.Lock()
+
+    #             logger.info(f"Initialized {{self.name}} with config: {{self.config}}")
+            
+    #         def get_default_config(self):
+    #             """Get default configuration optimized for hypothesis"""
+    #             return {{
+    #                 "max_workers": 4,
+    #                 "task_timeout": 60,
+    #                 "enable_parallel": {'parallel' in hypothesis.lower()},
+    #                 "enable_optimization": {'optimization' in hypothesis.lower()}
+    #             }}
+
+    #         def run_task(self, task: Dict[str, Any]):
+    #             """Execute single task with enhanced error handling and monitoring"""
+    #             start_time = time.time()
+    #             task_id = task.get('id', self.get_next_task_id())
+    #             try:
+    #                 logger.info(f"Starting task {{task_id}}")
+    #                 #Simulate intelligent task processing
+    #                 self.process_task_logic(task)
+    #                 execution_time = time.time() - start_time
+    #                 #update metrics
+    #                 with self.lock:
+    #                     self.metrics['total_tasks'] += 1
+    #                     self.metrics['successful_tasks'] += 1
+    #                     self.metrics['total_execution_time'] += execution_time
+    #                 result = TaskResult(
+    #                     task_id = task_id,
+    #                     status = TaskStatus.COMPLETED,
+    #                     execution_time = execution_time,
+    #                     success = True,
+    #                     data = {{'processed': True, 'task_type': task.get('type', 'unknown')}}
+    #                 )
+
+    #                 logger.info(f"Task {{task_id}} completed successfully in {{execution_time:.4f}}s")
+    #                 return result
+
+    #             except Exception as e:
+    #                 execution_time = time.time() - start_time
+    #                 with self.lock:
+    #                     self.metrics['total_tasks'] += 1
+    #                     self.metrics['failed_tasks'] += 1
+    #                     self.metrics['total_execution_time'] += execution_time
+    #                 result = TaskResult(
+    #                     task_id = task_id,
+    #                     status = TaskStatus.FAILED,
+    #                     execution_time = execution_time,
+    #                     success = False,
+    #                     error = str(e)
+    #                 )
+    #                 logger.error(f"Task {{task_id}} failed: {{str(e)}}")
+    #                 return result
+            
+    #         def get_next_task_id(self):
+    #             "thread-safe task id generation"
+    #             with self.lock:
+    #                 self.task_count += 1
+    #                 return self.task_count
+
+    #         def process_task_logic(self, task):
+    #             """Intelligent task processing based on hypothesis"""
+    #             task_type = task.get('type', 'default')
+    #             complexity = task.get('complexity', 1)
+
+    #             #Simulate processing time based on complexity
+    #             base_time = 0.01
+    #             processing_time = base_time * complexity
+
+    #             #Add hypothesis-specific optimisation
+    #             if complexity > 5 and not self.config.get('enable_optimization', False):
+    #                 # 10% error rate for high complexity tasks
+    #                 if time.time() % 10 < 1: 
+    #                     raise RuntimeError(f"Simulated error for complex task {{task.get('id')}}")
+
+    #         def run_tasks(self, tasks):
+    #             """
+    #             Execute multiple tasks with intelligent scheduling
+    #             """
+    #             if tasks is None:
+    #                 tasks = self.generate_default_tasks()
+    #             logger.info(f"Starting execution of {{len(tasks)}} tasks")
+    #             batch_start_time = time.time()
+
+    #             if self.config.get('enable_parallel', True) and len(tasks) > 1:
+    #                 results = self.run_tasks_parallel(tasks)
+    #             else:
+    #                 results = self.run_tasks_sequential(tasks)
+                
+    #             #Calculate comprehensive metrics
+    #             total_time = time.time() - batch_start_time
+    #             success_count = sum(1 for r in results if r.success)
+    #             avg_task_time = sum(r.execution_time for r in results) / len(results) if results else 0
+
+    #             metrics = {{
+    #                 "success_rate": success_count / len(results) if results else 0,
+    #                 "avg_task_time": avg_task_time,
+    #                 "error_count": len(results) - success_count,
+    #                 "total_tasks": len(results),
+    #                 "total_execution_time": total_time,
+    #                 "system_metrics": self.metrics.copy()
+    #             }}
+    #             logger.info(f"Batch execution completed: {{metrics}}")
+    #             return metrics
+
+    #         def run_tasks_sequential(self, tasks): 
+    #             """Execute task sequentially"""
+    #             results = []
+    #             for task in tasks:
+    #                 result = self.run_task(task)
+    #                 results.append(result)
+    #             return results
+
+    #         def run_tasks_parallel(self, tasks):
+    #             """Execute tasks in parallel using ThreadPoolExecutor"""
+    #             results = []
+    #             max_workers = min(self.config.get('max_workers', 4), len(tasks))
+    #             with ThreadPoolExecutor(max_workers = max_workers) as executor:
+    #                 future_to_task = {{executor.submit(self.run_task, task): task for task in tasks}}
+    #                 for future in as_completed(future_to_task):
+    #                     try:
+    #                         result = future.result(timeout = self.config.get('task_timeout', 30))
+    #                         results.append(result)
+    #                     except Exception as e:
+    #                         #Create error result for failed failure
+    #                         task = future_to_task[future]
+    #                         error_result = TaskResult(
+    #                             task_id = task.get('id', 0),
+    #                             status = TaskStatus.FAILED,
+    #                             execution_time = 0,
+    #                             success = False,
+    #                             error = f"Future execution failed: {{str(e)}}"
+    #                         )
+    #                         results.append(error_result)
+    #             return results
+
+    #         def generate_default_tasks(self):
+    #             """Generate default tasks for testing"""
+    #             return [
+    #                 {{
+    #                     "id": i,
+    #                     "type": "test",
+    #                     "data": f"sample_{{i}}",
+    #                     "complexity": (i % 5 ) + 1
+    #                 }} for i in range(10)
+    #             ]
+
+    #         def get_system_status(self):
+    #             """Get current system status and metrics"""
+    #             with self.lock:
+    #                 status = {{
+    #                     'name': self.name,
+    #                     'config': self.config,
+    #                     'metrics': self.metrics.copy(),
+    #                     'success_rate': (
+    #                         self.metrics['successful_tasks'] / self.metrics['total_tasks']
+    #                         if self.metrics['total_tasks'] > 0 else 0
+    #                     )
+    #                 }}
+    #             return status
+    #     '''
+
+    def generate_agent_code(self, task_info):
+        """
+        Generate agent code using LLM instead of templates
+        """
+        system_prompt = """
+        You are an expert in building AI research agents for complex analytical tasks.
+Generate production-ready Python code implementing a multi-step research agent capable of:
+- Breaking down complex research questions
+- Gathering and synthesizing information
+- Producing comprehensive, insightful answers
+Follow best practices with proper error handling and documentation.
+        """
+        user_prompt = f"""Generate a research agent for DeepResearch Bench evaluation.
+**Research Task:**
+- Domain: {task_info['domain']}
+- Question: {task_info['question']}
+- Sub-Question: {task_info['sub_question']}
+- Required Capabilities: {task_info['required capabilities']}
+- Complexity : {task_info['complexity']}
+
+**Agent Requirements:**
+
+1. Class: ResearchAgent with methods:
+   - __init__(config: Dict = None)
+   - research(question: str) -> Dict: Main research pipeline
+   - decompose_question(question: str) -> List[str]: Break into sub-questions
+   - gather_information(sub_q: str) -> Dict: Information gathering
+   - analyze_information(info_list: List[Dict]) -> Dict: Analysis
+   - synthesize_answer(analyses: List[Dict]) -> str: Final synthesis
+   
+2. Research Pipeline:
+   Step 1: Question Decomposition - Break complex question into manageable sub-questions
+   Step 2: Information Gathering - For each sub-question, gather relevant information
+   Step 3: Analysis - Analyze gathered information with causal reasoning
+   Step 4: Synthesis - Integrate analyses into comprehensive answer
+   
+3. Output Format (critical for evaluation):
+   {{
+       "answer": str,           # Main comprehensive answer
+       "sub_answers": [         # Answers to each sub-question
+           {{"question": str, "answer": str, "evidence": List[str]}}
+       ],
+       "reasoning": str,        # Explanation of research approach and logic
+       "evidence": List[str],   # Supporting citations/sources
+       "confidence": float,     # 0-1 confidence score
+       "metadata": {{           # Additional research metadata
+           "domain": str,
+           "approach": str,
+           "limitations": List[str]
+       }}
+   }}
+   
+4. Domain-Specific Research Strategies:
+   - Biology: Focus on mechanisms, evidence from studies
+   - Business: Market analysis, competitive factors, financial data
+   - Computer Science: Technical analysis, algorithmic reasoning
+   - General: Structured, logical approach
+
+5. Implementation Requirements:
+   - Use logging for tracking research process
+   - Handle edge cases and missing information gracefully
+   - Support different research strategies per domain
+   - Include reasoning traces for explainability
+   - Modular, extensible architecture
+
+Generate ONLY the Python code without markdown blocks or explanations."""
+
+        try:
+            logger.info("Calling LLM to generate research agent code...")
+            response = self.api_backend.build_messages_and_create_chat_completion(
+                user_prompt=user_prompt,
+                system_prompt=system_prompt,
+                json_mode=False
+            )
+            
+            code = self._clean_llm_code_response(response)
+            self._validate_python_syntax(code, "agent.py")
+            
+            logger.info(f"Generated research agent code ({len(code)} chars)")
+            return code
+            
+        except Exception as e:
+            logger.error(f"LLM research agent generation failed: {e}")
+            logger.warning("Using fallback research agent template")
+            return self._get_fallback_research_agent(task_info)
+
+
+    
+    # def generate_train_script(self, context):
+    #     """
+    #     Generate enhanced training/execution script
+    #     """
+    #     hypothesis = context.get('hypothesis', 'Improve agentic system performance')
+    #     task_id = context.get('task_id', 'unknown')
+    #     task_domain = context.get('task_domain', 'general')
+    #     evaluation_criteria = context.get('evaluation_criteria', {})
+        
+    #     enable_parallel = 'parallel' in hypothesis.lower() or 'concurrent' in hypothesis.lower()
+    #     enable_optimization = 'optimization' in hypothesis.lower() or 'optimize' in hypothesis.lower()
+    #     max_workers = 8 if enable_parallel else 4
+    #     task_timeout = 60 if enable_optimization else 30
+
+    #     return f'''"""
+    #     CoSTEER-Enhanced Training/Execution Script for Agentic System
+    #     Task ID: {task_id}
+    #     Domain: {task_domain}
+    #     Hypothesis: {hypothesis}
+
+    #     This script evaluates outputs according to DeepResearch Bench standards:
+    #     - Comprehensiveness (0-10): Coverage and depth
+    #     - Insight (0-10): Causal reasoning and originality
+    #     - Instruction Following (0-10): Task compliance
+    #     - Readability (0-10): Clarity and presentation
+    #     """
+    #     import json
+    #     import sys
+    #     import time
+    #     import traceback
+    #     from pathlib import Path
+    #     from typing import Dict, List, Any, Optional
+    #     from dataclasses import dataclass, asdict
+    #     from agent import AgenticSystem
+
+    #     @dataclass
+    #     class EvaluationScore:
+    #         """DeepResearch Bench evaluation score"""
+    #         comprehensiveness: float = 0.0  # 0-10
+    #         insight: float = 0.0            # 0-10
+    #         instruction_following: float = 0.0  # 0-10
+    #         readability: float = 0.0        # 0-10
+    #         overall: float = 0.0            # Weighted average
+            
+    #         # Dimension weights (customizable per task)
+    #         weights: Dict[str, float] = None
+            
+    #         def __post_init__(self):
+    #             if self.weights is None:
+    #                 # Default equal weights
+    #                 self.weights = {{
+    #                     'comprehensiveness': 0.25,
+    #                     'insight': 0.25,
+    #                     'instruction_following': 0.25,
+    #                     'readability': 0.25
+    #                 }}
+            
+    #         def calculate_overall(self) -> float:
+    #             """Calculate weighted overall score"""
+    #             self.overall = (
+    #                 self.comprehensiveness * self.weights['comprehensiveness'] +
+    #                 self.insight * self.weights['insight'] +
+    #                 self.instruction_following * self.weights['instruction_following'] +
+    #                 self.readability * self.weights['readability']
+    #             )
+    #             return self.overall
+            
+    #         def to_dict(self) -> Dict[str, Any]:
+    #             """Convert to dictionary"""
+    #             return {{
+    #                 'comprehensiveness': round(self.comprehensiveness, 2),
+    #                 'insight': round(self.insight, 2),
+    #                 'instruction_following': round(self.instruction_following, 2),
+    #                 'readability': round(self.readability, 2),
+    #                 'overall': round(self.overall, 2),
+    #                 'weights': self.weights
+    #             }}
+
+    #     class ResearchOutputEvaluator:
+    #         """Evaluate research outputs according to DeepResearch Bench standards"""
+            
+    #         def __init__(self, task_domain: str = 'general'):
+    #             self.task_domain = task_domain
+    #             self.evaluation_log = []
+            
+    #         def evaluate_comprehensiveness(self, output: Dict[str, Any], task_requirements: Dict) -> float:
+    #             """
+    #             Evaluate comprehensiveness (0-10)
+    #             - Breadth and depth of content
+    #             - Coverage of required subtopics
+    #             - Evidence and data sources
+    #             - Multiple perspectives
+    #             """
+    #             score = 0.0
+    #             checks = []
+                
+    #             # Check 1: Coverage of key topics (0-3 points)
+    #             required_topics = task_requirements.get('required_topics', [])
+    #             if required_topics:
+    #                 covered = sum(1 for topic in required_topics 
+    #                             if self._check_topic_coverage(output, topic))
+    #                 coverage_score = min(3.0, (covered / len(required_topics)) * 3.0)
+    #                 score += coverage_score
+    #                 checks.append(f"Topic coverage: {{covered}}/{{len(required_topics)}} ({{coverage_score:.1f}}/3.0)")
+    #             else:
+    #                 score += 2.0  # Default if no specific requirements
+    #                 checks.append("No specific topic requirements (default 2.0/3.0)")
+                
+    #             # Check 2: Depth of analysis (0-3 points)
+    #             depth_indicators = [
+    #                 'detailed analysis' in str(output).lower(),
+    #                 'data' in output or 'evidence' in output,
+    #                 len(str(output)) > 500,  # Substantial content
+    #                 'methodology' in str(output).lower() or 'approach' in str(output).lower()
+    #             ]
+    #             depth_score = sum(depth_indicators) * 0.75
+    #             score += depth_score
+    #             checks.append(f"Depth indicators: {{sum(depth_indicators)}}/4 ({{depth_score:.1f}}/3.0)")
+                
+    #             # Check 3: Evidence and sources (0-2 points)
+    #             evidence_score = 0.0
+    #             if 'references' in output or 'sources' in output:
+    #                 evidence_score += 1.0
+    #             if 'data' in output or 'statistics' in output:
+    #                 evidence_score += 1.0
+    #             score += evidence_score
+    #             checks.append(f"Evidence & sources: {{evidence_score:.1f}}/2.0")
+                
+    #             # Check 4: Multiple perspectives (0-2 points)
+    #             perspective_keywords = ['advantage', 'disadvantage', 'trade-off', 'alternative', 
+    #                                 'limitation', 'consideration']
+    #             perspectives_found = sum(1 for kw in perspective_keywords 
+    #                                     if kw in str(output).lower())
+    #             perspective_score = min(2.0, perspectives_found * 0.5)
+    #             score += perspective_score
+    #             checks.append(f"Multiple perspectives: {{perspectives_found}} keywords ({{perspective_score:.1f}}/2.0)")
+                
+    #             self.evaluation_log.append({{
+    #                 'dimension': 'comprehensiveness',
+    #                 'score': score,
+    #                 'checks': checks
+    #             }})
+                
+    #             return min(10.0, score)
+            
+    #         def evaluate_insight(self, output: Dict[str, Any], task_context: Dict) -> float:
+    #             """
+    #             Evaluate insight (0-10)
+    #             - Causal reasoning and why-think
+    #             - Quantified analysis
+    #             - Non-obvious implications
+    #             - Novel synthesis
+    #             """
+    #             score = 0.0
+    #             checks = []
+                
+    #             # Check 1: Causal reasoning (0-3 points)
+    #             causal_indicators = [
+    #                 'because' in str(output).lower(),
+    #                 'therefore' in str(output).lower(),
+    #                 'as a result' in str(output).lower(),
+    #                 'leads to' in str(output).lower(),
+    #                 'causes' in str(output).lower(),
+    #                 'impacts' in str(output).lower()
+    #             ]
+    #             causal_score = min(3.0, sum(causal_indicators) * 0.6)
+    #             score += causal_score
+    #             checks.append(f"Causal reasoning: {{sum(causal_indicators)}} indicators ({{causal_score:.1f}}/3.0)")
+                
+    #             # Check 2: Quantified analysis (0-2 points)
+    #             has_numbers = any(char.isdigit() for char in str(output))
+    #             has_metrics = any(word in str(output).lower() 
+    #                             for word in ['percent', 'rate', 'ratio', 'metric', 'measure'])
+    #             quant_score = (1.0 if has_numbers else 0) + (1.0 if has_metrics else 0)
+    #             score += quant_score
+    #             checks.append(f"Quantified analysis: numbers={{has_numbers}}, metrics={{has_metrics}} ({{quant_score:.1f}}/2.0)")
+                
+    #             # Check 3: Non-obvious implications (0-3 points)
+    #             insight_keywords = ['implication', 'insight', 'suggests', 'indicates', 
+    #                             'reveals', 'unexpected', 'surprisingly', 'notable']
+    #             insights_found = sum(1 for kw in insight_keywords if kw in str(output).lower())
+    #             implication_score = min(3.0, insights_found * 0.5)
+    #             score += implication_score
+    #             checks.append(f"Implications: {{insights_found}} keywords ({{implication_score:.1f}}/3.0)")
+                
+    #             # Check 4: Novel synthesis (0-2 points)
+    #             synthesis_indicators = [
+    #                 'framework' in str(output).lower(),
+    #                 'model' in str(output).lower(),
+    #                 'synthesis' in str(output).lower(),
+    #                 'integration' in str(output).lower()
+    #             ]
+    #             synthesis_score = min(2.0, sum(synthesis_indicators) * 0.7)
+    #             score += synthesis_score
+    #             checks.append(f"Novel synthesis: {{sum(synthesis_indicators)}} indicators ({{synthesis_score:.1f}}/2.0)")
+                
+    #             self.evaluation_log.append({{
+    #                 'dimension': 'insight',
+    #                 'score': score,
+    #                 'checks': checks
+    #             }})
+                
+    #             return min(10.0, score)
+            
+    #         def evaluate_instruction_following(self, output: Dict[str, Any], 
+    #                                         task_requirements: Dict) -> float:
+    #             """
+    #             Evaluate instruction following (0-10)
+    #             - Answers all sub-questions
+    #             - Respects scope and constraints
+    #             - Required deliverables present
+    #             - Avoids out-of-scope content
+    #             """
+    #             score = 0.0
+    #             checks = []
+                
+    #             # Check 1: All required sections present (0-4 points)
+    #             required_sections = task_requirements.get('required_sections', [])
+    #             if required_sections:
+    #                 present = sum(1 for section in required_sections 
+    #                             if self._check_section_present(output, section))
+    #                 section_score = min(4.0, (present / len(required_sections)) * 4.0)
+    #                 score += section_score
+    #                 checks.append(f"Required sections: {{present}}/{{len(required_sections)}} ({{section_score:.1f}}/4.0)")
+    #             else:
+    #                 score += 3.0  # Default if no specific requirements
+    #                 checks.append("No specific section requirements (default 3.0/4.0)")
+                
+    #             # Check 2: Scope compliance (0-3 points)
+    #             scope_violations = self._check_scope_violations(output, task_requirements)
+    #             scope_score = max(0.0, 3.0 - len(scope_violations) * 0.5)
+    #             score += scope_score
+    #             if scope_violations:
+    #                 checks.append(f"Scope violations: {{len(scope_violations)}} ({{scope_score:.1f}}/3.0)")
+    #             else:
+    #                 checks.append("No scope violations (3.0/3.0)")
+                
+    #             # Check 3: Format compliance (0-2 points)
+    #             format_requirements = task_requirements.get('format', {{}})
+    #             format_score = 2.0  # Default
+    #             if format_requirements:
+    #                 format_checks = [
+    #                     self._check_format_requirement(output, req, val)
+    #                     for req, val in format_requirements.items()
+    #                 ]
+    #                 format_score = min(2.0, sum(format_checks) * 0.5)
+    #             score += format_score
+    #             checks.append(f"Format compliance: ({{format_score:.1f}}/2.0)")
+                
+    #             # Check 4: Completeness (0-1 point)
+    #             completeness_score = 1.0 if len(str(output)) > 200 else 0.5
+    #             score += completeness_score
+    #             checks.append(f"Completeness: ({{completeness_score:.1f}}/1.0)")
+                
+    #             self.evaluation_log.append({{
+    #                 'dimension': 'instruction_following',
+    #                 'score': score,
+    #                 'checks': checks
+    #             }})
+                
+    #             return min(10.0, score)
+            
+    #         def evaluate_readability(self, output: Dict[str, Any]) -> float:
+    #             """
+    #             Evaluate readability (0-10)
+    #             - Clear structure and organization
+    #             - Fluent language
+    #             - Effective data presentation
+    #             - Proper formatting
+    #             """
+    #             score = 0.0
+    #             checks = []
+                
+    #             output_str = str(output)
+                
+    #             # Check 1: Structure and organization (0-3 points)
+    #             structure_indicators = [
+    #                 '\\n' in output_str,  # Line breaks
+    #                 any(word in output_str for word in ['Summary', 'Introduction', 'Conclusion']),
+    #                 len(output_str.split('\\n')) > 5,  # Multiple paragraphs
+    #             ]
+    #             structure_score = min(3.0, sum(structure_indicators) * 1.0)
+    #             score += structure_score
+    #             checks.append(f"Structure: {{sum(structure_indicators)}} indicators ({{structure_score:.1f}}/3.0)")
+                
+    #             # Check 2: Language quality (0-3 points)
+    #             # Simple heuristics for language quality
+    #             avg_word_length = sum(len(word) for word in output_str.split()) / max(len(output_str.split()), 1)
+    #             has_variety = len(set(output_str.lower().split())) / max(len(output_str.split()), 1) > 0.5
+                
+    #             language_score = 0.0
+    #             if 4 < avg_word_length < 7:  # Reasonable word length
+    #                 language_score += 1.5
+    #             if has_variety:  # Vocabulary variety
+    #                 language_score += 1.5
+                
+    #             score += language_score
+    #             checks.append(f"Language quality: avg_word_len={{avg_word_length:.1f}}, variety={{has_variety}} ({{language_score:.1f}}/3.0)")
+                
+    #             # Check 3: Data presentation (0-2 points)
+    #             has_formatting = any(marker in output_str for marker in ['|', ':', '-', '*'])
+    #             has_lists = output_str.count('\\n') > 3
+    #             presentation_score = (1.0 if has_formatting else 0) + (1.0 if has_lists else 0)
+    #             score += presentation_score
+    #             checks.append(f"Data presentation: formatting={{has_formatting}}, lists={{has_lists}} ({{presentation_score:.1f}}/2.0)")
+                
+    #             # Check 4: Clarity (0-2 points)
+    #             clarity_score = 2.0
+    #             # Penalize if too short or too verbose
+    #             if len(output_str) < 100:
+    #                 clarity_score = 0.5
+    #             elif len(output_str) > 5000:
+    #                 clarity_score = 1.5
+                
+    #             score += clarity_score
+    #             checks.append(f"Clarity: length={{len(output_str)}} chars ({{clarity_score:.1f}}/2.0)")
+                
+    #             self.evaluation_log.append({{
+    #                 'dimension': 'readability',
+    #                 'score': score,
+    #                 'checks': checks
+    #             }})
+                
+    #             return min(10.0, score)
+            
+    #         def _check_topic_coverage(self, output: Dict, topic: str) -> bool:
+    #             """Check if topic is covered in output"""
+    #             return topic.lower() in str(output).lower()
+            
+    #         def _check_section_present(self, output: Dict, section: str) -> bool:
+    #             """Check if required section is present"""
+    #             return section.lower() in str(output).lower()
+            
+    #         def _check_scope_violations(self, output: Dict, requirements: Dict) -> List[str]:
+    #             """Check for scope violations"""
+    #             violations = []
+    #             # Add specific violation checks based on requirements
+    #             return violations
+            
+    #         def _check_format_requirement(self, output: Dict, requirement: str, value: Any) -> bool:
+    #             """Check specific format requirement"""
+    #             # Implement format checking logic
+    #             return True
+            
+    #         def evaluate_all(self, output: Dict[str, Any], 
+    #                         task_requirements: Dict,
+    #                         task_context: Dict,
+    #                         dimension_weights: Optional[Dict[str, float]] = None) -> EvaluationScore:
+    #             """Evaluate all dimensions and calculate overall score"""
+                
+    #             score = EvaluationScore(weights=dimension_weights)
+                
+    #             score.comprehensiveness = self.evaluate_comprehensiveness(output, task_requirements)
+    #             score.insight = self.evaluate_insight(output, task_context)
+    #             score.instruction_following = self.evaluate_instruction_following(output, task_requirements)
+    #             score.readability = self.evaluate_readability(output)
+    #             score.calculate_overall()
+                
+    #             return score
+
+    #     def main():
+    #         """Main execution function with DeepResearch Bench evaluation"""
+    #         try:
+    #             print("=" * 60)
+    #             print("CoSTEER Agentic System Execution Started")
+    #             print("Task ID: {task_id}")
+    #             print("Domain: {task_domain}")
+    #             print("=" * 60)
+                
+    #             execution_start = time.time()
+                
+    #             # Initialize agent with configuration
+    #             config = {{
+    #                 'max_workers': {max_workers},
+    #                 'enable_parallel': {enable_parallel},
+    #                 'enable_optimization': {enable_optimization},
+    #                 'task_timeout': {task_timeout}
+    #             }}
+                
+    #             print(f"Configuration: {{json.dumps(config, indent=2)}}")
+    #             agent = AgenticSystem(config)
+    #             print(f"Initialized: {{agent.name}}")
+                
+    #             # Run tasks and collect results
+    #             print("\\nExecuting tasks...")
+    #             results = agent.run_tasks()
+                
+    #             # Prepare task requirements for evaluation
+    #             task_requirements = {{
+    #                 'required_topics': ['task execution', 'performance metrics'],
+    #                 'required_sections': ['results', 'metrics'],
+    #                 'format': {{'type': 'json'}}
+    #             }}
+                
+    #             task_context = {{
+    #                 'domain': '{task_domain}',
+    #                 'hypothesis': '{hypothesis}'
+    #             }}
+                
+    #             # Evaluate using DeepResearch Bench standards
+    #             print("\\nEvaluating results...")
+    #             evaluator = ResearchOutputEvaluator(task_domain='{task_domain}')
+                
+    #             evaluation_score = evaluator.evaluate_all(
+    #                 output=results,
+    #                 task_requirements=task_requirements,
+    #                 task_context=task_context,
+    #                 dimension_weights={evaluation_criteria} if {evaluation_criteria} else None
+    #             )
+                
+    #             # Prepare detailed results
+    #             execution_time = time.time() - execution_start
+                
+    #             detailed_results = {{
+    #                 'task_info': {{
+    #                     'task_id': '{task_id}',
+    #                     'domain': '{task_domain}',
+    #                     'hypothesis': '{hypothesis}'
+    #                 }},
+    #                 'execution_results': results,
+    #                 'deepresearch_evaluation': evaluation_score.to_dict(),
+    #                 'evaluation_log': evaluator.evaluation_log,
+    #                 'system_status': agent.get_system_status(),
+    #                 'execution_time': execution_time,
+    #                 'timestamp': time.time()
+    #             }}
+                
+    #             # Save detailed results to file
+    #             result_file = Path("result.json")
+    #             result_file.write_text(json.dumps(detailed_results, indent=2))
+                
+    #             # Print structured output
+    #             print("\\n" + "=" * 60)
+    #             print("EXECUTION RESULTS")
+    #             print("=" * 60)
+    #             print(f"Success Rate: {{results.get('success_rate', 0):.2%}}")
+    #             print(f"Average Task Time: {{results.get('avg_time', 0):.4f}}s")
+    #             print(f"Error Count: {{results.get('error_count', 0)}}")
+    #             print(f"Total Tasks: {{results.get('total_tasks', 0)}}")
+    #             print(f"Total Execution Time: {{execution_time:.2f}}s")
+                
+    #             print("\\n" + "=" * 60)
+    #             print("DEEPRESEARCH BENCH EVALUATION")
+    #             print("=" * 60)
+    #             print(f"Comprehensiveness:      {{evaluation_score.comprehensiveness:.2f}}/10.0")
+    #             print(f"Insight:                {{evaluation_score.insight:.2f}}/10.0")
+    #             print(f"Instruction Following:  {{evaluation_score.instruction_following:.2f}}/10.0")
+    #             print(f"Readability:            {{evaluation_score.readability:.2f}}/10.0")
+    #             print(f"{{'-' * 60}}")
+    #             print(f"Overall Score:          {{evaluation_score.overall:.2f}}/10.0")
+    #             print("=" * 60)
+                
+    #             # Print evaluation details
+    #             print("\\nEvaluation Details:")
+    #             for log_entry in evaluator.evaluation_log:
+    #                 print(f"\\n{{log_entry['dimension'].upper()}}:")
+    #                 for check in log_entry['checks']:
+    #                     print(f"  - {{check}}")
+                
+    #             # JSON output for automated parsing
+    #             print("\\n" + "=" * 60)
+    #             print("JSON_RESULTS_START")
+    #             print(json.dumps(detailed_results, indent=2))
+    #             print("JSON_RESULTS_END")
+    #             print("=" * 60)
+                
+    #             return 0
+                
+    #         except Exception as e:
+    #             print(f"\\nERROR: Execution failed - {{str(e)}}", file=sys.stderr)
+    #             print("\\nError Details:")
+    #             traceback.print_exc()
+                
+    #             error_result = {{
+    #                 'task_info': {{
+    #                     'task_id': '{task_id}',
+    #                     'domain': '{task_domain}'
+    #                 }},
+    #                 'execution_results': {{
+    #                     "success_rate": 0.0,
+    #                     "avg_time": float('inf'),
+    #                     "error_count": 1,
+    #                     "total_tasks": 0
+    #                 }},
+    #                 'deepresearch_evaluation': {{
+    #                     'comprehensiveness': 0.0,
+    #                     'insight': 0.0,
+    #                     'instruction_following': 0.0,
+    #                     'readability': 0.0,
+    #                     'overall': 0.0
+    #                 }},
+    #                 "error_reason": str(e),
+    #                 "traceback": traceback.format_exc()
+    #             }}
+                
+    #             # Save error result
+    #             try:
+    #                 error_file = Path("error_result.json")
+    #                 error_file.write_text(json.dumps(error_result, indent=2))
+    #             except:
+    #                 pass
+                
+    #             return 1
+
+    #     if __name__ == "__main__":
+    #         exit_code = main()
+    #         sys.exit(exit_code)
+    #     '''
+    
+#     def generate_train_script(self,context):
+#         """
+#         Generate training script using LLM
+#         """
+#         hypothesis = context.get('hypothesis', 'Improve agentic system performance')
+#         task_id = context.get('task_id', 'unknown')
+#         task_domain = context.get('task_domain','general')
+#         system_prompt = """You are an expert in creating experiment training scripts. Generate clean, executable 
+#           python code that test AgenticSystem."""
+#         user_prompt = f"""Generate a training script (train.py) for an agentic system experiment.
+#         **Context:**
+# - Task ID: {task_id}
+# - Domain: {task_domain}
+# - Hypothesis: {hypothesis}
+
+# **Requirements:**
+# 1. Import AgenticSystem from agent.py
+# 2. Run tasks and collect results
+# 3. Print results between JSON_RESULTS_START and JSON_RESULTS_END markers
+# 4. Include DeepResearch Bench evaluation scores
+
+# Generate ONLY the Python code without markdown blocks."""
+#         try:
+#             logger.info("Calling LLM to generate train script")
+#             response = self.api_backend.build_messages_and_create_chat_completion(
+#                 user_prompt = user_prompt,
+#                 system_prompt = system_prompt,
+#                 json_mode = False
+#             )
+#             code = self.clean_llm_response(response)
+#             self.validate_python_syntax(code, "train.py")
+#             logger.info(f"Successfully generated train script using LLM")
+#             return code
+#         except Exception as e:
+#             logger.error(f"LLM train script generation failed: {str(e)}")
+#             logger.warning("Falling back to template-based generation")
+#             return self.get_fallback_train_code(hypothesis, task_id)
+
+
+    def generate_execution_script(self, task_info):
+        """
+        generate training script using LLM
+        """
+        system_prompt = """You are an expert in creating research execution and evaluation scripts.
+Generate Python code that runs a research agent and evaluates outputs using DeepResearch Bench standards."""
+        user_prompt = f"""Generate a execution script (train.py) for an agentic system experiment task.
+**Task**: 
+- Task ID: {task_info.get('task_id', 'unknown')}
+- Domain: {task_info.get('task_domain', 'general')}
+- Question: {task_info.get('question', 'N/A')}
+- Competition: {task_info.get('competition', 'deepresearch')}
+
+**Script Requirements:**
+
+1. Import modules:
+   - from agent import ResearchAgent
+   - from evaluator import DeepResearchEvaluator
+   - Standard libraries: json, sys, time, pathlib
+
+2. Main execution flow:
+   a) Initialize ResearchAgent
+   b) Execute research on the question
+   c) Evaluate output using DeepResearchEvaluator
+   d) Save results to result.json
+   e) Print structured output
+
+3. DeepResearch Bench Evaluation:
+   Evaluate on 4 dimensions (0-10 each):
+   - Comprehensiveness: Coverage, depth, evidence, perspectives
+   - Insight: Causal reasoning, quantified analysis, non-obvious implications
+   - Instruction Following: Completeness, scope compliance
+   - Readability: Structure, language quality, clarity
+
+4. Output Format (between JSON_RESULTS_START/END markers):
+   {{
+       "task_info": {{
+           "task_id": str,
+           "domain": str,
+           "question": str
+       }},
+       "research_output": {{
+           "answer": str,
+           "sub_answers": List[Dict],
+           "reasoning": str,
+           "evidence": List[str],
+           "confidence": float
+       }},
+       "evaluation_scores": {{
+           "comprehensiveness": float,  # 0-10
+           "insight": float,            # 0-10
+           "instruction_following": float,  # 0-10
+           "readability": float,        # 0-10
+           "overall": float             # Weighted average
+       }},
+       "execution_time": float
+   }}
+
+5. Error Handling:
+   - Wrap in try-except
+   - Log errors to stderr
+   - Return appropriate exit codes
+   - Save error details to error_result.json
+
+Generate ONLY the Python code without markdown blocks."""
+
+        try:
+            logger.info("Calling LLM to generate execution script...")
+            response = self.api_backend.build_messages_and_create_chat_completion(
+                user_prompt=user_prompt,
+                system_prompt=system_prompt,
+                json_mode=False
+            )
+            
+            code = self.clean_llm_code_response(response)
+            self.validate_python_syntax(code, "train.py")
+            
+            logger.info("Generated research execution script")
+            return code
+            
+        except Exception as e:
+            logger.error(f"LLM execution script generation failed: {e}")
+            return self.get_fallback_execution_script(task_info)
+
+    def generate_deepresearch_evaluator_code(self, task_info: Dict) -> str:
+        """Generate DeepResearch Bench evaluator"""
+        
+        system_prompt = """You are an expert in research quality evaluation.
+Generate Python code implementing the DeepResearch Bench evaluation framework with detailed scoring rubrics."""
+
+        user_prompt = f"""Generate evaluator module (evaluator.py) for DeepResearch Bench.
+
+**Evaluator Requirements:**
+
+1. Class: DeepResearchEvaluator with methods:
+   - evaluate_comprehensiveness(output, requirements) -> float (0-10)
+   - evaluate_insight(output, context) -> float (0-10)
+   - evaluate_instruction_following(output, task) -> float (0-10)
+   - evaluate_readability(output) -> float (0-10)
+   - evaluate(output, task_requirements, task_context) -> EvaluationResult
+
+2. Scoring Rubrics:
+
+   **Comprehensiveness (0-10):**
+   - Coverage of sub-topics (0-3): All required topics addressed
+   - Depth of analysis (0-3): Detailed, not superficial
+   - Evidence and sources (0-2): Citations, data, references
+   - Multiple perspectives (0-2): Diverse viewpoints
+
+   **Insight (0-10):**
+   - Causal reasoning (0-3): Why-think, cause-effect relationships
+   - Quantified analysis (0-2): Numbers, metrics, measurements
+   - Non-obvious implications (0-3): Insights beyond surface level
+   - Novel synthesis (0-2): Original frameworks or connections
+
+   **Instruction Following (0-10):**
+   - All sub-questions answered (0-4): Completeness check
+   - Scope compliance (0-3): Stays within bounds
+   - Format requirements (0-2): Structure, deliverables
+   - Completeness (0-1): Nothing major missing
+
+   **Readability (0-10):**
+   - Clear structure (0-3): Organized, logical flow
+   - Language quality (0-3): Fluent, precise
+   - Data presentation (0-2): Tables, lists, formatting
+   - Clarity (0-2): Easy to understand
+
+3. EvaluationResult class:
+   - Scores for each dimension (float)
+   - Overall weighted score (float)
+   - Detailed feedback per criterion (List[str])
+   - Suggestions for improvement (List[str])
+
+4. Normalization (if reference provided):
+   - Scale relative to reference performance
+   - Adjust thresholds based on task difficulty
+
+Generate ONLY the Python code without markdown blocks."""
+
+        try:
+            logger.info("Calling LLM to generate evaluator...")
+            response = self.api_backend.build_messages_and_create_chat_completion(
+                user_prompt=user_prompt,
+                system_prompt=system_prompt,
+                json_mode=False
+            )
+            
+            code = self._clean_llm_code_response(response)
+            self._validate_python_syntax(code, "evaluator.py")
+            
+            logger.info("Generated DeepResearch evaluator")
+            return code
+            
+        except Exception as e:
+            logger.error(f"LLM evaluator generation failed: {e}")
+            return self._get_fallback_evaluator(task_info)
+
+    def generate_research_requirements(self, task_info: Dict) -> str:
+        """Generate requirements for research agent"""
+        requirements = [
+            "# DeepResearch Bench Requirements",
+            "",
+            "# No external dependencies required",
+            "# Use Python standard library only:",
+            "# - json: JSON serialization",
+            "# - logging: Logging research process",
+            "# - time: Timing measurements",
+            "# - typing: Type hints",
+            "# - dataclasses: Data structures",
+            "# - pathlib: File operations",
+            ""
+        ]
+        
+        # Optional: domain-specific suggestions (commented out)
+        domain = task_info['domain']
+        if domain in ['biology', 'medicine']:
+            requirements.append("# Optional (if needed): biopython")
+        elif domain == 'data_analysis':
+            requirements.append("# Optional (if needed): numpy, pandas")
+        
+        return "\n".join(requirements)
+
+    def parse_research_output(self, stdout: str) -> Dict:
+        """Parse DeepResearch Bench output"""
+        try:
+            import json
+            import re
+            
+            # Look for JSON_RESULTS block
+            pattern = r'JSON_RESULTS_START\s*(.*?)\s*JSON_RESULTS_END'
+            match = re.search(pattern, stdout, re.DOTALL)
+            
+            if match:
+                json_str = match.group(1)
+                result = json.loads(json_str)
+                
+                if self._validate_research_result_format(result):
+                    logger.info("Successfully parsed DeepResearch output")
+                    return result
+            
+            logger.warning("Could not parse DeepResearch output")
+            return self._create_default_research_result()
+            
+        except Exception as e:
+            logger.error(f"Failed to parse research output: {e}")
+            return self._create_default_research_result()
+
+    def _validate_research_result_format(self, result: Dict) -> bool:
+        """Validate research result format"""
+        required_fields = ['task_info', 'research_output', 'evaluation_scores']
+        
+        for field in required_fields:
+            if field not in result:
+                logger.warning(f"Missing field: {field}")
+                return False
+        
+        # Validate evaluation scores
+        eval_scores = result['evaluation_scores']
+        required_scores = ['comprehensiveness', 'insight', 'instruction_following', 
+                          'readability', 'overall']
+        
+        for score_name in required_scores:
+            if score_name not in eval_scores:
+                logger.warning(f"Missing score: {score_name}")
+                return False
+            score = eval_scores[score_name]
+            if not isinstance(score, (int, float)) or not (0 <= score <= 10):
+                logger.warning(f"Invalid score for {score_name}: {score}")
+                return False
+        
+        return True
+
+    def _create_default_research_result(self) -> Dict:
+        """Create default research result"""
+        return {
+            "task_info": {"task_id": "unknown", "domain": "unknown"},
+            "research_output": {
+                "answer": "Research execution failed",
+                "sub_answers": [],
+                "reasoning": "No output generated",
+                "evidence": [],
+                "confidence": 0.0
+            },
+            "evaluation_scores": {
+                "comprehensiveness": 0.0,
+                "insight": 0.0,
+                "instruction_following": 0.0,
+                "readability": 0.0,
+                "overall": 0.0
+            },
+            "execution_time": 0.0
+        }
+
+    # Helper methods
+    def _clean_llm_code_response(self, response: str) -> str:
+        """Clean LLM response"""
+        import re
+        code = re.sub(r'^```python\s*\n', '', response, flags=re.MULTILINE)
+        code = re.sub(r'^```\s*\n', '', code, flags=re.MULTILINE)
+        code = re.sub(r'\n```\s*$', '', code, flags=re.MULTILINE)
+        return code.strip()
+
+    def _validate_python_syntax(self, code: str, filename: str):
+        """Validate Python syntax"""
+        try:
+            compile(code, filename, 'exec')
+            logger.info(f"Syntax validation passed for {filename}")
+        except SyntaxError as e:
+            raise ValueError(f"Syntax error in {filename}: {e}")
+
+    def _get_fallback_research_agent(self, task_info: Dict) -> str:
+        """Fallback research agent"""
+        return f'''"""
+Research Agent for DeepResearch Bench (Fallback Template)
+Domain: {task_info['domain']}
+"""
+import logging
+from typing import Dict, List
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("ResearchAgent")
+
+class ResearchAgent:
+    def __init__(self, config: Dict = None):
+        self.config = config or {{}}
+        self.domain = "{task_info['domain']}"
+        logger.info(f"Initialized ResearchAgent for domain: {{self.domain}}")
+    
+    def research(self, question: str) -> Dict:
+        """Conduct research on question"""
+        logger.info(f"Starting research: {{question[:100]}}...")
+        
+        # Decompose
+        sub_questions = self.decompose_question(question)
+        logger.info(f"Decomposed into {{len(sub_questions)}} sub-questions")
+        
+        # Gather & analyze
+        sub_answers = []
+        for sub_q in sub_questions:
+            info = self.gather_information(sub_q)
+            analysis = self.analyze_information([info])
+            sub_answers.append({{
+                "question": sub_q,
+                "answer": analysis.get("summary", "Analysis unavailable"),
+                "evidence": analysis.get("evidence", [])
+            }})
+        
+        # Synthesize
+        final_answer = self.synthesize_answer(sub_answers)
+        
+        return {{
+            "answer": final_answer,
+            "sub_answers": sub_answers,
+            "reasoning": "Multi-step research: decompose, gather, analyze, synthesize",
+            "evidence": [item for sa in sub_answers for item in sa.get("evidence", [])],
+            "confidence": 0.6,
+            "metadata": {{
+                "domain": self.domain,
+                "approach": "structured_research",
+                "limitations": ["Limited information sources", "Basic analysis"]
+            }}
+        }}
+    
+    def decompose_question(self, question: str) -> List[str]:
+        """Break question into sub-questions"""
+        return [question]  # Simplified: treat as single question
+    
+    def gather_information(self, sub_question: str) -> Dict:
+        """Gather information"""
+        return {{
+            "question": sub_question,
+            "info": f"Simulated information for: {{sub_question[:50]}}...",
+            "sources": ["simulated_source_1"]
+        }}
+    
+    def analyze_information(self, info_list: List[Dict]) -> Dict:
+        """Analyze information"""
+        return {{
+            "summary": f"Analysis of {{len(info_list)}} information pieces",
+            "evidence": [item.get("sources", ["unknown"])[0] for item in info_list]
+        }}
+    
+    def synthesize_answer(self, analyses: List[Dict]) -> str:
+        """Synthesize final answer"""
+        parts = [a.get("answer", "") for a in analyses]
+        return f"Synthesized answer based on {{len(parts)}} analyses: " + " ".join(parts[:3])
+'''
+
+    def _get_fallback_execution_script(self, task_info: Dict) -> str:
+        """Fallback execution script"""
+        return f'''"""
+Execution Script for DeepResearch Bench (Fallback)
+"""
+import json
+import sys
+import time
+from pathlib import Path
+
+try:
+    from agent import ResearchAgent
+    from evaluator import DeepResearchEvaluator
+except ImportError as e:
+    print(f"Import error: {{e}}", file=sys.stderr)
+    sys.exit(1)
+
+def main():
+    try:
+        start_time = time.time()
+        
+        # Research
+        agent = ResearchAgent()
+        question = """{task_info['question']}"""
+        research_output = agent.research(question)
+        
+        # Evaluate
+        evaluator = DeepResearchEvaluator()
+        task_requirements = {{"required_sections": ["answer", "reasoning"]}}
+        task_context = {{"domain": "{task_info['domain']}"}}
+        
+        evaluation = evaluator.evaluate(
+            research_output,
+            task_requirements,
+            task_context
+        )
+        
+        # Prepare results
+        results = {{
+            "task_info": {{
+                "task_id": "{task_info['task_id']}",
+                "domain": "{task_info['domain']}",
+                "question": question
+            }},
+            "research_output": research_output,
+            "evaluation_scores": evaluation.to_dict(),
+            "execution_time": time.time() - start_time
+        }}
+        
+        # Save
+        Path("result.json").write_text(json.dumps(results, indent=2))
+        
+        # Output
+        print("\\nJSON_RESULTS_START")
+        print(json.dumps(results, indent=2))
+        print("JSON_RESULTS_END")
+        
+        return 0
+        
+    except Exception as e:
+        print(f"Error: {{e}}", file=sys.stderr)
+        return 1
+
+if __name__ == "__main__":
+    sys.exit(main())
+'''
+
+
+
+    
+    
+    def needs_config_file(self, context):
+        """
+        Determine if a configuration file is needed
+        """
+        hypothesis = context.get('hypothesis', '')
+        return any(keyword in hypothesis.lower() for keyword in ['config', 'parameter', 'setting', 'tune'])
+    
+    def generate_config_file(self, context):
+        """
+        Generate configuration file 
+        """
+        #acquire hypothesis from context and tune config accordingly
+        hypothesis = context.get('hypothesis', 'Improve agentic system performance')
+
+        #decide default config values based on hypothesis
+        enable_parallel = 'parallel' in hypothesis.lower() or 'concurrent' in hypothesis.lower()
+        enable_optimization = 'optimization' in hypothesis.lower() or 'optimize' in hypothesis.lower()
+        max_workers = 8 if enable_parallel else 4
+        task_timeout = 60 if enable_optimization else 30
+        batch_size = 20 if enable_optimization else 10
+        retry_attempts = 5 if enable_optimization else 3
+
+        return '''
+        """
+        CoSTEER Generated Configuration
+        """
+        import os
+        from dataclasses import dataclass
+        from typing import Dict, Any
+
+        @dataclass
+        class AgentSystemConfig:
+            """Configuration for agentic system"""
+            #Execution settings
+            max_workers: int = {max_workers}
+            task_timeout: float = {task_timeout}
+            enable_parallel: bool = {enable_parallel}
+            enable_optimization: bool = {enable_optimization}
+    
+            # Performance settings
+            retry_attempts: int = {retry_attempts}
+            batch_size: int = {batch_size}
+            
+            # Logging settings
+            log_level: str = "INFO"
+            enable_detailed_logging: bool = True
+            
+            @classmethod
+            def from_env(cls) -> 'AgenticSystemConfig':
+                """Create config from environment variables"""
+                return cls(
+                    max_workers = int(os.getenv('AGENT_MAX_WORKERS', '{max_workers}')),
+                    task_timeout = float(os.getenv('AGENT_TASK_TIMEOUT', '{task_timeout}')),
+                    enable_parallel = os.getenv('AGENT_ENABLE_PARALLEL', '{str(enable_parallel).lower()}').lower() == 'true',
+                    enable_optimization = os.getenv('AGENT_ENABLE_OPTIMIZATION', '{str(enable_optimization).lower()}').lower() == 'true',
+                    retry_attempts = int(os.getenv('AGENT_RETRY_ATTEMPTS', '{retry_attempts}')),
+                    batch_size = int(os.getenv('AGENT_BATCH_SIZE', '{batch_size}')),
+                )
+            
+            def to_dict(self) -> Dict[str, Any]:
+                """Convert config to dictionary"""
+                return {{
+                    'max_workers': self.max_workers,
+                    'task_timeout': self.task_timeout,
+                    'enable_parallel': self.enable_parallel,
+                    'enable_optimization': self.enable_optimization,
+                    'retry_attempts': self.retry_attempts,
+                    'batch_size': self.batch_size,
+                    'log_level': self.log_level,
+                    'enable_detailed_logging': self.enable_detailed_logging
+                }}
+
+        # Default configuration instance
+        DEFAULT_CONFIG = AgenticSystemConfig()
+
+        #Example Usage
+        #config = AgenticSystemConfig.from_hypothesis("{hypothesis}")
+        #config = AgenticSystemConfig.from_env()
+        '''
+
+    def create_fallback_workspace(self, exp: Experiment) -> FBWorkspace:
+        """Create a fallback worksapce in case of errors"""
+        logger.warning("create fallback workspace due to previous errors")
+        try:
+            workspace = FBWorkspace()
+            hypothesis = getattr(exp, 'hypothesis', 'Improve agentic system performance')
+            exp_id = getattr(exp, 'id', 'unknown')
+            
+            # Create minimal working files
+            minimal_files = {
+                "agent.py": self.get_minimal_agent_code(hypothesis),
+                "train.py": self.get_minimal_train_code(hypothesis,exp_id),
+                "requirements.txt": "# Minimal requirements\\n",
+                "README.md": f"# Fallback Workspace\nExperiment :{exp_id}\n Hypothesis: {hypothesis}\nThis is a fallback workspace with minimal working code."
+            }
+            
+            workspace.inject_files(**minimal_files)
+            logger.info(f"Created fallback workspace for experiment {exp_id}")
+            return workspace
+            
+        except Exception as e:
+            logger.error(f"Failed to create fallback workspace: {e}")
+            raise
+
+    def get_minimal_agent_code(self,hypothesis):
+        """Get minimal working agent code"""
+        return f'''
+        class AgenticSystem:
+            def __init__(self):
+                self.name = "MinimalFallbackAgent"
+                self.hypothesis = "{hypothesis}"
+            def run_tasks(self):
+                return {{
+                    "success_rate": 0.5,
+                    "avg_time": 0.01,
+                    "error_count": 0,
+                    "total_tasks": 1,
+                    "note": "Fallback implementation"
+                }}
+        '''
+    
+    def get_minimal_train_code(self,hypothesis, exp_id):
+        """Get minimal working train code"""
+        return f'''
+        import json
+        from pathlib import Path
+        from agent import AgenticSystem
+        def main():
+            print("Running fallback Implementation")
+            print(f"Experiment: {exp_id}")
+            print(f"Hypothesis: {hypothesis}")
+            agent = AgenticSystem()
+            results = agent.run_tasks()
+
+            #Save results
+            result_file = Path("result.json")
+            result_file.write_text(json.dumps(results, indent=2))
+
+            #Print results
+            print(f"Success Rate: {{results['success_rate']}}")
+            print(f"Average Time: {{results['avg_time']}}")
+            print(f"Error Count: {{results['error_count']}}")
+            print(f"Total Tasks: {{results['total_tasks']}}")
+            print("=== Fallback Execution Completed ===")
+            return 0
+        if __name__ == "__main__":
+            exit_code = main()
+            import sys
+            sys.exit(exit_code)
+        ''' 
+
+        # # begin drafting
+        # # NOTE:
+        # # We should implement CoSTEER here to improve high quality coding ability
+        # # 1) generate code
+        # # prompting
+        # exp.experiment_workspace = FBWorkspace()
+        # # exp.experiment_workspace.inject_files(**{"<filename>": <file content>})
+
+        # # 2) run code
+        # # prepare environment.
+        # env = get_agent_sys_env(
+        #     extra_volumes={exp.experiment_workspace.workspace_path: "/....."},
+        #     # .....
+        # )
+
+        # env.run(entry="<entrypoint>", ...)
+
+        # # Please refer to the following code for details.
+        # # [[rdagent/components/coder/data_science/conf.py:41]]
+        
+
+        # # end drafting
+        # try:
+        #     #acquire workspace 
+        #     ws_path = self.get_workspace_path(exp)
+        #     #create workspace directory
+        #     ws_path.mkdir(parents=True, exist_ok=True)
+        #     #generate code
+        #     self.generate_files(ws_path, exp)
+        #     logger.info(f"Code generation as workspace at {ws_path}")
+
+        # except Exception as e:
+        #     logger.error(f"Code generation failed: {str(e)}")
+        #     exp.exception = e
+        
+        # return exp
+
+    def get_workspace_path(self, exp: Experiment):
+        '''
+        Get workspace path for the experiment
+        '''
+        if hasattr(exp, 'experiment_workspace') and exp.experiment_workspace:
+            return Path(exp.experiment_workspace.workspace_path)
+        
+        base = Path("./workspace")
+        base.mkdir(exist_ok=True)
+        return base / f"exp_{exp.id}"
+
+    def generate_files(self, ws_path, exp):
+        '''
+        Generate necessary files for the agentic system experiment and write file to disk
+        '''
+        # Dummy agent code
+        (ws_path / "agent.py").write_text(
+            self.get_agent_template(exp)
+        )
+
+        # train.py (execute entry point)
+        (ws_path / "train.py").write_text(
+            self.get_train_template()
+        )
+
+        #requirements.txt
+        (ws_path / "requirements.txt").write_text(
+            "#Add dependencies here\n"
+        )
+
+    def get_agent_template(self, exp):
+        "generate agent code template"
+        hypothesis = getattr(exp, 'hypothesis', 'Improve system performance')
+        return f'''
+        """
+        Agentic System Implementation
+        Hypothesis: {hypothesis}
+        """
+        import time
+        from typing import Dict, List, Any
+
+        class AgenticSystem:
+            """Agentic System for task execution"""
+
+            def __init__(self):
+                self.name = "AgenticSystem"
+                self.task_count = 0
+
+            def run_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
+                """Run tasks and return results"""
+                start_time = time.time()
+                try: 
+                    task_id = task.get('id', self.task_count)
+                    self.task_count += 1
+                    result = {{
+                        "task_id": task_id,
+                        "success": "True",
+                        "time": time.time() - start_time,
+                        "error": None
+                    }}
+                except Exception as e:
+                    result = {{
+                        "task_id": task_id,
+                        "success": False,
+                        "time": time.time() - start_time,
+                        "error": str(e)
+                    }}
+                return result
+            def run_tasks(self, tasks: List[Dict] = None): 
+                """Run multiple tasks and collect results"""
+                if tasks is None:
+                    tasks = [
+                         {{"id": i, "type": "test", "data": f"sample{{i}}"}}
+                        for i in range(10)
+                    ]
+
+                results = []
+                for task in tasks: 
+                    result.append(self.run_task(task))
+
+                # Calculate metrics
+                success_count = sum(1 for r in results if r["success"])
+                total_time = sum(r["time"] for r in results)
+                error_count = sum(1 for r in results if r["error"])
+
+                return {{
+                    "success_rate": success_count / len(results) if result else 0,
+                    "avg_time": total_time / len(results) if results else 0,
+                    "error_count": error_count,
+                    "total_tasks": len(results)
+                }}
+        '''
+    
+    def get_train_template(self):
+        """generate execution template"""
+        return '''"""
+        Training/Execution script for Agentic System, this is the entry point 
+        that will be executed by the runner.
+        """
+        import json
+        import sys
+        from pathlib import Path
+        from agent import AgenticSystem
+
+        def main():
+            """Main execution function"""
+            try:
+                print("Starting Agentic System execution...")
+                # Initialize agent
+                agent = AgenticSystem()
+                # Run tasks
+                results = agent.run_tasks()
+
+                # Save results to file (for backup parsing)
+                result_file = Path("result.json")
+                result_file.write_text(json.dumps(results, indent = 2))
+
+                #Print for logging
+                print("execution completed")
+                print(f"Success Rate: {results['success_rate']}")
+                print(f"Average Time: {results['avg_time']}")
+                print(f"Error Count: {results['error_count']}")
+                print(f"Total Tasks: {results['total_tasks']}")
+
+                return 0
+
+            except Exception as e:
+                print(f"Execution failed: {str(e)}")
+                import traceback
+                traceback.print_exc()
+                return 1
+        
+        if __name__ == "__main__":
+            main()
+        '''
+        
+
+class AgenticSysRunner(Developer[Experiment]):
+    """execute code generated by AgenticSysCoder"""
+
+    def __init__(self, scen):
+        self.scen = scen
+
+    def develop(self, exp: Experiment) -> Experiment:
+        # TODO: implement the runner
+        """
+        execute the experiment
+        steps: 
+        1. acquire workspace
+        2. execute test.py
+        3. parse output
+        4. collect performance metrics
+        5. record logs
+        """
+        logger.info("Starting experiment execution")
+        # try: 
+        #     # acquire workspace
+        #     ws_path = self.get_workspace_path(exp)
+        #     logger.info(f"Using workspace at {ws_path}")
+        #     # validate necessary files
+        #     self.validate_workspace(ws_path)
+        #     #execute experiment
+        #     stdout, stderr = self.execute_experiment(ws_path)
+        #     #parse result
+        #     result = self.parse_execution_output(stdout, stderr)
+        #     exp.result = result
+        #     # record execution logs
+        #     self._log_execution_results(exp, result)
+        #     logger.info("Experiment completed successfully")
+        # except Exception as e:
+        #     logger.error(f"Experiment execution failed: {str(e)}")
+        #     exp.exception = e
+        #     exp.result = self.create_error_result(str(e))
+        # return exp
+        try:
+            if not self.has_valid_workspace(exp):
+                logger.info("Workspace is not ready, calling coder to generate code")
+                coder = AgenticSysCoder(self.scen)
+                exp = coder.develop(exp)
+                #check if coder succeeded
+                if not self.has_valid_workspace(exp):
+                    raise RuntimeError("Coder failed to generate valid workspace")
+            #1. acquire workspace
+            ws_path = self.get_workspace_path(exp)
+            logger.info(f"Using workspace at {ws_path}")
+
+            #2. validate necessary files
+            self.validate_workspace(ws_path)
+
+            #3. execute experiment
+            stdout, stderr = self.execute_experiment(ws_path)
+
+            #4. parse result
+            result = self.parse_execution_output(stdout, stderr)
+            exp.result = result
+
+            #5. record execution logs
+            self.log_execution_results(exp, result)
+            logger.info("Experiment completed successfully")
+
+        except Exception as e:
+            logger.error(f"Experiment execution failed: {str(e)}")
+            exp.exception = e
+            exp.result = self.create_error_result(str(e))
+        return exp
+    
+    def has_valid_workspace(self, exp: Experiment):
+        """check if experiment has valid workspace with required files"""
+        try:
+            if not hasattr(exp, 'experiment_workspace') or not exp.experiment_workspace:
+                return False
+            ws_path = Path(exp.experiment_workspace.workspace_path)
+            if not ws_path.exists():
+                return False
+            #check for required files
+            required_files = ["train.py", "agent.py"]
+            for file_name in required_files:
+                if not (ws_path / file_name).exists():
+                    return False
+            return True
+        except Exception as e:
+            logger.warning(f"Error checking workspace validity : {(e)}")
+            return False
+    
+    def get_workspace_path(self, exp):
+        '''
+        Get workspace path for the experiment
+        '''
+        if hasattr(exp, 'experiment_workspace') and exp.experiment_workspace:
+            return Path(exp.experiment_workspace.workspace_path)
+        # Default workspace path
+        base = Path("./workspace")
+        return base / f"exp_{exp.id}"
+    
+    def validate_workspace(self, ws_path: Path):
+        """Validate necessary files in the workspace"""
+        if not ws_path.exists():
+            raise FileNotFoundError(f"Workspace path {ws_path} does not exist.")
+        
+        # examine necessary files
+        required_files = ["train.py", "agent.py"]
+        missing_files = []
+
+        for file_name in required_files:
+            file_path = ws_path / file_name
+            if not file_path.exists():
+                missing_files.append(file_name)
+        
+        if missing_files:
+            raise FileNotFoundError(f"Missing required files in workspace {ws_path}: {', '.join(missing_files)}")
+        
+        logger.info("workspace validation passed: {ws_path}")
+
+    def execute_experiment(self, ws_path: Path, timeout: int = 300):
+        """Execute the experiment by running train.py"""
+        cmd = [sys.executable, "train.py"]
+        # use environment variables if necessary
+        env = self.prepare_environment() 
+        
+        logger.info(f"Executing: {' '.join(cmd)} in {ws_path}")
+        
+        try:
+        # pass in environment variables if necessary
+            result = subprocess.run(
+                cmd,
+                cwd=str(ws_path),
+                capture_output=True,
+                text=True,
+                timeout=timeout,
+                env=env 
+            )
+            
+            logger.info(f"Process completed with return code: {result.returncode}")
+            
+            if result.returncode != 0:
+                logger.warning(f"Process exited with non-zero code: {result.returncode}")
+            
+            return result.stdout, result.stderr
+            
+        except subprocess.TimeoutExpired as e:
+            logger.error(f"Execution timed out after {timeout} seconds")
+            raise RuntimeError(f"Execution timeout: {timeout}s") from e
+        
+        except Exception as e:
+            logger.error(f"Execution failed with exception: {str(e)}")
+            raise RuntimeError(f"Execution error: {str(e)}") from e
+        
+    def prepare_environment(self):
+        """Prepare execution environment"""
+        import os
+        env = os.environ.copy()
+        # Add any necessary environment variables here
+        if 'PYTHONPATH' in env:
+            env['PYTHONPATH'] = f"{os.getcwd()}:{env['PYTHONPATH']}"
+        else:
+            env['PYTHONPATH'] = os.getcwd()
+        return env
+    
+    def parse_execution_output(self, stdout: str, stderr: str):
+        """Parse execution output including DeepResearch Bench evaluation scores"""
+        try:
+            # Method 1: Look for JSON block with evaluation scores
+            result = self.parse_json_results(stdout)
+            if result:
+                return result
+            
+            # Method 2: Look up result file
+            result = self.parse_result_file()
+            if result:
+                return result
+            
+            # Method 3: Parse from stdout text
+            result = self.parse_text_output(stdout)
+            if result:
+                return result
+            
+            logger.warning("Could not parse execution output, using default result")
+            return self.create_default_result(
+                success=False,
+                reason="Could not parse output"
+            )
+        
+        except Exception as e:
+            logger.error(f"Failed to parse output: {e}")
+            return self.create_error_result(f"Parsing error: {e}")
+        
+    def parse_json_results(self, stdout: str):
+        """Parse JSON results block from stdout"""
+        try:
+            import json
+            import re
+            
+            # Look for JSON_RESULTS block
+            json_pattern = r'JSON_RESULTS_START\s*(.*?)\s*JSON_RESULTS_END'
+            match = re.search(json_pattern, stdout, re.DOTALL)
+            
+            if match:
+                json_str = match.group(1)
+                result = json.loads(json_str)
+                
+                # Validate and extract both execution and evaluation results
+                if self.validate_deepresearch_result(result):
+                    logger.info("Successfully parsed DeepResearch Bench results")
+                    return result
+            
+            return None
+            
+        except Exception as e:
+            logger.warning(f"Failed to parse JSON results: {e}")
+            return None
+    
+    def parse_structured_output(self, stdout:str):
+        """Parse structured JSON output """
+        try:
+            import json
+            import re
+            # Look for JSON blocks in stdout
+            json_pattern = r'\{[^{}]*"success_rate"[^{}]*\}'
+            matches = re.findall(json_pattern, stdout, re.DOTALL)
+            
+            for match in matches:
+                try:
+                    result = json.loads(match)
+                    # Validate result format
+                    if self.validate_result_format(result):
+                        logger.info("Successfully parsed structured output")
+                        return result
+                except json.JSONDecodeError:
+                    continue
+            
+            return None
+            
+        except Exception as e:
+            logger.warning(f"Failed to parse structured output: {e}")
+            return None
+        
+    def validate_deepresearch_result(self, result):
+        """validate DeepResearch Bench result format"""
+        try:
+            # Check execution results
+            if 'execution_results' not in result:
+                return False
+            
+            exec_results = result['execution_results']
+            required_exec_fields = ['success_rate', 'avg_time', 'error_count']
+            for field in required_exec_fields:
+                if field not in exec_results:
+                    return False
+            
+            # Check evaluation scores
+            if 'deepresearch_evaluation' not in result:
+                return False
+            
+            eval_scores = result['deepresearch_evaluation']
+            required_eval_fields = ['comprehensiveness', 'insight', 
+                                'instruction_following', 'readability', 'overall']
+            for field in required_eval_fields:
+                if field not in eval_scores:
+                    return False
+                # Validate score range
+                score = eval_scores[field]
+                if not isinstance(score, (int, float)) or not (0 <= score <= 10):
+                    return False
+            
+            return True
+        
+        except Exception:
+            return False
+
+
+
+    def parse_text_output(self, stdout: str):
+        """Parse text output using regex"""
+        try:
+            import re
+            
+            # Extract metrics using regex
+            success_match = re.search(r'Success Rate:\s*([0-9.]+)', stdout, re.IGNORECASE)
+            time_match = re.search(r'Average Time:\s*([0-9.]+)', stdout, re.IGNORECASE)
+            error_match = re.search(r'Error Count:\s*([0-9]+)', stdout, re.IGNORECASE)
+            task_match = re.search(r'Total Tasks:\s*([0-9]+)', stdout, re.IGNORECASE)
+            
+            if success_match:
+                result = {
+                    "success_rate": float(success_match.group(1)),
+                    "avg_time": float(time_match.group(1)) if time_match else 0.0,
+                    "error_count": int(error_match.group(1)) if error_match else 0,
+                    "total_tasks": int(task_match.group(1)) if task_match else 0
+                }
+                logger.info("Successfully parsed text output")
+                return result
+            
+            return None
+            
+        except Exception as e:
+            logger.warning(f"Failed to parse text output: {e}")
+            return None
+
+    def parse_result_file(self):
+        """Parse result from JSON file"""
+        try:
+            import json
+            
+            possible_paths = ["result.json", "output.json", "results.json"]
+            
+            for file_name in possible_paths:
+                file_path = Path(file_name)
+                if file_path.exists():
+                    content = file_path.read_text(encoding='utf-8')
+                    result = json.loads(content)
+                    
+                    if self.validate_result_format(result):
+                        logger.info(f"Successfully parsed result file: {file_path}")
+                        return result
+            
+            return None
+            
+        except Exception as e:
+            logger.warning(f"Failed to parse result file: {e}")
+            return None
+
+    def validate_result_format(self, result: dict) -> bool:
+        """Validate result format"""
+        required_fields = ["success_rate", "avg_time", "error_count"]
+        
+        for field in required_fields:
+            if field not in result:
+                return False
+            if not isinstance(result[field], (int, float)):
+                return False
+        
+        # Check value ranges
+        if not (0.0 <= result["success_rate"] <= 1.0):
+            return False
+        if result["avg_time"] < 0:
+            return False
+        if result["error_count"] < 0:
+            return False
+        
+        return True
+
+    def create_default_result(self, success: bool = False, reason: str = "") -> dict:  
+        """Create default result"""
+        return {
+            "success_rate": 1.0 if success else 0.0,
+            "avg_time": 0.0 if success else float('inf'),
+            "error_count": 0 if success else 1,
+            "total_tasks": 0,
+            "error_reason": reason
+        }
+
+    def create_error_result(self, error_message: str) -> dict:  
+        """Create error result"""
+        return {
+            "success_rate": 0.0,
+            "avg_time": float('inf'),
+            "error_count": 1,
+            "total_tasks": 0,
+            "error_reason": error_message
+        }
+
+    def log_execution_results(self, exp: Experiment, result: dict):
+        """Log execution results including DeepResearch Bench evaluation"""
+        logger.info("=" * 60)
+        logger.info("EXECUTION RESULTS")
+        logger.info("=" * 60)
+        
+        # Log execution metrics
+        exec_results = result.get('execution_results', result)
+        logger.info(f"Success Rate: {exec_results.get('success_rate', 0):.2%}")
+        logger.info(f"Average Time: {exec_results.get('avg_time', 0):.4f}s")
+        logger.info(f"Error Count: {exec_results.get('error_count', 0)}")
+        logger.info(f"Total Tasks: {exec_results.get('total_tasks', 0)}")
+        
+        # Log DeepResearch Bench evaluation if available
+        if 'deepresearch_evaluation' in result:
+            logger.info("=" * 60)
+            logger.info("DEEPRESEARCH BENCH EVALUATION")
+            logger.info("=" * 60)
+            
+            eval_scores = result['deepresearch_evaluation']
+            logger.info(f"Comprehensiveness:      {eval_scores.get('comprehensiveness', 0):.2f}/10.0")
+            logger.info(f"Insight:                {eval_scores.get('insight', 0):.2f}/10.0")
+            logger.info(f"Instruction Following:  {eval_scores.get('instruction_following', 0):.2f}/10.0")
+            logger.info(f"Readability:            {eval_scores.get('readability', 0):.2f}/10.0")
+            logger.info(f"{'-' * 60}")
+            logger.info(f"Overall Score:          {eval_scores.get('overall', 0):.2f}/10.0")
+            
+            # Log evaluation details if available
+            if 'evaluation_log' in result:
+                logger.info("\\nEvaluation Details:")
+                for log_entry in result['evaluation_log']:
+                    logger.info(f"  {log_entry['dimension'].upper()}: {log_entry['score']:.2f}/10.0")
+                    for check in log_entry.get('checks', []):
+                        logger.info(f"    - {check}")
+        
+        if 'error_reason' in result:
+            logger.warning(f"Error: {result['error_reason']}")
+        
+        logger.info("=" * 60)
+
+
+
+            
+
+
+
+
+
+
+        
diff --git a/rdagent/scenarios/agentic_sys/dev_new.py b/rdagent/scenarios/agentic_sys/dev_new.py
new file mode 100644
index 000000000..380246b99
--- /dev/null
+++ b/rdagent/scenarios/agentic_sys/dev_new.py
@@ -0,0 +1,734 @@
+"""
+Developer for Agentic System Scenario
+Generates code for agentic system experiments with optional web search enhancement
+"""
+
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+
+from rdagent.core.developer import Developer
+from rdagent.core.experiment import Experiment, FBWorkspace
+from rdagent.log import rdagent_logger as logger
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.agentic_sys.env import get_agent_sys_env
+from rdagent.scenarios.agentic_sys.tools.web_search import create_web_search_tool
+
+
+class AgenticSysCoder(Developer[Experiment]):
+    """
+    Code generator for agentic system experiments
+    
+    Features:
+    - CoSTEER-based code generation
+    - Optional web search tool integration
+    - Lazy initialization of external tools
+    - Intelligent context enhancement
+    """
+    
+    def __init__(self, scen):
+        """
+        Initialize AgenticSysCoder
+        
+        Args:
+            scen: Scenario instance containing task description and configuration
+        """
+        self.scen = scen
+        self.api_backend = APIBackend()
+        
+        # Lazy initialization for web search tool
+        self._web_search_tool = None
+        
+        logger.info("Initialized AgenticSysCoder with LLM backend")
+    
+    @property
+    def web_search_tool(self):
+        """
+        Lazy load web search tool when needed
+        
+        Returns:
+            WebSearchTool instance or None if unavailable
+        """
+        if self._web_search_tool is None:
+            try:
+                search_config_path = Path(__file__).parent / "tools" / "search_config.yaml"
+                if search_config_path.exists():
+                    self._web_search_tool = create_web_search_tool(search_config_path)
+                    logger.info("✓ Web search tool initialized successfully")
+                else:
+                    logger.warning(f"Search config not found: {search_config_path}")
+                    self._web_search_tool = False
+            except Exception as e:
+                logger.warning(f"Failed to initialize web search tool: {e}")
+                self._web_search_tool = False  # Mark as failed to avoid retry
+        
+        return self._web_search_tool if self._web_search_tool is not False else None
+
+    def develop(self, exp: Experiment) -> Experiment:
+        """
+        Generate code for the experiment
+        
+        Workflow:
+        1. Initialize workspace
+        2. Prepare base context
+        3. Optionally enhance with web search (tool call)
+        4. Generate code artifacts
+        5. Inject files and run
+        
+        Args:
+            exp: Experiment instance
+            
+        Returns:
+            Experiment with generated code and results
+        """
+        logger.info(f"Starting code generation for experiment: {getattr(exp, 'id', 'unknown')}")
+
+        try:
+            # Step 1: Initialize workspace
+            exp.experiment_workspace = FBWorkspace()
+            ws_path = Path(exp.experiment_workspace.workspace_path)
+            ws_path.mkdir(parents=True, exist_ok=True)
+            logger.info(f"✓ Initialized workspace at {ws_path}")
+
+            # Step 2: Prepare base context
+            context = self._prepare_base_context(exp)
+            logger.info("✓ Prepared base context")
+            
+            # Step 3: Optionally enhance with web search (TOOL CALL)
+            if self._should_use_web_search(exp):
+                logger.info("→ Calling web search tool for context enhancement...")
+                context = self._enhance_context_with_web_search(context, exp)
+            else:
+                logger.info("→ Skipping web search (not needed)")
+
+            # Step 4: Generate code artifacts
+            logger.info("→ Generating code with CoSTEER framework...")
+            code_artifacts = self._generate_code_artifacts(exp, context)
+            
+            # Step 5: Inject files into workspace
+            exp.experiment_workspace.inject_files(**code_artifacts)
+            logger.info(f"✓ Injected {len(code_artifacts)} files into workspace")
+
+            # Step 6: Prepare environment and run
+            timeout = self._calculate_timeout(exp)
+            env = get_agent_sys_env(
+                running_timeout_period=timeout,
+                enable_cache=True
+            )
+            
+            logger.info(f"→ Running generated code (timeout: {timeout}s)...")
+            run_res = exp.experiment_workspace.run(env=env, entry="python train.py")
+            
+            # Store results
+            exp.run_returncode = getattr(run_res, 'returncode', None)
+            exp.run_stdout = getattr(run_res, 'stdout', getattr(run_res, 'logs', None))
+            exp.run_stderr = getattr(run_res, 'stderr', None)
+            
+            if exp.run_returncode == 0:
+                logger.info("✓ Experiment execution succeeded")
+            else:
+                logger.warning(f"⚠ Experiment execution failed with return code: {exp.run_returncode}")
+
+        except Exception as e:
+            logger.error(f"❌ Code generation failed: {str(e)}", exc_info=True)
+            exp.exception = e
+            
+            # Try to create fallback workspace
+            if not hasattr(exp, 'experiment_workspace') or not exp.experiment_workspace:
+                try:
+                    exp.experiment_workspace = self._create_fallback_workspace(exp)
+                    logger.info("Created fallback workspace")
+                except Exception as e_fallback:
+                    logger.error(f"Failed to create fallback workspace: {e_fallback}")
+        
+        return exp
+    
+    def _prepare_base_context(self, exp: Experiment) -> Dict[str, Any]:
+        """
+        Prepare base context without web search
+        
+        Args:
+            exp: Current experiment
+            
+        Returns:
+            Base context dictionary
+        """
+        hypothesis = getattr(exp, 'hypothesis', 'Improve agentic system performance')
+        
+        context = {
+            'hypothesis': hypothesis,
+            'scenario_desc': self.scen.get_scenario_all_desc(),
+            'success_criteria': getattr(self.scen, 'success_criteria', 'High performance'),
+            'task_id': getattr(exp, 'id', 'unknown'),
+            'task_domain': getattr(self.scen, 'domain', 'general'),
+            'iteration_number': getattr(exp, 'iteration_number', 0),
+            'external_sources': [],  # Will be filled by web search if used
+            'external_knowledge_summary': ''  # Will be filled by web search if used
+        }
+        
+        return context
+    
+    def _should_use_web_search(self, exp: Experiment) -> bool:
+        """
+        Determine if web search should be used for this experiment
+        
+        Decision criteria:
+        1. Web search tool is available
+        2. Not explicitly disabled by configuration
+        3. Hypothesis complexity requires external knowledge
+        4. Early iterations (< 3) benefit from external knowledge
+        5. Previous experiments show low performance
+        
+        Args:
+            exp: Current experiment
+            
+        Returns:
+            True if web search should be used
+        """
+        # Check if web search is globally disabled
+        if getattr(self.scen, 'disable_web_search', False):
+            logger.info("Web search disabled by scenario configuration")
+            return False
+        
+        # Check if tool is available
+        if self.web_search_tool is None:
+            logger.info("Web search tool not available")
+            return False
+        
+        # Check if search service is healthy
+        if not self.web_search_tool.client.health_check():
+            logger.warning("Web search service is not healthy, skipping")
+            return False
+        
+        hypothesis = getattr(exp, 'hypothesis', '').lower()
+        
+        # Use web search for research-heavy hypotheses
+        research_indicators = [
+            'research', 'investigate', 'explore', 'analyze', 'study',
+            'compare', 'evaluate', 'survey', 'benchmark', 'baseline',
+            'novel', 'innovative', 'advanced', 'state-of-art', 'sota',
+            'improve', 'optimize', 'enhance', 'boost'
+        ]
+        
+        if any(indicator in hypothesis for indicator in research_indicators):
+            logger.info(f"Research-heavy hypothesis detected: '{hypothesis[:50]}...'")
+            return True
+        
+        # Use web search for early iterations
+        iteration = getattr(exp, 'iteration_number', 0)
+        if iteration < 3:
+            logger.info(f"Early iteration ({iteration}/3), enabling web search")
+            return True
+        
+        # Use web search if previous performance was low
+        if hasattr(exp, 'previous_performance_low') and exp.previous_performance_low:
+            logger.info("Previous performance low, enabling web search for improvement")
+            return True
+        
+        # Default: don't use web search for efficiency
+        logger.info("Web search not needed (simple task or late iteration)")
+        return False
+    
+    def _enhance_context_with_web_search(
+        self, 
+        context: Dict[str, Any], 
+        exp: Experiment
+    ) -> Dict[str, Any]:
+        """
+        Enhance context with web search results (TOOL CALL)
+        
+        This is the main entry point for web search tool integration.
+        
+        Args:
+            context: Base context to enhance
+            exp: Current experiment
+            
+        Returns:
+            Enhanced context with external sources
+        """
+        try:
+            hypothesis = context['hypothesis']
+            
+            # Step 1: Identify knowledge gaps
+            knowledge_gaps = self._identify_knowledge_gaps(exp, hypothesis)
+            logger.info(f"Identified {len(knowledge_gaps)} knowledge gaps: {knowledge_gaps}")
+            
+            # Step 2: Prepare search context
+            search_context = {
+                'methodology': self._extract_methodology(hypothesis),
+                'complexity': self._assess_complexity(hypothesis),
+                'iteration': context.get('iteration_number', 0),
+                'domain': context.get('task_domain', 'general')
+            }
+            logger.info(f"Search context: {search_context}")
+            
+            # Step 3: TOOL CALL - Search for hypothesis
+            logger.info(f"Calling web search tool with task: '{hypothesis[:80]}...'")
+            external_sources = self.web_search_tool.search_for_hypothesis(
+                task_description=hypothesis,
+                current_gaps=knowledge_gaps,
+                context=search_context
+            )
+            
+            # Step 4: Enhance context with results
+            context['external_sources'] = external_sources
+            logger.info(f"✓ Retrieved {len(external_sources)} external sources")
+            
+            # Step 5: Add summary for easy consumption
+            if external_sources:
+                context['external_knowledge_summary'] = self._summarize_external_sources(
+                    external_sources
+                )
+                logger.info("✓ Generated external knowledge summary")
+                
+                # Log top sources
+                for idx, source in enumerate(external_sources[:3], 1):
+                    logger.info(
+                        f"  {idx}. [{source['credibility_level']}] {source['title'][:60]}..."
+                    )
+            else:
+                logger.warning("No external sources found")
+            
+        except Exception as e:
+            logger.error(f"Web search enhancement failed: {e}", exc_info=True)
+            # Don't fail the entire process, just skip enhancement
+            context['external_sources'] = []
+            context['external_knowledge_summary'] = ''
+        
+        return context
+    
+    def _identify_knowledge_gaps(self, exp: Experiment, hypothesis: str) -> List[str]:
+        """
+        Identify knowledge gaps from hypothesis
+        
+        Args:
+            exp: Current experiment
+            hypothesis: Hypothesis string
+            
+        Returns:
+            List of knowledge gap descriptions (max 5)
+        """
+        gaps = []
+        hypothesis_lower = hypothesis.lower()
+        
+        # Common agentic system knowledge areas
+        knowledge_areas = {
+            'planning': ['plan', 'planning', 'strategy', 'approach', 'roadmap'],
+            'reasoning': ['reason', 'reasoning', 'logic', 'inference', 'think', 'thought'],
+            'learning': ['learn', 'learning', 'adapt', 'optimization', 'train'],
+            'memory': ['memory', 'context', 'history', 'recall', 'cache'],
+            'tool_use': ['tool', 'api', 'external', 'integration', 'function'],
+            'evaluation': ['evaluate', 'assessment', 'metric', 'performance', 'measure'],
+            'communication': ['communicate', 'language', 'dialogue', 'interaction', 'conversation'],
+            'retrieval': ['retrieval', 'search', 'rag', 'knowledge base', 'database'],
+            'generation': ['generate', 'generation', 'create', 'synthesize', 'produce']
+        }
+        
+        # Identify relevant areas
+        for area, keywords in knowledge_areas.items():
+            if any(kw in hypothesis_lower for kw in keywords):
+                gaps.append(f"{area} techniques and best practices")
+        
+        # Add general gaps if none identified
+        if not gaps:
+            gaps.extend([
+                "agentic system design patterns",
+                "system implementation strategies",
+                "performance optimization techniques"
+            ])
+        
+        return gaps[:5]  # Limit to top 5 gaps
+    
+    def _extract_methodology(self, hypothesis: str) -> str:
+        """
+        Extract methodology from hypothesis
+        
+        Args:
+            hypothesis: Hypothesis string
+            
+        Returns:
+            Identified methodology
+        """
+        hypothesis_lower = hypothesis.lower()
+        
+        methodologies = {
+            'reinforcement learning': ['rl', 'reinforcement', 'q-learning', 'policy', 'reward'],
+            'retrieval augmented generation': ['rag', 'retrieval', 'augmented', 'retrieve'],
+            'chain of thought': ['cot', 'chain of thought', 'reasoning chain', 'step by step'],
+            'tree of thought': ['tot', 'tree of thought', 'reasoning tree', 'branching'],
+            'multi-agent': ['multi-agent', 'multiple agents', 'agent collaboration', 'swarm'],
+            'iterative refinement': ['iterative', 'refinement', 'feedback loop', 'improve'],
+            'prompt engineering': ['prompt', 'prompting', 'instruction', 'template'],
+            'fine-tuning': ['fine-tune', 'fine-tuning', 'training', 'adapt model']
+        }
+        
+        for method, keywords in methodologies.items():
+            if any(kw in hypothesis_lower for kw in keywords):
+                return method
+        
+        return 'general agentic approach'
+    
+    def _assess_complexity(self, hypothesis: str) -> str:
+        """
+        Assess hypothesis complexity
+        
+        Args:
+            hypothesis: Hypothesis string
+            
+        Returns:
+            Complexity level: 'high', 'medium', or 'low'
+        """
+        hypothesis_lower = hypothesis.lower()
+        
+        high_complexity_indicators = [
+            'complex', 'advanced', 'sophisticated', 'multi-stage', 'multi-step',
+            'distributed', 'parallel', 'optimization', 'novel', 'innovative',
+            'state-of-art', 'cutting-edge', 'research'
+        ]
+        
+        medium_complexity_indicators = [
+            'moderate', 'standard', 'typical', 'conventional', 'improve',
+            'enhance', 'optimize', 'refine'
+        ]
+        
+        low_complexity_indicators = [
+            'simple', 'basic', 'straightforward', 'minimal', 'quick',
+            'fix', 'patch', 'update'
+        ]
+        
+        if any(ind in hypothesis_lower for ind in high_complexity_indicators):
+            return 'high'
+        elif any(ind in hypothesis_lower for ind in medium_complexity_indicators):
+            return 'medium'
+        elif any(ind in hypothesis_lower for ind in low_complexity_indicators):
+            return 'low'
+        else:
+            return 'medium'  # Default to medium
+    
+    def _summarize_external_sources(self, sources: List[Dict[str, Any]]) -> str:
+        """
+        Summarize external sources for context injection
+        
+        Args:
+            sources: List of external source dictionaries
+            
+        Returns:
+            Formatted summary string
+        """
+        if not sources:
+            return "No external sources available."
+        
+        summary_parts = []
+        
+        # Count by credibility
+        high_cred = [s for s in sources if s.get('credibility_level') == 'High']
+        medium_cred = [s for s in sources if s.get('credibility_level') == 'Medium']
+        low_cred = [s for s in sources if s.get('credibility_level') == 'Low']
+        
+        summary_parts.append(
+            f"Retrieved {len(sources)} sources: "
+            f"{len(high_cred)} high-credibility, "
+            f"{len(medium_cred)} medium-credibility, "
+            f"{len(low_cred)} low-credibility"
+        )
+        
+        # High credibility sources
+        if high_cred:
+            summary_parts.append(
+                "\nHigh-credibility sources:\n" +
+                "\n".join(f"  - {s['title'][:70]}" for s in high_cred[:3])
+            )
+        
+        # Key insights from top sources
+        key_insights = []
+        for source in sources[:3]:
+            summary = source.get('summary', '')
+            if len(summary) > 50:
+                key_insights.append(f"  • {summary[:150]}...")
+        
+        if key_insights:
+            summary_parts.append("\nKey insights:\n" + "\n".join(key_insights))
+        
+        return "\n".join(summary_parts)
+    
+    def _generate_code_artifacts(
+        self, 
+        exp: Experiment, 
+        context: Dict[str, Any]
+    ) -> Dict[str, str]:
+        """
+        Generate code artifacts using CoSTEER approach
+        
+        Args:
+            exp: Current experiment
+            context: Enhanced context (possibly with external knowledge)
+        
+        Returns:
+            Dictionary of code artifacts {filename: content}
+        """
+        logger.info("Generating code artifacts with CoSTEER framework...")
+        
+        code_artifacts = {}
+        
+        # Extract task information
+        task_info = self._extract_task_info(context)
+        
+        # Generate main agent implementation
+        logger.info("→ Generating agent.py...")
+        agent_code = self._generate_agent_code(task_info, context)
+        code_artifacts['agent.py'] = agent_code
+        
+        # Generate evaluator
+        logger.info("→ Generating evaluator.py...")
+        evaluator_code = self._generate_evaluator_code(task_info)
+        code_artifacts['evaluator.py'] = evaluator_code
+        
+        # Generate execution script
+        logger.info("→ Generating train.py...")
+        train_code = self._generate_execution_script(task_info)
+        code_artifacts['train.py'] = train_code
+        
+        # Generate requirements
+        logger.info("→ Generating requirements.txt...")
+        requirements = self._generate_requirements(task_info)
+        code_artifacts['requirements.txt'] = requirements
+        
+        logger.info(f"✓ Generated {len(code_artifacts)} code artifacts")
+        return code_artifacts
+    
+    def _extract_task_info(self, context: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Extract task information from context
+        
+        Args:
+            context: Context dictionary with external knowledge
+            
+        Returns:
+            Task information dictionary
+        """
+        hypothesis = context.get('hypothesis', 'Improve agentic system performance')
+        
+        task_info = {
+            'task_id': context.get('task_id', 'unknown'),
+            'domain': context.get('task_domain', 'general'),
+            'hypothesis': hypothesis,
+            'complexity': context.get('complexity', self._assess_complexity(hypothesis)),
+            'methodology': self._extract_methodology(hypothesis),
+            'external_sources': context.get('external_sources', []),
+            'external_knowledge_summary': context.get('external_knowledge_summary', ''),
+            'has_external_knowledge': len(context.get('external_sources', [])) > 0,
+            'iteration_number': context.get('iteration_number', 0)
+        }
+        
+        return task_info
+    
+    def _generate_agent_code(self, task_info: Dict[str, Any], context: Dict[str, Any]) -> str:
+        """
+        Generate agent implementation code
+        
+        Args:
+            task_info: Task information
+            context: Full context with external knowledge
+            
+        Returns:
+            Agent code as string
+        """
+        # Simplified placeholder - in real implementation, use LLM with prompts
+        hypothesis = task_info['hypothesis']
+        external_summary = task_info['external_knowledge_summary']
+        
+        code = f'''"""
+Agentic System Implementation
+Generated for: {hypothesis}
+
+External Knowledge:
+{external_summary if external_summary else "No external knowledge used"}
+"""
+
+from typing import Dict, Any, List
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class AgenticSystem:
+    """
+    Main agentic system implementation
+    Hypothesis: {hypothesis}
+    """
+    
+    def __init__(self, config: Dict[str, Any]):
+        self.config = config
+        logger.info("Initialized AgenticSystem")
+    
+    def run(self, task: str) -> Dict[str, Any]:
+        """Execute the agentic system on a task"""
+        logger.info(f"Running task: {{task}}")
+        
+        # Implementation based on hypothesis
+        result = {{
+            'task': task,
+            'status': 'completed',
+            'output': 'Task completed successfully'
+        }}
+        
+        return result
+
+
+def create_agent(config: Dict[str, Any]) -> AgenticSystem:
+    """Factory function to create agent"""
+    return AgenticSystem(config)
+'''
+        return code
+    
+    def _generate_evaluator_code(self, task_info: Dict[str, Any]) -> str:
+        """Generate evaluator code"""
+        code = '''"""
+Evaluator for Agentic System
+"""
+
+from typing import Dict, Any
+
+
+class AgenticSystemEvaluator:
+    """Evaluates agentic system performance"""
+    
+    def evaluate(self, results: Dict[str, Any]) -> Dict[str, float]:
+        """
+        Evaluate system performance
+        
+        Returns:
+            Dictionary of metric scores
+        """
+        scores = {
+            'comprehensiveness': 7.0,
+            'insight': 6.5,
+            'instruction_following': 8.0,
+            'readability': 7.5
+        }
+        
+        return scores
+
+
+def create_evaluator() -> AgenticSystemEvaluator:
+    """Factory function"""
+    return AgenticSystemEvaluator()
+'''
+        return code
+    
+    def _generate_execution_script(self, task_info: Dict[str, Any]) -> str:
+        """Generate execution script"""
+        code = '''"""
+Training/Execution script for agentic system
+"""
+
+import logging
+from agent import create_agent
+from evaluator import create_evaluator
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def main():
+    """Main execution"""
+    logger.info("Starting agentic system execution")
+    
+    # Create agent
+    config = {'model': 'gpt-4', 'temperature': 0.7}
+    agent = create_agent(config)
+    
+    # Run task
+    task = "Sample agentic task"
+    results = agent.run(task)
+    
+    # Evaluate
+    evaluator = create_evaluator()
+    scores = evaluator.evaluate(results)
+    
+    logger.info(f"Evaluation scores: {scores}")
+    logger.info("Execution completed")
+
+
+if __name__ == '__main__':
+    main()
+'''
+        return code
+    
+    def _generate_requirements(self, task_info: Dict[str, Any]) -> str:
+        """Generate requirements.txt"""
+        requirements = '''# Requirements for agentic system
+openai>=1.0.0
+anthropic>=0.7.0
+pydantic>=2.0.0
+python-dotenv>=1.0.0
+requests>=2.31.0
+'''
+        return requirements
+    
+    def _calculate_timeout(self, exp: Experiment) -> int:
+        """Calculate execution timeout based on complexity"""
+        complexity = getattr(exp, 'complexity', 'medium')
+        
+        timeout_map = {
+            'low': 300,      # 5 minutes
+            'medium': 600,   # 10 minutes
+            'high': 1200     # 20 minutes
+        }
+        
+        return timeout_map.get(complexity, 600)
+    
+    def _create_fallback_workspace(self, exp: Experiment) -> FBWorkspace:
+        """Create fallback workspace on error"""
+        ws = FBWorkspace()
+        
+        # Create minimal agent.py
+        ws.inject_files(**{
+            'agent.py': '# Fallback agent implementation\nprint("Fallback mode")',
+            'train.py': '# Fallback execution\nprint("Running in fallback mode")'
+        })
+        
+        return ws
+
+
+class AgenticSysRunner(Developer[Experiment]):
+    """
+    Runner for agentic system experiments
+    Executes generated code and collects results
+    """
+    
+    def __init__(self, scen):
+        self.scen = scen
+        logger.info("Initialized AgenticSysRunner")
+    
+    def develop(self, exp: Experiment) -> Experiment:
+        """
+        Run the experiment
+        
+        Args:
+            exp: Experiment with generated code
+            
+        Returns:
+            Experiment with execution results
+        """
+        logger.info(f"Running experiment: {getattr(exp, 'id', 'unknown')}")
+        
+        try:
+            if not hasattr(exp, 'experiment_workspace') or not exp.experiment_workspace:
+                raise ValueError("No workspace found in experiment")
+            
+            # Execute the code
+            env = get_agent_sys_env(running_timeout_period=600, enable_cache=True)
+            run_res = exp.experiment_workspace.run(env=env, entry="python train.py")
+            
+            # Store results
+            exp.run_returncode = getattr(run_res, 'returncode', None)
+            exp.run_stdout = getattr(run_res, 'stdout', getattr(run_res, 'logs', None))
+            exp.run_stderr = getattr(run_res, 'stderr', None)
+            
+            logger.info(f"Execution completed with return code: {exp.run_returncode}")
+            
+        except Exception as e:
+            logger.error(f"Execution failed: {e}", exc_info=True)
+            exp.exception = e
+        
+        return exp
\ No newline at end of file
diff --git a/rdagent/scenarios/agentic_sys/docker/Dockerfile b/rdagent/scenarios/agentic_sys/docker/Dockerfile
new file mode 100644
index 000000000..b4e6928a6
--- /dev/null
+++ b/rdagent/scenarios/agentic_sys/docker/Dockerfile
@@ -0,0 +1,14 @@
+FROM python:3.11-slim
+
+RUN apt-get clean && apt-get update && apt-get install -y \  
+    curl \  
+    vim \  
+    git \  
+    build-essential \
+    && pip install --no-cache-dir uv \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy entrypoint.sh script into the container workspace  
+COPY entrypoint.sh /workspace/entrypoint.sh  
+RUN chmod +x /workspace/entrypoint.sh
+
diff --git a/rdagent/scenarios/agentic_sys/docker/entrypoint.sh b/rdagent/scenarios/agentic_sys/docker/entrypoint.sh
new file mode 100644
index 000000000..2d94b4ccf
--- /dev/null
+++ b/rdagent/scenarios/agentic_sys/docker/entrypoint.sh
@@ -0,0 +1,14 @@
+
+
+mkdir -p /env
+cd /env
+uv sync
+uv add pip
+source .venv/bin/activate
+
+mkdir -p /workspace
+cd /workspace
+
+git clone https://github.com/your-username/deep_research_bench.git
+cd deep_research_bench
+pip install -r requirements.txt
diff --git a/rdagent/scenarios/agentic_sys/env.py b/rdagent/scenarios/agentic_sys/env.py
new file mode 100644
index 000000000..b32d472f7
--- /dev/null
+++ b/rdagent/scenarios/agentic_sys/env.py
@@ -0,0 +1,109 @@
+
+from pathlib import Path
+from pydantic_settings.main import SettingsConfigDict
+from rdagent.utils.env import DockerConf, DockerEnv
+from rdagent.app.data_science.conf import DS_RD_SETTING
+import logging
+import shutil
+
+logger = logging.getLogger(__name__)
+
+
+class AgentSysDockerConf(DockerConf):
+    # TODO: change the content
+    model_config = SettingsConfigDict(env_prefix="ASYS_DOCKER_")
+
+    build_from_dockerfile: bool = True
+
+    dockerfile_folder_path: Path = Path(__file__).parent / "docker"
+    image: str = "local_agentic_sys:latest"
+
+
+    #Mount and execution strategy
+    mount_path: str = "/workspace/rdagent-solution"
+    #mount_path: str = "/workspace"
+
+
+    default_entry: str = "python main.py"
+    #default_entry: str = "python train.py"
+
+    running_timeout_period: int | None = 600
+    mem_limit: str | None = (
+        "48g"  # Add memory limit attribute # new-york-city-taxi-fare-prediction may need more memory
+    )
+
+def sanitize_container_path(path):
+    p = path.replace("\\","/")
+    if ":" in p:
+        #remove drive letter
+        p = p.split(":",1)[-1]
+    if not p.startswith("/"):
+        p = "/" + p.lstrip("/")
+    return p
+
+def build_volume(ws_path, mount_path, extra):
+    """
+    return Docker SDK volume mapping dict
+    """
+    vols = {}
+    host_ws = str(ws_path.resolve())
+    container_ws = sanitize_container_path(mount_path)
+    vols[host_ws] = {"bind": container_ws, "mode": "rw"}
+    if extra:
+        for host, container in extra.items():
+            host_res = str(Path(host).resolve())
+            container_res = sanitize_container_path(container)
+            vols[host_res] = {"bind": container_res, "mode": "rw"}
+    return vols
+
+
+
+
+def get_agent_sys_env(
+    extra_volumes: dict = {},
+    running_timeout_period: int | None = DS_RD_SETTING.debug_timeout,
+    enable_cache: bool | None = None,
+) -> DockerEnv:
+    """
+    create and prepare Docker environment for agentic system scenario
+    """
+    conf = AgentSysDockerConf()
+    env = DockerEnv(conf=conf)
+    env.conf.extra_volumes = extra_volumes.copy()
+    env.conf.running_timeout_period = running_timeout_period
+    if enable_cache is not None:
+        env.conf.enable_cache = enable_cache
+    env.prepare()
+    return env
+
+
+# def get_agent_sys_env(
+#     extra_volumes:dict = {},
+#     running_timeout_period: int | None = DS_RD_SETTING.debug_timeout,
+#     enable_cache: bool | None = None,
+# ) -> DockerEnv:
+#     """
+#     create and prepare Docker environment for agentic system scenario
+#     """
+#     conf = AgentSysDockerConf()
+#     env = DockerEnv(conf=conf)
+#     env.conf.extra_volumes = extra_volumes.copy()
+#     env.conf.running_timeout_period = running_timeout_period
+#     if enable_cache is not None:
+#         env.conf.enable_cache = enable_cache
+#     #inject correct volumes before preparation
+#     env.conf.mount_path = sanitize_container_path(env.conf.mount_path)
+    
+#     # 清理 extra_volumes 中的容器路径
+#     if env.conf.extra_volumes:
+#         sanitized_extra = {}
+#         for host, container in env.conf.extra_volumes.items():
+#             sanitized_extra[host] = sanitize_container_path(container)
+#         env.conf.extra_volumes = sanitized_extra
+    
+#     env.prepare()
+#     return env
+
+
+    
+
diff --git a/rdagent/scenarios/agentic_sys/evaluator.py b/rdagent/scenarios/agentic_sys/evaluator.py
new file mode 100644
index 000000000..112e4499e
--- /dev/null
+++ b/rdagent/scenarios/agentic_sys/evaluator.py
@@ -0,0 +1,500 @@
+"""
+DeepResearch Bench Evaluator for Agentic System
+Implements 4-dimension evaluation: Comprehensiveness, Insight, Instruction Following, Readability
+"""
+from typing import Dict, Any, Optional, List
+from dataclasses import dataclass, field
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DimensionScore:
+    """Score for a single dimension"""
+    score: float  # 0-10
+    checks: List[str] = field(default_factory=list)
+    details: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class EvaluationResult:
+    """Complete evaluation result with all dimensions"""
+    comprehensiveness: DimensionScore
+    insight: DimensionScore
+    instruction_following: DimensionScore
+    readability: DimensionScore
+    overall: float = 0.0
+    weights: Dict[str, float] = field(default_factory=dict)
+    normalized_scores: Optional[Dict[str, float]] = None
+    
+    def __init__(self):
+        if not self.weights:
+            # Default equal weights
+            self.weights = {
+                'comprehensiveness': 0.25,
+                'insight': 0.25,
+                'instruction_following': 0.25,
+                'readability': 0.25
+            }
+        self.calculate_overall()
+    
+    def calculate_overall(self) -> float:
+        """Calculate weighted overall score"""
+        self.overall = (
+            self.comprehensiveness.score * self.weights['comprehensiveness'] +
+            self.insight.score * self.weights['insight'] +
+            self.instruction_following.score * self.weights['instruction_following'] +
+            self.readability.score * self.weights['readability']
+        )
+        return self.overall
+    
+    def normalize_against_reference(self, reference: 'EvaluationResult') -> Dict[str, float]:
+        """
+        Pairwise normalization: target_normalized = target_score / (target_score + reference_score)
+        """
+        normalized = {}
+        
+        dimensions = ['comprehensiveness', 'insight', 'instruction_following', 'readability']
+        for dim in dimensions:
+            target_score = getattr(self, dim).score
+            ref_score = getattr(reference, dim).score
+            total = target_score + ref_score
+            normalized[dim] = target_score / total if total > 0 else 0.5
+        
+        # Normalize overall score
+        total_overall = self.overall + reference.overall
+        normalized['overall'] = self.overall / total_overall if total_overall > 0 else 0.5
+        
+        self.normalized_scores = normalized
+        return normalized
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization"""
+        result = {
+            'scores': {
+                'comprehensiveness': round(self.comprehensiveness.score, 2),
+                'insight': round(self.insight.score, 2),
+                'instruction_following': round(self.instruction_following.score, 2),
+                'readability': round(self.readability.score, 2),
+                'overall': round(self.overall, 2)
+            },
+            'weights': self.weights,
+            'details': {
+                'comprehensiveness': {
+                    'score': round(self.comprehensiveness.score, 2),
+                    'checks': self.comprehensiveness.checks,
+                    'details': self.comprehensiveness.details
+                },
+                'insight': {
+                    'score': round(self.insight.score, 2),
+                    'checks': self.insight.checks,
+                    'details': self.insight.details
+                },
+                'instruction_following': {
+                    'score': round(self.instruction_following.score, 2),
+                    'checks': self.instruction_following.checks,
+                    'details': self.instruction_following.details
+                },
+                'readability': {
+                    'score': round(self.readability.score, 2),
+                    'checks': self.readability.checks,
+                    'details': self.readability.details
+                }
+            }
+        }
+        
+        if self.normalized_scores:
+            result['normalized_scores'] = self.normalized_scores
+        
+        return result
+
+
+class DeepResearchEvaluator:
+    """
+    Evaluator implementing DeepResearch Bench 4-dimension rubric
+    
+    Scoring Anchors:
+    - 0-2: Poor/Missing core elements
+    - 4-6: Basic/Adequate with gaps
+    - 6-8: Good/Complete coverage
+    - 8-10: Excellent/Exhaustive
+    """
+    
+    def __init__(self, dimension_weights: Optional[Dict[str, float]] = None):
+        """
+        Initialize evaluator with optional custom weights
+        
+        Args:
+            dimension_weights: Custom weights for dimensions (must sum to 1.0)
+        """
+        self.weights = dimension_weights or {
+            'comprehensiveness': 0.25,
+            'insight': 0.25,
+            'instruction_following': 0.25,
+            'readability': 0.25
+        }
+        
+        # Validate weights
+        total = sum(self.weights.values())
+        if abs(total - 1.0) > 0.01:
+            logger.warning(f"Weights sum to {total}, normalizing to 1.0")
+            for k in self.weights:
+                self.weights[k] /= total
+    
+    def evaluate(
+        self,
+        output: Any,
+        task_requirements: Optional[Dict[str, Any]] = None,
+        task_context: Optional[Dict[str, Any]] = None,
+        reference_result: Optional[EvaluationResult] = None
+    ) -> EvaluationResult:
+        """
+        Evaluate output against DeepResearch Bench criteria
+        
+        Args:
+            output: The agent's output to evaluate
+            task_requirements: Task requirements and constraints
+            task_context: Additional context about the task
+            reference_result: Optional reference for normalization
+            
+        Returns:
+            EvaluationResult with scores for all dimensions
+        """
+        task_requirements = task_requirements or {}
+        task_context = task_context or {}
+        
+        # Evaluate each dimension
+        comp_score = self._evaluate_comprehensiveness(output, task_requirements)
+        insight_score = self._evaluate_insight(output, task_context)
+        following_score = self._evaluate_instruction_following(output, task_requirements)
+        read_score = self._evaluate_readability(output)
+        
+        # Create result
+        result = EvaluationResult(
+            comprehensiveness=comp_score,
+            insight=insight_score,
+            instruction_following=following_score,
+            readability=read_score,
+            weights=self.weights
+        )
+        
+        # Normalize against reference if provided
+        if reference_result:
+            result.normalize_against_reference(reference_result)
+        
+        return result
+    
+    def evaluate_comprehensiveness(
+        self, 
+        output: Any, 
+        requirements: Dict[str, Any]
+    ) -> DimensionScore:
+        """
+        Evaluate Comprehensiveness (0-10)
+        - Required subtopics coverage (0-3 pts)
+        - Depth of analysis (0-3 pts)
+        - Evidence and sources (0-2 pts)
+        - Multiple perspectives (0-2 pts)
+        """
+        score = 0.0
+        checks = []
+        details = {}
+        
+        output_str = str(output).lower()
+        
+        # 1. Required subtopics coverage (0-3 pts)
+        required_topics = requirements.get('required_topics', [])
+        if required_topics:
+            covered = sum(1 for topic in required_topics 
+                         if topic.lower() in output_str)
+            coverage_ratio = covered / len(required_topics)
+            coverage_score = min(3.0, coverage_ratio * 3.0)
+            score += coverage_score
+            checks.append(f"Topic coverage: {covered}/{len(required_topics)} ({coverage_score:.1f}/3.0)")
+            details['topic_coverage'] = {
+                'required': len(required_topics),
+                'covered': covered,
+                'ratio': coverage_ratio
+            }
+        else:
+            score += 2.0
+            checks.append("No specific topic requirements (default 2.0/3.0)")
+        
+        # 2. Depth of analysis (0-3 pts)
+        depth_indicators = {
+            'detailed_analysis': 'detailed analysis' in output_str or 'in-depth' in output_str,
+            'data_evidence': 'data' in output_str or 'evidence' in output_str,
+            'substantial_content': len(str(output)) > 500,
+            'methodology': 'methodology' in output_str or 'approach' in output_str
+        }
+        depth_score = sum(depth_indicators.values()) * 0.75
+        score += depth_score
+        checks.append(f"Depth indicators: {sum(depth_indicators.values())}/4 ({depth_score:.1f}/3.0)")
+        details['depth_indicators'] = depth_indicators
+        
+        # 3. Evidence and sources (0-2 pts)
+        evidence_score = 0.0
+        if 'references' in output_str or 'sources' in output_str or 'citation' in output_str:
+            evidence_score += 1.0
+        if 'data' in output_str or 'statistics' in output_str or 'figure' in output_str:
+            evidence_score += 1.0
+        score += evidence_score
+        checks.append(f"Evidence & sources: ({evidence_score:.1f}/2.0)")
+        details['evidence_score'] = evidence_score
+        
+        # 4. Multiple perspectives (0-2 pts)
+        perspective_keywords = [
+            'advantage', 'disadvantage', 'trade-off', 'alternative',
+            'limitation', 'consideration', 'pros', 'cons'
+        ]
+        perspectives_found = sum(1 for kw in perspective_keywords if kw in output_str)
+        perspective_score = min(2.0, perspectives_found * 0.4)
+        score += perspective_score
+        checks.append(f"Multiple perspectives: {perspectives_found} keywords ({perspective_score:.1f}/2.0)")
+        details['perspectives_found'] = perspectives_found
+        
+        final_score = min(10.0, score)
+        checks.append(f"Total Comprehensiveness Score: {final_score:.2f}/10.0")
+        
+        return DimensionScore(score=final_score, checks=checks, details=details)
+    
+    def evaluate_insight(
+        self,
+        output: Any,
+        context: Dict[str, Any]
+    ) -> DimensionScore:
+        """
+        Evaluate Insight (0-10)
+        - Causal reasoning (0-3 pts)
+        - Quantified analysis (0-2 pts)
+        - Non-obvious implications (0-3 pts)
+        - Novel synthesis (0-2 pts)
+        """
+        score = 0.0
+        checks = []
+        details = {}
+        
+        output_str = str(output).lower()
+        
+        # 1. Causal reasoning (0-3 pts)
+        causal_indicators = [
+            'because', 'therefore', 'as a result', 'leads to',
+            'causes', 'impacts', 'due to', 'consequently'
+        ]
+        causal_found = sum(1 for indicator in causal_indicators if indicator in output_str)
+        causal_score = min(3.0, causal_found * 0.5)
+        score += causal_score
+        checks.append(f"Causal reasoning: {causal_found} indicators ({causal_score:.1f}/3.0)")
+        details['causal_indicators'] = causal_found
+        
+        # 2. Quantified analysis (0-2 pts)
+        has_numbers = any(char.isdigit() for char in str(output))
+        metric_keywords = ['percent', '%', 'rate', 'ratio', 'metric', 'measure', 'score']
+        has_metrics = any(kw in output_str for kw in metric_keywords)
+        quant_score = (1.0 if has_numbers else 0.0) + (1.0 if has_metrics else 0.0)
+        score += quant_score
+        checks.append(f"Quantified analysis: numbers={has_numbers}, metrics={has_metrics} ({quant_score:.1f}/2.0)")
+        details['quantification'] = {'has_numbers': has_numbers, 'has_metrics': has_metrics}
+        
+        # 3. Non-obvious implications (0-3 pts)
+        insight_keywords = [
+            'implication', 'insight', 'suggests', 'indicates',
+            'reveals', 'unexpected', 'surprisingly', 'notable', 'interesting'
+        ]
+        insights_found = sum(1 for kw in insight_keywords if kw in output_str)
+        implication_score = min(3.0, insights_found * 0.6)
+        score += implication_score
+        checks.append(f"Implications: {insights_found} keywords ({implication_score:.1f}/3.0)")
+        details['insights_found'] = insights_found
+        
+        # 4. Novel synthesis (0-2 pts)
+        synthesis_indicators = [
+            'framework', 'model', 'synthesis', 'integration',
+            'novel', 'innovative', 'unique', 'original'
+        ]
+        synthesis_found = sum(1 for kw in synthesis_indicators if kw in output_str)
+        synthesis_score = min(2.0, synthesis_found * 0.5)
+        score += synthesis_score
+        checks.append(f"Novel synthesis: {synthesis_found} indicators ({synthesis_score:.1f}/2.0)")
+        details['synthesis_indicators'] = synthesis_found
+        
+        final_score = min(10.0, score)
+        checks.append(f"Total Insight Score: {final_score:.2f}/10.0")
+        
+        return DimensionScore(score=final_score, checks=checks, details=details)
+    
+    def evaluate_instruction_following(
+        self,
+        output: Any,
+        requirements: Dict[str, Any]
+    ) -> DimensionScore:
+        """
+        Evaluate Instruction Following (0-10)
+        - Required sections present (0-4 pts)
+        - Scope compliance (0-3 pts)
+        - Format compliance (0-2 pts)
+        - Completeness (0-1 pt)
+        """
+        score = 0.0
+        checks = []
+        details = {}
+        
+        output_str = str(output).lower()
+        
+        # 1. Required sections present (0-4 pts)
+        required_sections = requirements.get('required_sections', [])
+        if required_sections:
+            present = sum(1 for section in required_sections 
+                         if section.lower() in output_str)
+            section_score = min(4.0, (present / len(required_sections)) * 4.0)
+            score += section_score
+            checks.append(f"Required sections: {present}/{len(required_sections)} ({section_score:.1f}/4.0)")
+            details['sections'] = {
+                'required': len(required_sections),
+                'present': present
+            }
+        else:
+            score += 3.0
+            checks.append("No specific section requirements (default 3.0/4.0)")
+        
+        # 2. Scope compliance (0-3 pts)
+        scope_violations = self._check_scope_violations(output, requirements)
+        scope_score = max(0.0, 3.0 - len(scope_violations) * 0.5)
+        score += scope_score
+        if scope_violations:
+            checks.append(f"Scope violations: {len(scope_violations)} ({scope_score:.1f}/3.0)")
+            details['scope_violations'] = scope_violations
+        else:
+            checks.append("No scope violations (3.0/3.0)")
+        
+        # 3. Format compliance (0-2 pts)
+        format_reqs = requirements.get('format', {})
+        format_score = 2.0  # Default
+        if format_reqs:
+            format_type = format_reqs.get('type', '').lower()
+            if format_type == 'json':
+                try:
+                    import json
+                    json.loads(str(output))
+                    format_score = 2.0
+                except:
+                    format_score = 0.5
+            checks.append(f"Format compliance: {format_type} ({format_score:.1f}/2.0)")
+        else:
+            checks.append("Format compliance: (2.0/2.0)")
+        score += format_score
+        details['format_score'] = format_score
+        
+        # 4. Completeness (0-1 pt)
+        length = len(str(output))
+        completeness_score = 1.0 if length > 200 else 0.5
+        score += completeness_score
+        checks.append(f"Completeness: {length} chars ({completeness_score:.1f}/1.0)")
+        details['output_length'] = length
+        
+        final_score = min(10.0, score)
+        checks.append(f"Total Instruction Following Score: {final_score:.2f}/10.0")
+        
+        return DimensionScore(score=final_score, checks=checks, details=details)
+    
+    def evaluate_readability(self, output: Any) -> DimensionScore:
+        """
+        Evaluate Readability (0-10)
+        - Structure and organization (0-3 pts)
+        - Language quality (0-3 pts)
+        - Data presentation (0-2 pts)
+        - Clarity (0-2 pts)
+        """
+        score = 0.0
+        checks = []
+        details = {}
+        
+        output_str = str(output)
+        output_lower = output_str.lower()
+        
+        # 1. Structure and organization (0-3 pts)
+        structure_indicators = {
+            'has_breaks': '\n' in output_str,
+            'has_sections': any(word in output_lower for word in 
+                               ['summary', 'introduction', 'conclusion', 'results', 'method']),
+            'multi_paragraph': output_str.count('\n') > 5
+        }
+        structure_score = min(3.0, sum(structure_indicators.values()) * 1.0)
+        score += structure_score
+        checks.append(f"Structure: {sum(structure_indicators.values())}/3 indicators ({structure_score:.1f}/3.0)")
+        details['structure'] = structure_indicators
+        
+        # 2. Language quality (0-3 pts)
+        words = output_str.split()
+        if words:
+            avg_word_length = sum(len(word) for word in words) / len(words)
+            unique_ratio = len(set(output_lower.split())) / len(words) if words else 0
+            
+            language_score = 0.0
+            if 4 < avg_word_length < 7:  # Reasonable word length
+                language_score += 1.5
+            if unique_ratio > 0.5:  # Vocabulary variety
+                language_score += 1.5
+            
+            score += language_score
+            checks.append(f"Language quality: avg_len={avg_word_length:.1f}, variety={unique_ratio:.2f} ({language_score:.1f}/3.0)")
+            details['language'] = {
+                'avg_word_length': avg_word_length,
+                'unique_ratio': unique_ratio
+            }
+        else:
+            checks.append("Language quality: no content (0.0/3.0)")
+        
+        # 3. Data presentation (0-2 pts)
+        has_formatting = any(marker in output_str for marker in ['|', ':', '-', '*', '•'])
+        has_structure = output_str.count('\n') > 3
+        presentation_score = (1.0 if has_formatting else 0.0) + (1.0 if has_structure else 0.0)
+        score += presentation_score
+        checks.append(f"Data presentation: formatting={has_formatting}, structure={has_structure} ({presentation_score:.1f}/2.0)")
+        details['presentation'] = {
+            'has_formatting': has_formatting,
+            'has_structure': has_structure
+        }
+        
+        # 4. Clarity (0-2 pts)
+        length = len(output_str)
+        clarity_score = 2.0
+        if length < 100:
+            clarity_score = 0.5
+        elif length > 5000:
+            clarity_score = 1.5
+        
+        score += clarity_score
+        checks.append(f"Clarity: length={length} chars ({clarity_score:.1f}/2.0)")
+        details['clarity'] = {'length': length, 'score': clarity_score}
+        
+        final_score = min(10.0, score)
+        checks.append(f"Total Readability Score: {final_score:.2f}/10.0")
+        
+        return DimensionScore(score=final_score, checks=checks, details=details)
+    
+    def _check_scope_violations(
+        self,
+        output: Any,
+        requirements: Dict[str, Any]
+    ) -> List[str]:
+        """Check for scope violations"""
+        violations = []
+        output_lower = str(output).lower()
+        
+        # Check timeframe constraints
+        timeframe = requirements.get('timeframe')
+        if timeframe:
+            excluded_periods = requirements.get('excluded_periods', [])
+            for period in excluded_periods:
+                if period.lower() in output_lower:
+                    violations.append(f"Out-of-scope timeframe: {period}")
+        
+        # Check topic constraints
+        excluded_topics = requirements.get('excluded_topics', [])
+        for topic in excluded_topics:
+            if topic.lower() in output_lower:
+                violations.append(f"Out-of-scope topic: {topic}")
+        
+        return violations
\ No newline at end of file
diff --git a/rdagent/scenarios/agentic_sys/exp.py b/rdagent/scenarios/agentic_sys/exp.py
new file mode 100644
index 000000000..a688c7ea8
--- /dev/null
+++ b/rdagent/scenarios/agentic_sys/exp.py
@@ -0,0 +1,140 @@
+from pathlib import Path
+from rdagent.core.experiment import Experiment
+from typing import Any, List, Optional, Dict
+
+# convert code into executable experiment and output standard experiment result
+class AgenticSysExperiment(Experiment):
+    def __init__(self, sub_tasks=None, based_experiments=None, experiment_workspace=None):
+        super().__init__(sub_tasks=sub_tasks, based_experiments=based_experiments)
+        if experiment_workspace is not None:
+            self.experiment_workspace = experiment_workspace
+        
+        # DeepResearch Bench evaluation scores
+        self.deepresearch_scores: Optional[Dict[str, float]] = None
+        self.evaluation_log: Optional[list] = None
+
+        #web search related attributes (NEW)
+        self.used_web_search: bool = False
+        self.external_sources: List[Dict[str, Any]] = []
+        self.external_knowledge_summary: str = ""
+        self.web_search_timestamp: Optional[str] = None
+        
+        # Existing attributes...
+        self.hypothesis: str = ""
+        self.iteration_number: int = 0
+        self.complexity: str = "medium"
+        self.previous_performance_low: bool = False
+
+    def run(self, code:str):
+        """
+        Run the experiment with the given code.
+        Step: 
+        1. Prepare Experiment Environment
+        2. Run Agent code
+        3. Collect Performance Metrics
+        4. record log
+        """
+        code_path = self.workspace / "agent.py"
+        code_path.write_text(code)
+
+        #construct running script
+        run_script = f"""
+        import sys
+        import json
+        try:
+            sys.path.insert(0, '{self.workspace}')
+            from agent import AgenticSystem
+            
+            agent = AgenticSystem()
+            results = agent.run_tasks()
+            
+            # output structured results
+            print("=== EXECUTION RESULTS ===")
+            print(f"Success Rate: {{results['success_rate']}}")
+            print(f"Average Time: {{results['avg_time']}}")
+            print(f"Error Count: {{results['error_count']}}")
+            print(f"Total Tasks: {{results['total_tasks']}}")
+            print("=== END RESULTS ===")
+            
+            # output JSON format for parsing
+            print("=== JSON RESULTS ===")
+            print(json.dumps(results))
+            print("=== END JSON ===")
+            
+        except Exception as e:
+            print(f"ERROR: {{str(e)}}", file=sys.stderr)
+            import traceback
+            traceback.print_exc()
+            sys.exit(1)
+        """
+
+        # use runner to execute
+        result = self.runner.run(
+            script = run_script,
+            timeout = 300,  # 5 minutes timeout
+            capture_output = True
+        )
+
+        # parse output for metrics
+        metrics = self.parse_output(result.stdout)
+        return Experiment(
+            success = result.returncode == 0,
+            metrics = metrics,
+            logs = result.stdout,
+            errors = result.stderr
+        )
+    
+    #parse all the metrics from stdout
+    def parse_output(self, stdout: str):
+        metrics = {
+            "success_rate": 0.0,
+            "avg_time": float('inf'),
+            "error_count": 1,
+            "total_tasks": 0
+        }
+        
+        try:
+            # try to extract JSON block first
+            import json
+            import re
+            json_match = re.search(r'=== JSON RESULTS ===\n(.*?)\n=== END JSON ===', stdout, re.DOTALL)
+            if json_match:
+                result_data = json.loads(json_match.group(1))
+                metrics.update(result_data)
+                return metrics
+            
+            # fallback to text parsing
+            for line in stdout.splitlines():
+                if "Success Rate:" in line:
+                    metrics["success_rate"] = float(line.split(":")[1].strip())
+                elif "Average Time:" in line:
+                    metrics["avg_time"] = float(line.split(":")[1].strip())
+                elif "Error Count:" in line:
+                    metrics["error_count"] = int(line.split(":")[1].strip())
+                elif "Total Tasks:" in line:
+                    metrics["total_tasks"] = int(line.split(":")[1].strip())
+                    
+        except Exception as e:
+            print(f"Failed to parse output: {e}")
+            
+        return metrics
+
+
+    def get_deepresearch_score(self, dimension: str = 'overall') -> float:
+        """Get DeepResearch Bench score for specific dimension"""
+        if not self.deepresearch_scores:
+            return 0.0
+        return self.deepresearch_scores.get(dimension, 0.0)
+    
+    def get_evaluation_summary(self):
+        """Get comprehensive evaluation summary"""
+        if not hasattr(self, 'result') or not self.result:
+            return {'status': 'no_results'}
+        
+        summary = {
+            'execution_metrics': self.result.get('execution_results', {}),
+            'deepresearch_scores': self.result.get('deepresearch_evaluation', {}),
+            'overall_score': self.get_deepresearch_score('overall')
+        }
+        
+        return summary
\ No newline at end of file
diff --git a/rdagent/scenarios/agentic_sys/feedback.py b/rdagent/scenarios/agentic_sys/feedback.py
new file mode 100644
index 000000000..69d9d9847
--- /dev/null
+++ b/rdagent/scenarios/agentic_sys/feedback.py
@@ -0,0 +1,244 @@
+from asyncio.log import logger
+from pathlib import Path
+from rdagent.core.experiment import Experiment
+from rdagent.core.proposal import Experiment2Feedback, ExperimentFeedback, Trace
+import re
+import json
+
+
+class AgenticSysExp2Feedback(Experiment2Feedback):
+    def generate_feedback(self, experiment: Experiment, trace: Trace) -> ExperimentFeedback:
+
+        # BEGIN drafting
+        # read content from `expriment.workspace_path`
+        # END drafting
+        try:
+            if hasattr(experiment, 'experiment_workspace') and experiment.experiment_workspace:
+                ws_path = Path(experiment.experiment_workspace.workspace_path)
+                if ws_path.exists() and ws_path.is_dir():
+                    logger.info(f"Reading results from workspace: {ws_path}")
+                    #Try to read result files in order of preference
+                    result_files = [
+                        "result.json",
+                        "detailed_result.json",
+                        "output.json",
+                        "error_result.json"
+                    ]
+                    for result_file in result_files:
+                        result_path = ws_path / result_file
+                        if result_path.exists():
+                            try:
+                                content = result_path.read_text()
+                                data = json.loads(content)
+                                #Extract execution results if nested 
+                                if isinstance(data, dict):
+                                    if "execution_result" in data:
+                                        experiment.result = data["execution_result"]
+                                    else:
+                                        experiment.result = data
+                                else:
+                                    experiment.result = data
+                                break
+                            except Exception as e:
+                                logger.warning(f"Failed to parse {result_file}: {e}")
+                                continue
+                    #if no result file found, try parsing stdout/stderr from workspace
+                    if not hasattr(experiment, 'result') or experiment.result is None:
+                        self.try_parse_logs(experiment, ws_path)
+        except Exception as e:
+            logger.warning(f"Failed to read workspace contents: {e}")
+
+
+
+        # 1. check whether experiment ran successfully
+        if not hasattr(experiment, 'result') or experiment.result is None:
+            return ExperimentFeedback(
+                reason = "Experiment did not complete execution.",
+                decision = False,
+                exception = getattr(experiment, 'exception', None)
+            )
+        
+        #2. extract important metrics from experiment result
+        result = experiment.result
+
+        #evaluation metrics
+        success_rate = result.get('success_rate', 0)
+        avg_time = result.get('avg_time', float('inf'))
+        error_count = result.get('error_count', 0)
+
+        #3. formulate success criteria
+        MIN_SUCCESS_RATE = 0.7
+        MAX_AVG_TIME = 30
+        MAX_ERROR_COUNT = 2
+
+        is_successful = (
+            success_rate >= MIN_SUCCESS_RATE and
+            avg_time <= MAX_AVG_TIME and
+            error_count <= MAX_ERROR_COUNT
+        )
+
+        #4. Compare with past experiments in the trace
+        historical_best = self.get_best_from_trace(trace)
+        is_improvement = False
+
+        if historical_best:
+            best_success_rate = historical_best.get('success_rate', 0)
+            best_avg_time = historical_best.get('avg_time', float('inf'))
+
+            is_improvement = (
+                success_rate > best_success_rate or 
+                (success_rate == best_success_rate and avg_time < best_avg_time)
+            )
+
+        else:
+            #first-time experiment, we should still accept it even if it is fail.
+            is_improvement = True
+
+        #5. Generate detailed feedback
+        reason_parts = []
+        reason_parts.append(f"Success Rate: {success_rate:.2f}")
+        reason_parts.append(f"Average Time: {avg_time:.2f}s")
+        if error_count > 0:
+            reason_parts.append(f"Errors Encountered: {error_count}")
+        if is_improvement:
+            reason_parts.append("This experiment shows improvement over past results.")
+        elif historical_best:
+            reason_parts.append(
+                f"No improvement (best: {historical_best.get('success_rate', 0)})"
+            )
+        reason = "|".join(reason_parts)
+        return ExperimentFeedback(
+            reason = reason,
+            decision = is_improvement,
+        )
+    
+    def try_parse_logs(self, experiment, ws_path):
+        """Try to parse result from workspace log files"""
+        try:
+            # look for commonn log files patterns
+            log_pattern = ["*.log", "*.out", "train_output.txt","execution.log"]
+            for pattern in log_pattern:
+                for log_file in ws_path.glob(pattern):
+                    try:
+                        content = log_file.read_text()
+                        parsed = self.parse_stdout_for_metrics(content)
+                        if parsed:
+                            experiment.result = parsed
+                            return
+                    except Exception as e:
+                        logger.warning(f"Failed to parse log file {log_file}: {e}")
+                        continue
+        except Exception as e:
+            logger.warning(f"Failed to read workspace contents: {e}")
+
+
+    def parse_stdout_for_metrics(self, stdout):
+        """Parse metrics from stdout text"""
+        if not stdout:
+            return None
+        try:
+            # Method 1: Try to extract JSON block
+            json_pattern = r'=== JSON RESULTS ===\s*\n(.*?)\n=== END JSON ==='
+            match = re.search(json_pattern, stdout, re.DOTALL)
+            if match:
+                try:
+                    return json.loads(match.group(1).strip())
+                except json.JSONDecodeError:
+                    pass
+            # Method 2: Try to find any JSON object
+            json_obj_pattern = r'\{[^{}]*"success_rate"[^{}]*\}'
+            match = re.search(json_obj_pattern, stdout)
+            if match:
+                try:
+                    return json.loads(match.group(0))
+                except json.JSONDecodeError:
+                    pass
+            
+            # Method 3: Parse text patterns
+            metrics = {}
+            
+            success_match = re.search(r'Success Rate:\s*([0-9.]+)', stdout)
+            time_match = re.search(r'Average Time:\s*([0-9.]+)', stdout)
+            error_match = re.search(r'Error Count:\s*([0-9]+)', stdout)
+            tasks_match = re.search(r'Total Tasks:\s*([0-9]+)', stdout)
+            
+            if success_match:
+                metrics['success_rate'] = float(success_match.group(1))
+                metrics['avg_time'] = float(time_match.group(1)) if time_match else 0.0
+                metrics['error_count'] = int(error_match.group(1)) if error_match else 0
+                metrics['total_tasks'] = int(tasks_match.group(1)) if tasks_match else 0
+                return metrics
+                
+        except Exception as e:
+            logger.debug(f"Failed to parse stdout: {e}")
+        
+        return None
+
+                    
+
+
+    
+    def get_best_from_trace(self, trace:Trace):
+        # Extract the best experiment result from the trace
+        if not hasattr(trace, 'hist') or not trace.hist:
+            return None
+        best_result = None
+        best_success_rate = -1
+        for exp, feedback in trace.hist:
+            if hasattr(exp, 'result') and exp.result:
+                success_rate = exp.result.get('success_rate', 0)
+                if success_rate > best_success_rate:
+                    best_success_rate = success_rate
+                    best_result = exp.result
+        return best_result
+    
+    def analyze_performance_issues(self, result):
+        # analyze performance issues based on result metrics
+        issues = []
+        success_rate = result.get('success_rate', 0)
+        avg_time = result.get('avg_time', float('inf'))
+        error_count = result.get('error_count', 0)
+        
+        if success_rate < 0.3:
+            issues.append("Critical: Very low success rate - review core algorithm")
+        elif success_rate < 0.7:
+            issues.append("Warning: Success rate below target - optimize task handling")
+        
+        if avg_time > 10:
+            issues.append("Performance: High execution time - consider optimization")
+        
+        if error_count > 5:
+            issues.append("Stability: High error count - improve error handling")
+        
+        return issues
+    
+    def get_evaluation_summary(self, trace):
+        """Get summary of all experiments in trace"""
+        if not hasattr(trace, 'hist') or not trace.hist:
+            return {"total": 0, "successful": 0, "average_success_rate": 0.0}
+        
+        total = len(trace.hist)
+        successful = 0
+        success_rates = []
+        
+        for exp, feedback in trace.hist:
+            if hasattr(exp, 'result') and exp.result:
+                success_rate = exp.result.get('success_rate', 0)
+                success_rates.append(success_rate)
+                if feedback and feedback.decision:
+                    successful += 1
+        
+        return {
+            "total": total,
+            "successful": successful,
+            "success_ratio": successful / total if total > 0 else 0,
+            "average_success_rate": sum(success_rates) / len(success_rates) if success_rates else 0,
+            "best_success_rate": max(success_rates) if success_rates else 0
+        }
+
+    
+    
+
+
+    
+    
diff --git a/rdagent/scenarios/agentic_sys/loop.py b/rdagent/scenarios/agentic_sys/loop.py
new file mode 100644
index 000000000..92d032288
--- /dev/null
+++ b/rdagent/scenarios/agentic_sys/loop.py
@@ -0,0 +1,116 @@
+import asyncio
+import shutil
+import subprocess
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Optional, Union
+
+from rdagent.app.agentic_sys.conf import AgenticSysSetting
+from rdagent.components.workflow.conf import BasePropSetting
+from rdagent.components.workflow.rd_loop import RDLoop
+from rdagent.core.conf import RD_AGENT_SETTINGS
+from rdagent.core.developer import Developer
+from rdagent.core.exception import CoderError, PolicyError, RunnerError
+from rdagent.core.experiment import Experiment
+from rdagent.core.proposal import Experiment2Feedback, ExperimentFeedback, ExpGen, Trace
+from rdagent.core.scenario import Scenario
+from rdagent.core.utils import import_class
+from rdagent.log import rdagent_logger as logger
+from rdagent.scenarios.agentic_sys.exp import AgenticSysExperiment
+from rdagent.core.proposal import ExpGen
+
+
+class AgenticSysRDLoop(RDLoop):
+    # NOTE: we move the DataScienceRDLoop here to be easier to be imported
+    # Maintain experiment loop history and context
+    # support multi-iteration optimization
+    skip_loop_error = (CoderError, RunnerError)
+    withdraw_loop_error = (PolicyError,)
+
+    def __init__(self, PROP_SETTING: AgenticSysSetting):
+
+        scen = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
+        self.scen: Scenario = scen
+        self.exp_gen: ExpGen = import_class(PROP_SETTING.exp_gen)(scen)
+
+        self.coder: Developer = import_class(PROP_SETTING.coder)(scen)
+        self.runner: Developer = import_class(PROP_SETTING.runner)(scen)
+
+        self.summarizer: Experiment2Feedback = import_class(PROP_SETTING.feedback)(scen)
+        self.trace = Trace(scen=scen)
+
+        #Store configuration
+        self.setting = PROP_SETTING
+
+        super(RDLoop, self).__init__()
+
+        logger.info(f"AgenticSysRDLoop initialized for competition: {PROP_SETTING.competition}")
+
+    async def direct_exp_gen(self, prev_out: dict[str, Any]):
+        exp = await self.exp_gen.async_gen(self.trace, self)
+        return {"exp_gen": exp}
+
+    def record(self, prev_out: dict[str, Any]):
+        cur_loop_id = prev_out[self.LOOP_IDX_KEY]
+
+        if (e := prev_out.get(self.EXCEPTION_KEY, None)) is None:
+            exp = prev_out["running"]
+            self.trace.sync_dag_parent_and_hist((exp, prev_out["feedback"]), cur_loop_id)
+        else:
+            exp: DSExperiment = prev_out["direct_exp_gen"] if isinstance(e, CoderError) else prev_out["coding"]
+            self.trace.sync_dag_parent_and_hist(
+                (
+                    exp,
+                    ExperimentFeedback.from_exception(e),
+                ),
+                cur_loop_id,
+            )
+    
+    async def propose(self, prev_out):
+        """Propose hypothesis"""
+        #integrate web search tool
+        hypothesis = self.hypothesis_gen.gen(self.trace)
+        #record result
+        if hasattr(hypothesis, 'external_sources'):
+            logger.log_object(
+                hypothesis.external_sources,
+                tag = "research.external.sources"
+            )
+        return hypothesis
+
+
+    async def develop(self, prev_out):
+        """
+        Develop code with optional web search enhancement
+        
+        Args:
+            prev_out: Previous output containing hypothesis
+            
+        Returns:
+            Developed experiment
+        """
+        logger.info("=" * 80)
+        logger.info("DEVELOP PHASE: Generating code")
+        logger.info("=" * 80)
+        
+        hypothesis = prev_out.get("hypothesis")
+        
+        exp = Experiment()
+        exp.hypothesis = hypothesis.hypothesis if hypothesis else "Default hypothesis"
+        exp.iteration_number = len(self.trace.hist)
+        
+        # Develop code (web search is called inside if needed)
+        exp = self.developer.develop(exp)
+        
+        # Track web search usage in development phase
+        if hasattr(self.developer, 'web_search_tool'):
+            web_tool = self.developer.web_search_tool
+            if web_tool is not None and hasattr(exp, 'used_web_search'):
+                if exp.used_web_search:
+                    self.web_search_usage['total_calls'] += 1
+                    self.web_search_usage['successful_calls'] += 1
+        
+        return exp
+
+
+
diff --git a/rdagent/scenarios/agentic_sys/prompts.yaml b/rdagent/scenarios/agentic_sys/prompts.yaml
new file mode 100644
index 000000000..8212af61f
--- /dev/null
+++ b/rdagent/scenarios/agentic_sys/prompts.yaml
@@ -0,0 +1,838 @@
+# ==================== Knowledge Retrieval and RAG ====================
+
+KG_hypothesis_gen_RAG:
+  system: |-
+    You are an expert in agentic systems research with access to knowledge from previous research tasks and current experiments.
+  
+  user: |-
+    {% if insights %}
+    ====== Cross-Task Insights (Transferable Knowledge) ======
+    {% for insight in insights %}
+    Insight {{ loop.index }}:
+    - Task Domain: {{ insight.domain }}
+    - Research Method: {{ insight.method }}
+    - Key Finding: {{ insight.finding }}
+    - Applicability: {{ insight.applicability }}
+    {% endfor %}
+    {% endif %}
+
+    {% if experiences %}
+    ====== Current Task History ======
+    {% for exp in experiences %}
+    Experiment {{ loop.index }}:
+    - Hypothesis: {{ exp.hypothesis }}
+    - Approach: {{ exp.approach }}
+    - Dimensions Improved: {{ exp.improved_dims }}
+    - Lessons Learned: {{ exp.lessons }}
+    {% endfor %}
+    {% endif %}
+    
+    {% if external_sources %}
+    ====== Retrieved External Sources ======
+    {% for source in external_sources %}
+    Source {{ loop.index }}: {{ source.citation }}
+    - Relevance Score: {{ source.relevance }}
+    - Key Information: {{ source.summary }}
+    {% endfor %}
+    {% endif %}
+
+retrieval_query_generation:
+  system: |-
+    You are an expert in formulating effective search queries for research tasks.
+  
+  user: |-
+    Based on the current research task, generate search queries to retrieve relevant information.
+    
+    Task: {{ task_description }}
+    Current Knowledge Gaps: {{ knowledge_gaps }}
+    
+    Generate:
+    1. **Primary Queries**: Core concepts and requirements
+    2. **Exploratory Queries**: Adjacent topics and methodologies
+    3. **Validation Queries**: Fact-checking and source verification
+    
+    Output Format: 
+    {
+      "primary_queries": ["query1", "query2", ...],
+      "exploratory_queries": ["query1", "query2", ...],
+      "validation_queries": ["query1", "query2", ...]
+    }
+
+# ==================== Scenario and Task Description ====================
+
+scenario_description:
+  system: |-
+    You are an expert in agentic system design and DeepResearch benchmark evaluation.
+  
+  user: |-
+    {% if use_raw_description -%}
+    ====== Background of the Research Task ======
+    {{ raw_description }}
+    {% else %}
+    ====== Background of the Research Task ======
+    {{ background }}
+    {% endif %}
+
+    {% if system_analysis_output is not none %}
+    ====== Current System Analysis ======
+    The following is the analysis of the current agentic system implementation:
+    {{ system_analysis_output }}
+    {% endif %}
+
+    ====== Task Requirements ======
+    Your agentic system must address the following research task:
+    {{ task_requirements }}
+
+    ====== System Specifications ======
+    Please ensure your system adheres to the following specifications:
+    - **Architecture**: {{ architecture_requirements }}
+    - **Agent Communication**: {{ communication_protocol }}
+    - **Error Handling**: Graceful failure recovery and proper logging
+    - **Modularity**: Clear separation of concerns and reusable components
+
+    ====== Evaluation Metrics ======
+    Your system will be evaluated on **four dimensions** with scores from 0-10 (continuous):
+
+    **1. Comprehensiveness (Coverage)** - Weight: {{ comprehensiveness_weight | default(0.25) }}
+    Intent: Breadth and depth of content; no major omissions.
+    - Coverage of all required subtopics and scope (time/geo/segments)
+    - Multiple data sources and evidence
+    - Balanced perspectives
+    Scoring Anchors:
+    - 0-2: Misses core parts; narrow, superficial
+    - 4-6: Covers basics; some gaps or shallow treatment
+    - 6-8: Covers all key areas with adequate depth and evidence
+    - 8-10: Exhaustive, balanced, well-evidenced, no meaningful gaps
+
+    **2. Insight (Depth and Originality)** - Weight: {{ insight_weight | default(0.30) }}
+    Intent: Why-think, causality, synthesis, non-obvious implications.
+    - Causal chains and quantified reasoning
+    - Trade-offs and counterfactual analysis
+    - Novel synthesis and frameworks
+    - Acknowledges limitations
+    Scoring Anchors:
+    - 0-2: Descriptive only; platitudes
+    - 4-6: Some analysis; shallow drivers; limited originality
+    - 6-8: Clear causal logic; non-trivial implications; data-backed claims
+    - 8-10: Original frameworks; quantifies impact; anticipates edge cases
+
+    **3. Instruction Following (Task Fit)** - Weight: {{ instruction_weight | default(0.25) }}
+    Intent: Strict adherence to task requirements and constraints.
+    - Answers all sub-questions
+    - Respects scope (topic/geo/time)
+    - Required deliverables and methods
+    - No out-of-scope content
+    Scoring Anchors:
+    - 0-2: Largely off-task; violates constraints
+    - 4-6: Partially compliant; missing notable requirements
+    - 6-8: Fully compliant with minor misses
+    - 8-10: Exact, complete, and precise compliance
+
+    **4. Readability (Clarity and Presentation)** - Weight: {{ readability_weight | default(0.20) }}
+    Intent: Clear structure, fluent language, effective data presentation.
+    - Logical outline with clear headings
+    - Cohesive and precise wording
+    - Concise tables/figures
+    - Defined terms and consistent formatting
+    Scoring Anchors:
+    - 0-2: Hard to follow; disorganized; errors
+    - 4-6: Understandable but clunky or poorly organized
+    - 6-8: Clear, well-structured, minimal friction
+    - 8-10: Publication-ready polish; visuals aid understanding
+
+    {% if evaluation_details is not none %}
+    ====== Additional Evaluation Details ======
+    {{ evaluation_details }}
+    {% endif %}
+
+    ====== Scoring Method ======
+    - **Per Criterion**: Evidence-based scoring; 5 = baseline adequate, adjust ±
+    - **Per Dimension**: Weighted average of its criteria
+    - **Overall Score**: Weighted sum of four dimension scores
+    - **Pairwise Normalization**: target_normalized = target_score / (target_score + reference_score)
+
+    {% if time_limit is not none %}
+    ====== Time Limit On System Execution ======
+    Your system's execution is limited to **{{ time_limit }}**. Ensure efficient implementation.
+    {% endif %}
+
+    {% if runtime_environment is not none %}
+    ====== Runtime Environment ======
+    {{ runtime_environment }}
+    {% endif %}
+
+task_description_template:
+  system: |-
+    You are an expert in agentic system design and research task analysis.
+    The user will provide a research task description, and you need to extract structured information.
+    Please answer in JSON format with the following schema:
+    {
+      "Task Type": "The type of research task, e.g., 'Literature Review', 'Multi-hop QA', 'Data Analysis', 'Code Generation', 'Scientific Research'",
+      "Domain": "The domain of the task, e.g., 'Scientific Research', 'Business Intelligence', 'Software Engineering', 'Healthcare'",
+      "Brief Description": "A brief description of the task (2-3 sentences)",
+      "Scope Requirements": {
+        "Temporal": "Time range if specified, e.g., '2020-2024'",
+        "Geographical": "Geographical scope if relevant, e.g., 'Global', 'US only'",
+        "Topical": "Topic boundaries and depth"
+      },
+      "Required Deliverables": "List of expected outputs, e.g., ['Report', 'Data tables', 'Visualizations', 'Code']",
+      "Data Sources": "Expected or required data sources",
+      "Sub-questions": "List of sub-questions that must be answered",
+      "Constraints": "Any specific constraints or limitations",
+      "Evaluation Focus": {
+        "Comprehensiveness": "What aspects determine coverage completeness",
+        "Insight": "What constitutes deep analysis for this task",
+        "Instruction Following": "Key requirements that must be met",
+        "Readability": "Presentation format expectations"
+      },
+      "Complexity Level": "Low/Medium/High based on research depth and multi-hop reasoning requirements"
+    }
+  
+  user: |-
+    Research Task Description: 
+    {{ task_raw_description }}
+
+    Additional Context:
+    {{ task_context }}
+
+task_background:
+  system: |-
+    You are a world-class AI researcher and system architect specializing in agentic systems for research automation.
+    
+    Your expertise includes:
+    - Multi-agent coordination and planning
+    - Information retrieval and synthesis
+    - Causal reasoning and analysis
+    - Research methodology and evaluation
+    - Large Language Model orchestration
+  
+  user: |-
+    The task type for this research scenario is **{{ task_type }}**.
+    Domain: **{{ domain }}**.
+    
+    Brief task description: {{ brief_description }}.
+    
+    Scope Requirements:
+    {{ scope_requirements }}.
+    
+    Required Deliverables:
+    {{ required_deliverables }}.
+    
+    The task will be evaluated on four dimensions:
+    1. **Comprehensiveness**: {{ comprehensiveness_focus }}
+    2. **Insight**: {{ insight_focus }}
+    3. **Instruction Following**: {{ instruction_focus }}
+    4. **Readability**: {{ readability_focus }}
+
+# ==================== Hypothesis Generation ====================
+
+hypothesis_generation:
+  system: |-
+    You are an expert in agentic system optimization and research automation.
+    Your task is to propose hypotheses to improve the system's performance on DeepResearch evaluation dimensions.
+  
+  user: |-
+    You are proposing a hypothesis to improve the agentic system for research tasks.
+    
+    ====== Current System State ======
+    {{ current_system_description }}
+    
+    ====== Performance on DeepResearch Dimensions ======
+    Current Scores (0-10 scale):
+    - Comprehensiveness: {{ current_comprehensiveness | default("N/A") }}
+    - Insight: {{ current_insight | default("N/A") }}
+    - Instruction Following: {{ current_instruction_following | default("N/A") }}
+    - Readability: {{ current_readability | default("N/A") }}
+    
+    ====== Previous Experiments ======
+    {{ experiment_history }}
+    
+    ====== Identified Weaknesses ======
+    {{ performance_gaps }}
+    
+    ====== Task ======
+    Propose a hypothesis for system improvement that targets one or more evaluation dimensions.
+    
+    Your hypothesis should:
+    1. **Target Dimension(s)**: Which evaluation dimension(s) will this improve?
+    2. **Current Gap**: What specific weakness does it address?
+    3. **Proposed Change**: Concrete architectural or algorithmic modification
+    4. **Expected Impact**: How will this improve the target dimension score(s)?
+    5. **Trade-offs**: Any potential negative impacts on other dimensions?
+    6. **Implementation Feasibility**: Complexity and resource requirements
+    
+    Format your response as:
+    **Hypothesis**: [One clear sentence]
+    **Target Dimensions**: [List with expected improvement, e.g., "Comprehensiveness (+1.5), Insight (+0.8)"]
+    **Rationale**: [Why this will work, with evidence from experiments]
+    **Implementation Plan**: [Step-by-step approach]
+    **Risk Mitigation**: [How to avoid hurting other dimensions]
+
+hypothesis_output_format:
+  system: |-
+    You must format your hypothesis according to the following JSON schema.
+  
+  user: |-
+    The output should follow JSON format with the following schema:
+    {
+      "action": "Choose from ['Information_Gathering', 'Analysis_Synthesis', 'Structure_Refinement', 'Compliance_Verification']. If 'hypothesis_specification' provides the action you need to take, please follow it. Otherwise, based on previous experimental results, suggest the action you believe is most appropriate.",
+      "hypothesis": "One clear sentence stating what improvement will be made",
+      "target_dimensions": [
+        {
+          "name": "Comprehensiveness/Insight/Instruction_Following/Readability",
+          "current_score": 0.0,
+          "target_score": 0.0,
+          "expected_improvement": 0.0,
+          "confidence": "Low/Medium/High"
+        }
+      ],
+      "current_gap": "Specific weakness being addressed (one sentence)",
+      "rationale": "Why this hypothesis should work, with evidence from previous experiments or theoretical principles (2-3 sentences)",
+      "implementation_plan": {
+        "step_1": "First concrete step",
+        "step_2": "Second concrete step",
+        "step_3": "Third concrete step (if needed)"
+      },
+      "risk_assessment": {
+        "potential_negative_impacts": [
+          {"dimension": "dimension_name", "reason": "why it might be affected", "severity": "Low/Medium/High"}
+        ],
+        "mitigation_strategies": ["strategy1", "strategy2", ...]
+      },
+      "resource_requirements": {
+        "time_estimate": "Estimated time to implement and validate",
+        "external_tools": ["tool1", "tool2", ...],
+        "complexity": "Low/Medium/High",
+        "dependencies": ["dependency1", ...]
+      },
+      "success_criteria": {
+        "primary": "Main success indicator (e.g., 'Comprehensiveness score increases by at least 1.0')",
+        "secondary": ["Additional indicators of success"],
+        "validation_method": "How to verify the improvement"
+      },
+      "concise_knowledge": "One-line transferable principle using conditional grammar (e.g., 'If X, then Y'; 'When A, do B'). Must be clear and unambiguous without referencing 'previous hypothesis' or other context-dependent terms."
+    }
+
+hypothesis_and_feedback:
+  system: |-
+    You have access to the complete history of previous experiments and their results.
+    Analyze patterns and learn from past successes and failures.
+  
+  user: |-
+    ====== Recent Experiment History (Last {{ history_window | default(10) }} iterations) ======
+    
+    {% for experiment, feedback in trace.hist[-history_window:] %}
+    ====== Iteration {{ loop.index }} ======
+    **Hypothesis**: {{ experiment.hypothesis }}
+    **Action Type**: {{ experiment.action_type }}
+    **Target Dimensions**: {{ experiment.target_dimensions }}
+    
+    **Results**:
+    - Comprehensiveness: {{ feedback.comprehensiveness_score }} (Δ {{ feedback.comprehensiveness_delta }})
+    - Insight: {{ feedback.insight_score }} (Δ {{ feedback.insight_delta }})
+    - Instruction Following: {{ feedback.instruction_score }} (Δ {{ feedback.instruction_delta }})
+    - Readability: {{ feedback.readability_score }} (Δ {{ feedback.readability_delta }})
+    - Overall: {{ feedback.overall_score }} (Δ {{ feedback.overall_delta }})
+    
+    **Observations**: {{ feedback.observations }}
+    **Decision**: {{ feedback.decision }} (Success/Partial/Failure)
+    **Reason**: {{ feedback.reason }}
+    **Lessons Learned**: {{ feedback.lessons }}
+    
+    {% endfor %}
+    
+    ====== Pattern Analysis ======
+    - Most successful action type: {{ most_successful_action }}
+    - Most improved dimension: {{ most_improved_dimension }}
+    - Persistent weaknesses: {{ persistent_weaknesses }}
+    - Effective strategies: {{ effective_strategies }}
+
+# ==================== Action Type Specifications ====================
+
+hypothesis_specification:
+  Information_Gathering:
+    system: |-
+      You are an expert in information gathering and comprehensive research methodologies.
+    
+    user: |-
+      Action: Information Gathering
+      
+      Focus: Comprehensive data collection and source validation
+      
+      Guidelines:
+      - Start with authoritative sources (peer-reviewed papers, official databases)
+      - Cover multiple perspectives and timeframes
+      - Verify facts through cross-referencing
+      - Document all sources with proper citations
+
+      Evaluation Impact:
+      - Primary: **Comprehensiveness** (improved coverage and evidence)
+      - Secondary: **Instruction Following** (adherence to source requirements)
+
+      Common Pitfalls:
+      - Relying on single sources
+      - Missing key subtopics
+      - Ignoring temporal or geographical constraints
+
+      Output Format:
+      {
+        "sources": [
+          {
+            "citation": "Author (Year). Title. Publisher.",
+            "relevance": "How this source addresses this task",
+            "key_information": "Summary of relevant content",
+            "credibility": "Assessment of source quality"
+          }
+        ],
+        "coverage_checklist": {
+          "temporal_scope": "Covered/Partial/Missing",
+          "geographical_scope": "Covered/Partial/Missing",
+          "subtopics": ["topic1: covered", "topic2: partial", ...]
+        }
+      }
+  
+  Analysis_Synthesis:
+    system: |-
+      You are an expert in causal analysis, quantitative reasoning, and knowledge synthesis.
+    
+    user: |-
+      Action: Analysis and Synthesis
+      
+      Focus: Deep causal reasoning and novel insights
+      
+      Guidelines:
+      - Identify causal relationships (not just correlations)
+      - Quantify impacts where possible
+      - Consider counterfactuals and trade-offs
+      - Acknowledge limitations and uncertainties
+      - Propose original frameworks or synthesis
+      
+      Evaluation Impact:
+      - Primary: **Insight** (depth of analysis and originality)
+      - Secondary: **Comprehensiveness** (improved understanding of topic)
+      
+      Common Pitfalls:
+      - Descriptive summaries without analysis
+      - Correlation presented as causation
+      - Generic "pros and cons" without depth
+      - Ignoring edge cases
+      
+      Output Format:
+      {
+        "causal_chains": [
+          {
+            "cause": "...",
+            "mechanism": "...",
+            "effect": "...",
+            "evidence": "...",
+            "quantification": "X% increase/decrease"
+          }
+        ],
+        "trade_offs": [
+          {
+            "dimension1": "...",
+            "dimension2": "...",
+            "relationship": "...",
+            "implications": "..."
+          }
+        ],
+        "novel_insights": "...",
+        "limitations": "..."
+      }
+
+  Structure_Refinement:
+    system: |-
+      You are an expert in technical writing, information architecture, and presentation design.
+    
+    user: |-
+      Action: Structure and Presentation Refinement
+      
+      Focus: Clear organization and effective communication
+      
+      Guidelines:
+      - Logical hierarchical structure
+      - Clear section headings
+      - Effective use of tables/figures
+      - Consistent terminology
+      - Smooth transitions between sections
+
+      Evaluation Impact:
+      - Primary: **Readability** (clarity and presentation quality)
+      - Secondary: **Instruction Following** (meeting format requirements)
+
+      Common Pitfalls:
+      - Walls of text without structure
+      - Undefined acronyms or jargon
+      - Inconsistent formatting
+      - Cluttered or unclear visualizations
+      
+      Output Format:
+      {
+        "structure": {
+          "sections": [
+            {
+              "title": "...",
+              "subsections": [...],
+              "key_points": [...]
+            }
+          ]
+        },
+        "visual_elements": [
+          {
+            "type": "table/figure/chart",
+            "purpose": "...",
+            "data": "..."
+          }
+        ],
+        "terminology": {
+          "term1": "definition",
+          "term2": "definition"
+        }
+      }
+
+  Compliance_Verification:
+    system: |-
+      You are an expert in requirement validation and compliance checking.
+    
+    user: |-
+      Action: Compliance and Requirement Verification
+      
+      Focus: Ensuring all task requirements are met
+      
+      Guidelines:
+      - Check all sub-questions are answered
+      - Verify scope adherence (time/geo/topic)
+      - Confirm all deliverables are provided
+      - Validate required methods are used
+      - Remove out-of-scope content
+      
+      Evaluation Impact:
+      - Primary: **Instruction Following** (requirement adherence)
+      
+      Common Pitfalls:
+      - Missing mandatory sections
+      - Scope creep
+      - Wrong timeframe or geography
+      - Ignoring format specifications
+      
+      Output Format:
+      {
+        "compliance_checklist": {
+          "sub_questions": [
+            {"question": "...", "status": "answered/partial/missing"}
+          ],
+          "scope_verification": {
+            "temporal": "compliant/violated",
+            "geographical": "compliant/violated",
+            "topical": "compliant/violated"
+          },
+          "deliverables": [
+            {"required": "...", "status": "provided/missing"}
+          ]
+        },
+        "violations": ["list of any violations"],
+        "corrective_actions": ["list of needed fixes"]
+      }
+
+# ==================== Code Generation ====================
+
+code_generation:
+  system: |-
+    You are an expert software engineer specializing in agentic systems and research automation.
+    You write clean, well-documented, evaluation-aware code.
+  
+  user: |-
+    You are implementing the following hypothesis:
+    {{ hypothesis }}
+    
+    Target Evaluation Dimensions: {{ target_dimensions }}
+    
+    ====== Current Codebase ======
+    {{ current_code }}
+    
+    ====== Implementation Requirements ======
+    Your implementation must:
+    1. **Maintain/Improve Target Dimensions**:
+       {% for dim in target_dimensions %}
+       - {{ dim.name }}: Focus on {{ dim.focus_areas }}
+       {% endfor %}
+    
+    2. **Code Quality Standards**:
+       - Clear documentation explaining how code improves target dimensions
+       - Error handling with informative messages
+       - Logging for debugging and analysis
+       - Modular design for maintainability
+    
+    3. **Evaluation-Aware Design**:
+       - For **Comprehensiveness**: Ensure complete coverage of required topics
+       - For **Insight**: Include causal reasoning, quantification, synthesis logic
+       - For **Instruction Following**: Validate all requirements are met
+       - For **Readability**: Structure output clearly, use proper formatting
+    
+    ====== Implementation Guidelines ======
+    {{ implementation_guidelines }}
+    
+    Please generate code with:
+    - Comments explaining dimension-specific improvements
+    - Docstrings describing evaluation impact
+    - Unit tests for critical functionality
+
+# ==================== Feedback and Analysis ====================
+
+feedback_analysis:
+  system: |-
+    You are an expert in experimental analysis and performance evaluation for agentic systems.
+    Analyze results across multiple dimensions and provide actionable insights.
+  
+  user: |-
+    ====== Experiment Results ======
+    Hypothesis: {{ hypothesis }}
+    Target Dimensions: {{ target_dimensions }}
+    
+    ====== Performance Metrics (0-10 scale) ======
+    {% if metrics %}
+    Current vs. Baseline:
+    - Comprehensiveness: {{ metrics.comprehensiveness.current }} (Δ {{ metrics.comprehensiveness.delta }})
+    - Insight: {{ metrics.insight.current }} (Δ {{ metrics.insight.delta }})
+    - Instruction Following: {{ metrics.instruction_following.current }} (Δ {{ metrics.instruction_following.delta }})
+    - Readability: {{ metrics.readability.current }} (Δ {{ metrics.readability.delta }})
+    - Overall Score: {{ metrics.overall.current }} (Δ {{ metrics.overall.delta }})
+    
+    Pairwise Normalized Score: {{ metrics.normalized_score }}
+    {% endif %}
+    
+    ====== Execution Logs ======
+    {{ logs }}
+    
+    ====== Detailed Dimension Analysis ======
+    {% if dimension_feedback %}
+    {{ dimension_feedback }}
+    {% endif %}
+    
+    ====== Analysis Task ======
+    Provide a comprehensive analysis:
+    
+    1. **Success Assessment** (Pass/Fail for each dimension):
+       - Did we improve target dimension(s)?
+       - Were there unexpected changes in non-target dimensions?
+    
+    2. **Dimension-Specific Findings**:
+       For each dimension, explain:
+       - **Comprehensiveness**: Coverage gaps or improvements
+       - **Insight**: Quality of reasoning and originality
+       - **Instruction Following**: Compliance issues or successes
+       - **Readability**: Clarity and presentation quality
+    
+    3. **Root Cause Analysis**:
+       - Why did improvements/regressions occur?
+       - What worked as expected vs. surprises?
+    
+    4. **Trade-off Analysis**:
+       - Did improving one dimension hurt others?
+       - Is the trade-off acceptable?
+    
+    5. **Next Steps**:
+       - Should we iterate on this hypothesis?
+       - New hypothesis directions based on learnings?
+    
+    6. **Knowledge Update**:
+       - What general principles did we learn?
+       - What to avoid in future experiments?
+
+# ==================== Evaluation Rubric ====================
+
+evaluation_rubric:
+  system: |-
+    You are an expert evaluator trained on the DeepResearch Benchmark rubric.
+    Apply consistent, evidence-based scoring across all four dimensions.
+  
+  user: |-
+    ====== DeepResearch Benchmark Evaluation Rubric ======
+    
+    Use this rubric to score outputs on each dimension (0-10 continuous):
+    
+    **Comprehensiveness (0-10)**
+    Check:
+    - [ ] All required subtopics covered
+    - [ ] Appropriate scope (time/geography/segments)
+    - [ ] Multiple data sources and evidence cited
+    - [ ] Balanced perspectives presented
+    - [ ] No major omissions
+    
+    Pitfalls to avoid:
+    - Ignoring time/geographic constraints
+    - One-sided coverage
+    - Missing data/evidence
+    - Superficial treatment of topics
+    
+    **Insight (0-10)**
+    Check:
+    - [ ] Causal chains explained (not just correlation)
+    - [ ] Quantified reasoning where possible
+    - [ ] Trade-offs and counterfactuals discussed
+    - [ ] Limitations acknowledged
+    - [ ] Novel synthesis or frameworks
+    
+    Pitfalls to avoid:
+    - Purely descriptive content
+    - Platitudes and generic statements
+    - Untested assertions
+    - Shallow "pros and cons" lists
+    
+    **Instruction Following (0-10)**
+    Check:
+    - [ ] All sub-questions answered
+    - [ ] Scope respected (topic/geo/time)
+    - [ ] Required deliverables provided
+    - [ ] Required methods used
+    - [ ] No out-of-scope content
+    
+    Pitfalls to avoid:
+    - Missing mandatory sections
+    - Scope drift
+    - Wrong timeframe or geography
+    - Ignoring format requirements
+    
+    **Readability (0-10)**
+    Check:
+    - [ ] Logical structure with clear headings
+    - [ ] Cohesive flow between sections
+    - [ ] Precise and concise wording
+    - [ ] Effective tables/figures
+    - [ ] Terms defined, formatting consistent
+    
+    Pitfalls to avoid:
+    - Walls of text
+    - Undefined acronyms
+    - Noisy or unclear visualizations
+    - Inconsistent terminology
+
+# ==================== UI and Display ====================
+
+rich_style_description:
+  system: |-
+    You are describing the agentic system scenario for display purposes.
+  
+  user: |-
+    ### {{ name }} Agent: Automated Research System for DeepResearch Tasks
+
+    #### [Overview](#_summary)
+    This scenario focuses on automated research and development of agentic systems 
+    optimized for DeepResearch Benchmark evaluation criteria.
+
+    #### {{ name }} Task Info
+    Current Task: {{ task_name }}
+    Task Type: {{ task_type }}
+    Domain: {{ domain }}
+
+    #### [Evaluation Dimensions](#_metrics)
+    - **Comprehensiveness** ({{ comprehensiveness_weight }}): Coverage breadth and depth
+    - **Insight** ({{ insight_weight }}): Causal reasoning and originality
+    - **Instruction Following** ({{ instruction_weight }}): Task requirement adherence
+    - **Readability** ({{ readability_weight }}): Clarity and presentation quality
+
+    #### [Automated R&D Loop](#_rdloops)
+    
+    - **[R (Research)](#_research)**
+      - Hypothesis generation targeting evaluation dimensions
+      - Analysis of dimension-specific performance gaps
+      - Knowledge construction from scored experiments
+    
+    - **[D (Development)](#_development)**
+      - Code evolution optimizing for target dimensions
+      - Multi-dimensional performance validation
+      - Trade-off analysis across dimensions
+    
+    #### [Objective](#_summary)
+    To automatically discover and implement system improvements that maximize 
+    performance across all four DeepResearch evaluation dimensions through 
+    autonomous, dimension-aware research and development cycles.
+
+system_prompt_template:
+  system: |-
+    You are an advanced agentic system designed for research tasks.
+    
+    Your core capabilities:
+    - Multi-hop reasoning and information synthesis
+    - Causal analysis and quantitative reasoning
+    - Structured output generation
+    - Source verification and citation
+  
+  user: |-
+    Evaluation awareness:
+    You will be evaluated on four dimensions (0-10 each):
+    1. **Comprehensiveness**: Complete coverage, no gaps
+    2. **Insight**: Deep analysis, causal thinking, originality
+    3. **Instruction Following**: Strict requirement adherence
+    4. **Readability**: Clear structure and presentation
+    
+    Always optimize for all four dimensions in your responses.
+
+
+# ...existing code...
+
+# NEW: Hypothesis generation with external knowledge
+hypothesis_gen_with_external_knowledge:
+  system: |
+    You are an expert AI researcher specializing in agentic systems.
+    Your task is to generate innovative hypotheses based on:
+    1. The scenario description
+    2. Previous experimental results
+    3. External knowledge from research papers and best practices
+    
+    Generate a clear, specific, and testable hypothesis in JSON format.
+  
+  user: |
+    # Scenario
+    {{ scenario_desc }}
+    
+    # Previous Trials
+    {{ previous_trials }}
+    
+    {% if external_knowledge %}
+    # External Knowledge (from web search)
+    {% for source in external_knowledge %}
+    {{ loop.index }}. [{{ source.credibility_level }}] {{ source.title }}
+       Summary: {{ source.summary }}
+       URL: {{ source.url }}
+    {% endfor %}
+    {% endif %}
+    
+    # Task
+    Generate a hypothesis to improve the agentic system.
+    Consider the external knowledge and previous results.
+    
+    Output format:
+    {
+      "hypothesis": "Your hypothesis here",
+      "reasoning": "Why this hypothesis is promising",
+      "expected_improvement": "What improvements you expect",
+      "implementation_approach": "How to implement this",
+      "external_sources_used": ["List of URLs used"]
+    }
+
+# NEW: Code generation with external knowledge
+code_gen_with_external_knowledge:
+  system: |
+    You are an expert software engineer specializing in agentic systems.
+    Generate production-quality code based on the hypothesis and external knowledge.
+  
+  user: |
+    # Hypothesis
+    {{ hypothesis }}
+    
+    # External Knowledge Summary
+    {{ external_knowledge_summary }}
+    
+    # High-Credibility Sources
+    {% for source in high_cred_sources %}
+    - {{ source.title }}: {{ source.url }}
+    {% endfor %}
+    
+    # Task
+    Generate complete, working code for:
+    1. agent.py - Main agent implementation
+    2. evaluator.py - Performance evaluator
+    3. train.py - Execution script
+    
+    Follow best practices from the external sources.
\ No newline at end of file
diff --git a/rdagent/scenarios/agentic_sys/proposal.py b/rdagent/scenarios/agentic_sys/proposal.py
new file mode 100644
index 000000000..5a8f63eb1
--- /dev/null
+++ b/rdagent/scenarios/agentic_sys/proposal.py
@@ -0,0 +1,1019 @@
+from rdagent.core.experiment import Task
+from rdagent.core.proposal import ExpGen, Trace
+from pathlib import Path
+from rdagent.scenarios.agentic_sys.exp import AgenticSysExperiment
+from rdagent.core.proposal import (
+    ExpGen,
+    Hypothesis,
+    HypothesisGen,
+    Trace,
+    Experiment2Feedback
+)
+from rdagent.scenarios.agentic_sys.scen import AgenticSysScen
+from rdagent.log import rdagent_logger as logger
+from rdagent.core.proposal import HypothesisGen, Hypothesis
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.tpl import T  # 使用 T 模板系统
+import json
+from typing import Any, Dict, List, Optional, Tuple
+from rdagent.scenarios.agentic_sys.tools.web_search import create_web_search_tool
+
+
+class AgenticSysHypothesisGen(HypothesisGen):
+    """
+    Generate hypothesis for agentic system improvements based on DeepResearch evaluation dimensions.
+    Uses T() template system to render prompts from prompts.yaml.
+    """
+
+    def __init__(self, scen: AgenticSysScen):
+        super().__init__(scen=scen)
+        self.scen = scen
+        
+        # Initialize LLM backend
+        self.api_backend = APIBackend()
+
+        #Initialize web search tool
+        search_config_path = Path(__file__).parent /"tools"/ "search_config.yaml"
+        self.web_search = create_web_search_tool(config_path=search_config_path)
+        
+        logger.info("AgenticSysHypothesisGen initialized with T() template system")
+
+
+    @property
+    def web_search_tool(self):
+        """Lazy load web search tool when needed"""
+        if self._web_search_tool is None:
+            try:
+                search_config_path = Path(__file__).parent / "tools" / "search_config.yaml"
+                if search_config_path.exists():
+                    self._web_search_tool = create_web_search_tool(search_config_path)
+                    logger.info("✓ Web search tool initialized in HypothesisGen")
+                else:
+                    logger.warning(f"Search config not found: {search_config_path}")
+                    self._web_search_tool = False
+            except Exception as e:
+                logger.warning(f"Failed to initialize web search tool: {e}")
+                self._web_search_tool = False
+        return self._web_search_tool if self._web_search_tool is not False else None
+
+
+    def gen(self, trace: Trace) -> Hypothesis:
+        """
+        Generate hypothesis based on trace history and evaluation dimensions.
+        
+        Args:
+            trace: Experiment trace containing history
+            
+        Returns:
+            Hypothesis object with structured hypothesis data
+        """
+        logger.info("Generating hypothesis...")
+        
+        # Prepare base context
+        scenario_desc = trace.scen.get_scenario_all_desc()
+        previous_trials = self._extract_previous_trials(trace)
+        
+        # Optionally enhance with web search
+        external_knowledge = []
+        if self._should_use_web_search(trace):
+            external_knowledge = self._retrieve_external_knowledge(trace)
+        
+        # Generate hypothesis using LLM
+        system_prompt = self._build_system_prompt()
+        user_prompt = self._build_user_prompt(
+            scenario_desc=scenario_desc,
+            previous_trials=previous_trials,
+            external_knowledge=external_knowledge
+        )
+        
+        response = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=user_prompt,
+            system_prompt=system_prompt,
+            json_mode=True
+        )
+        
+        # Parse and return hypothesis
+        hypothesis = self._parse_hypothesis(response, trace)
+        
+        logger.info(f"Generated hypothesis: {hypothesis.hypothesis[:100]}...")
+        return hypothesis
+
+
+    def _should_use_web_search(self, trace: Trace) -> bool:
+        """Determine if web search should be used"""
+        # Check if tool is available
+        if self.web_search_tool is None:
+            return False
+        
+        # Check if service is healthy
+        if not self.web_search_tool.client.health_check():
+            logger.warning("Web search service not healthy")
+            return False
+        
+        # Use for early iterations
+        iteration = len(trace.hist)
+        if iteration < 3:
+            logger.info(f"Early iteration ({iteration}/3), enabling web search")
+            return True
+        
+        # Use if previous performance is low
+        if trace.hist and hasattr(trace.hist[-1][1], 'overall_score'):
+            last_score = trace.hist[-1][1].overall_score
+            if last_score < 6.0:  # Threshold for low performance
+                logger.info(f"Low previous score ({last_score}), enabling web search")
+                return True
+        
+        return False
+    
+    def _retrieve_external_knowledge(self, trace: Trace) -> list:
+        """
+        Retrieve external knowledge using web search tool
+        
+        Args:
+            trace: Execution trace
+            
+        Returns:
+            List of external sources
+        """
+        try:
+            scenario_desc = trace.scen.get_scenario_all_desc()
+            
+            # Identify knowledge gaps
+            knowledge_gaps = self._identify_knowledge_gaps(trace)
+            
+            # Prepare search context
+            search_context = {
+                'iteration': len(trace.hist),
+                'domain': getattr(trace.scen, 'domain', 'general')
+            }
+            
+            # Call web search tool
+            logger.info("Retrieving external knowledge via web search...")
+            external_sources = self.web_search_tool.search_for_hypothesis(
+                task_description=scenario_desc,
+                current_gaps=knowledge_gaps,
+                context=search_context
+            )
+            
+            logger.info(f"Retrieved {len(external_sources)} external sources")
+            return external_sources
+            
+        except Exception as e:
+            logger.error(f"Failed to retrieve external knowledge: {e}")
+            return []
+    
+    def _identify_knowledge_gaps(self, trace: Trace) -> list:
+        """Identify knowledge gaps from trace history"""
+        gaps = []
+        
+        if trace.hist:
+            last_feedback = trace.hist[-1][1]
+            
+            # Check which dimensions performed poorly
+            if hasattr(last_feedback, 'dimension_feedback'):
+                for dim, feedback in last_feedback.dimension_feedback.items():
+                    if hasattr(feedback, 'score') and feedback.score < 6.0:
+                        gaps.append(f"improve {dim}")
+        
+        # Default gaps if none identified
+        if not gaps:
+            gaps = [
+                "agentic system best practices",
+                "system design patterns",
+                "performance optimization"
+            ]
+        
+        return gaps[:5]
+    
+    def _extract_previous_trials(self, trace: Trace) -> str:
+        """Extract previous trials from trace"""
+        if not trace.hist:
+            return "No previous trials"
+        
+        trials = []
+        for exp, feedback in trace.hist[-3:]:  # Last 3 trials
+            trial_summary = {
+                'hypothesis': getattr(exp, 'hypothesis', 'N/A'),
+                'result': getattr(feedback, 'decision', 'N/A'),
+                'score': getattr(feedback, 'overall_score', 0.0)
+            }
+            trials.append(trial_summary)
+        
+        return str(trials)
+    
+    def _build_system_prompt(self) -> str:
+        """Build system prompt for hypothesis generation"""
+        return """You are an expert AI researcher specializing in agentic systems.
+Your task is to generate innovative hypotheses for improving agentic system performance.
+
+Consider:
+1. Previous experimental results
+2. External knowledge from research papers and best practices
+3. Novel approaches and methodologies
+4. Feasibility and implementability
+
+Generate a clear, specific, and testable hypothesis."""
+    
+    def _build_user_prompt(
+        self,
+        scenario_desc: str,
+        previous_trials: str,
+        external_knowledge: list
+    ) -> str:
+        """Build user prompt with all context"""
+        prompt = f"""# Scenario
+{scenario_desc}
+
+# Previous Trials
+{previous_trials}
+"""
+        
+        if external_knowledge:
+            prompt += "\n# External Knowledge\n"
+            for idx, source in enumerate(external_knowledge[:5], 1):
+                prompt += f"\n{idx}. [{source['credibility_level']}] {source['title']}\n"
+                prompt += f"   Summary: {source['summary'][:150]}...\n"
+                prompt += f"   URL: {source['url']}\n"
+        
+        prompt += "\n# Task\nGenerate a hypothesis to improve the agentic system."
+        
+        return prompt
+    
+    def _parse_hypothesis(self, response: str, trace: Trace) -> Hypothesis:
+        """Parse LLM response into Hypothesis object"""
+        # Simplified parsing - in real implementation, use structured output
+        hypothesis_text = response.strip()
+        
+        hypothesis = Hypothesis(
+            hypothesis=hypothesis_text,
+            reason="Generated based on scenario and previous results",
+            concise_reason="Improve system performance",
+            concise_observation="",
+            concise_justification="",
+            concise_knowledge=""
+        )
+        
+        return hypothesis
+
+    def prepare_context(self, trace: Trace):
+        """
+        Prepare context for hypothesis generation from trace history.
+        
+        KEY METHOD: Uses T() template system like Kaggle scenario
+        
+        Args:
+            trace: Experiment trace
+            
+        Returns:
+            Tuple of (context dictionary, is_first_experiment flag)
+        """
+        is_first_experiment = not (hasattr(trace, 'hist') and trace.hist)
+        
+        # Use T() to render hypothesis_and_feedback prompt
+        hypothesis_and_feedback = (
+            T("scenarios.agentic_sys.prompts:hypothesis_and_feedback").r(
+                trace=trace,
+                history_window=10,
+                most_successful_action=self._get_most_successful_action(trace),
+                most_improved_dimension=self._get_most_improved_dimension(trace),
+                persistent_weaknesses=self._get_persistent_weaknesses(trace),
+                effective_strategies=self._get_effective_strategies(trace)
+            )
+            if len(trace.hist) > 0
+            else "No previous hypothesis and feedback available since it's the first round."
+        )
+        
+        context = {
+            "is_first_experiment": is_first_experiment,
+            "current_system_description": self._get_system_description(trace),
+            "experiment_history": hypothesis_and_feedback,  # 使用渲染后的提示词
+            "performance_gaps": self._identify_performance_gaps(trace),
+            "current_scores": self._extract_current_scores(trace),
+        }
+        
+        return context, is_first_experiment
+
+    def prepare_rag_context(self, trace: Trace):
+        """
+        Prepare RAG (Retrieval-Augmented Generation) context.
+        
+        Uses T() template system for RAG prompt rendering.
+        
+        Args:
+            trace: Experiment trace
+            
+        Returns:
+            Dictionary with RAG context
+        """
+        # Retrieve knowledge sources
+        insights = self._retrieve_cross_task_insights()
+        experiences = self._retrieve_current_task_experiences(trace)
+        external_sources = self._retrieve_external_sources(trace)
+        
+        # Render RAG prompt if sources available
+        rag_prompt = ""
+        if insights or experiences or external_sources:
+            try:
+                rag_prompt = T("scenarios.agentic_sys.prompts:KG_hypothesis_gen_RAG").r(
+                    insights=insights,
+                    experiences=experiences,
+                    external_sources=external_sources
+                )
+            except Exception as e:
+                logger.warning(f"Failed to render KG_hypothesis_gen_RAG: {e}")
+        
+        return {
+            "insights": insights,
+            "experiences": experiences,
+            "external_sources": external_sources,
+            "rag_prompt": rag_prompt  # 渲染后的 RAG 提示词
+        }
+
+    def generate_hypothesis_with_llm(
+        self, 
+        context: Dict[str, Any], 
+        rag_context: Dict[str, Any],
+        trace: Trace
+    ) -> Dict[str, Any]:
+        """
+        Generate hypothesis using LLM with prompts from prompts.yaml.
+        
+        Uses T() template system to render all prompts.
+        
+        Args:
+            context: Context dictionary
+            rag_context: RAG context dictionary
+            trace: Experiment trace
+            
+        Returns:
+            Parsed hypothesis data dictionary
+        """
+        # Step 1: Build system prompt using T()
+        try: 
+            system_prompt = T("scenarios.agentic_sys.prompts:hypothesis_generation").s()
+            logger.info("Rendered hypothesis_generation system prompt")
+        except Exception as e:
+            logger.warning(f"Failed to render hypothesis_generation system prompt: {e}")
+            system_prompt = """You are an expert in agentic system optimization and research automation.Your task is to propose hypotheses to improve the system's performance on DeepResearch evaluation dimensions."""
+        
+        # Step 2: Build user prompt using T()
+        user_prompt = self._build_user_prompt_with_t(context, rag_context, trace)
+        
+        # Step 3: Call LLM
+        logger.info("Calling LLM for hypothesis generation...")
+        response = self.api_backend.build_messages_and_create_chat_completion(
+            user_prompt=user_prompt,
+            system_prompt=system_prompt,
+            json_mode=True
+        )
+        
+        # Step 4: Parse JSON response
+        try:
+            hypothesis_data = json.loads(response)
+            logger.info("Successfully parsed hypothesis JSON")
+            return hypothesis_data
+        except json.JSONDecodeError as e:
+            logger.error(f"Failed to parse hypothesis JSON: {e}")
+            logger.error(f"Response: {response}")
+            return self.get_fallback_hypothesis(context)
+
+    def build_user_prompt_with_t(
+        self, 
+        context: Dict[str, Any], 
+        rag_context: Dict[str, Any],
+        trace: Trace
+    ) -> str:
+        """
+        Build user prompt using T() template system.
+        
+        KEY METHOD: Shows how to use T() to render and combine multiple prompts.
+        
+        Pattern:
+        1. T("path:prompt_name").r(**variables) - Render user part
+        2. T("path:prompt_name").s(**variables) - Render system part (if needed)
+        3. Combine multiple rendered prompts with "\n\n"
+        
+        Args:
+            context: Context dictionary
+            rag_context: RAG context dictionary
+            trace: Experiment trace
+            
+        Returns:
+            Complete user prompt string
+        """
+        prompt_parts = []
+        
+        # Part 1: Task background (user part)
+        try:
+            task_bg_user = T("scenarios.agentic_sys.prompts:task_background").r(
+                task_type=getattr(self.scen, 'task_type', 'Research Automation'),
+                domain=getattr(self.scen, 'domain', 'Agentic Systems'),
+                brief_description=getattr(self.scen, 'description', 'Automated research system'),
+                scope_requirements=getattr(self.scen, 'scope', 'N/A'),
+                required_deliverables=getattr(self.scen, 'deliverables', 'N/A'),
+                comprehensiveness_focus=getattr(self.scen, 'comprehensiveness_focus', 'Complete coverage'),
+                insight_focus=getattr(self.scen, 'insight_focus', 'Deep analysis'),
+                instruction_focus=getattr(self.scen, 'instruction_focus', 'Strict adherence'),
+                readability_focus=getattr(self.scen, 'readability_focus', 'Clear presentation')
+            )
+            prompt_parts.append(task_bg_user)
+        except Exception as e:
+            logger.warning(f"Failed to render task_background: {e}")
+            prompt_parts.append(f"""Task Type: {getattr(self.scen, 'task_type', 'Research Automation')}
+Domain: {getattr(self.scen, 'domain', 'Agentic Systems')}
+Brief Description: {getattr(self.scen, 'description', 'Automated research system')}""")
+        
+        # Part 2: RAG context (if available)
+        if rag_context.get("rag_prompt"):
+            prompt_parts.append(rag_context["rag_prompt"])
+            logger.info("Added RAG context")
+        
+        # Part 3: Main hypothesis generation instruction
+        try:
+            hypothesis_gen = T("scenarios.agentic_sys.prompts:hypothesis_generation").r(
+                current_system_description=context["current_system_description"],
+                current_comprehensiveness=context["current_scores"]["comprehensiveness"],
+                current_insight=context["current_scores"]["insight"],
+                current_instruction_following=context["current_scores"]["instruction_following"],
+                current_readability=context["current_scores"]["readability"],
+                experiment_history=context["experiment_history"],
+                performance_gaps=context["performance_gaps"]
+            )
+            prompt_parts.append(hypothesis_gen)
+            logger.info("Rendered hypothesis_generation user prompt")
+        except Exception as e:
+            logger.error(f"Failed to render hypothesis_generation: {e}")
+            raise
+        
+        # Part 4: Output format specification
+        try:
+            output_format = T("scenarios.agentic_sys.prompts:hypothesis_output_format").r()
+            prompt_parts.append(output_format)
+        except Exception as e:
+            logger.warning(f"Failed to render hypothesis_output_format: {e}")
+        
+        # Combine all parts
+        full_prompt = "\n\n".join(prompt_parts)
+        
+        return full_prompt
+
+    # ==================== Helper Methods for Context Preparation ====================
+    
+    def get_most_successful_action(self, trace: Trace) -> str:
+        """Get most successful action type from trace history"""
+        if not hasattr(trace, 'hist') or not trace.hist:
+            return "N/A"
+        
+        action_success = {}
+        for exp, feedback in trace.hist:
+            action_type = getattr(exp, 'action_type', 'Unknown')
+            if getattr(feedback, 'decision', False):
+                action_success[action_type] = action_success.get(action_type, 0) + 1
+        
+        return max(action_success, key=action_success.get) if action_success else "N/A"
+    
+    def get_most_improved_dimension(self, trace: Trace) -> str:
+        """Get most improved dimension from trace history"""
+        if not hasattr(trace, 'hist') or not trace.hist:
+            return "N/A"
+        
+        dimension_improvements = {
+            "comprehensiveness": 0,
+            "insight": 0,
+            "instruction_following": 0,
+            "readability": 0
+        }
+        
+        for exp, feedback in trace.hist:
+            for dim in dimension_improvements.keys():
+                delta_attr = f"{dim}_delta"
+                if hasattr(feedback, delta_attr):
+                    delta = getattr(feedback, delta_attr, 0)
+                    if delta > 0:
+                        dimension_improvements[dim] += delta
+        
+        return max(dimension_improvements, key=dimension_improvements.get)
+    
+    def get_persistent_weaknesses(self, trace: Trace) -> str:
+        """Identify persistent weaknesses from trace history"""
+        if not hasattr(trace, 'hist') or not trace.hist:
+            return "N/A"
+        
+        weaknesses = []
+        if trace.hist:
+            _, last_feedback = trace.hist[-1]
+            for dim in ["comprehensiveness", "insight", "instruction_following", "readability"]:
+                score_attr = f"{dim}_score"
+                if hasattr(last_feedback, score_attr):
+                    score = getattr(last_feedback, score_attr, 0)
+                    if score < 6.0:
+                        weaknesses.append(f"{dim} (score: {score:.1f})")
+        
+        return ", ".join(weaknesses) if weaknesses else "None identified"
+    
+    def get_effective_strategies(self, trace: Trace) -> str:
+        """Get effective strategies from trace history"""
+        most_successful = self._get_most_successful_action(trace)
+        if most_successful != "N/A":
+            return f"{most_successful} action type has been most successful"
+        return "No clear pattern yet"
+
+    def get_system_description(self, trace: Trace) -> str:
+        """Get current system description from trace"""
+        if not hasattr(trace, 'hist') or not trace.hist:
+            return "No previous system implementation. Starting from baseline."
+        
+        last_exp, last_feedback = trace.hist[-1]
+        
+        description = f"Current system status:\n"
+        description += f"- Last hypothesis: {getattr(last_exp, 'hypothesis', 'N/A')}\n"
+        description += f"- Last feedback: {getattr(last_feedback, 'reason', 'N/A')[:200]}\n"
+        description += f"- Success rate: {self._calculate_success_rate(trace):.1%}\n"
+        
+        return description
+
+    def identify_performance_gaps(self, trace: Trace) -> str:
+        """Identify performance gaps from trace history"""
+        if not hasattr(trace, 'hist') or not trace.hist:
+            return "Initial baseline establishment needed. Focus on core functionality."
+        
+        gaps = []
+        
+        # Analyze recent failures
+        failed_experiments = [
+            (exp, fb) for exp, fb in trace.hist[-5:]
+            if not getattr(fb, 'decision', False)
+        ]
+        
+        if failed_experiments:
+            gaps.append(f"- {len(failed_experiments)} recent failures indicate instability")
+        
+        # Check success rate
+        success_rate = self._calculate_success_rate(trace)
+        if success_rate < 0.5:
+            gaps.append(f"- Low success rate ({success_rate:.1%}) requires fundamental improvements")
+        elif success_rate < 0.8:
+            gaps.append(f"- Moderate success rate ({success_rate:.1%}) suggests refinement opportunities")
+        
+        return "\n".join(gaps) if gaps else "System performing well. Focus on advanced optimizations."
+
+    def extract_current_scores(self, trace: Trace) -> Dict[str, Optional[float]]:
+        """Extract current dimension scores from latest feedback"""
+        if not hasattr(trace, 'hist') or not trace.hist:
+            return {
+                "comprehensiveness": None,
+                "insight": None,
+                "instruction_following": None,
+                "readability": None
+            }
+        
+        _, last_feedback = trace.hist[-1]
+        
+        return {
+            "comprehensiveness": getattr(last_feedback, 'comprehensiveness_score', None),
+            "insight": getattr(last_feedback, 'insight_score', None),
+            "instruction_following": getattr(last_feedback, 'instruction_score', None),
+            "readability": getattr(last_feedback, 'readability_score', None)
+        }
+
+    def calculate_success_rate(self, trace: Trace) -> float:
+        """Calculate success rate from trace history"""
+        if not hasattr(trace, 'hist') or not trace.hist:
+            return 0.0
+        
+        success_count = sum(
+            1 for _, fb in trace.hist
+            if getattr(fb, 'decision', False)
+        )
+        
+        return success_count / len(trace.hist)
+
+    def extract_concise_observation(self, trace: Trace) -> str:
+        """Extract concise observation from trace"""
+        if not hasattr(trace, 'hist') or not trace.hist:
+            return "Starting baseline implementation"
+        
+        _, last_feedback = trace.hist[-1]
+        observations = getattr(last_feedback, 'observations', '')
+        
+        if observations:
+            first_sentence = observations.split('.')[0]
+            return first_sentence[:100] + "..." if len(first_sentence) > 100 else first_sentence
+        
+        return "Previous experiment completed"
+
+    # ==================== RAG Methods ====================
+    
+    def retrieve_cross_task_insights(self) -> List[Dict[str, Any]]:
+        """Retrieve insights from other similar tasks"""
+        # TODO: Implement actual knowledge base retrieval
+        return []
+
+    def retrieve_current_task_experiences(self, trace: Trace) -> List[Dict[str, Any]]:
+        """Retrieve relevant experiences from current task's trace history"""
+        if not hasattr(trace, 'hist') or not trace.hist:
+            return []
+        
+        experiences = []
+        for exp, fb in trace.hist[-5:]:
+            experiences.append({
+                "hypothesis": getattr(exp, 'hypothesis', 'N/A'),
+                "approach": getattr(exp, 'action_type', 'N/A') if hasattr(exp, 'action_type') else 'N/A',
+                "improved_dims": self._extract_improved_dimensions(fb),
+                "lessons": getattr(fb, 'reason', 'N/A')[:200]
+            })
+        
+        return experiences
+
+    def retrieve_external_sources(self, trace: Trace) -> List[Dict[str, Any]]:
+        """Retrieve external sources
+        Args:
+            trace: Experiment trace
+        Returns:
+            List of external source dictionaries
+        """
+
+        #check if web search is available
+        if not self.web_search.client_health_check():
+            logger.warning("SearxNG service unavailable. Skipping external search")
+            return []
+        #prepare search content
+        task_description = getattr(self.scen, 'description', 'Automated research system')
+        knowledge_gaps = self._identify_performance_gaps(trace)
+        context = {
+            "weak_dimension": self._get_most_improved_dimension(trace),
+            "methodology": getattr(self.scen, 'task_type', '')
+        }
+        try:
+            #perform web search
+            external_sources = self.web_search.search_for_hypothesis(
+                task_description = task_description,
+                current_gaps = knowledge_gaps,
+                context = context,
+            )
+
+            logger.info(f"Retrieved {len(external_sources)} external sources")
+            return external_sources
+        except Exception as e:
+            logger.error(f"Failed to retrieve external sources: {e}")
+            return []
+
+    def identify_knowledge_gaps(self, trace):
+        """
+        Identify knowledge gaps from trace history for external search
+        Args:
+            trace: Experiment trace
+        Returns:
+            List of knowledge gap descriptions
+        """
+        gaps = []
+        if not hasattr(trace, 'hist') or not trace.hist:
+            gaps.append("baseline system design")
+            gaps.append("evaluation metrics implementation")
+            return gaps
+        #analyze recent failures
+        for exp, feedback in trace.hist[-3:]:
+            if not getattr(feedback, 'decision', False):
+                reason = getattr(feedback, 'reason', '')
+                if 'error' in reason.lower():
+                    gaps.append("error handling strategies")
+                if 'coverage' in reason.lower():
+                    gaps.append("comprehensive task coverage techniques")
+                if 'insight' in reason.lower():
+                    gaps.append("methods to enhance insight generation")
+
+        #check dimension scores
+        if hasattr(trace, 'hist') and trace.hist:
+            _, last_feedback = trace.hist[-1]
+            dimensions = {
+                'comprehensiveness': getattr(last_feedback, 'comprehensiveness_score', 0),
+                'insight': getattr(last_feedback, 'insight_score', 0),
+                'instruction_following': getattr(last_feedback, 'instruction_score', 0),
+                'readability': getattr(last_feedback, 'readability_score', 0)
+            }
+            #identify low scoring dimensions
+            for dim, score in dimensions.items():
+                if score and score < 6.0:
+                    gaps.append(f"improving {dim} techniques")
+
+        return gaps if gaps else ["general agentic system optimization"]
+
+
+    def get_weak_dimension(self, trace):
+        """
+        get the weakest evaluation dimension from trace history
+        """
+        if not hasattr(trace, 'hist') or not trace.hist:
+            return None
+        _, last_feedback = trace.hist[-1]
+        dimensions = {
+            "comprehensiveness": getattr(last_feedback, 'comprehensiveness_score', 10),
+            "insight": getattr(last_feedback, 'insight_score', 10),
+            "instruction_following": getattr(last_feedback, 'instruction_score', 10),
+            "readability": getattr(last_feedback, 'readability_score', 10)
+        }
+
+        if dimensions:
+            weakest = min(dimensions, key = lambda x: x[1])
+            return weakest[0]
+        
+        return None
+
+
+
+
+    def extract_improved_dimensions(self, feedback) -> List[str]:
+        """Extract which dimensions improved from feedback"""
+        improved = []
+        
+        for dim in ["comprehensiveness", "insight", "instruction_following", "readability"]:
+            delta_attr = f"{dim}_delta"
+            if hasattr(feedback, delta_attr) and getattr(feedback, delta_attr, 0) > 0:
+                improved.append(dim.replace("_", " ").title())
+        
+        return improved if improved else ["None"]
+
+    def get_fallback_hypothesis(self, context: Dict[str, Any]) -> Dict[str, Any]:
+        """Get fallback hypothesis when LLM parsing fails"""
+        return {
+            "action": "Information_Gathering",
+            "hypothesis": "Improve system based on previous feedback",
+            "target_dimensions": [
+                {
+                    "name": "Comprehensiveness",
+                    "current_score": context["current_scores"]["comprehensiveness"] or 0.0,
+                    "target_score": (context["current_scores"]["comprehensiveness"] or 0.0) + 1.0,
+                    "expected_improvement": 1.0,
+                    "confidence": "Low"
+                }
+            ],
+            "current_gap": "Unable to generate structured hypothesis",
+            "rationale": "LLM response parsing failed. Using fallback hypothesis.",
+            "implementation_plan": {
+                "step_1": "Review previous feedback",
+                "step_2": "Implement basic improvements",
+                "step_3": "Validate changes"
+            },
+            "risk_assessment": {
+                "potential_negative_impacts": [],
+                "mitigation_strategies": ["Incremental changes", "Thorough testing"]
+            },
+            "success_criteria": {
+                "primary": "System runs without errors",
+                "secondary": ["Performance maintained or improved"],
+                "validation_method": "Manual verification"
+            },
+            "concise_knowledge": "When LLM parsing fails, use incremental improvements"
+        }
+
+
+class AgenticSysExpGen(ExpGen):
+    """Generate experiment based on hypothesis"""
+
+    def __init__(self, scen: AgenticSysScen):
+        self.scen = scen
+        self.api_backend = APIBackend()
+        logger.info("AgenticSysExpGen initialized with T() template system")
+        
+    def gen(self, trace: Trace) -> AgenticSysExperiment:
+        """
+        Generate experiment based on trace and hypothesis.
+        
+        Uses T() template system for task description generation.
+        
+        Args:
+            trace: Experiment trace
+            
+        Returns:
+            AgenticSysExperiment object
+        """
+        logger.info("Generating experiment from hypothesis...")
+        
+        # Step 1: Get hypothesis from trace
+        hypothesis = self.get_latest_hypothesis(trace)
+        
+        # Step 2: Generate task description using T()
+        task_desc = self.generate_task_description_with_t(hypothesis, trace)
+        
+        # Step 3: Create experiment
+        main_task = Task(task_desc)
+        experiment = AgenticSysExperiment(
+            sub_tasks=[main_task]
+        )
+        
+        # Step 4: Attach hypothesis and metadata
+        if hypothesis:
+            experiment.hypothesis = hypothesis.hypothesis
+            experiment.action_type = getattr(hypothesis, 'action_type', 'Information_Gathering')
+            experiment.target_dimensions = getattr(hypothesis, 'target_dimensions', [])
+            experiment.implementation_plan = getattr(hypothesis, 'implementation_plan', {})
+            experiment.hypothesis_obj = hypothesis
+        else:
+            experiment.hypothesis = "Baseline implementation"
+            experiment.action_type = "Information_Gathering"
+        
+        logger.info(f"Generated experiment with action type: {experiment.action_type}")
+        
+        return experiment
+
+    def get_latest_hypothesis(self, trace: Trace) -> Optional[Hypothesis]:
+        """Get the latest hypothesis from trace"""
+        if hasattr(trace, 'hypothesis') and trace.hypothesis:
+            return trace.hypothesis
+        
+        if hasattr(trace, 'hist') and trace.hist:
+            last_exp, _ = trace.hist[-1]
+            if hasattr(last_exp, 'hypothesis_obj'):
+                return last_exp.hypothesis_obj
+        
+        return None
+
+    def generate_task_description_with_t(
+        self, 
+        hypothesis: Optional[Hypothesis], 
+        trace: Trace
+    ) -> str:
+        """
+        Generate task description using T() template system.
+        
+        KEY METHOD: Shows how to use action-specific prompts with T().
+        
+        Args:
+            hypothesis: Hypothesis object
+            trace: Experiment trace
+            
+        Returns:
+            Task description string
+        """
+        is_first_experiment = not (hasattr(trace, 'hist') and trace.hist)
+        
+        # First experiment: baseline task
+        if is_first_experiment:
+            return self.get_baseline_task()
+        
+        # No hypothesis: fallback
+        if not hypothesis:
+            return self.get_improvement_task_fallback(trace)
+        
+        # Generate task based on action type using T()
+        action_type = getattr(hypothesis, 'action_type', 'Information_Gathering')
+        
+        try:
+            # Use T() to render action-specific specification
+            action_spec = T(f"scenarios.agentic_sys.prompts:hypothesis_specification.{action_type}").r()
+            
+            # Build complete task description
+            task_desc = f"""Action: {action_type}
+
+Hypothesis: {hypothesis.hypothesis}
+
+Target Dimensions:
+{self.format_target_dimensions(getattr(hypothesis, 'target_dimensions', []))}
+
+Implementation Plan:
+{self.format_implementation_plan(getattr(hypothesis, 'implementation_plan', {}))}
+
+====== Action-Specific Guidelines ======
+{action_spec}
+
+====== Success Criteria ======
+{self.format_success_criteria(getattr(hypothesis, 'success_criteria', {}))}
+
+====== Risk Assessment ======
+{self.format_risk_assessment(getattr(hypothesis, 'risk_assessment', {}))}
+"""
+            return task_desc
+            
+        except Exception as e:
+            logger.warning(f"Failed to use T() for action specification: {e}")
+            return self._get_improvement_task_fallback(trace)
+
+    # ==================== Formatting Helper Methods ====================
+    
+    def format_target_dimensions(self, target_dimensions: List[Dict]) -> str:
+        """Format target dimensions"""
+        if not target_dimensions:
+            return "- No specific dimension targets"
+        
+        lines = []
+        for dim in target_dimensions:
+            name = dim.get('name', 'Unknown')
+            current = dim.get('current_score', 'N/A')
+            target = dim.get('target_score', 'N/A')
+            improvement = dim.get('expected_improvement', 'N/A')
+            confidence = dim.get('confidence', 'N/A')
+            
+            lines.append(f"- {name}: {current} → {target} (Δ{improvement}, confidence: {confidence})")
+        
+        return "\n".join(lines)
+
+    def format_implementation_plan(self, plan: Dict) -> str:
+        """Format implementation plan"""
+        if not plan:
+            return "- No specific implementation plan"
+        
+        lines = []
+        for key, value in plan.items():
+            lines.append(f"- {key}: {value}")
+        
+        return "\n".join(lines)
+
+    def format_success_criteria(self, criteria: Dict) -> str:
+        """Format success criteria"""
+        if not criteria:
+            return "- Complete implementation without errors"
+        
+        lines = []
+        
+        primary = criteria.get('primary', None)
+        if primary:
+            lines.append(f"- Primary: {primary}")
+        
+        secondary = criteria.get('secondary', [])
+        if secondary:
+            lines.append("- Secondary:")
+            for criterion in secondary:
+                lines.append(f"  * {criterion}")
+        
+        validation = criteria.get('validation_method', None)
+        if validation:
+            lines.append(f"- Validation: {validation}")
+        
+        return "\n".join(lines) if lines else "- Complete implementation without errors"
+
+    def format_risk_assessment(self, risk_assessment: Dict) -> str:
+        """Format risk assessment"""
+        if not risk_assessment:
+            return "- No specific risks identified"
+        
+        lines = []
+        
+        negative_impacts = risk_assessment.get('potential_negative_impacts', [])
+        if negative_impacts:
+            lines.append("Potential Negative Impacts:")
+            for impact in negative_impacts:
+                if isinstance(impact, dict):
+                    dimension = impact.get('dimension', 'Unknown')
+                    reason = impact.get('reason', 'N/A')
+                    severity = impact.get('severity', 'N/A')
+                    lines.append(f"  - {dimension}: {reason} (Severity: {severity})")
+                else:
+                    lines.append(f"  - {impact}")
+        
+        mitigations = risk_assessment.get('mitigation_strategies', [])
+        if mitigations:
+            lines.append("\nMitigation Strategies:")
+            for strategy in mitigations:
+                lines.append(f"  - {strategy}")
+        
+        return "\n".join(lines) if lines else "- No specific risks identified"
+
+    def get_baseline_task(self) -> str:
+        """Get baseline task description for first experiment"""
+        competition = getattr(self.scen, "competition", 'general') if self.scen else 'general'
+        
+        return f"""Design and implement a baseline agentic system for {competition}.
+
+Requirements:
+1. Create an AgenticSystem class for autonomous research task execution
+2. Implement task execution with performance monitoring
+3. Include metrics collection for DeepResearch dimensions:
+   - Comprehensiveness, Insight, Instruction Following, Readability
+4. Add error handling and logging
+5. Output results in structured JSON format
+
+Target Scores: Comprehensiveness ≥6.0, Insight ≥5.0, Instruction Following ≥7.0, Readability ≥6.0
+"""
+
+    def get_improvement_task_fallback(self, trace: Trace) -> str:
+        """Fallback task generation when hypothesis unavailable"""
+        if not hasattr(trace, 'hist') or not trace.hist:
+            return self._get_baseline_task()
+        
+        last_exp, last_feedback = trace.hist[-1]
+        
+        decision = getattr(last_feedback, 'decision', None)
+        base_desc = "Enhance successful system" if decision else "Fix issues in previous implementation"
+        
+        feedback_reason = getattr(last_feedback, 'reason', 'No feedback')[:200]
+        
+        return f"""{base_desc}
+
+Previous feedback: {feedback_reason}
+
+Focus on improving lowest-scoring dimension.
+
+Current Scores:
+{self.format_current_scores(last_feedback)}
+"""
+
+    def format_current_scores(self, feedback) -> str:
+        """Format current dimension scores"""
+        scores = {
+            "Comprehensiveness": getattr(feedback, 'comprehensiveness_score', 'N/A'),
+            "Insight": getattr(feedback, 'insight_score', 'N/A'),
+            "Instruction Following": getattr(feedback, 'instruction_score', 'N/A'),
+            "Readability": getattr(feedback, 'readability_score', 'N/A')
+        }
+        
+        return "\n".join(f"- {dim}: {score}" for dim, score in scores.items())
\ No newline at end of file
diff --git a/rdagent/scenarios/agentic_sys/research_task.py b/rdagent/scenarios/agentic_sys/research_task.py
new file mode 100644
index 000000000..d2235f6f1
--- /dev/null
+++ b/rdagent/scenarios/agentic_sys/research_task.py
@@ -0,0 +1,212 @@
+"""
+DeepResearch Bench Dataset Loader for Agentic System
+"""
+import json
+import logging
+from pathlib import Path
+from typing import Dict, List, Any, Optional
+import requests
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class ResearchTask:
+    """
+    Research task from DeepResearch Bench
+    """
+    task_id: str
+    title: str
+    description: str
+    domain: str
+    difficulty: str
+    evaluation_metrics: Dict[str, Any]
+    input_data: Optional[Dict] = None
+    expected_output: Optional[Dict] = None
+    metadata: Optional[Dict] = None
+
+class DeepResearchBenchLoader:
+    """
+    Load and manage DeepResearch Bench loader
+    """
+    def __init__(self, data_path, cache_dir):
+        """
+        Initialize DeepResearch Bench Loader
+
+        Args:
+            data_path: Path to local dataset (if already download)
+            cache_dir: Directory to cache downloaded data
+        """
+        self.data_path = data_path
+        self.cache_dir = cache_dir
+        self.cache_dir.mkdir(parents = True, exist_ok = True)
+        self.tasks: List[ResearchTask] = []
+
+    def load_dataset(self, subset:str):
+        """
+        Load DeepResearch Bench dataset
+        Args:
+            subset: Dataset subset to load (e.g., 'easy', 'medium', 'hard')
+        """
+        logger.info(f"Loading DeepResearch Bench dataset (subset={subset})")
+        if self.data_path and self.data_path.exists():
+            #load from local path
+            self.tasks = self.load_from_local(self.data_path, subset)
+        else:
+            #download from remote
+            self.tasks = self.download_and_load(subset)
+        logger.info(f"Loaded {len(self.tasks)} tasks from DeepResearch Bench (subset={subset})")
+        return self.tasks
+    
+    def load_from_local(self, data_path: Path, subset: str):
+        "load dataset from local path"
+        tasks = []
+        #assume JSON format
+        json_files = list(data_path.glob(".json"))
+
+        for json_file in json_files:
+            try:
+                with open(json_file, 'r') as f:
+                    data = json.load(f)
+                
+                #Parse task data
+                if isinstance(data,list):
+                    for item in data:
+                        task = self.parse_task(item)
+                        if self.matches_subset(task, subset):
+                            tasks.append(task)
+                else:
+                    task = self.parse_task(data)
+                    if self.matches_subset(task, subset):
+                        tasks.append(task)
+            except Exception as e:
+                logger.error(f"Failed to load task from {json_file}: {e}")
+        return tasks
+    
+    def download_and_load(self, subset):
+        "download dataset from DeepResearch Bench and load"
+        base_url = "https://github.com/Ayanami0730/deep_research_bench"
+
+        tasks = []
+        cache_file = self.cache_dir / f"tasks_{subset}.json"
+
+        #Check cache first
+        if cache_file.exists():
+            logger.info(f"Loading from cache: {cache_file}")
+            with open(cache_file, 'r') as f:
+                cached_data = json.load(f)
+                return [self.parse_task(item) for item in cached_data]
+        try: 
+            #Download task list
+            logger.info(f"Downloading tasks from {base_url}")
+            response = requests.get(f"{base_url}/tree/main/data/{subset}_data/{subset}.jsonl",timeout=30)
+            response.raise_for_status()
+            data = response.json()
+            tasks_data = data.get('tasks', [])
+
+            #Parse tasks
+            for item in tasks_data:
+                task = self.parse_task(item)
+                tasks.append(task)
+
+            #Cache downloaded data
+            with open(cache_file, 'w') as f:
+                json.dump(tasks_data, f)
+
+            logger.info(f"Downloaded and cached {len(tasks)} tasks")
+        except Exception as e:
+            logger.error(f"Failed to download dataset: {e}")
+            tasks = self.create_mock_tasks(subset)
+
+        return tasks
+    
+    def parse_task(self, data: Dict) -> ResearchTask:
+        """Parse task data into ResearchTask object"""
+        return ResearchTask(
+            task_id=data.get('id', 'unknown'),
+            title=data.get('title', ''),
+            description=data.get('description', ''),
+            domain=data.get('domain', 'general'),
+            difficulty=data.get('difficulty', 'medium'),
+            evaluation_criteria=data.get('evaluation_criteria', {}),
+            input_data=data.get('input_data'),
+            expected_output=data.get('expected_output'),
+            metadata=data.get('metadata', {})
+        )
+    
+    def matches_subset(self, task: ResearchTask, subset: str) -> bool:
+        """Check if task matches requested subset"""
+        if subset == "all":
+            return True
+        return task.difficulty.lower() == subset.lower()
+    
+    def create_mock_tasks(self, subset: str) -> List[ResearchTask]:
+        """Create mock tasks for testing when download fails"""
+        logger.warning("Creating mock tasks for testing")
+        
+        mock_tasks = [
+            {
+                'id': 'mock_001',
+                'title': 'Literature Review Synthesis',
+                'description': 'Synthesize findings from multiple research papers',
+                'domain': 'research_synthesis',
+                'difficulty': 'medium',
+                'evaluation_criteria': {
+                    'completeness': 0.3,
+                    'coherence': 0.3,
+                    'accuracy': 0.4
+                },
+                'input_data': {
+                    'papers': ['paper1.pdf', 'paper2.pdf', 'paper3.pdf'],
+                    'query': 'What are the main findings on topic X?'
+                }
+            },
+            {
+                'id': 'mock_002',
+                'title': 'Hypothesis Generation',
+                'description': 'Generate research hypotheses based on existing literature',
+                'domain': 'hypothesis_generation',
+                'difficulty': 'hard',
+                'evaluation_criteria': {
+                    'novelty': 0.4,
+                    'feasibility': 0.3,
+                    'clarity': 0.3
+                }
+            },
+            {
+                'id': 'mock_003',
+                'title': 'Experiment Design',
+                'description': 'Design an experiment to test a given hypothesis',
+                'domain': 'experiment_design',
+                'difficulty': 'easy',
+                'evaluation_criteria': {
+                    'validity': 0.4,
+                    'completeness': 0.3,
+                    'practicality': 0.3
+                }
+            }
+        ]
+        
+        return [self._parse_task(task) for task in mock_tasks 
+                if self._matches_subset(self._parse_task(task), subset)]
+    
+    def get_task_by_id(self, task_id: str) -> Optional[ResearchTask]:
+        """Get specific task by ID"""
+        for task in self.tasks:
+            if task.task_id == task_id:
+                return task
+        return None
+    
+    def get_tasks_by_domain(self, domain: str):
+        """Get tasks filtered by domain"""
+        return [task for task in self.tasks if task.domain == domain]
+    
+    def get_tasks_by_difficulty(self, difficulty: str):
+        """Get tasks filtered by difficulty"""
+        return [task for task in self.tasks 
+                if task.difficulty.lower() == difficulty.lower()]
+
+
+                
+
+
diff --git a/rdagent/scenarios/agentic_sys/scen.py b/rdagent/scenarios/agentic_sys/scen.py
new file mode 100644
index 000000000..0b9fdecbb
--- /dev/null
+++ b/rdagent/scenarios/agentic_sys/scen.py
@@ -0,0 +1,299 @@
+from typing import Any, Dict, Optional
+from rdagent.core.experiment import Task
+from rdagent.core.scenario import Scenario
+from rdagent.scenarios.agentic_sys.evaluator import DeepResearchEvaluator, EvaluationResult
+
+#define experiment scenario
+#scenario abstraction for agentic system development
+#support different competition contexts
+class AgenticSysScen(Scenario):
+    def __init__(self, competition: str,evaluation_weights: Optional[Dict[str, float]] = None) -> None:
+        self.competition = competition
+
+        #Initialize DeepResearch Bench evaluator
+        self.evaluator = DeepResearchEvaluator(dimension_weights=evaluation_weights)
+
+        # Set competition-specific evaluation weights
+        self.evaluation_weights = evaluation_weights or {
+            'comprehensiveness': 0.25,
+            'insight': 0.25,
+            'instruction_following': 0.25,
+            'readability': 0.25
+        }
+
+
+    # Implement dummy functions for the abstract methods in Scenario
+    @property
+    def background(self) -> str:
+        """Background information"""
+        background_template = {
+            "deepresearch": "Advanced AI agent research focusing on autonomous reasoning and complex problem solving",
+            "tool_usage": "Development of agents with sophisticated tool usage and API integration capabilities", 
+            "multi_agent": "Multi-agent systems with coordination, communication, and collaborative task execution",
+            "planning": "Agent planning systems with strategic thinking and multi-step task decomposition",
+            "general": "General-purpose agentic system development with broad task handling capabilities"
+        }
+        base_desc = background_template.get(self.competition,  f"Agentic system development for {self.competition}")
+
+        evaluation_info = f"""
+        
+        Evaluation Framework: DeepResearch Bench Standards
+        - Comprehensiveness (weight: {self.evaluator.weights['comprehensiveness']:.2f}): Breadth and depth of coverage
+        - Insight (weight: {self.evaluator.weights['insight']:.2f}): Causal reasoning and originality
+        - Instruction Following (weight: {self.evaluator.weights['instruction_following']:.2f}): Task compliance
+        - Readability (weight: {self.evaluator.weights['readability']:.2f}): Clarity and presentation
+        """
+
+        return f"""Competition: {self.competition},Objective: {base_desc}, Focus: Create autonomous AI agents that can execute complex tasks with minimal human intervention. 
+Key requirements include task planning, execution monitoring, error handling, and performance optimization. {evaluation_info}"""
+    
+
+    #running environment description and standards
+    def get_runtime_environment(self) -> str:
+        """Get the runtime environment information"""
+        return f"""Runtime Environment for competition {self.competition}: 
+        Base Requirements: 
+        - Python 3.8+ execution environment
+        - JSON serialization support for results
+        - File I/O capabilities for workspace management
+        - Standard Library access
+
+        Agent Framework: 
+        - Task execution and monitoring system
+        - Performance metrics collection module (success rate, average time, error count)
+        - Error handling and logging mechanisms
+        - Structured output format (JSON)
+        - DeepResearch Bench evaluation integration
+        
+        Execution Context:
+        - Isolated workspace directory
+        - Configurable timeout settings
+        - Resource monitoring (CPU, Memory usage) and cleanup
+        - Result validation and reporting
+        - Multi-dimensional quality assessment (Comprehensiveness, Insight, Instruction Following, Readability)
+        """
+
+    #task content analyze
+    def get_scenario_all_desc(
+        self,
+        task: Task | None = None,
+        filtered_tag: str | None = None,
+        simple_background: bool | None = None,
+    ) -> str:
+        """Combine all descriptions together"""
+        parts = []
+
+        #1. basic information processing
+        if simple_background:
+            parts.append(f"Competition: {self.competition}. Develop an autonomous agentic system.")
+        else:
+            parts.append(self.background)
+            parts.append(self.get_runtime_environment())
+
+        #2. task specific processing
+        if task:
+            parts.append(f"\n--- Current Task ---")
+            parts.append(task.description)
+            task_desc = task.description.lower()
+            if 'memory' in task_desc:
+                parts.append("Additional Focus: Memory management and state persistence.")
+            elif 'parallel' in task_desc:
+                parts.append("Additional Focus: Parallel execution and concurrency handling.")
+            elif 'planning' in task_desc:
+                parts.append("Additional Focus: Advanced planning and multi-step task decomposition.")
+            
+            # Add evaluation criteria for this task
+            parts.append(self.get_task_evaluation_criteria(task))
+
+        if filtered_tag:
+            parts.append(f"\n--- Filtered Tags: {filtered_tag} ---")
+            tag_guidance = self.get_tag_guidance(filtered_tag)
+            if tag_guidance:
+                parts.append(tag_guidance)
+
+        if not simple_background:
+            parts.append(self.get_success_criteria())
+
+        return "\n".join(parts)
+
+    def get_task_evaluation_criteria(self, task: Task) -> str:
+        """Get evaluation criteria specific to the task"""
+
+        #extract task-specific information
+        task_desc = task.description.lower() if task and task.description else ""
+        task_domain = getattr(task, 'domain', 'general') if task else 'general'
+
+        focus_areas = []
+        emphasis_dimensions = {}
+
+        #Analyze task description to adjust criteria emphasis
+        if 'comprehensive' in task_desc:
+            focus_areas.append("comprehensive coverage")
+            emphasis_dimensions['comprehensiveness'] = 'emphasized'
+
+        if 'analyze' in task_desc or 'explain' in task_desc or 'reason' in task_desc:
+            focus_areas.append("analytical reasoning")
+            emphasis_dimensions['insight'] = 'emphasized'
+
+        if 'follow' in task_desc or 'present' in task_desc or 'format' in task_desc:
+            focus_areas.append("strict instruction adherence")
+            emphasis_dimensions['instruction_following'] = 'emphasized'
+
+        if 'report' in task_desc or 'present' in task_desc or 'clarity' in task_desc:
+            focus_areas.append("clear presentation")
+            emphasis_dimensions['readability'] = 'emphasized'
+        
+        #build focus statement
+        focus_statement = ""
+        if focus_areas:
+            focus_statement = f"\n**Task Focus**: This task particularly emphasizes {', '.join(focus_areas)}.\n"
+        else:
+            focus_statement = "\n**Task Focus**: Standard evaluation across all dimensions, including comprehensiveness, Insight, Instruction following and readability\n"
+        
+        #domain specific guidance
+        domain_guidance = self.get_domain_specific_guidance(task_domain)
+
+        #build criteria with emphasis markers
+        comp_marker = emphasis_dimensions.get('comprehensiveness', '')
+        insight_marker = emphasis_dimensions.get('insight', '')
+        instruction_marker = emphasis_dimensions.get('instruction_following', '')
+        readability_marker = emphasis_dimensions.get('readability', '')
+
+        return f"""
+--- Evaluation Criteria (DeepResearch Bench) ---
+{focus_statement}
+
+Your solution will be evaluated on four dimensions (0-10 scale each):
+
+1. Comprehensiveness ({self.evaluator.weights['comprehensiveness']:.0%} weight):
+   - Coverage of all required subtopics
+   - Depth of analysis with evidence
+   - Multiple perspectives considered
+   - No major omissions
+   
+2. Insight ({self.evaluator.weights['insight']:.0%} weight):
+   - Causal reasoning and why-think
+   - Quantified analysis with data
+   - Non-obvious implications identified
+   - Novel synthesis or frameworks
+   
+3. Instruction Following ({self.evaluator.weights['instruction_following']:.0%} weight):
+   - Answers all sub-questions
+   - Respects scope and constraints
+   - Required deliverables present
+   - Avoids out-of-scope content
+   
+4. Readability ({self.evaluator.weights['readability']:.0%} weight):
+   - Clear structure and organization
+   - Fluent, precise language
+   - Effective data presentation
+   - Proper formatting
+
+Overall Score: Weighted sum of four dimensions
+Target: >= 7.0/10.0 overall for success
+"""
+
+    def get_tag_guidance(self, tag):
+        """acquire specific guidance based on tag"""
+        tag_guidance = {
+            "performance": "Optimize for speed and resource efficiency. Evaluation: Focus on insight (efficiency analysis) and comprehensiveness (performance metrics).",
+            "robustness": "Focus on error handling and system stability. Evaluation: Emphasize comprehensiveness (edge cases) and instruction following (requirements).",
+            "scalability": "Design for handling larger and more complex tasks. Evaluation: Highlight insight (scalability analysis) and comprehensiveness (architectural depth).",
+            "planning": "Emphasize strategic thinking and multi-step execution. Evaluation: Prioritize insight (causal reasoning) and comprehensiveness (planning depth).",
+            "coordination": "Multi-agent communication and collaboration. Evaluation: Focus on comprehensiveness (interaction coverage) and readability (clear protocols)."
+        }
+        return tag_guidance.get(tag.lower(), f"Focus on {tag} aspects.")
+    
+
+    
+    def get_success_criteria(self):
+        '''acquire success criteria with DeepResearch Bench standards'''
+        return f"""
+--- Success Criteria ---
+
+Primary Metrics (Execution):
+- Task Success Rate: >= 70%
+- Average Execution Time: Within reasonable limits 
+- Error Rate: < 10%
+
+Quality Metrics (DeepResearch Bench):
+- Comprehensiveness: >= 6.0/10.0 (adequate coverage)
+- Insight: >= 6.0/10.0 (clear reasoning)
+- Instruction Following: >= 7.0/10.0 (compliant)
+- Readability: >= 6.0/10.0 (clear presentation)
+- Overall Score: >= 7.0/10.0
+
+Implementation Requirements:
+- Clean, maintainable code structure
+- Proper error handling and logging
+- JSON-formatted result output with evaluation scores
+- Autonomous task execution capability
+- Documented reasoning and decision-making process
+
+Scoring Guidance:
+- 0-2: Poor/Missing - Major issues
+- 4-6: Basic/Adequate - Meets minimum requirements
+- 6-8: Good/Complete - Solid implementation
+- 8-10: Excellent/Exhaustive - Outstanding quality
+"""
+
+    def evaluate_output(
+        self,
+        output: Any,
+        task: Optional[Task] = None,
+        reference_output: Optional[Any] = None
+    ) -> EvaluationResult:
+        """
+        Evaluate output using DeepResearch Bench standards
+        
+        Args:
+            output: The agent's output to evaluate
+            task: Optional task for context
+            reference_output: Optional reference for normalization
+            
+        Returns:
+            EvaluationResult with scores for all dimensions
+        """
+        # Prepare task requirements
+        task_requirements = {}
+        task_context = {}
+        
+        if task:
+            task_context = {
+                'task_description': task.description,
+                'competition': self.competition
+            }
+            
+            # Extract requirements from task description
+            task_desc_lower = task.description.lower()
+            task_requirements['required_sections'] = []
+            
+            if 'results' in task_desc_lower or 'output' in task_desc_lower:
+                task_requirements['required_sections'].append('results')
+            if 'analysis' in task_desc_lower or 'evaluate' in task_desc_lower:
+                task_requirements['required_sections'].append('analysis')
+            if 'metrics' in task_desc_lower or 'performance' in task_desc_lower:
+                task_requirements['required_sections'].append('metrics')
+        
+        # Evaluate using the evaluator
+        reference_result = None
+        if reference_output:
+            reference_result = self.evaluator.evaluate(
+                reference_output,
+                task_requirements,
+                task_context
+            )
+        
+        result = self.evaluator.evaluate(
+            output,
+            task_requirements,
+            task_context,
+            reference_result
+        )
+        
+        return result
+
+    @property
+    def rich_style_description(self) -> str:
+        """Rich style description to present"""
+        return f"<b>AgenticSysScen</b> for competition: <i>{self.competition}</i> with <b>DeepResearch Bench</b> evaluation"
\ No newline at end of file
diff --git a/rdagent/scenarios/agentic_sys/tools/deploy_searxng.sh b/rdagent/scenarios/agentic_sys/tools/deploy_searxng.sh
new file mode 100644
index 000000000..124b50961
--- /dev/null
+++ b/rdagent/scenarios/agentic_sys/tools/deploy_searxng.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+# filepath: /data/userdata/v-wangzhu/RD-Agent/rdagent/scenarios/agentic_sys/tools/deploy_searxng.sh
+
+# SearxNG Deployment Script
+set -e
+
+SEARXNG_DIR="${HOME}/apps/searxng"
+SEARXNG_PORT=8888
+CONTAINER_NAME="searxng"
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
+log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
+log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
+
+deploy() {
+    log_info "Deploying SearxNG..."
+    mkdir -p "${SEARXNG_DIR}/config" "${SEARXNG_DIR}/data"
+    
+    if docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then
+        log_warn "Container exists, removing..."
+        docker rm -f ${CONTAINER_NAME}
+    fi
+    
+    docker run --name ${CONTAINER_NAME} -d \
+        -p ${SEARXNG_PORT}:8080 \
+        -v "${SEARXNG_DIR}/config:/etc/searxng/" \
+        -v "${SEARXNG_DIR}/data:/var/cache/searxng/" \
+        --restart unless-stopped \
+        docker.io/searxng/searxng:latest
+    
+    log_info "SearxNG deployed at http://localhost:${SEARXNG_PORT}"
+    sleep 5
+}
+
+update_config() {
+    log_info "Updating configuration..."
+    CONFIG_FILE="${SEARXNG_DIR}/config/settings.yml"
+    
+    local attempts=0
+    while [ ! -f "$CONFIG_FILE" ] && [ $attempts -lt 10 ]; do
+        log_info "Waiting for config file... ($((attempts+1))/10)"
+        sleep 2
+        attempts=$((attempts+1))
+    done
+    
+    if [ ! -f "$CONFIG_FILE" ]; then
+        log_error "Config file not found!"
+        exit 1
+    fi
+    
+    sudo chmod 777 -R "${SEARXNG_DIR}/config/"
+    
+    if ! command -v yq &> /dev/null; then
+        log_warn "Installing yq..."
+        sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64
+        sudo chmod +x /usr/local/bin/yq
+    fi
+    
+    yq -i '.search.formats = ["html", "json", "csv"]' "$CONFIG_FILE"
+    log_info "Configuration updated"
+    
+    restart
+}
+
+restart() {
+    log_info "Restarting SearxNG..."
+    docker restart ${CONTAINER_NAME} >/dev/null
+    log_info "Restarted successfully"
+    sleep 3
+}
+
+status() {
+    log_info "SearxNG Status:"
+    if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then
+        echo -e "${GREEN}  Status: Running${NC}"
+        echo "  URL: http://localhost:${SEARXNG_PORT}"
+    else
+        echo -e "${RED}  Status: Not running${NC}"
+    fi
+}
+
+case "${1:-help}" in
+    deploy) deploy ;;
+    update_config) update_config ;;
+    restart) restart ;;
+    status) status ;;
+    *) echo "Usage: $0 {deploy|update_config|restart|status}" ;;
+esac
\ No newline at end of file
diff --git a/rdagent/scenarios/agentic_sys/tools/how_to_use.md b/rdagent/scenarios/agentic_sys/tools/how_to_use.md
new file mode 100644
index 000000000..441fc1839
--- /dev/null
+++ b/rdagent/scenarios/agentic_sys/tools/how_to_use.md
@@ -0,0 +1,19 @@
+from pathlib import Path
+from rdagent.scenarios.agentic_sys.tools.web_search import create_web_search_tool
+
+# Initialize
+config_path = Path(__file__).parent / "tools" / "search_config.yaml"
+search_tool = create_web_search_tool(config_path)
+
+# Search for hypothesis
+results = search_tool.search_for_hypothesis(
+    task_description="Improve agentic system",
+    current_gaps=["information gathering"],
+    context={'weak_dimension': 'comprehensiveness'}
+)
+
+# Process results
+for result in results:
+    print(f"Title: {result['title']}")
+    print(f"URL: {result['url']}")
+    print(f"Relevance: {result['relevance']}")
\ No newline at end of file
diff --git a/rdagent/scenarios/agentic_sys/tools/search_config.yaml b/rdagent/scenarios/agentic_sys/tools/search_config.yaml
new file mode 100644
index 000000000..1216a09fc
--- /dev/null
+++ b/rdagent/scenarios/agentic_sys/tools/search_config.yaml
@@ -0,0 +1,38 @@
+# SearxNG Configuration
+base_url: "http://localhost:8888"  # Change to your SearxNG server
+timeout: 30
+max_retries: 3
+default_format: "json"
+relevance_threshold: 0.3
+
+# Search Strategy
+max_results_per_query: 5
+preferred_engines:
+  - google
+  - bing
+  - google_scholar
+
+# Credibility Scoring
+credibility_weights:
+  edu_domain: 0.3
+  gov_domain: 0.3
+  org_domain: 0.2
+  academic_engine: 0.4
+  tech_blog: 0.1
+
+# Query Generation
+max_queries_per_search: 5
+query_templates:
+  gap_specific: "how to improve {gap}"
+  best_practice: "best practices for {gap}"
+  case_study: "{methodology} case studies"
+  optimization: "improve {dimension} in research systems"
+
+#Search categories
+categories:
+  - general
+  - science
+
+default_language: "auto"
+
+safesearch: 0 # 0: off, 1: moderate, 2: strict
\ No newline at end of file
diff --git a/rdagent/scenarios/agentic_sys/tools/searxng_client.py b/rdagent/scenarios/agentic_sys/tools/searxng_client.py
new file mode 100644
index 000000000..79566ecb8
--- /dev/null
+++ b/rdagent/scenarios/agentic_sys/tools/searxng_client.py
@@ -0,0 +1,220 @@
+"""
+SearXNG client for web search. Based on the deployment script's search API
+"""
+import requests
+import json
+import csv
+from io import StringIO
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+from rdagent.log import rdagent_logger as logger
+import yaml
+
+class SearxNGClient:
+    """
+    Client for SearxNG search engine with multi-format support.
+
+    Features:
+    - Multiple output formats (json,csv, html)
+    - Result filtering by relevant 
+    - error handling and retry logic
+    - Configuration requirement
+    """
+
+    def __init__(self,config_path):
+        """
+        Initialize SearxNG client
+        Args:
+            config_path (str): Path to the SearxNG configuration file
+        """
+        #load configuration
+        if config_path and config_path.exists():
+            with open(config_path, 'r') as f:
+                config = yaml.load(f)
+        else:
+            #base configuration
+            config = {
+                'base_url': "http://localhost:8888",
+                'timeout': 30,
+                'max_retries': 3,
+                'default_format': 'json',
+                'relevant_threshold': 0.3
+            }
+
+        self.base_url = config.get('base_url', 'http://localhost:8888')
+        self.timeout = config.get('timeout', 30)
+        self.max_retries = config.get('max_retries', 3)
+        self.default_format = config.get('default_format', 'json')
+        self.relevance_threshold = config.get('relevant_threshold', 0.3)
+
+        logger.info(f"SearxNGClient initialized with base_url: {self.base_url}")
+
+    def search(self, query, format, categories, engines, languages, time_range,safesearch):
+        """Perform web search using SearxNG API."""
+        if not query or not query.strip():
+            logger.warning("Empty query provided to SearxNGClient.search")
+            return self.empty_result(query)
+        format = format or self.default_format
+
+        #build search parameters
+        params = {
+            'q': query,
+            'format': format
+        }
+        if categories:
+            params['categories'] = ','.join(categories)
+        
+        if engines:
+            params['engines'] = ','.join(engines)
+
+        if languages != 'auto':
+            params['language'] = languages
+
+        if time_range:
+            params['time_range'] = time_range
+        
+        if safesearch > 0:
+            params['safesearch'] = safesearch
+
+        #perform search with retry logic
+        for attempt in range(self.max_retries):
+            try:
+                logger.info(f"Searching SearxNG (attempt {attempt + 1}/{self.max_retries}): {query}")
+                response = requests.get(
+                    f"{self.base_url}/search",
+                    params=params,
+                    timeout=self.timeout
+                )
+                response.raise_for_status()
+
+                #Parse response based on format
+                if format == 'json':
+                    result = response.json()
+                elif format == 'csv':
+                    result = self.parse_csv_response(response.text, query)
+                elif format == 'html':
+                    result = self.parse_html_response(response.text, query)
+                else:
+                    raise ValueError(f"Unsupported format: {format}")
+                logger.info(f"Search completed: {len(result.get('results', []))} results")
+                return result
+
+            except requests.Timeout:
+                logger.warning(f"Search timeout")
+                if attempt == self.max_retries - 1:
+                    return self.empty_result(query, error="Timeout")
+            
+            except requests.RequestException as e:
+                logger.error(f"Search request failed: {e}")
+                if attempt == self.max_retries - 1:
+                    return self.empty_result(query, error=str(e))
+                
+            except Exception as e:
+                logger.error(f"Error processing search response: {e}")
+                return self.empty_result(query, error=str(e))
+        return self.empty_result(query)
+    
+    def search_json(self, query, **kwargs):
+        """Search with JSON output"""
+        return self.search(query, format = 'json', **kwargs)
+    
+    def search_with_filter(
+            self,
+            query,
+            min_score,
+            max_results,
+            **kwargs
+    ):
+        """Search and filter results by relevance score.
+        Args:
+            query: Search query
+            min_score: Minimum relevance score (0 to 1)
+            max_results: Maximum number of results to return
+            **kwargs: Additional search parameters
+        Returns:
+            Filtered list of search results
+        """
+
+        min_score = min_score or self.relevance_threshold
+        result = self.search(query, format = 'json', **kwargs)
+
+        #filter and sort results
+        filtered = [
+            r for r in result.get('results', [])
+            if r.get('relevance_score', 0) >= min_score
+        ]
+
+        #sort by score (descending)
+        filtered.sort(key=lambda r: r.get('score', 0), reverse=True)
+
+        #limit results
+        if max_results:
+            filtered = filtered[:max_results]
+
+        return filtered
+    
+    def empty_result(self, query, error=None):
+        """Return empty search result"""
+        result = {
+            'query': query, 
+            'number_of_results': 0,
+            'results': [],
+            'answers': [],
+            'suggestions': [],
+            'corrections': [],
+            'infoboxes': [],
+            'unresponsive_engines': []
+        }
+        if error:
+            result['error'] = error
+        return result
+    
+    def parse_csv_response(self, csv_text, query):
+        """Parse CSV response to JSON format"""
+        import csv
+        from io import StringIO
+        results = []
+        reader = csv.DictReader(StringIO(csv_text))
+        for row in reader:
+            results.append({
+                'title': row.get('title',''),
+                'url': row.get('url',''),
+                'content': row.get('content',''),
+                'score': 1 / (len(results) + 1)  #simple score based on order
+            })
+        return {
+            'query': query,
+            'number_of_results': len(results),
+            'results': results,
+            'answers': [],
+            'suggestions': [],
+        }
+    
+    def parse_html_response(self, html_text, query):
+        """Parse HTML response"""
+        return {
+            'query': query,
+            'number_of_results': 0,
+            'results': [],
+            'answers': [],
+            'suggestions': [],
+        }
+    
+
+def create_searxng_client(config_path=None):
+    """Factory method to create SearxNG client"""
+    return SearxNGClient(config_path)
+
+
+
+        
+
+
+
+
+
+
+
+
+
+
diff --git a/rdagent/scenarios/agentic_sys/tools/web_search.py b/rdagent/scenarios/agentic_sys/tools/web_search.py
new file mode 100644
index 000000000..b2297f373
--- /dev/null
+++ b/rdagent/scenarios/agentic_sys/tools/web_search.py
@@ -0,0 +1,226 @@
+"""
+Web Search Tool for Agentic System
+Intelligent SearxNG for external knowledge retrieval
+"""
+
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+
+from rdagent.log import rdagent_logger as logger
+from rdagent.scenarios.agentic_sys.tools.searxng_client import SearxNGClient
+
+class WebSearchTool:
+    """
+    High-level web search tool for hypothesis generation
+
+    Features:
+    - Query generation from context
+    - Multi-source search support
+    - Result ranking and filtering
+    - Source validation
+    - Knowledge extraction 
+    """
+
+    def __init__(self, config_path: Optional[Path] = None):
+        """
+        Initialize WebSearchTool with SearxNG client
+
+        Args:
+            config_path (Optional[Path]): Path to SearxNG configuration file
+        """
+        self.client = SearxNGClient(config_path)
+        #search strategy configuration
+        self.max_results_per_query = 5
+        self.min_relevance_score = 0.3
+        self.preferred_engines = ['duckduckgo', 'google','bing']
+        logger.info("WebSearchTool initialized with SearxNGClient")
+        
+
+    def search_for_hypothesis(self, task_description, current_gaps, context):
+        """
+        search for information to support hypothesis generation
+        Args:
+            task_description: Description of the research task
+            current_gaps: List of identified knowledge gaps
+            context: Additional context
+        Returns:
+            List of relevant external sources with metadata
+        """
+        #generate search queries
+        queries = self.generate_queries(task_description, current_gaps, context)
+
+        #execute searches
+        all_results = []
+        for query in queries:
+            try:
+                results = self.client.search_with_filter(
+                query = query,
+                min_score = self.min_relevance_score,
+                max_results = self.max_results_per_query,
+                engines = self.preferred_engines
+            )
+                all_results.extend(results)
+            except Exception as e:
+                    logger.error(f"Error during search for query '{query}': {e}")
+                    continue
+
+        #rank and filter results
+        ranked_results = self.deduplicate_results(all_results)
+
+        #validate sources
+        validated_results = self.validate_sources(ranked_results)
+
+        #extract key information
+        enriched = self.extract_knowledge(validated_results)
+
+        logger.info(f"Search completed with {len(enriched)} relevant sources found")
+        return enriched
+    
+    def generate_queries(self, task_description, gaps, context):
+        """
+        Generate search queries based on task and gaps.
+        Strategy:
+        1. Primary queries: Direct task-related questions
+        2. Gap-specific queries: Target identified knowledge gaps
+        3. Exploratory queries, adjacent topics and methodologies
+        """
+        queries = []
+        #Primary query
+        if task_description:
+            queries.append(task_description[:200])
+        
+        #Gap-specific queries
+        for gap in gaps:
+            queries.append(f"how to improve {gap}")
+        
+        #context-based queries
+        if context:
+            #If previous experiments failed in specific dimension
+            if 'weak_dimension' in context:
+                dim = context['weak_dimension']
+                queries.append(f"improve {dim} in research system")
+                queries.append(f"{dim} optimization techniques")
+
+            # If specific methodology is being used
+            if 'methodology' in context:
+                method = context['methodology']
+                queries.append(f"{method} case studies")
+
+        #Remove duplicates while preserving order
+        seen = set()
+        unique_queries = []
+        for q in queries:
+            if q.lower() not in seen:
+                seen.add(q.lower())
+                unique_queries.append(q)
+        logger.info(f"Generated {len(unique_queries)} search queries")
+        return unique_queries
+    
+    def deduplicate_results(self, results):
+        """
+        Remove duplicate results based on URL
+        """
+        seen_urls = set()
+        deduplicated = []
+
+        #sort by score first
+        sorted_results = sorted(
+            results,
+            key = lambda x: x.get('score', 0), reverse=True
+        )
+        for result in sorted_results:
+            url = result.get('url')
+            if url and url not in seen_urls:
+                seen_urls.add(url)
+                deduplicated.append(result)
+
+    def extract_knowledge(self, results):
+        """
+        Extract and structure key knowledge from search results
+        Args:
+            results: Validated search results
+        Returns:
+            Enriched results with structured knowledge
+        """
+
+        enriched = []
+        for idx, result in enumerate(results):
+            enriched_result = {
+                'citation': f"{result.get('title', 'Untitled')} ({result.get('url', 'No URL')})",
+                'title': result.get('title', ''),
+                'url': result.get('url', ''),
+                'summary': result.get('content', '')[:300],  # First 300 chars
+                'relevance': result.get('score', 0),
+                'credibility': result.get('credibility', 0.5),
+                'credibility_level': result.get('credibility_level', 'Medium'),
+                'source_engine': result.get('engine', 'unknown'),
+                'rank': idx
+            }
+            enriched.append(enriched_result)
+        return enriched
+
+    def validate_sources(self, results):
+        """
+        Validate source credibility.
+        """
+        validated = []
+        for result in results:
+            url = result.get('url', '')
+
+            #calculate credibility score
+            credibility = self.calculate_credibility(url, result)
+
+            #Add credibility to result
+            result['credibility'] = credibility
+            result['credibility_level'] = self.credibility_level(credibility)
+            validated.append(result)
+
+        validated.sort(
+            key = lambda r: (r.get('credibility', 0), r.get('score', 0)),
+            reverse=True
+        )
+        return validated
+    
+    def calculate_credibility(self, url, result):
+        """
+        Calculate source credibility score based on heuristics
+        """
+        score = 0.5  # Baseline
+        
+        # Domain-based scoring
+        if any(domain in url.lower() for domain in ['.edu', '.gov', '.org']):
+            score += 0.3
+        elif any(domain in url.lower() for domain in ['arxiv.org', 'scholar.google', 'pubmed']):
+            score += 0.4  # Academic sources
+        elif any(domain in url.lower() for domain in ['medium.com', 'towardsdatascience']):
+            score += 0.1  # Tech blogs
+        
+        # Title-based signals
+        title = result.get('title', '').lower()
+        if any(keyword in title for keyword in ['research', 'study', 'analysis', 'survey']):
+            score += 0.1
+        
+        # Engine-based trust
+        engine = result.get('engine', '')
+        if engine in ['google_scholar', 'semantic_scholar']:
+            score += 0.2
+        
+        # Normalize to [0, 1]
+        return min(1.0, score)
+
+    def credibility_level(self, score):
+        """
+        convert credibility score to qualitative label
+        """
+        if score >= 0.8:
+            return 'High'
+        elif score >= 0.5:
+            return 'Medium'
+        else:
+            return 'Low'
+        
+def create_web_search_tool(config_path):
+    """
+    Factory function to create web search tool
+    """
+    return WebSearchTool(config_path=config_path)
\ No newline at end of file
diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
index 209effdae..4af528fd0 100644
--- a/rdagent/scenarios/data_science/experiment/experiment.py
+++ b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -41,3 +41,19 @@ def is_ready_to_run(self) -> bool:
 
     def set_local_selection(self, local_selection: tuple[int, ...]) -> None:
         self.local_selection = local_selection
+
+
+class ExperimentResult:
+    def __init__(
+        self,
+        success: bool,
+        metrics: dict[str, float] | pd.DataFrame | None = None,
+        logs: str | None = None,
+        errors: str | None = None,
+        metadata: dict | None = None,
+    ) -> None:
+        self.success = success
+        self.metrics = metrics
+        self.logs = logs
+        self.errors = errors
+        self.metadata = metadata if metadata is not None else {}
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/base.py b/rdagent/scenarios/data_science/proposal/exp_gen/base.py
index d94a054ac..ca5863cb9 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/base.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/base.py
@@ -61,8 +61,6 @@ def __init__(self, scen: DataScienceScen, knowledge_base: KnowledgeBase | None =
 
         self.sota_exp_to_submit: DSExperiment | None = None  # grab the global best exp to submit
 
-        self.uncommitted_experiments: dict[int, DSExperiment] = {}  # loop_id -> DSExperiment
-
     def should_inject_diversity(self, current_selection: tuple[int, ...] | None = None) -> bool:
         """
         Check if diversity context should be injected based on the current selection.
@@ -78,13 +76,6 @@ def should_inject_diversity(self, current_selection: tuple[int, ...] | None = No
 
     COMPLETE_ORDER = ("DataLoadSpec", "FeatureEng", "Model", "Ensemble", "Workflow")
 
-    def register_uncommitted_exp(self, exp: DSExperiment, loop_id: int):
-        self.uncommitted_experiments[loop_id] = exp
-
-    def deregister_uncommitted_exp(self, loop_id: int):
-        if loop_id in self.uncommitted_experiments:
-            del self.uncommitted_experiments[loop_id]
-
     def set_sota_exp_to_submit(self, exp: DSExperiment) -> None:
         self.sota_exp_to_submit = exp
 
@@ -136,32 +127,6 @@ def get_sibling_exps(self, current_selection: tuple[int, ...] | None = None):
                 sibling_exps.append(self.hist[idx][0])
         return sibling_exps
 
-    def sync_dag_parent_and_hist(
-        self,
-        exp_and_fb: tuple[Experiment, ExperimentFeedback],
-        cur_loop_id: int,
-    ) -> None:
-        """
-        Adding corresponding parent index to the dag_parent when the hist is going to be changed.
-        Should be called when the hist is changed.
-        """
-
-        if len(self.hist) == 0 or len(self.get_current_selection()) == 0:
-            # the node we are going to add is the first node of hist / root node of a new sub-trace
-            self.dag_parent.append(())
-
-        else:
-            current_node_idx = self.current_selection[0]
-
-            if current_node_idx == -1:
-                # the current selection is the latest one
-                current_node_idx = len(self.hist) - 1
-
-            self.dag_parent.append((current_node_idx,))
-        self.hist.append(exp_and_fb)
-        self.idx2loop_id[len(self.hist) - 1] = cur_loop_id
-        self.deregister_uncommitted_exp(cur_loop_id)
-
     def retrieve_search_list(
         self,
         search_type: Literal["all", "ancestors"] = "ancestors",