diff --git a/.devcontainer/env b/.devcontainer/env index 8282f374a..2f497fdfb 100644 --- a/.devcontainer/env +++ b/.devcontainer/env @@ -9,17 +9,19 @@ TIMEOUT_FAIL_LIMIT=100 # CHAT_TEMPERATURE=0.7 CHAT_STREAM=False -CHAT_TEMPERATURE=1 CHAT_MODEL=o1-preview SYSTEM_PROMPT_ROLE=user -BACKEND=rdagent.oai.backend.LiteLLMAPIBackend -OPENAI_API_KEY=sk-1234 -OPENAI_API_BASE=http://ep14.213428.xyz:38881 +BACKEND=rdagent.oai.backend.LiteLLMAPIBackend +OPENAI_API_KEY=sk-1234 +OPENAI_API_BASE=http://10.150.240.117:38803 +EMBEDDING_MODEL=text-embedding-3-small +CHAT_MODEL=gpt-5 +CHAT_TEMPERATURE=1 # amc chat model configs: -EMBEDDING_MODEL=text-embedding-ada-002 +#EMBEDDING_MODEL=text-embedding-ada-002 # Cache Setting (Optional): DUMP_CHAT_CACHE=True diff --git a/rdagent/app/agentic_sys/conf.py b/rdagent/app/agentic_sys/conf.py new file mode 100644 index 000000000..8d0e76b98 --- /dev/null +++ b/rdagent/app/agentic_sys/conf.py @@ -0,0 +1,27 @@ + +from pydantic_settings import SettingsConfigDict + +from rdagent.core.conf import ExtendedBaseSettings + + +class AgenticSysSetting(ExtendedBaseSettings): + model_config = SettingsConfigDict(env_prefix="AS_", protected_namespaces=()) + + competition: str | None = None + + # Main components + ## Scen + scen: str = "rdagent.scenarios.agentic_sys.scen.AgenticSysScen" + """ + Scenario class for data science tasks. + - For Kaggle competitions, use: "rdagent.scenarios.data_science.scen.KaggleScen" + - For custom data science scenarios, use: "rdagent.scenarios.data_science.scen.DataScienceScen" + """ + exp_gen: str = "rdagent.scenarios.agentic_sys.proposal.AgenticSysExpGen" + coder: str = "rdagent.scenarios.agentic_sys.dev.AgenticSysCoder" + runner: str = "rdagent.scenarios.agentic_sys.dev.AgenticSysRunner" + + feedback: str = "rdagent.scenarios.agentic_sys.feedback.AgenticSysExp2Feedback" + + +ASYS_RD_SETTING = AgenticSysSetting() diff --git a/rdagent/app/agentic_sys/loop.py b/rdagent/app/agentic_sys/loop.py new file mode 100644 index 000000000..de92d76ed --- /dev/null +++ b/rdagent/app/agentic_sys/loop.py @@ -0,0 +1,50 @@ +import asyncio +from pathlib import Path +from typing import Optional + +import fire +import typer +from typing_extensions import Annotated + + +from rdagent.core.utils import import_class +from rdagent.log import rdagent_logger as logger + +from rdagent.app.agentic_sys.conf import ASYS_RD_SETTING +from rdagent.scenarios.agentic_sys.loop import AgenticSysRDLoop + + +def main( + path: Optional[str] = None, + checkout: Annotated[bool, typer.Option("--checkout/--no-checkout", "-c/-C")] = True, + checkout_path: Optional[str] = None, + step_n: Optional[int] = None, + loop_n: Optional[int] = None, + timeout: Optional[str] = None, + competition="deepresearch", + replace_timer=True, + exp_gen_cls: Optional[str] = None, +): + if not checkout_path is None: + checkout = Path(checkout_path) + + if competition is not None: + ASYS_RD_SETTING.competition = competition + + if not ASYS_RD_SETTING.competition: + logger.error("Please specify competition name.") + + if path is None: + agentic_sys_loop = AgenticSysRDLoop(ASYS_RD_SETTING) + else: + agentic_sys_loop: AgenticSysRDLoop = AgenticSysRDLoop.load(path, checkout=checkout, replace_timer=replace_timer) + + # replace exp_gen if we have new class + if exp_gen_cls is not None: + agentic_sys_loop.exp_gen = import_class(exp_gen_cls)(agentic_sys_loop.exp_gen.scen) + + asyncio.run(agentic_sys_loop.run(step_n=step_n, loop_n=loop_n, all_duration=timeout)) + + +if __name__ == "__main__": + fire.Fire(main) diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py index 964ce3683..679ed15f8 100644 --- a/rdagent/core/proposal.py +++ b/rdagent/core/proposal.py @@ -162,6 +162,9 @@ def __init__(self, scen: ASpecificScen, knowledge_base: ASpecificKB | None = Non self.knowledge_base: ASpecificKB | None = knowledge_base self.current_selection: tuple[int, ...] = (-1,) + # When parallel multiple nodes in the trace, nodes are not committed before finish running. + self.uncommitted_experiments: dict[int, Experiment] = {} # loop_id -> Experiment + def get_sota_hypothesis_and_experiment(self) -> tuple[Hypothesis | None, Experiment | None]: """Access the last experiment result, sub-task, and the corresponding hypothesis.""" # TODO: The return value does not align with the signature. @@ -240,6 +243,39 @@ def get_parents(self, child_idx: int) -> list[int]: return ancestors + def register_uncommitted_exp(self, exp: DSExperiment, loop_id: int): + self.uncommitted_experiments[loop_id] = exp + + def deregister_uncommitted_exp(self, loop_id: int): + if loop_id in self.uncommitted_experiments: + del self.uncommitted_experiments[loop_id] + + def sync_dag_parent_and_hist( + self, + exp_and_fb: tuple[Experiment, ExperimentFeedback], + cur_loop_id: int, + ) -> None: + """ + Adding corresponding parent index to the dag_parent when the hist is going to be changed. + Should be called when the hist is changed. + """ + + if len(self.hist) == 0 or len(self.get_current_selection()) == 0: + # the node we are going to add is the first node of hist / root node of a new sub-trace + self.dag_parent.append(()) + + else: + current_node_idx = self.current_selection[0] + + if current_node_idx == -1: + # the current selection is the latest one + current_node_idx = len(self.hist) - 1 + + self.dag_parent.append((current_node_idx,)) + self.hist.append(exp_and_fb) + self.idx2loop_id[len(self.hist) - 1] = cur_loop_id + self.deregister_uncommitted_exp(cur_loop_id) + class CheckpointSelector: """ @@ -298,7 +334,7 @@ def __init__(self, scen: Scenario) -> None: self.scen = scen @abstractmethod - def gen(self, trace: Trace, plan: ExperimentPlan | None = None) -> Experiment: + def gen(self, trace: Trace) -> Experiment: """ Generate the experiment based on the trace. Planning is part of gen, but since we may support multi-stage planning, diff --git a/rdagent/scenarios/agentic_sys/dev.py b/rdagent/scenarios/agentic_sys/dev.py new file mode 100644 index 000000000..cf98c5d4f --- /dev/null +++ b/rdagent/scenarios/agentic_sys/dev.py @@ -0,0 +1,2467 @@ +import sys +from blosc2 import exp +from matplotlib.style import context +from prefect import task +from rdagent.core.developer import Developer +from rdagent.core.experiment import Experiment, FBWorkspace +from rdagent.log import rdagent_logger as logger +from pathlib import Path +import subprocess +import sys +import json +import re +import os +from typing import Dict, Any, List, Optional +from rdagent.scenarios.agentic_sys.env import get_agent_sys_env +from rdagent.oai.llm_utils import APIBackend +from rdagent.scenarios.agentic_sys.tools.web_search import create_web_search_tool + +# TODO: We only list the dummy coder and runner here. +# If we want to implement the a comprehensive agentic system R&D Agent, we need to implement it with CoSTEER. + + +class AgenticSysCoder(Developer[Experiment]): + #generate code for agentic system experiment + def __init__(self, scen): + self.scen = scen + + #initialize LLM backend + self.api_backend = APIBackend() + logger.info("Initialized AgenticSysCoder with LLM backend") + + #initialize web search tool + search_config_path = Path(__file__).parent / "tools" / "search_config.yaml" + + #web search tool + self.web_search_tool = None + + try: + self.web_search_tool = create_web_search_tool(search_config_path) + logger.info("Initialized web search tool for external knowledge retrieval") + except Exception as e: + logger.warning(f"Failed to initialize web search tool: {e}") + self.web_search_tool = None + + logger.info("Initialized AgenticSysCoder with LLM") + + + def develop(self, exp: Experiment) -> Experiment: + # TODO: implement the coder + ''' + generate code based on experiment assumption + ''' + logger.info("Starting code generation for the experiment") + + try: + # 1. Initialize workspace with FBWorkspace + exp.experiment_workspace = FBWorkspace() + ws_path = Path(exp.experiment_workspace.workspace_path) + ws_path.mkdir(parents=True, exist_ok=True) + logger.info(f"Initialized workspace at {ws_path}") + + #2. prepare enhanced context with web search + context = self.prepare_enhanced_context(exp) + + + + #2. Generate code files using CoSTEER approach + code_artifacts = self.generate_code_with_costeer(exp) + exp.experiment_workspace.inject_files(**code_artifacts) + logger.info(f"Injected {len(code_artifacts)} files into workspace") + + #prepare execution environment following conf.py pattern + timeout = self.calculate_timeout(exp) + env = get_agent_sys_env( + # extra_volumes = {str(ws_path): "/workspace"}, + running_timeout_period = timeout, + enable_cache=True + ) + logger.info(f"Prepared execution environment") + + # 3) Optinal pre-run validation + try: + if self.should_validate_generation(exp): + validation_result = self.validate_generated_code(env, ws_path) + if not getattr(validation_result, 'success', False): + logger.warning(f"Pre-run validation failed: {validation_result.message}") + except Exception as e_val: + logger.error(f"Validation step raised: {e_val} continuing...") + + #4. run the entrypoint inside environment (use train.py as entry) + try: + logger.info("Running generated code inside validation") + # run_res = env.run( + # entry = "bash", + # cmd = "cd /workspace && python train.py", timeout = timeout + # ) + run_res = exp.experiment_workspace.run(env=env, entry="python train.py") + #collect run outputs + exp.run_returncode = getattr(run_res, 'returncode', None) + exp.run_stdout = getattr(run_res, 'stdout', getattr(run_res, 'logs', None)) + exp.run_stderr = getattr(run_res, 'stderr', None) + logger.info(f"Run finished") + except Exception as e_run: + raise + logger.error(f"Execution inside environment failed: {e_run}") + #keep exception and let caller decide; still return exp with workspace + exp.run_exception = e_run + + except Exception as e: + raise + logger.error(f"Code generation failed: {str(e)}") + exp.exception = e + if not hasattr(exp, 'experiment_workspace') or not exp.experiment_workspace: + try: + exp.experiment_workspace = self.create_fallback_workspace(exp) + except Exception as e_fallback: + pass + return exp + + def prepare_enhanced_context(self, exp:Experiment): + """ + Prepare enhanced context with external knowledge from web search + + Args: + exp: Current experiment + + Returns: + Enhanced context dictionary + """ + hypothesis = getattr(exp, 'hypothesis', 'Improve agentic system performance') + + # Base context + context = { + 'hypothesis': hypothesis, + 'scenario_desc': self.scen.get_scenario_all_desc(), + 'success_criteria': self.scen.get_success_criteria(), + 'task_id': getattr(exp, 'id', 'unknown'), + 'task_domain': getattr(self.scen, 'domain', 'general'), + } + + # Add web search results if available (NEW) + if self.web_search_tool: + try: + logger.info("Retrieving external knowledge via web search...") + + # Check if search service is healthy + if not self.web_search_tool.client.health_check(): + logger.warning("Search service unavailable, skipping external search") + context['external_sources'] = [] + return context + + # Identify knowledge gaps + knowledge_gaps = self._identify_knowledge_gaps(exp, hypothesis) + + # Prepare search context + search_context = { + 'methodology': self._extract_methodology(hypothesis), + 'complexity': self._assess_complexity(hypothesis) + } + + # Perform web search + external_sources = self.web_search_tool.search_for_hypothesis( + task_description=hypothesis, + current_gaps=knowledge_gaps, + context=search_context + ) + + context['external_sources'] = external_sources + logger.info(f"Retrieved {len(external_sources)} external sources") + + # Add summary of external sources to context + if external_sources: + context['external_knowledge_summary'] = self._summarize_external_sources( + external_sources + ) + + except Exception as e: + logger.error(f"Web search failed: {e}") + context['external_sources'] = [] + else: + context['external_sources'] = [] + + return context + + def identify_knowledge_gaps(self, exp, hypothesis): + """ + Identify knowledge gaps from hypothesis + returns: + list of knowledge gap descriptions + """ + + gaps = [] + + #Extract keywords indicating knowledge needs + hypothesis_lower = hypothesis.lower() + + #Common agentic system knowledge areas + knowledge_areas = { + 'planning': ['plan', 'planning', 'strategy', 'approach'], + 'reasoning': ['reason', 'reasoning', 'logic', 'inference'], + 'learning': ['learn', 'learning', 'adapt', 'optimization'], + 'memory': ['memory', 'context', 'history', 'recall'], + 'tool_use': ['tool', 'api', 'external', 'integration'], + 'evaluation': ['evaluate', 'assessment', 'metric', 'performance'], + 'communication': ['communicate', 'language', 'dialogue', 'interaction'] + } + + for area, keywords in knowledge_areas.items(): + if any(kw in hypothesis_lower for kw in keywords): + gaps.append(f"{area} techniques and best practices") + + # Add general gaps if none identified + if not gaps: + gaps.append("agentic system design patterns") + gaps.append("system implementation strategies") + + logger.info(f"Identified knowledge gaps: {gaps}") + return gaps[:5] # Limit to top 5 + + + def extract_methodology(self, hypothesis: str) -> str: + """Extract methodology from hypothesis""" + hypothesis_lower = hypothesis.lower() + + methodologies = { + 'reinforcement learning': ['rl', 'reinforcement', 'q-learning', 'policy'], + 'retrieval augmented generation': ['rag', 'retrieval', 'augmented'], + 'chain of thought': ['cot', 'chain of thought', 'reasoning chain'], + 'tree of thought': ['tot', 'tree of thought', 'reasoning tree'], + 'multi-agent': ['multi-agent', 'multiple agents', 'agent collaboration'], + 'iterative refinement': ['iterative', 'refinement', 'feedback loop'] + } + + for method, keywords in methodologies.items(): + if any(kw in hypothesis_lower for kw in keywords): + return method + + return 'general agentic approach' + + def assess_complexity(self, hypothesis: str) -> str: + """Assess hypothesis complexity""" + hypothesis_lower = hypothesis.lower() + + high_complexity_indicators = [ + 'complex', 'advanced', 'sophisticated', 'multi-stage', + 'distributed', 'parallel', 'optimization' + ] + + medium_complexity_indicators = [ + 'moderate', 'standard', 'typical', 'conventional' + ] + + if any(ind in hypothesis_lower for ind in high_complexity_indicators): + return 'high' + elif any(ind in hypothesis_lower for ind in medium_complexity_indicators): + return 'medium' + else: + return 'low' + + def summarize_external_sources(self, sources: List[Dict[str, Any]]) -> str: + """ + Summarize external sources for context injection + + Args: + sources: List of external source dictionaries + + Returns: + Formatted summary string + """ + if not sources: + return "No external sources available." + + summary_parts = [] + + # High credibility sources + high_cred = [s for s in sources if s.get('credibility_level') == 'High'] + if high_cred: + summary_parts.append( + f"High-credibility sources ({len(high_cred)}): " + + ", ".join(s['title'][:50] for s in high_cred[:3]) + ) + + # Key insights + key_insights = [] + for source in sources[:5]: + summary = source.get('summary', '') + if len(summary) > 50: + key_insights.append(summary[:100]) + + if key_insights: + summary_parts.append( + "Key insights: " + " | ".join(key_insights[:2]) + ) + + return "\n".join(summary_parts) + + + + def generate_code_with_costeer(self, exp) -> Dict[str, str]: + """ + Generate code artifacts using CoSTEER approach + """ + logger.info("Generating code using CoSTEER framework") + hypothesis = getattr(exp, 'hypothesis', 'Improve agentic system performance') + context = { + 'hypothesis': hypothesis, + 'scenario_desc': self.scen.get_scenario_all_desc(), + 'success_criteria': self.scen.get_success_criteria(), + } + # generate code artifacts + code_artifacts = {} + + #1. generate main agent implementation + agent_code = self.generate_agent_code(context) + code_artifacts['agent.py'] = agent_code + + #2. Generate execution script + train_code = self.generate_train_script(context) + code_artifacts['train.py'] = train_code + + #3. Generate requirements file + requirements = self.generate_requirements(context) + code_artifacts['requirements.txt'] = requirements + + #4. Generate configuration file if needed + if self.needs_config_file(context): + config_code = self.generate_config_file(context) + code_artifacts['config.py'] = config_code + + logger.info(f"Generated {len(code_artifacts)} code artifacts") + return code_artifacts + + def prepare_execution_environment(self, exp: Experiment, ws_path: Path): + """ + Prepare execution environment similar to DS CoSTEER approach + """ + try: + # Get environment configuration + extra_volumes = {str(ws_path): "/workspace"} + #Set timeout based on experiment complexity + timeout = self.calculate_timeout(exp) + #create environment using agent_sys specific configuration + env = get_agent_sys_env( + extra_volumes = extra_volumes, + running_timeout_period = timeout, + enable_cache=True + ) + logger.info("Prepared execution environment successfully") + return env + + except Exception as e: + logger.error(f"Failed to prepare execution environment: {str(e)}") + raise + + def calculate_timeout(self, exp: Experiment) -> int: + """ + Calculate appropriate timeout based on experiment characteristics + """ + base_timeout = 300 # default 5 minutes + #Adjust timeout based on hypothesis comnplexity + hypothesis = getattr(exp, 'hypothesis', '') + if 'parallel' in hypothesis.lower() or 'concurrent' in hypothesis.lower(): + return base_timeout * 2 #parallel tasks may need more time + elif 'optimisation' in hypothesis.lower(): + return base_timeout * 4 #learning/optimization may need more time + elif 'simple' in hypothesis.lower() or 'basic' in hypothesis.lower(): + return base_timeout #simple tasks + return base_timeout + + def should_validate_generation(self,exp: Experiment) -> bool: + """ + Determine if we should validate generated code before proceeding + + Validation is recommended when: + 1. It's the first experiment (no prior validation history) + 2. The hypothesis involves complex/risk operations + 3. Previous experiment has validation failures + 4. Configuration explicitly requires validation + + parameters: + exp: Experiment + The experiment to potentially + Returns: bool + True if validation should be performed + """ + #1. check global configuration flag + validation_config = getattr(self.scen, 'enable_code_validation', True) + if not validation_config: + logger.info("Code validation disabled by configuration") + return False + #2. always validate first experiment + if not hasattr(exp, 'iteration_number') or exp.iteration_number == 0: + logger.info("First experiment - validation enabled ") + return True + #3. check hypothesis complexity/risk indicators + hypothesis = getattr(exp, 'hypothesis', '').lower() + + #High risk keywords that suggest validation is needed + high_risk_keywords = [ + 'parallel', 'concurrent', 'multi-thread', 'async', # Concurrency risks + 'optimization', 'complex', 'advanced', # Complexity + 'distributed', 'network', 'remote', # Network operations + 'file system', 'database', 'io', # I/O operations + 'experimental', 'novel', 'new approach' # Unproven approaches + ] + + if any(keyword in hypothesis for keyword in high_risk_keywords): + logger.info(f"High risk hypothesis detected, validation enabled") + return True + + if hasattr(exp, 'previous_validation_failed') and exp.previous_validation_failed: + logger.info("Previous validation failed, re-enabling validation") + return True + + #5. skip validation for simple/proven approaches + simple_keywords = ['simple', 'basic', 'straightforward', 'minimal'] + if any(keyword in hypothesis for keyword in simple_keywords): + logger.info("Simple hypothesis detected, skipping validation") + return False + + # 6. Default behavior: validate every N experiments + validation_interval = getattr(self.scen, 'validation_interval', 3) + iteration = getattr(exp, 'iteration_number', 0) + + if iteration % validation_interval == 0: + logger.info(f"Periodic validation (interval={validation_interval})") + return True + + # 7. Default: skip validation for efficiency + logger.info("No validation triggers met - skipping validation") + return False + + + def validate_generated_code(self, env, ws_path: Path): + """ + Validate generated code by running basic checks + """ + class ValidationResult: + def __init__(self,success, message): + self.success = success + self.message = message + + try: + #Run basic syntax check + check_cmd = "python -m py_compile agent.py && python -m py_compile train.py" + result = env.run( + entry_point = "bash", + cmd = f'cd/workspace && {check_cmd}', + timeout = 30 + ) + if result.returncode == 0: + return ValidationResult(True, "syntax validation passed") + else: + return ValidationResult(False, f"Syntax validation failed: {result.stderr}") + + except Exception as e: + return ValidationResult(False, f"Validation error: {str(e)}") + + # def generate_agent_code(self,context): + # """ + # Generate agent code based on context + # """ + # hypothesis = context.get('hypothesis', 'Improve agentic system performance') + + # #enhanced agent template with CoSTEER improvement + # return f''' + # """ + # Agentic System Implementation - CoSTEER enhanced + # Hypothesis: {hypothesis} + # Generated with intelligent code generation + # """ + # import time + # import logging + # import threading + # from typing import Dict, List, Any, Optional + # from concurrent.futures import ThreadPoolExecutor, as_completed + # from dataclasses import dataclass + # from enum import Enum + # import json + + # #Configurable logging + # logging.basicConfig(level = logging.INFO) + # logger = logging.getLogger("AgenticSystem") + + # class TaskStatus(Enum): + # PENDING = "pending" + # RUNNING = "running" + # COMPLETED = "completed" + # FAILED = "failed" + + # @dataclass + # class TaskResult: + # task_id: int + # success: bool + # execution_time: float + # error: Optional[str] = None + # data: Optional[Dict[str, Any]] = None + + # class AgenticSystem: + # """ + # Ehanced Agentic System with CoSTEER optimizations + # """ + # def __init__(self, config[Dict] = None): + # self.name = "CoSTEER_AgenticSystem" + # self.task_count = 0 + # self.config = config if config else self.get_default_config() + + # #Performance Tracking + # self.performance_metrics = {{"total_tasks": 0,"successful_tasks": 0,"failed_tasks": 0,"total_execution_time": 0}} + + # #thread safety + # self.lock = threading.Lock() + + # logger.info(f"Initialized {{self.name}} with config: {{self.config}}") + + # def get_default_config(self): + # """Get default configuration optimized for hypothesis""" + # return {{ + # "max_workers": 4, + # "task_timeout": 60, + # "enable_parallel": {'parallel' in hypothesis.lower()}, + # "enable_optimization": {'optimization' in hypothesis.lower()} + # }} + + # def run_task(self, task: Dict[str, Any]): + # """Execute single task with enhanced error handling and monitoring""" + # start_time = time.time() + # task_id = task.get('id', self.get_next_task_id()) + # try: + # logger.info(f"Starting task {{task_id}}") + # #Simulate intelligent task processing + # self.process_task_logic(task) + # execution_time = time.time() - start_time + # #update metrics + # with self.lock: + # self.metrics['total_tasks'] += 1 + # self.metrics['successful_tasks'] += 1 + # self.metrics['total_execution_time'] += execution_time + # result = TaskResult( + # task_id = task_id, + # status = TaskStatus.COMPLETED, + # execution_time = execution_time, + # success = True, + # data = {{'processed': True, 'task_type': task.get('type', 'unknown')}} + # ) + + # logger.info(f"Task {{task_id}} completed successfully in {{execution_time:.4f}}s") + # return result + + # except Exception as e: + # execution_time = time.time() - start_time + # with self.lock: + # self.metrics['total_tasks'] += 1 + # self.metrics['failed_tasks'] += 1 + # self.metrics['total_execution_time'] += execution_time + # result = TaskResult( + # task_id = task_id, + # status = TaskStatus.FAILED, + # execution_time = execution_time, + # success = False, + # error = str(e) + # ) + # logger.error(f"Task {{task_id}} failed: {{str(e)}}") + # return result + + # def get_next_task_id(self): + # "thread-safe task id generation" + # with self.lock: + # self.task_count += 1 + # return self.task_count + + # def process_task_logic(self, task): + # """Intelligent task processing based on hypothesis""" + # task_type = task.get('type', 'default') + # complexity = task.get('complexity', 1) + + # #Simulate processing time based on complexity + # base_time = 0.01 + # processing_time = base_time * complexity + + # #Add hypothesis-specific optimisation + # if complexity > 5 and not self.config.get('enable_optimization', False): + # # 10% error rate for high complexity tasks + # if time.time() % 10 < 1: + # raise RuntimeError(f"Simulated error for complex task {{task.get('id')}}") + + # def run_tasks(self, tasks): + # """ + # Execute multiple tasks with intelligent scheduling + # """ + # if tasks is None: + # tasks = self.generate_default_tasks() + # logger.info(f"Starting execution of {{len(tasks)}} tasks") + # batch_start_time = time.time() + + # if self.config.get('enable_parallel', True) and len(tasks) > 1: + # results = self.run_tasks_parallel(tasks) + # else: + # results = self.run_tasks_sequential(tasks) + + # #Calculate comprehensive metrics + # total_time = time.time() - batch_start_time + # success_count = sum(1 for r in results if r.success) + # avg_task_time = sum(r.execution_time for r in results) / len(results) if results else 0 + + # metrics = {{ + # "success_rate": success_count / len(results) if results else 0, + # "avg_task_time": avg_task_time, + # "error_count": len(results) - success_count, + # "total_tasks": len(results), + # "total_execution_time": total_time, + # "system_metrics": self.metrics.copy() + # }} + # logger.info(f"Batch execution completed: {{metrics}}") + # return metrics + + # def run_tasks_sequential(self, tasks): + # """Execute task sequentially""" + # results = [] + # for task in tasks: + # result = self.run_task(task) + # results.append(result) + # return results + + # def run_tasks_parallel(self, tasks): + # """Execute tasks in parallel using ThreadPoolExecutor""" + # results = [] + # max_workers = min(self.config.get('max_workers', 4), len(tasks)) + # with ThreadPoolExecutor(max_workers = max_workers) as executor: + # future_to_task = {{executor.submit(self.run_task, task): task for task in tasks}} + # for future in as_completed(future_to_task): + # try: + # result = future.result(timeout = self.config.get('task_timeout', 30)) + # results.append(result) + # except Exception as e: + # #Create error result for failed failure + # task = future_to_task[future] + # error_result = TaskResult( + # task_id = task.get('id', 0), + # status = TaskStatus.FAILED, + # execution_time = 0, + # success = False, + # error = f"Future execution failed: {{str(e)}}" + # ) + # results.append(error_result) + # return results + + # def generate_default_tasks(self): + # """Generate default tasks for testing""" + # return [ + # {{ + # "id": i, + # "type": "test", + # "data": f"sample_{{i}}", + # "complexity": (i % 5 ) + 1 + # }} for i in range(10) + # ] + + # def get_system_status(self): + # """Get current system status and metrics""" + # with self.lock: + # status = {{ + # 'name': self.name, + # 'config': self.config, + # 'metrics': self.metrics.copy(), + # 'success_rate': ( + # self.metrics['successful_tasks'] / self.metrics['total_tasks'] + # if self.metrics['total_tasks'] > 0 else 0 + # ) + # }} + # return status + # ''' + + def generate_agent_code(self, task_info): + """ + Generate agent code using LLM instead of templates + """ + system_prompt = """ + You are an expert in building AI research agents for complex analytical tasks. +Generate production-ready Python code implementing a multi-step research agent capable of: +- Breaking down complex research questions +- Gathering and synthesizing information +- Producing comprehensive, insightful answers +Follow best practices with proper error handling and documentation. + """ + user_prompt = f"""Generate a research agent for DeepResearch Bench evaluation. +**Research Task:** +- Domain: {task_info['domain']} +- Question: {task_info['question']} +- Sub-Question: {task_info['sub_question']} +- Required Capabilities: {task_info['required capabilities']} +- Complexity : {task_info['complexity']} + +**Agent Requirements:** + +1. Class: ResearchAgent with methods: + - __init__(config: Dict = None) + - research(question: str) -> Dict: Main research pipeline + - decompose_question(question: str) -> List[str]: Break into sub-questions + - gather_information(sub_q: str) -> Dict: Information gathering + - analyze_information(info_list: List[Dict]) -> Dict: Analysis + - synthesize_answer(analyses: List[Dict]) -> str: Final synthesis + +2. Research Pipeline: + Step 1: Question Decomposition - Break complex question into manageable sub-questions + Step 2: Information Gathering - For each sub-question, gather relevant information + Step 3: Analysis - Analyze gathered information with causal reasoning + Step 4: Synthesis - Integrate analyses into comprehensive answer + +3. Output Format (critical for evaluation): + {{ + "answer": str, # Main comprehensive answer + "sub_answers": [ # Answers to each sub-question + {{"question": str, "answer": str, "evidence": List[str]}} + ], + "reasoning": str, # Explanation of research approach and logic + "evidence": List[str], # Supporting citations/sources + "confidence": float, # 0-1 confidence score + "metadata": {{ # Additional research metadata + "domain": str, + "approach": str, + "limitations": List[str] + }} + }} + +4. Domain-Specific Research Strategies: + - Biology: Focus on mechanisms, evidence from studies + - Business: Market analysis, competitive factors, financial data + - Computer Science: Technical analysis, algorithmic reasoning + - General: Structured, logical approach + +5. Implementation Requirements: + - Use logging for tracking research process + - Handle edge cases and missing information gracefully + - Support different research strategies per domain + - Include reasoning traces for explainability + - Modular, extensible architecture + +Generate ONLY the Python code without markdown blocks or explanations.""" + + try: + logger.info("Calling LLM to generate research agent code...") + response = self.api_backend.build_messages_and_create_chat_completion( + user_prompt=user_prompt, + system_prompt=system_prompt, + json_mode=False + ) + + code = self._clean_llm_code_response(response) + self._validate_python_syntax(code, "agent.py") + + logger.info(f"Generated research agent code ({len(code)} chars)") + return code + + except Exception as e: + logger.error(f"LLM research agent generation failed: {e}") + logger.warning("Using fallback research agent template") + return self._get_fallback_research_agent(task_info) + + + + # def generate_train_script(self, context): + # """ + # Generate enhanced training/execution script + # """ + # hypothesis = context.get('hypothesis', 'Improve agentic system performance') + # task_id = context.get('task_id', 'unknown') + # task_domain = context.get('task_domain', 'general') + # evaluation_criteria = context.get('evaluation_criteria', {}) + + # enable_parallel = 'parallel' in hypothesis.lower() or 'concurrent' in hypothesis.lower() + # enable_optimization = 'optimization' in hypothesis.lower() or 'optimize' in hypothesis.lower() + # max_workers = 8 if enable_parallel else 4 + # task_timeout = 60 if enable_optimization else 30 + + # return f'''""" + # CoSTEER-Enhanced Training/Execution Script for Agentic System + # Task ID: {task_id} + # Domain: {task_domain} + # Hypothesis: {hypothesis} + + # This script evaluates outputs according to DeepResearch Bench standards: + # - Comprehensiveness (0-10): Coverage and depth + # - Insight (0-10): Causal reasoning and originality + # - Instruction Following (0-10): Task compliance + # - Readability (0-10): Clarity and presentation + # """ + # import json + # import sys + # import time + # import traceback + # from pathlib import Path + # from typing import Dict, List, Any, Optional + # from dataclasses import dataclass, asdict + # from agent import AgenticSystem + + # @dataclass + # class EvaluationScore: + # """DeepResearch Bench evaluation score""" + # comprehensiveness: float = 0.0 # 0-10 + # insight: float = 0.0 # 0-10 + # instruction_following: float = 0.0 # 0-10 + # readability: float = 0.0 # 0-10 + # overall: float = 0.0 # Weighted average + + # # Dimension weights (customizable per task) + # weights: Dict[str, float] = None + + # def __post_init__(self): + # if self.weights is None: + # # Default equal weights + # self.weights = {{ + # 'comprehensiveness': 0.25, + # 'insight': 0.25, + # 'instruction_following': 0.25, + # 'readability': 0.25 + # }} + + # def calculate_overall(self) -> float: + # """Calculate weighted overall score""" + # self.overall = ( + # self.comprehensiveness * self.weights['comprehensiveness'] + + # self.insight * self.weights['insight'] + + # self.instruction_following * self.weights['instruction_following'] + + # self.readability * self.weights['readability'] + # ) + # return self.overall + + # def to_dict(self) -> Dict[str, Any]: + # """Convert to dictionary""" + # return {{ + # 'comprehensiveness': round(self.comprehensiveness, 2), + # 'insight': round(self.insight, 2), + # 'instruction_following': round(self.instruction_following, 2), + # 'readability': round(self.readability, 2), + # 'overall': round(self.overall, 2), + # 'weights': self.weights + # }} + + # class ResearchOutputEvaluator: + # """Evaluate research outputs according to DeepResearch Bench standards""" + + # def __init__(self, task_domain: str = 'general'): + # self.task_domain = task_domain + # self.evaluation_log = [] + + # def evaluate_comprehensiveness(self, output: Dict[str, Any], task_requirements: Dict) -> float: + # """ + # Evaluate comprehensiveness (0-10) + # - Breadth and depth of content + # - Coverage of required subtopics + # - Evidence and data sources + # - Multiple perspectives + # """ + # score = 0.0 + # checks = [] + + # # Check 1: Coverage of key topics (0-3 points) + # required_topics = task_requirements.get('required_topics', []) + # if required_topics: + # covered = sum(1 for topic in required_topics + # if self._check_topic_coverage(output, topic)) + # coverage_score = min(3.0, (covered / len(required_topics)) * 3.0) + # score += coverage_score + # checks.append(f"Topic coverage: {{covered}}/{{len(required_topics)}} ({{coverage_score:.1f}}/3.0)") + # else: + # score += 2.0 # Default if no specific requirements + # checks.append("No specific topic requirements (default 2.0/3.0)") + + # # Check 2: Depth of analysis (0-3 points) + # depth_indicators = [ + # 'detailed analysis' in str(output).lower(), + # 'data' in output or 'evidence' in output, + # len(str(output)) > 500, # Substantial content + # 'methodology' in str(output).lower() or 'approach' in str(output).lower() + # ] + # depth_score = sum(depth_indicators) * 0.75 + # score += depth_score + # checks.append(f"Depth indicators: {{sum(depth_indicators)}}/4 ({{depth_score:.1f}}/3.0)") + + # # Check 3: Evidence and sources (0-2 points) + # evidence_score = 0.0 + # if 'references' in output or 'sources' in output: + # evidence_score += 1.0 + # if 'data' in output or 'statistics' in output: + # evidence_score += 1.0 + # score += evidence_score + # checks.append(f"Evidence & sources: {{evidence_score:.1f}}/2.0") + + # # Check 4: Multiple perspectives (0-2 points) + # perspective_keywords = ['advantage', 'disadvantage', 'trade-off', 'alternative', + # 'limitation', 'consideration'] + # perspectives_found = sum(1 for kw in perspective_keywords + # if kw in str(output).lower()) + # perspective_score = min(2.0, perspectives_found * 0.5) + # score += perspective_score + # checks.append(f"Multiple perspectives: {{perspectives_found}} keywords ({{perspective_score:.1f}}/2.0)") + + # self.evaluation_log.append({{ + # 'dimension': 'comprehensiveness', + # 'score': score, + # 'checks': checks + # }}) + + # return min(10.0, score) + + # def evaluate_insight(self, output: Dict[str, Any], task_context: Dict) -> float: + # """ + # Evaluate insight (0-10) + # - Causal reasoning and why-think + # - Quantified analysis + # - Non-obvious implications + # - Novel synthesis + # """ + # score = 0.0 + # checks = [] + + # # Check 1: Causal reasoning (0-3 points) + # causal_indicators = [ + # 'because' in str(output).lower(), + # 'therefore' in str(output).lower(), + # 'as a result' in str(output).lower(), + # 'leads to' in str(output).lower(), + # 'causes' in str(output).lower(), + # 'impacts' in str(output).lower() + # ] + # causal_score = min(3.0, sum(causal_indicators) * 0.6) + # score += causal_score + # checks.append(f"Causal reasoning: {{sum(causal_indicators)}} indicators ({{causal_score:.1f}}/3.0)") + + # # Check 2: Quantified analysis (0-2 points) + # has_numbers = any(char.isdigit() for char in str(output)) + # has_metrics = any(word in str(output).lower() + # for word in ['percent', 'rate', 'ratio', 'metric', 'measure']) + # quant_score = (1.0 if has_numbers else 0) + (1.0 if has_metrics else 0) + # score += quant_score + # checks.append(f"Quantified analysis: numbers={{has_numbers}}, metrics={{has_metrics}} ({{quant_score:.1f}}/2.0)") + + # # Check 3: Non-obvious implications (0-3 points) + # insight_keywords = ['implication', 'insight', 'suggests', 'indicates', + # 'reveals', 'unexpected', 'surprisingly', 'notable'] + # insights_found = sum(1 for kw in insight_keywords if kw in str(output).lower()) + # implication_score = min(3.0, insights_found * 0.5) + # score += implication_score + # checks.append(f"Implications: {{insights_found}} keywords ({{implication_score:.1f}}/3.0)") + + # # Check 4: Novel synthesis (0-2 points) + # synthesis_indicators = [ + # 'framework' in str(output).lower(), + # 'model' in str(output).lower(), + # 'synthesis' in str(output).lower(), + # 'integration' in str(output).lower() + # ] + # synthesis_score = min(2.0, sum(synthesis_indicators) * 0.7) + # score += synthesis_score + # checks.append(f"Novel synthesis: {{sum(synthesis_indicators)}} indicators ({{synthesis_score:.1f}}/2.0)") + + # self.evaluation_log.append({{ + # 'dimension': 'insight', + # 'score': score, + # 'checks': checks + # }}) + + # return min(10.0, score) + + # def evaluate_instruction_following(self, output: Dict[str, Any], + # task_requirements: Dict) -> float: + # """ + # Evaluate instruction following (0-10) + # - Answers all sub-questions + # - Respects scope and constraints + # - Required deliverables present + # - Avoids out-of-scope content + # """ + # score = 0.0 + # checks = [] + + # # Check 1: All required sections present (0-4 points) + # required_sections = task_requirements.get('required_sections', []) + # if required_sections: + # present = sum(1 for section in required_sections + # if self._check_section_present(output, section)) + # section_score = min(4.0, (present / len(required_sections)) * 4.0) + # score += section_score + # checks.append(f"Required sections: {{present}}/{{len(required_sections)}} ({{section_score:.1f}}/4.0)") + # else: + # score += 3.0 # Default if no specific requirements + # checks.append("No specific section requirements (default 3.0/4.0)") + + # # Check 2: Scope compliance (0-3 points) + # scope_violations = self._check_scope_violations(output, task_requirements) + # scope_score = max(0.0, 3.0 - len(scope_violations) * 0.5) + # score += scope_score + # if scope_violations: + # checks.append(f"Scope violations: {{len(scope_violations)}} ({{scope_score:.1f}}/3.0)") + # else: + # checks.append("No scope violations (3.0/3.0)") + + # # Check 3: Format compliance (0-2 points) + # format_requirements = task_requirements.get('format', {{}}) + # format_score = 2.0 # Default + # if format_requirements: + # format_checks = [ + # self._check_format_requirement(output, req, val) + # for req, val in format_requirements.items() + # ] + # format_score = min(2.0, sum(format_checks) * 0.5) + # score += format_score + # checks.append(f"Format compliance: ({{format_score:.1f}}/2.0)") + + # # Check 4: Completeness (0-1 point) + # completeness_score = 1.0 if len(str(output)) > 200 else 0.5 + # score += completeness_score + # checks.append(f"Completeness: ({{completeness_score:.1f}}/1.0)") + + # self.evaluation_log.append({{ + # 'dimension': 'instruction_following', + # 'score': score, + # 'checks': checks + # }}) + + # return min(10.0, score) + + # def evaluate_readability(self, output: Dict[str, Any]) -> float: + # """ + # Evaluate readability (0-10) + # - Clear structure and organization + # - Fluent language + # - Effective data presentation + # - Proper formatting + # """ + # score = 0.0 + # checks = [] + + # output_str = str(output) + + # # Check 1: Structure and organization (0-3 points) + # structure_indicators = [ + # '\\n' in output_str, # Line breaks + # any(word in output_str for word in ['Summary', 'Introduction', 'Conclusion']), + # len(output_str.split('\\n')) > 5, # Multiple paragraphs + # ] + # structure_score = min(3.0, sum(structure_indicators) * 1.0) + # score += structure_score + # checks.append(f"Structure: {{sum(structure_indicators)}} indicators ({{structure_score:.1f}}/3.0)") + + # # Check 2: Language quality (0-3 points) + # # Simple heuristics for language quality + # avg_word_length = sum(len(word) for word in output_str.split()) / max(len(output_str.split()), 1) + # has_variety = len(set(output_str.lower().split())) / max(len(output_str.split()), 1) > 0.5 + + # language_score = 0.0 + # if 4 < avg_word_length < 7: # Reasonable word length + # language_score += 1.5 + # if has_variety: # Vocabulary variety + # language_score += 1.5 + + # score += language_score + # checks.append(f"Language quality: avg_word_len={{avg_word_length:.1f}}, variety={{has_variety}} ({{language_score:.1f}}/3.0)") + + # # Check 3: Data presentation (0-2 points) + # has_formatting = any(marker in output_str for marker in ['|', ':', '-', '*']) + # has_lists = output_str.count('\\n') > 3 + # presentation_score = (1.0 if has_formatting else 0) + (1.0 if has_lists else 0) + # score += presentation_score + # checks.append(f"Data presentation: formatting={{has_formatting}}, lists={{has_lists}} ({{presentation_score:.1f}}/2.0)") + + # # Check 4: Clarity (0-2 points) + # clarity_score = 2.0 + # # Penalize if too short or too verbose + # if len(output_str) < 100: + # clarity_score = 0.5 + # elif len(output_str) > 5000: + # clarity_score = 1.5 + + # score += clarity_score + # checks.append(f"Clarity: length={{len(output_str)}} chars ({{clarity_score:.1f}}/2.0)") + + # self.evaluation_log.append({{ + # 'dimension': 'readability', + # 'score': score, + # 'checks': checks + # }}) + + # return min(10.0, score) + + # def _check_topic_coverage(self, output: Dict, topic: str) -> bool: + # """Check if topic is covered in output""" + # return topic.lower() in str(output).lower() + + # def _check_section_present(self, output: Dict, section: str) -> bool: + # """Check if required section is present""" + # return section.lower() in str(output).lower() + + # def _check_scope_violations(self, output: Dict, requirements: Dict) -> List[str]: + # """Check for scope violations""" + # violations = [] + # # Add specific violation checks based on requirements + # return violations + + # def _check_format_requirement(self, output: Dict, requirement: str, value: Any) -> bool: + # """Check specific format requirement""" + # # Implement format checking logic + # return True + + # def evaluate_all(self, output: Dict[str, Any], + # task_requirements: Dict, + # task_context: Dict, + # dimension_weights: Optional[Dict[str, float]] = None) -> EvaluationScore: + # """Evaluate all dimensions and calculate overall score""" + + # score = EvaluationScore(weights=dimension_weights) + + # score.comprehensiveness = self.evaluate_comprehensiveness(output, task_requirements) + # score.insight = self.evaluate_insight(output, task_context) + # score.instruction_following = self.evaluate_instruction_following(output, task_requirements) + # score.readability = self.evaluate_readability(output) + # score.calculate_overall() + + # return score + + # def main(): + # """Main execution function with DeepResearch Bench evaluation""" + # try: + # print("=" * 60) + # print("CoSTEER Agentic System Execution Started") + # print("Task ID: {task_id}") + # print("Domain: {task_domain}") + # print("=" * 60) + + # execution_start = time.time() + + # # Initialize agent with configuration + # config = {{ + # 'max_workers': {max_workers}, + # 'enable_parallel': {enable_parallel}, + # 'enable_optimization': {enable_optimization}, + # 'task_timeout': {task_timeout} + # }} + + # print(f"Configuration: {{json.dumps(config, indent=2)}}") + # agent = AgenticSystem(config) + # print(f"Initialized: {{agent.name}}") + + # # Run tasks and collect results + # print("\\nExecuting tasks...") + # results = agent.run_tasks() + + # # Prepare task requirements for evaluation + # task_requirements = {{ + # 'required_topics': ['task execution', 'performance metrics'], + # 'required_sections': ['results', 'metrics'], + # 'format': {{'type': 'json'}} + # }} + + # task_context = {{ + # 'domain': '{task_domain}', + # 'hypothesis': '{hypothesis}' + # }} + + # # Evaluate using DeepResearch Bench standards + # print("\\nEvaluating results...") + # evaluator = ResearchOutputEvaluator(task_domain='{task_domain}') + + # evaluation_score = evaluator.evaluate_all( + # output=results, + # task_requirements=task_requirements, + # task_context=task_context, + # dimension_weights={evaluation_criteria} if {evaluation_criteria} else None + # ) + + # # Prepare detailed results + # execution_time = time.time() - execution_start + + # detailed_results = {{ + # 'task_info': {{ + # 'task_id': '{task_id}', + # 'domain': '{task_domain}', + # 'hypothesis': '{hypothesis}' + # }}, + # 'execution_results': results, + # 'deepresearch_evaluation': evaluation_score.to_dict(), + # 'evaluation_log': evaluator.evaluation_log, + # 'system_status': agent.get_system_status(), + # 'execution_time': execution_time, + # 'timestamp': time.time() + # }} + + # # Save detailed results to file + # result_file = Path("result.json") + # result_file.write_text(json.dumps(detailed_results, indent=2)) + + # # Print structured output + # print("\\n" + "=" * 60) + # print("EXECUTION RESULTS") + # print("=" * 60) + # print(f"Success Rate: {{results.get('success_rate', 0):.2%}}") + # print(f"Average Task Time: {{results.get('avg_time', 0):.4f}}s") + # print(f"Error Count: {{results.get('error_count', 0)}}") + # print(f"Total Tasks: {{results.get('total_tasks', 0)}}") + # print(f"Total Execution Time: {{execution_time:.2f}}s") + + # print("\\n" + "=" * 60) + # print("DEEPRESEARCH BENCH EVALUATION") + # print("=" * 60) + # print(f"Comprehensiveness: {{evaluation_score.comprehensiveness:.2f}}/10.0") + # print(f"Insight: {{evaluation_score.insight:.2f}}/10.0") + # print(f"Instruction Following: {{evaluation_score.instruction_following:.2f}}/10.0") + # print(f"Readability: {{evaluation_score.readability:.2f}}/10.0") + # print(f"{{'-' * 60}}") + # print(f"Overall Score: {{evaluation_score.overall:.2f}}/10.0") + # print("=" * 60) + + # # Print evaluation details + # print("\\nEvaluation Details:") + # for log_entry in evaluator.evaluation_log: + # print(f"\\n{{log_entry['dimension'].upper()}}:") + # for check in log_entry['checks']: + # print(f" - {{check}}") + + # # JSON output for automated parsing + # print("\\n" + "=" * 60) + # print("JSON_RESULTS_START") + # print(json.dumps(detailed_results, indent=2)) + # print("JSON_RESULTS_END") + # print("=" * 60) + + # return 0 + + # except Exception as e: + # print(f"\\nERROR: Execution failed - {{str(e)}}", file=sys.stderr) + # print("\\nError Details:") + # traceback.print_exc() + + # error_result = {{ + # 'task_info': {{ + # 'task_id': '{task_id}', + # 'domain': '{task_domain}' + # }}, + # 'execution_results': {{ + # "success_rate": 0.0, + # "avg_time": float('inf'), + # "error_count": 1, + # "total_tasks": 0 + # }}, + # 'deepresearch_evaluation': {{ + # 'comprehensiveness': 0.0, + # 'insight': 0.0, + # 'instruction_following': 0.0, + # 'readability': 0.0, + # 'overall': 0.0 + # }}, + # "error_reason": str(e), + # "traceback": traceback.format_exc() + # }} + + # # Save error result + # try: + # error_file = Path("error_result.json") + # error_file.write_text(json.dumps(error_result, indent=2)) + # except: + # pass + + # return 1 + + # if __name__ == "__main__": + # exit_code = main() + # sys.exit(exit_code) + # ''' + +# def generate_train_script(self,context): +# """ +# Generate training script using LLM +# """ +# hypothesis = context.get('hypothesis', 'Improve agentic system performance') +# task_id = context.get('task_id', 'unknown') +# task_domain = context.get('task_domain','general') +# system_prompt = """You are an expert in creating experiment training scripts. Generate clean, executable +# python code that test AgenticSystem.""" +# user_prompt = f"""Generate a training script (train.py) for an agentic system experiment. +# **Context:** +# - Task ID: {task_id} +# - Domain: {task_domain} +# - Hypothesis: {hypothesis} + +# **Requirements:** +# 1. Import AgenticSystem from agent.py +# 2. Run tasks and collect results +# 3. Print results between JSON_RESULTS_START and JSON_RESULTS_END markers +# 4. Include DeepResearch Bench evaluation scores + +# Generate ONLY the Python code without markdown blocks.""" +# try: +# logger.info("Calling LLM to generate train script") +# response = self.api_backend.build_messages_and_create_chat_completion( +# user_prompt = user_prompt, +# system_prompt = system_prompt, +# json_mode = False +# ) +# code = self.clean_llm_response(response) +# self.validate_python_syntax(code, "train.py") +# logger.info(f"Successfully generated train script using LLM") +# return code +# except Exception as e: +# logger.error(f"LLM train script generation failed: {str(e)}") +# logger.warning("Falling back to template-based generation") +# return self.get_fallback_train_code(hypothesis, task_id) + + + def generate_execution_script(self, task_info): + """ + generate training script using LLM + """ + system_prompt = """You are an expert in creating research execution and evaluation scripts. +Generate Python code that runs a research agent and evaluates outputs using DeepResearch Bench standards.""" + user_prompt = f"""Generate a execution script (train.py) for an agentic system experiment task. +**Task**: +- Task ID: {task_info.get('task_id', 'unknown')} +- Domain: {task_info.get('task_domain', 'general')} +- Question: {task_info.get('question', 'N/A')} +- Competition: {task_info.get('competition', 'deepresearch')} + +**Script Requirements:** + +1. Import modules: + - from agent import ResearchAgent + - from evaluator import DeepResearchEvaluator + - Standard libraries: json, sys, time, pathlib + +2. Main execution flow: + a) Initialize ResearchAgent + b) Execute research on the question + c) Evaluate output using DeepResearchEvaluator + d) Save results to result.json + e) Print structured output + +3. DeepResearch Bench Evaluation: + Evaluate on 4 dimensions (0-10 each): + - Comprehensiveness: Coverage, depth, evidence, perspectives + - Insight: Causal reasoning, quantified analysis, non-obvious implications + - Instruction Following: Completeness, scope compliance + - Readability: Structure, language quality, clarity + +4. Output Format (between JSON_RESULTS_START/END markers): + {{ + "task_info": {{ + "task_id": str, + "domain": str, + "question": str + }}, + "research_output": {{ + "answer": str, + "sub_answers": List[Dict], + "reasoning": str, + "evidence": List[str], + "confidence": float + }}, + "evaluation_scores": {{ + "comprehensiveness": float, # 0-10 + "insight": float, # 0-10 + "instruction_following": float, # 0-10 + "readability": float, # 0-10 + "overall": float # Weighted average + }}, + "execution_time": float + }} + +5. Error Handling: + - Wrap in try-except + - Log errors to stderr + - Return appropriate exit codes + - Save error details to error_result.json + +Generate ONLY the Python code without markdown blocks.""" + + try: + logger.info("Calling LLM to generate execution script...") + response = self.api_backend.build_messages_and_create_chat_completion( + user_prompt=user_prompt, + system_prompt=system_prompt, + json_mode=False + ) + + code = self.clean_llm_code_response(response) + self.validate_python_syntax(code, "train.py") + + logger.info("Generated research execution script") + return code + + except Exception as e: + logger.error(f"LLM execution script generation failed: {e}") + return self.get_fallback_execution_script(task_info) + + def generate_deepresearch_evaluator_code(self, task_info: Dict) -> str: + """Generate DeepResearch Bench evaluator""" + + system_prompt = """You are an expert in research quality evaluation. +Generate Python code implementing the DeepResearch Bench evaluation framework with detailed scoring rubrics.""" + + user_prompt = f"""Generate evaluator module (evaluator.py) for DeepResearch Bench. + +**Evaluator Requirements:** + +1. Class: DeepResearchEvaluator with methods: + - evaluate_comprehensiveness(output, requirements) -> float (0-10) + - evaluate_insight(output, context) -> float (0-10) + - evaluate_instruction_following(output, task) -> float (0-10) + - evaluate_readability(output) -> float (0-10) + - evaluate(output, task_requirements, task_context) -> EvaluationResult + +2. Scoring Rubrics: + + **Comprehensiveness (0-10):** + - Coverage of sub-topics (0-3): All required topics addressed + - Depth of analysis (0-3): Detailed, not superficial + - Evidence and sources (0-2): Citations, data, references + - Multiple perspectives (0-2): Diverse viewpoints + + **Insight (0-10):** + - Causal reasoning (0-3): Why-think, cause-effect relationships + - Quantified analysis (0-2): Numbers, metrics, measurements + - Non-obvious implications (0-3): Insights beyond surface level + - Novel synthesis (0-2): Original frameworks or connections + + **Instruction Following (0-10):** + - All sub-questions answered (0-4): Completeness check + - Scope compliance (0-3): Stays within bounds + - Format requirements (0-2): Structure, deliverables + - Completeness (0-1): Nothing major missing + + **Readability (0-10):** + - Clear structure (0-3): Organized, logical flow + - Language quality (0-3): Fluent, precise + - Data presentation (0-2): Tables, lists, formatting + - Clarity (0-2): Easy to understand + +3. EvaluationResult class: + - Scores for each dimension (float) + - Overall weighted score (float) + - Detailed feedback per criterion (List[str]) + - Suggestions for improvement (List[str]) + +4. Normalization (if reference provided): + - Scale relative to reference performance + - Adjust thresholds based on task difficulty + +Generate ONLY the Python code without markdown blocks.""" + + try: + logger.info("Calling LLM to generate evaluator...") + response = self.api_backend.build_messages_and_create_chat_completion( + user_prompt=user_prompt, + system_prompt=system_prompt, + json_mode=False + ) + + code = self._clean_llm_code_response(response) + self._validate_python_syntax(code, "evaluator.py") + + logger.info("Generated DeepResearch evaluator") + return code + + except Exception as e: + logger.error(f"LLM evaluator generation failed: {e}") + return self._get_fallback_evaluator(task_info) + + def generate_research_requirements(self, task_info: Dict) -> str: + """Generate requirements for research agent""" + requirements = [ + "# DeepResearch Bench Requirements", + "", + "# No external dependencies required", + "# Use Python standard library only:", + "# - json: JSON serialization", + "# - logging: Logging research process", + "# - time: Timing measurements", + "# - typing: Type hints", + "# - dataclasses: Data structures", + "# - pathlib: File operations", + "" + ] + + # Optional: domain-specific suggestions (commented out) + domain = task_info['domain'] + if domain in ['biology', 'medicine']: + requirements.append("# Optional (if needed): biopython") + elif domain == 'data_analysis': + requirements.append("# Optional (if needed): numpy, pandas") + + return "\n".join(requirements) + + def parse_research_output(self, stdout: str) -> Dict: + """Parse DeepResearch Bench output""" + try: + import json + import re + + # Look for JSON_RESULTS block + pattern = r'JSON_RESULTS_START\s*(.*?)\s*JSON_RESULTS_END' + match = re.search(pattern, stdout, re.DOTALL) + + if match: + json_str = match.group(1) + result = json.loads(json_str) + + if self._validate_research_result_format(result): + logger.info("Successfully parsed DeepResearch output") + return result + + logger.warning("Could not parse DeepResearch output") + return self._create_default_research_result() + + except Exception as e: + logger.error(f"Failed to parse research output: {e}") + return self._create_default_research_result() + + def _validate_research_result_format(self, result: Dict) -> bool: + """Validate research result format""" + required_fields = ['task_info', 'research_output', 'evaluation_scores'] + + for field in required_fields: + if field not in result: + logger.warning(f"Missing field: {field}") + return False + + # Validate evaluation scores + eval_scores = result['evaluation_scores'] + required_scores = ['comprehensiveness', 'insight', 'instruction_following', + 'readability', 'overall'] + + for score_name in required_scores: + if score_name not in eval_scores: + logger.warning(f"Missing score: {score_name}") + return False + score = eval_scores[score_name] + if not isinstance(score, (int, float)) or not (0 <= score <= 10): + logger.warning(f"Invalid score for {score_name}: {score}") + return False + + return True + + def _create_default_research_result(self) -> Dict: + """Create default research result""" + return { + "task_info": {"task_id": "unknown", "domain": "unknown"}, + "research_output": { + "answer": "Research execution failed", + "sub_answers": [], + "reasoning": "No output generated", + "evidence": [], + "confidence": 0.0 + }, + "evaluation_scores": { + "comprehensiveness": 0.0, + "insight": 0.0, + "instruction_following": 0.0, + "readability": 0.0, + "overall": 0.0 + }, + "execution_time": 0.0 + } + + # Helper methods + def _clean_llm_code_response(self, response: str) -> str: + """Clean LLM response""" + import re + code = re.sub(r'^```python\s*\n', '', response, flags=re.MULTILINE) + code = re.sub(r'^```\s*\n', '', code, flags=re.MULTILINE) + code = re.sub(r'\n```\s*$', '', code, flags=re.MULTILINE) + return code.strip() + + def _validate_python_syntax(self, code: str, filename: str): + """Validate Python syntax""" + try: + compile(code, filename, 'exec') + logger.info(f"Syntax validation passed for {filename}") + except SyntaxError as e: + raise ValueError(f"Syntax error in {filename}: {e}") + + def _get_fallback_research_agent(self, task_info: Dict) -> str: + """Fallback research agent""" + return f'''""" +Research Agent for DeepResearch Bench (Fallback Template) +Domain: {task_info['domain']} +""" +import logging +from typing import Dict, List + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger("ResearchAgent") + +class ResearchAgent: + def __init__(self, config: Dict = None): + self.config = config or {{}} + self.domain = "{task_info['domain']}" + logger.info(f"Initialized ResearchAgent for domain: {{self.domain}}") + + def research(self, question: str) -> Dict: + """Conduct research on question""" + logger.info(f"Starting research: {{question[:100]}}...") + + # Decompose + sub_questions = self.decompose_question(question) + logger.info(f"Decomposed into {{len(sub_questions)}} sub-questions") + + # Gather & analyze + sub_answers = [] + for sub_q in sub_questions: + info = self.gather_information(sub_q) + analysis = self.analyze_information([info]) + sub_answers.append({{ + "question": sub_q, + "answer": analysis.get("summary", "Analysis unavailable"), + "evidence": analysis.get("evidence", []) + }}) + + # Synthesize + final_answer = self.synthesize_answer(sub_answers) + + return {{ + "answer": final_answer, + "sub_answers": sub_answers, + "reasoning": "Multi-step research: decompose, gather, analyze, synthesize", + "evidence": [item for sa in sub_answers for item in sa.get("evidence", [])], + "confidence": 0.6, + "metadata": {{ + "domain": self.domain, + "approach": "structured_research", + "limitations": ["Limited information sources", "Basic analysis"] + }} + }} + + def decompose_question(self, question: str) -> List[str]: + """Break question into sub-questions""" + return [question] # Simplified: treat as single question + + def gather_information(self, sub_question: str) -> Dict: + """Gather information""" + return {{ + "question": sub_question, + "info": f"Simulated information for: {{sub_question[:50]}}...", + "sources": ["simulated_source_1"] + }} + + def analyze_information(self, info_list: List[Dict]) -> Dict: + """Analyze information""" + return {{ + "summary": f"Analysis of {{len(info_list)}} information pieces", + "evidence": [item.get("sources", ["unknown"])[0] for item in info_list] + }} + + def synthesize_answer(self, analyses: List[Dict]) -> str: + """Synthesize final answer""" + parts = [a.get("answer", "") for a in analyses] + return f"Synthesized answer based on {{len(parts)}} analyses: " + " ".join(parts[:3]) +''' + + def _get_fallback_execution_script(self, task_info: Dict) -> str: + """Fallback execution script""" + return f'''""" +Execution Script for DeepResearch Bench (Fallback) +""" +import json +import sys +import time +from pathlib import Path + +try: + from agent import ResearchAgent + from evaluator import DeepResearchEvaluator +except ImportError as e: + print(f"Import error: {{e}}", file=sys.stderr) + sys.exit(1) + +def main(): + try: + start_time = time.time() + + # Research + agent = ResearchAgent() + question = """{task_info['question']}""" + research_output = agent.research(question) + + # Evaluate + evaluator = DeepResearchEvaluator() + task_requirements = {{"required_sections": ["answer", "reasoning"]}} + task_context = {{"domain": "{task_info['domain']}"}} + + evaluation = evaluator.evaluate( + research_output, + task_requirements, + task_context + ) + + # Prepare results + results = {{ + "task_info": {{ + "task_id": "{task_info['task_id']}", + "domain": "{task_info['domain']}", + "question": question + }}, + "research_output": research_output, + "evaluation_scores": evaluation.to_dict(), + "execution_time": time.time() - start_time + }} + + # Save + Path("result.json").write_text(json.dumps(results, indent=2)) + + # Output + print("\\nJSON_RESULTS_START") + print(json.dumps(results, indent=2)) + print("JSON_RESULTS_END") + + return 0 + + except Exception as e: + print(f"Error: {{e}}", file=sys.stderr) + return 1 + +if __name__ == "__main__": + sys.exit(main()) +''' + + + + + + def needs_config_file(self, context): + """ + Determine if a configuration file is needed + """ + hypothesis = context.get('hypothesis', '') + return any(keyword in hypothesis.lower() for keyword in ['config', 'parameter', 'setting', 'tune']) + + def generate_config_file(self, context): + """ + Generate configuration file + """ + #acquire hypothesis from context and tune config accordingly + hypothesis = context.get('hypothesis', 'Improve agentic system performance') + + #decide default config values based on hypothesis + enable_parallel = 'parallel' in hypothesis.lower() or 'concurrent' in hypothesis.lower() + enable_optimization = 'optimization' in hypothesis.lower() or 'optimize' in hypothesis.lower() + max_workers = 8 if enable_parallel else 4 + task_timeout = 60 if enable_optimization else 30 + batch_size = 20 if enable_optimization else 10 + retry_attempts = 5 if enable_optimization else 3 + + return ''' + """ + CoSTEER Generated Configuration + """ + import os + from dataclasses import dataclass + from typing import Dict, Any + + @dataclass + class AgentSystemConfig: + """Configuration for agentic system""" + #Execution settings + max_workers: int = {max_workers} + task_timeout: float = {task_timeout} + enable_parallel: bool = {enable_parallel} + enable_optimization: bool = {enable_optimization} + + # Performance settings + retry_attempts: int = {retry_attempts} + batch_size: int = {batch_size} + + # Logging settings + log_level: str = "INFO" + enable_detailed_logging: bool = True + + @classmethod + def from_env(cls) -> 'AgenticSystemConfig': + """Create config from environment variables""" + return cls( + max_workers = int(os.getenv('AGENT_MAX_WORKERS', '{max_workers}')), + task_timeout = float(os.getenv('AGENT_TASK_TIMEOUT', '{task_timeout}')), + enable_parallel = os.getenv('AGENT_ENABLE_PARALLEL', '{str(enable_parallel).lower()}').lower() == 'true', + enable_optimization = os.getenv('AGENT_ENABLE_OPTIMIZATION', '{str(enable_optimization).lower()}').lower() == 'true', + retry_attempts = int(os.getenv('AGENT_RETRY_ATTEMPTS', '{retry_attempts}')), + batch_size = int(os.getenv('AGENT_BATCH_SIZE', '{batch_size}')), + ) + + def to_dict(self) -> Dict[str, Any]: + """Convert config to dictionary""" + return {{ + 'max_workers': self.max_workers, + 'task_timeout': self.task_timeout, + 'enable_parallel': self.enable_parallel, + 'enable_optimization': self.enable_optimization, + 'retry_attempts': self.retry_attempts, + 'batch_size': self.batch_size, + 'log_level': self.log_level, + 'enable_detailed_logging': self.enable_detailed_logging + }} + + # Default configuration instance + DEFAULT_CONFIG = AgenticSystemConfig() + + #Example Usage + #config = AgenticSystemConfig.from_hypothesis("{hypothesis}") + #config = AgenticSystemConfig.from_env() + ''' + + def create_fallback_workspace(self, exp: Experiment) -> FBWorkspace: + """Create a fallback worksapce in case of errors""" + logger.warning("create fallback workspace due to previous errors") + try: + workspace = FBWorkspace() + hypothesis = getattr(exp, 'hypothesis', 'Improve agentic system performance') + exp_id = getattr(exp, 'id', 'unknown') + + # Create minimal working files + minimal_files = { + "agent.py": self.get_minimal_agent_code(hypothesis), + "train.py": self.get_minimal_train_code(hypothesis,exp_id), + "requirements.txt": "# Minimal requirements\\n", + "README.md": f"# Fallback Workspace\nExperiment :{exp_id}\n Hypothesis: {hypothesis}\nThis is a fallback workspace with minimal working code." + } + + workspace.inject_files(**minimal_files) + logger.info(f"Created fallback workspace for experiment {exp_id}") + return workspace + + except Exception as e: + logger.error(f"Failed to create fallback workspace: {e}") + raise + + def get_minimal_agent_code(self,hypothesis): + """Get minimal working agent code""" + return f''' + class AgenticSystem: + def __init__(self): + self.name = "MinimalFallbackAgent" + self.hypothesis = "{hypothesis}" + def run_tasks(self): + return {{ + "success_rate": 0.5, + "avg_time": 0.01, + "error_count": 0, + "total_tasks": 1, + "note": "Fallback implementation" + }} + ''' + + def get_minimal_train_code(self,hypothesis, exp_id): + """Get minimal working train code""" + return f''' + import json + from pathlib import Path + from agent import AgenticSystem + def main(): + print("Running fallback Implementation") + print(f"Experiment: {exp_id}") + print(f"Hypothesis: {hypothesis}") + agent = AgenticSystem() + results = agent.run_tasks() + + #Save results + result_file = Path("result.json") + result_file.write_text(json.dumps(results, indent=2)) + + #Print results + print(f"Success Rate: {{results['success_rate']}}") + print(f"Average Time: {{results['avg_time']}}") + print(f"Error Count: {{results['error_count']}}") + print(f"Total Tasks: {{results['total_tasks']}}") + print("=== Fallback Execution Completed ===") + return 0 + if __name__ == "__main__": + exit_code = main() + import sys + sys.exit(exit_code) + ''' + + # # begin drafting + # # NOTE: + # # We should implement CoSTEER here to improve high quality coding ability + # # 1) generate code + # # prompting + # exp.experiment_workspace = FBWorkspace() + # # exp.experiment_workspace.inject_files(**{"": }) + + # # 2) run code + # # prepare environment. + # env = get_agent_sys_env( + # extra_volumes={exp.experiment_workspace.workspace_path: "/....."}, + # # ..... + # ) + + # env.run(entry="", ...) + + # # Please refer to the following code for details. + # # [[rdagent/components/coder/data_science/conf.py:41]] + + + # # end drafting + # try: + # #acquire workspace + # ws_path = self.get_workspace_path(exp) + # #create workspace directory + # ws_path.mkdir(parents=True, exist_ok=True) + # #generate code + # self.generate_files(ws_path, exp) + # logger.info(f"Code generation as workspace at {ws_path}") + + # except Exception as e: + # logger.error(f"Code generation failed: {str(e)}") + # exp.exception = e + + # return exp + + def get_workspace_path(self, exp: Experiment): + ''' + Get workspace path for the experiment + ''' + if hasattr(exp, 'experiment_workspace') and exp.experiment_workspace: + return Path(exp.experiment_workspace.workspace_path) + + base = Path("./workspace") + base.mkdir(exist_ok=True) + return base / f"exp_{exp.id}" + + def generate_files(self, ws_path, exp): + ''' + Generate necessary files for the agentic system experiment and write file to disk + ''' + # Dummy agent code + (ws_path / "agent.py").write_text( + self.get_agent_template(exp) + ) + + # train.py (execute entry point) + (ws_path / "train.py").write_text( + self.get_train_template() + ) + + #requirements.txt + (ws_path / "requirements.txt").write_text( + "#Add dependencies here\n" + ) + + def get_agent_template(self, exp): + "generate agent code template" + hypothesis = getattr(exp, 'hypothesis', 'Improve system performance') + return f''' + """ + Agentic System Implementation + Hypothesis: {hypothesis} + """ + import time + from typing import Dict, List, Any + + class AgenticSystem: + """Agentic System for task execution""" + + def __init__(self): + self.name = "AgenticSystem" + self.task_count = 0 + + def run_task(self, task: Dict[str, Any]) -> Dict[str, Any]: + """Run tasks and return results""" + start_time = time.time() + try: + task_id = task.get('id', self.task_count) + self.task_count += 1 + result = {{ + "task_id": task_id, + "success": "True", + "time": time.time() - start_time, + "error": None + }} + except Exception as e: + result = {{ + "task_id": task_id, + "success": False, + "time": time.time() - start_time, + "error": str(e) + }} + return result + def run_tasks(self, tasks: List[Dict] = None): + """Run multiple tasks and collect results""" + if tasks is None: + tasks = [ + {{"id": i, "type": "test", "data": f"sample{{i}}"}} + for i in range(10) + ] + + results = [] + for task in tasks: + result.append(self.run_task(task)) + + # Calculate metrics + success_count = sum(1 for r in results if r["success"]) + total_time = sum(r["time"] for r in results) + error_count = sum(1 for r in results if r["error"]) + + return {{ + "success_rate": success_count / len(results) if result else 0, + "avg_time": total_time / len(results) if results else 0, + "error_count": error_count, + "total_tasks": len(results) + }} + ''' + + def get_train_template(self): + """generate execution template""" + return '''""" + Training/Execution script for Agentic System, this is the entry point + that will be executed by the runner. + """ + import json + import sys + from pathlib import Path + from agent import AgenticSystem + + def main(): + """Main execution function""" + try: + print("Starting Agentic System execution...") + # Initialize agent + agent = AgenticSystem() + # Run tasks + results = agent.run_tasks() + + # Save results to file (for backup parsing) + result_file = Path("result.json") + result_file.write_text(json.dumps(results, indent = 2)) + + #Print for logging + print("execution completed") + print(f"Success Rate: {results['success_rate']}") + print(f"Average Time: {results['avg_time']}") + print(f"Error Count: {results['error_count']}") + print(f"Total Tasks: {results['total_tasks']}") + + return 0 + + except Exception as e: + print(f"Execution failed: {str(e)}") + import traceback + traceback.print_exc() + return 1 + + if __name__ == "__main__": + main() + ''' + + +class AgenticSysRunner(Developer[Experiment]): + """execute code generated by AgenticSysCoder""" + + def __init__(self, scen): + self.scen = scen + + def develop(self, exp: Experiment) -> Experiment: + # TODO: implement the runner + """ + execute the experiment + steps: + 1. acquire workspace + 2. execute test.py + 3. parse output + 4. collect performance metrics + 5. record logs + """ + logger.info("Starting experiment execution") + # try: + # # acquire workspace + # ws_path = self.get_workspace_path(exp) + # logger.info(f"Using workspace at {ws_path}") + # # validate necessary files + # self.validate_workspace(ws_path) + # #execute experiment + # stdout, stderr = self.execute_experiment(ws_path) + # #parse result + # result = self.parse_execution_output(stdout, stderr) + # exp.result = result + # # record execution logs + # self._log_execution_results(exp, result) + # logger.info("Experiment completed successfully") + # except Exception as e: + # logger.error(f"Experiment execution failed: {str(e)}") + # exp.exception = e + # exp.result = self.create_error_result(str(e)) + # return exp + try: + if not self.has_valid_workspace(exp): + logger.info("Workspace is not ready, calling coder to generate code") + coder = AgenticSysCoder(self.scen) + exp = coder.develop(exp) + #check if coder succeeded + if not self.has_valid_workspace(exp): + raise RuntimeError("Coder failed to generate valid workspace") + #1. acquire workspace + ws_path = self.get_workspace_path(exp) + logger.info(f"Using workspace at {ws_path}") + + #2. validate necessary files + self.validate_workspace(ws_path) + + #3. execute experiment + stdout, stderr = self.execute_experiment(ws_path) + + #4. parse result + result = self.parse_execution_output(stdout, stderr) + exp.result = result + + #5. record execution logs + self.log_execution_results(exp, result) + logger.info("Experiment completed successfully") + + except Exception as e: + logger.error(f"Experiment execution failed: {str(e)}") + exp.exception = e + exp.result = self.create_error_result(str(e)) + return exp + + def has_valid_workspace(self, exp: Experiment): + """check if experiment has valid workspace with required files""" + try: + if not hasattr(exp, 'experiment_workspace') or not exp.experiment_workspace: + return False + ws_path = Path(exp.experiment_workspace.workspace_path) + if not ws_path.exists(): + return False + #check for required files + required_files = ["train.py", "agent.py"] + for file_name in required_files: + if not (ws_path / file_name).exists(): + return False + return True + except Exception as e: + logger.warning(f"Error checking workspace validity : {(e)}") + return False + + def get_workspace_path(self, exp): + ''' + Get workspace path for the experiment + ''' + if hasattr(exp, 'experiment_workspace') and exp.experiment_workspace: + return Path(exp.experiment_workspace.workspace_path) + # Default workspace path + base = Path("./workspace") + return base / f"exp_{exp.id}" + + def validate_workspace(self, ws_path: Path): + """Validate necessary files in the workspace""" + if not ws_path.exists(): + raise FileNotFoundError(f"Workspace path {ws_path} does not exist.") + + # examine necessary files + required_files = ["train.py", "agent.py"] + missing_files = [] + + for file_name in required_files: + file_path = ws_path / file_name + if not file_path.exists(): + missing_files.append(file_name) + + if missing_files: + raise FileNotFoundError(f"Missing required files in workspace {ws_path}: {', '.join(missing_files)}") + + logger.info("workspace validation passed: {ws_path}") + + def execute_experiment(self, ws_path: Path, timeout: int = 300): + """Execute the experiment by running train.py""" + cmd = [sys.executable, "train.py"] + # use environment variables if necessary + env = self.prepare_environment() + + logger.info(f"Executing: {' '.join(cmd)} in {ws_path}") + + try: + # pass in environment variables if necessary + result = subprocess.run( + cmd, + cwd=str(ws_path), + capture_output=True, + text=True, + timeout=timeout, + env=env + ) + + logger.info(f"Process completed with return code: {result.returncode}") + + if result.returncode != 0: + logger.warning(f"Process exited with non-zero code: {result.returncode}") + + return result.stdout, result.stderr + + except subprocess.TimeoutExpired as e: + logger.error(f"Execution timed out after {timeout} seconds") + raise RuntimeError(f"Execution timeout: {timeout}s") from e + + except Exception as e: + logger.error(f"Execution failed with exception: {str(e)}") + raise RuntimeError(f"Execution error: {str(e)}") from e + + def prepare_environment(self): + """Prepare execution environment""" + import os + env = os.environ.copy() + # Add any necessary environment variables here + if 'PYTHONPATH' in env: + env['PYTHONPATH'] = f"{os.getcwd()}:{env['PYTHONPATH']}" + else: + env['PYTHONPATH'] = os.getcwd() + return env + + def parse_execution_output(self, stdout: str, stderr: str): + """Parse execution output including DeepResearch Bench evaluation scores""" + try: + # Method 1: Look for JSON block with evaluation scores + result = self.parse_json_results(stdout) + if result: + return result + + # Method 2: Look up result file + result = self.parse_result_file() + if result: + return result + + # Method 3: Parse from stdout text + result = self.parse_text_output(stdout) + if result: + return result + + logger.warning("Could not parse execution output, using default result") + return self.create_default_result( + success=False, + reason="Could not parse output" + ) + + except Exception as e: + logger.error(f"Failed to parse output: {e}") + return self.create_error_result(f"Parsing error: {e}") + + def parse_json_results(self, stdout: str): + """Parse JSON results block from stdout""" + try: + import json + import re + + # Look for JSON_RESULTS block + json_pattern = r'JSON_RESULTS_START\s*(.*?)\s*JSON_RESULTS_END' + match = re.search(json_pattern, stdout, re.DOTALL) + + if match: + json_str = match.group(1) + result = json.loads(json_str) + + # Validate and extract both execution and evaluation results + if self.validate_deepresearch_result(result): + logger.info("Successfully parsed DeepResearch Bench results") + return result + + return None + + except Exception as e: + logger.warning(f"Failed to parse JSON results: {e}") + return None + + def parse_structured_output(self, stdout:str): + """Parse structured JSON output """ + try: + import json + import re + # Look for JSON blocks in stdout + json_pattern = r'\{[^{}]*"success_rate"[^{}]*\}' + matches = re.findall(json_pattern, stdout, re.DOTALL) + + for match in matches: + try: + result = json.loads(match) + # Validate result format + if self.validate_result_format(result): + logger.info("Successfully parsed structured output") + return result + except json.JSONDecodeError: + continue + + return None + + except Exception as e: + logger.warning(f"Failed to parse structured output: {e}") + return None + + def validate_deepresearch_result(self, result): + """validate DeepResearch Bench result format""" + try: + # Check execution results + if 'execution_results' not in result: + return False + + exec_results = result['execution_results'] + required_exec_fields = ['success_rate', 'avg_time', 'error_count'] + for field in required_exec_fields: + if field not in exec_results: + return False + + # Check evaluation scores + if 'deepresearch_evaluation' not in result: + return False + + eval_scores = result['deepresearch_evaluation'] + required_eval_fields = ['comprehensiveness', 'insight', + 'instruction_following', 'readability', 'overall'] + for field in required_eval_fields: + if field not in eval_scores: + return False + # Validate score range + score = eval_scores[field] + if not isinstance(score, (int, float)) or not (0 <= score <= 10): + return False + + return True + + except Exception: + return False + + + + def parse_text_output(self, stdout: str): + """Parse text output using regex""" + try: + import re + + # Extract metrics using regex + success_match = re.search(r'Success Rate:\s*([0-9.]+)', stdout, re.IGNORECASE) + time_match = re.search(r'Average Time:\s*([0-9.]+)', stdout, re.IGNORECASE) + error_match = re.search(r'Error Count:\s*([0-9]+)', stdout, re.IGNORECASE) + task_match = re.search(r'Total Tasks:\s*([0-9]+)', stdout, re.IGNORECASE) + + if success_match: + result = { + "success_rate": float(success_match.group(1)), + "avg_time": float(time_match.group(1)) if time_match else 0.0, + "error_count": int(error_match.group(1)) if error_match else 0, + "total_tasks": int(task_match.group(1)) if task_match else 0 + } + logger.info("Successfully parsed text output") + return result + + return None + + except Exception as e: + logger.warning(f"Failed to parse text output: {e}") + return None + + def parse_result_file(self): + """Parse result from JSON file""" + try: + import json + + possible_paths = ["result.json", "output.json", "results.json"] + + for file_name in possible_paths: + file_path = Path(file_name) + if file_path.exists(): + content = file_path.read_text(encoding='utf-8') + result = json.loads(content) + + if self.validate_result_format(result): + logger.info(f"Successfully parsed result file: {file_path}") + return result + + return None + + except Exception as e: + logger.warning(f"Failed to parse result file: {e}") + return None + + def validate_result_format(self, result: dict) -> bool: + """Validate result format""" + required_fields = ["success_rate", "avg_time", "error_count"] + + for field in required_fields: + if field not in result: + return False + if not isinstance(result[field], (int, float)): + return False + + # Check value ranges + if not (0.0 <= result["success_rate"] <= 1.0): + return False + if result["avg_time"] < 0: + return False + if result["error_count"] < 0: + return False + + return True + + def create_default_result(self, success: bool = False, reason: str = "") -> dict: + """Create default result""" + return { + "success_rate": 1.0 if success else 0.0, + "avg_time": 0.0 if success else float('inf'), + "error_count": 0 if success else 1, + "total_tasks": 0, + "error_reason": reason + } + + def create_error_result(self, error_message: str) -> dict: + """Create error result""" + return { + "success_rate": 0.0, + "avg_time": float('inf'), + "error_count": 1, + "total_tasks": 0, + "error_reason": error_message + } + + def log_execution_results(self, exp: Experiment, result: dict): + """Log execution results including DeepResearch Bench evaluation""" + logger.info("=" * 60) + logger.info("EXECUTION RESULTS") + logger.info("=" * 60) + + # Log execution metrics + exec_results = result.get('execution_results', result) + logger.info(f"Success Rate: {exec_results.get('success_rate', 0):.2%}") + logger.info(f"Average Time: {exec_results.get('avg_time', 0):.4f}s") + logger.info(f"Error Count: {exec_results.get('error_count', 0)}") + logger.info(f"Total Tasks: {exec_results.get('total_tasks', 0)}") + + # Log DeepResearch Bench evaluation if available + if 'deepresearch_evaluation' in result: + logger.info("=" * 60) + logger.info("DEEPRESEARCH BENCH EVALUATION") + logger.info("=" * 60) + + eval_scores = result['deepresearch_evaluation'] + logger.info(f"Comprehensiveness: {eval_scores.get('comprehensiveness', 0):.2f}/10.0") + logger.info(f"Insight: {eval_scores.get('insight', 0):.2f}/10.0") + logger.info(f"Instruction Following: {eval_scores.get('instruction_following', 0):.2f}/10.0") + logger.info(f"Readability: {eval_scores.get('readability', 0):.2f}/10.0") + logger.info(f"{'-' * 60}") + logger.info(f"Overall Score: {eval_scores.get('overall', 0):.2f}/10.0") + + # Log evaluation details if available + if 'evaluation_log' in result: + logger.info("\\nEvaluation Details:") + for log_entry in result['evaluation_log']: + logger.info(f" {log_entry['dimension'].upper()}: {log_entry['score']:.2f}/10.0") + for check in log_entry.get('checks', []): + logger.info(f" - {check}") + + if 'error_reason' in result: + logger.warning(f"Error: {result['error_reason']}") + + logger.info("=" * 60) + + + + + + + + + + + diff --git a/rdagent/scenarios/agentic_sys/dev_new.py b/rdagent/scenarios/agentic_sys/dev_new.py new file mode 100644 index 000000000..380246b99 --- /dev/null +++ b/rdagent/scenarios/agentic_sys/dev_new.py @@ -0,0 +1,734 @@ +""" +Developer for Agentic System Scenario +Generates code for agentic system experiments with optional web search enhancement +""" + +from pathlib import Path +from typing import Dict, Any, List, Optional + +from rdagent.core.developer import Developer +from rdagent.core.experiment import Experiment, FBWorkspace +from rdagent.log import rdagent_logger as logger +from rdagent.oai.llm_utils import APIBackend +from rdagent.scenarios.agentic_sys.env import get_agent_sys_env +from rdagent.scenarios.agentic_sys.tools.web_search import create_web_search_tool + + +class AgenticSysCoder(Developer[Experiment]): + """ + Code generator for agentic system experiments + + Features: + - CoSTEER-based code generation + - Optional web search tool integration + - Lazy initialization of external tools + - Intelligent context enhancement + """ + + def __init__(self, scen): + """ + Initialize AgenticSysCoder + + Args: + scen: Scenario instance containing task description and configuration + """ + self.scen = scen + self.api_backend = APIBackend() + + # Lazy initialization for web search tool + self._web_search_tool = None + + logger.info("Initialized AgenticSysCoder with LLM backend") + + @property + def web_search_tool(self): + """ + Lazy load web search tool when needed + + Returns: + WebSearchTool instance or None if unavailable + """ + if self._web_search_tool is None: + try: + search_config_path = Path(__file__).parent / "tools" / "search_config.yaml" + if search_config_path.exists(): + self._web_search_tool = create_web_search_tool(search_config_path) + logger.info("✓ Web search tool initialized successfully") + else: + logger.warning(f"Search config not found: {search_config_path}") + self._web_search_tool = False + except Exception as e: + logger.warning(f"Failed to initialize web search tool: {e}") + self._web_search_tool = False # Mark as failed to avoid retry + + return self._web_search_tool if self._web_search_tool is not False else None + + def develop(self, exp: Experiment) -> Experiment: + """ + Generate code for the experiment + + Workflow: + 1. Initialize workspace + 2. Prepare base context + 3. Optionally enhance with web search (tool call) + 4. Generate code artifacts + 5. Inject files and run + + Args: + exp: Experiment instance + + Returns: + Experiment with generated code and results + """ + logger.info(f"Starting code generation for experiment: {getattr(exp, 'id', 'unknown')}") + + try: + # Step 1: Initialize workspace + exp.experiment_workspace = FBWorkspace() + ws_path = Path(exp.experiment_workspace.workspace_path) + ws_path.mkdir(parents=True, exist_ok=True) + logger.info(f"✓ Initialized workspace at {ws_path}") + + # Step 2: Prepare base context + context = self._prepare_base_context(exp) + logger.info("✓ Prepared base context") + + # Step 3: Optionally enhance with web search (TOOL CALL) + if self._should_use_web_search(exp): + logger.info("→ Calling web search tool for context enhancement...") + context = self._enhance_context_with_web_search(context, exp) + else: + logger.info("→ Skipping web search (not needed)") + + # Step 4: Generate code artifacts + logger.info("→ Generating code with CoSTEER framework...") + code_artifacts = self._generate_code_artifacts(exp, context) + + # Step 5: Inject files into workspace + exp.experiment_workspace.inject_files(**code_artifacts) + logger.info(f"✓ Injected {len(code_artifacts)} files into workspace") + + # Step 6: Prepare environment and run + timeout = self._calculate_timeout(exp) + env = get_agent_sys_env( + running_timeout_period=timeout, + enable_cache=True + ) + + logger.info(f"→ Running generated code (timeout: {timeout}s)...") + run_res = exp.experiment_workspace.run(env=env, entry="python train.py") + + # Store results + exp.run_returncode = getattr(run_res, 'returncode', None) + exp.run_stdout = getattr(run_res, 'stdout', getattr(run_res, 'logs', None)) + exp.run_stderr = getattr(run_res, 'stderr', None) + + if exp.run_returncode == 0: + logger.info("✓ Experiment execution succeeded") + else: + logger.warning(f"⚠ Experiment execution failed with return code: {exp.run_returncode}") + + except Exception as e: + logger.error(f"❌ Code generation failed: {str(e)}", exc_info=True) + exp.exception = e + + # Try to create fallback workspace + if not hasattr(exp, 'experiment_workspace') or not exp.experiment_workspace: + try: + exp.experiment_workspace = self._create_fallback_workspace(exp) + logger.info("Created fallback workspace") + except Exception as e_fallback: + logger.error(f"Failed to create fallback workspace: {e_fallback}") + + return exp + + def _prepare_base_context(self, exp: Experiment) -> Dict[str, Any]: + """ + Prepare base context without web search + + Args: + exp: Current experiment + + Returns: + Base context dictionary + """ + hypothesis = getattr(exp, 'hypothesis', 'Improve agentic system performance') + + context = { + 'hypothesis': hypothesis, + 'scenario_desc': self.scen.get_scenario_all_desc(), + 'success_criteria': getattr(self.scen, 'success_criteria', 'High performance'), + 'task_id': getattr(exp, 'id', 'unknown'), + 'task_domain': getattr(self.scen, 'domain', 'general'), + 'iteration_number': getattr(exp, 'iteration_number', 0), + 'external_sources': [], # Will be filled by web search if used + 'external_knowledge_summary': '' # Will be filled by web search if used + } + + return context + + def _should_use_web_search(self, exp: Experiment) -> bool: + """ + Determine if web search should be used for this experiment + + Decision criteria: + 1. Web search tool is available + 2. Not explicitly disabled by configuration + 3. Hypothesis complexity requires external knowledge + 4. Early iterations (< 3) benefit from external knowledge + 5. Previous experiments show low performance + + Args: + exp: Current experiment + + Returns: + True if web search should be used + """ + # Check if web search is globally disabled + if getattr(self.scen, 'disable_web_search', False): + logger.info("Web search disabled by scenario configuration") + return False + + # Check if tool is available + if self.web_search_tool is None: + logger.info("Web search tool not available") + return False + + # Check if search service is healthy + if not self.web_search_tool.client.health_check(): + logger.warning("Web search service is not healthy, skipping") + return False + + hypothesis = getattr(exp, 'hypothesis', '').lower() + + # Use web search for research-heavy hypotheses + research_indicators = [ + 'research', 'investigate', 'explore', 'analyze', 'study', + 'compare', 'evaluate', 'survey', 'benchmark', 'baseline', + 'novel', 'innovative', 'advanced', 'state-of-art', 'sota', + 'improve', 'optimize', 'enhance', 'boost' + ] + + if any(indicator in hypothesis for indicator in research_indicators): + logger.info(f"Research-heavy hypothesis detected: '{hypothesis[:50]}...'") + return True + + # Use web search for early iterations + iteration = getattr(exp, 'iteration_number', 0) + if iteration < 3: + logger.info(f"Early iteration ({iteration}/3), enabling web search") + return True + + # Use web search if previous performance was low + if hasattr(exp, 'previous_performance_low') and exp.previous_performance_low: + logger.info("Previous performance low, enabling web search for improvement") + return True + + # Default: don't use web search for efficiency + logger.info("Web search not needed (simple task or late iteration)") + return False + + def _enhance_context_with_web_search( + self, + context: Dict[str, Any], + exp: Experiment + ) -> Dict[str, Any]: + """ + Enhance context with web search results (TOOL CALL) + + This is the main entry point for web search tool integration. + + Args: + context: Base context to enhance + exp: Current experiment + + Returns: + Enhanced context with external sources + """ + try: + hypothesis = context['hypothesis'] + + # Step 1: Identify knowledge gaps + knowledge_gaps = self._identify_knowledge_gaps(exp, hypothesis) + logger.info(f"Identified {len(knowledge_gaps)} knowledge gaps: {knowledge_gaps}") + + # Step 2: Prepare search context + search_context = { + 'methodology': self._extract_methodology(hypothesis), + 'complexity': self._assess_complexity(hypothesis), + 'iteration': context.get('iteration_number', 0), + 'domain': context.get('task_domain', 'general') + } + logger.info(f"Search context: {search_context}") + + # Step 3: TOOL CALL - Search for hypothesis + logger.info(f"Calling web search tool with task: '{hypothesis[:80]}...'") + external_sources = self.web_search_tool.search_for_hypothesis( + task_description=hypothesis, + current_gaps=knowledge_gaps, + context=search_context + ) + + # Step 4: Enhance context with results + context['external_sources'] = external_sources + logger.info(f"✓ Retrieved {len(external_sources)} external sources") + + # Step 5: Add summary for easy consumption + if external_sources: + context['external_knowledge_summary'] = self._summarize_external_sources( + external_sources + ) + logger.info("✓ Generated external knowledge summary") + + # Log top sources + for idx, source in enumerate(external_sources[:3], 1): + logger.info( + f" {idx}. [{source['credibility_level']}] {source['title'][:60]}..." + ) + else: + logger.warning("No external sources found") + + except Exception as e: + logger.error(f"Web search enhancement failed: {e}", exc_info=True) + # Don't fail the entire process, just skip enhancement + context['external_sources'] = [] + context['external_knowledge_summary'] = '' + + return context + + def _identify_knowledge_gaps(self, exp: Experiment, hypothesis: str) -> List[str]: + """ + Identify knowledge gaps from hypothesis + + Args: + exp: Current experiment + hypothesis: Hypothesis string + + Returns: + List of knowledge gap descriptions (max 5) + """ + gaps = [] + hypothesis_lower = hypothesis.lower() + + # Common agentic system knowledge areas + knowledge_areas = { + 'planning': ['plan', 'planning', 'strategy', 'approach', 'roadmap'], + 'reasoning': ['reason', 'reasoning', 'logic', 'inference', 'think', 'thought'], + 'learning': ['learn', 'learning', 'adapt', 'optimization', 'train'], + 'memory': ['memory', 'context', 'history', 'recall', 'cache'], + 'tool_use': ['tool', 'api', 'external', 'integration', 'function'], + 'evaluation': ['evaluate', 'assessment', 'metric', 'performance', 'measure'], + 'communication': ['communicate', 'language', 'dialogue', 'interaction', 'conversation'], + 'retrieval': ['retrieval', 'search', 'rag', 'knowledge base', 'database'], + 'generation': ['generate', 'generation', 'create', 'synthesize', 'produce'] + } + + # Identify relevant areas + for area, keywords in knowledge_areas.items(): + if any(kw in hypothesis_lower for kw in keywords): + gaps.append(f"{area} techniques and best practices") + + # Add general gaps if none identified + if not gaps: + gaps.extend([ + "agentic system design patterns", + "system implementation strategies", + "performance optimization techniques" + ]) + + return gaps[:5] # Limit to top 5 gaps + + def _extract_methodology(self, hypothesis: str) -> str: + """ + Extract methodology from hypothesis + + Args: + hypothesis: Hypothesis string + + Returns: + Identified methodology + """ + hypothesis_lower = hypothesis.lower() + + methodologies = { + 'reinforcement learning': ['rl', 'reinforcement', 'q-learning', 'policy', 'reward'], + 'retrieval augmented generation': ['rag', 'retrieval', 'augmented', 'retrieve'], + 'chain of thought': ['cot', 'chain of thought', 'reasoning chain', 'step by step'], + 'tree of thought': ['tot', 'tree of thought', 'reasoning tree', 'branching'], + 'multi-agent': ['multi-agent', 'multiple agents', 'agent collaboration', 'swarm'], + 'iterative refinement': ['iterative', 'refinement', 'feedback loop', 'improve'], + 'prompt engineering': ['prompt', 'prompting', 'instruction', 'template'], + 'fine-tuning': ['fine-tune', 'fine-tuning', 'training', 'adapt model'] + } + + for method, keywords in methodologies.items(): + if any(kw in hypothesis_lower for kw in keywords): + return method + + return 'general agentic approach' + + def _assess_complexity(self, hypothesis: str) -> str: + """ + Assess hypothesis complexity + + Args: + hypothesis: Hypothesis string + + Returns: + Complexity level: 'high', 'medium', or 'low' + """ + hypothesis_lower = hypothesis.lower() + + high_complexity_indicators = [ + 'complex', 'advanced', 'sophisticated', 'multi-stage', 'multi-step', + 'distributed', 'parallel', 'optimization', 'novel', 'innovative', + 'state-of-art', 'cutting-edge', 'research' + ] + + medium_complexity_indicators = [ + 'moderate', 'standard', 'typical', 'conventional', 'improve', + 'enhance', 'optimize', 'refine' + ] + + low_complexity_indicators = [ + 'simple', 'basic', 'straightforward', 'minimal', 'quick', + 'fix', 'patch', 'update' + ] + + if any(ind in hypothesis_lower for ind in high_complexity_indicators): + return 'high' + elif any(ind in hypothesis_lower for ind in medium_complexity_indicators): + return 'medium' + elif any(ind in hypothesis_lower for ind in low_complexity_indicators): + return 'low' + else: + return 'medium' # Default to medium + + def _summarize_external_sources(self, sources: List[Dict[str, Any]]) -> str: + """ + Summarize external sources for context injection + + Args: + sources: List of external source dictionaries + + Returns: + Formatted summary string + """ + if not sources: + return "No external sources available." + + summary_parts = [] + + # Count by credibility + high_cred = [s for s in sources if s.get('credibility_level') == 'High'] + medium_cred = [s for s in sources if s.get('credibility_level') == 'Medium'] + low_cred = [s for s in sources if s.get('credibility_level') == 'Low'] + + summary_parts.append( + f"Retrieved {len(sources)} sources: " + f"{len(high_cred)} high-credibility, " + f"{len(medium_cred)} medium-credibility, " + f"{len(low_cred)} low-credibility" + ) + + # High credibility sources + if high_cred: + summary_parts.append( + "\nHigh-credibility sources:\n" + + "\n".join(f" - {s['title'][:70]}" for s in high_cred[:3]) + ) + + # Key insights from top sources + key_insights = [] + for source in sources[:3]: + summary = source.get('summary', '') + if len(summary) > 50: + key_insights.append(f" • {summary[:150]}...") + + if key_insights: + summary_parts.append("\nKey insights:\n" + "\n".join(key_insights)) + + return "\n".join(summary_parts) + + def _generate_code_artifacts( + self, + exp: Experiment, + context: Dict[str, Any] + ) -> Dict[str, str]: + """ + Generate code artifacts using CoSTEER approach + + Args: + exp: Current experiment + context: Enhanced context (possibly with external knowledge) + + Returns: + Dictionary of code artifacts {filename: content} + """ + logger.info("Generating code artifacts with CoSTEER framework...") + + code_artifacts = {} + + # Extract task information + task_info = self._extract_task_info(context) + + # Generate main agent implementation + logger.info("→ Generating agent.py...") + agent_code = self._generate_agent_code(task_info, context) + code_artifacts['agent.py'] = agent_code + + # Generate evaluator + logger.info("→ Generating evaluator.py...") + evaluator_code = self._generate_evaluator_code(task_info) + code_artifacts['evaluator.py'] = evaluator_code + + # Generate execution script + logger.info("→ Generating train.py...") + train_code = self._generate_execution_script(task_info) + code_artifacts['train.py'] = train_code + + # Generate requirements + logger.info("→ Generating requirements.txt...") + requirements = self._generate_requirements(task_info) + code_artifacts['requirements.txt'] = requirements + + logger.info(f"✓ Generated {len(code_artifacts)} code artifacts") + return code_artifacts + + def _extract_task_info(self, context: Dict[str, Any]) -> Dict[str, Any]: + """ + Extract task information from context + + Args: + context: Context dictionary with external knowledge + + Returns: + Task information dictionary + """ + hypothesis = context.get('hypothesis', 'Improve agentic system performance') + + task_info = { + 'task_id': context.get('task_id', 'unknown'), + 'domain': context.get('task_domain', 'general'), + 'hypothesis': hypothesis, + 'complexity': context.get('complexity', self._assess_complexity(hypothesis)), + 'methodology': self._extract_methodology(hypothesis), + 'external_sources': context.get('external_sources', []), + 'external_knowledge_summary': context.get('external_knowledge_summary', ''), + 'has_external_knowledge': len(context.get('external_sources', [])) > 0, + 'iteration_number': context.get('iteration_number', 0) + } + + return task_info + + def _generate_agent_code(self, task_info: Dict[str, Any], context: Dict[str, Any]) -> str: + """ + Generate agent implementation code + + Args: + task_info: Task information + context: Full context with external knowledge + + Returns: + Agent code as string + """ + # Simplified placeholder - in real implementation, use LLM with prompts + hypothesis = task_info['hypothesis'] + external_summary = task_info['external_knowledge_summary'] + + code = f'''""" +Agentic System Implementation +Generated for: {hypothesis} + +External Knowledge: +{external_summary if external_summary else "No external knowledge used"} +""" + +from typing import Dict, Any, List +import logging + +logger = logging.getLogger(__name__) + + +class AgenticSystem: + """ + Main agentic system implementation + Hypothesis: {hypothesis} + """ + + def __init__(self, config: Dict[str, Any]): + self.config = config + logger.info("Initialized AgenticSystem") + + def run(self, task: str) -> Dict[str, Any]: + """Execute the agentic system on a task""" + logger.info(f"Running task: {{task}}") + + # Implementation based on hypothesis + result = {{ + 'task': task, + 'status': 'completed', + 'output': 'Task completed successfully' + }} + + return result + + +def create_agent(config: Dict[str, Any]) -> AgenticSystem: + """Factory function to create agent""" + return AgenticSystem(config) +''' + return code + + def _generate_evaluator_code(self, task_info: Dict[str, Any]) -> str: + """Generate evaluator code""" + code = '''""" +Evaluator for Agentic System +""" + +from typing import Dict, Any + + +class AgenticSystemEvaluator: + """Evaluates agentic system performance""" + + def evaluate(self, results: Dict[str, Any]) -> Dict[str, float]: + """ + Evaluate system performance + + Returns: + Dictionary of metric scores + """ + scores = { + 'comprehensiveness': 7.0, + 'insight': 6.5, + 'instruction_following': 8.0, + 'readability': 7.5 + } + + return scores + + +def create_evaluator() -> AgenticSystemEvaluator: + """Factory function""" + return AgenticSystemEvaluator() +''' + return code + + def _generate_execution_script(self, task_info: Dict[str, Any]) -> str: + """Generate execution script""" + code = '''""" +Training/Execution script for agentic system +""" + +import logging +from agent import create_agent +from evaluator import create_evaluator + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + """Main execution""" + logger.info("Starting agentic system execution") + + # Create agent + config = {'model': 'gpt-4', 'temperature': 0.7} + agent = create_agent(config) + + # Run task + task = "Sample agentic task" + results = agent.run(task) + + # Evaluate + evaluator = create_evaluator() + scores = evaluator.evaluate(results) + + logger.info(f"Evaluation scores: {scores}") + logger.info("Execution completed") + + +if __name__ == '__main__': + main() +''' + return code + + def _generate_requirements(self, task_info: Dict[str, Any]) -> str: + """Generate requirements.txt""" + requirements = '''# Requirements for agentic system +openai>=1.0.0 +anthropic>=0.7.0 +pydantic>=2.0.0 +python-dotenv>=1.0.0 +requests>=2.31.0 +''' + return requirements + + def _calculate_timeout(self, exp: Experiment) -> int: + """Calculate execution timeout based on complexity""" + complexity = getattr(exp, 'complexity', 'medium') + + timeout_map = { + 'low': 300, # 5 minutes + 'medium': 600, # 10 minutes + 'high': 1200 # 20 minutes + } + + return timeout_map.get(complexity, 600) + + def _create_fallback_workspace(self, exp: Experiment) -> FBWorkspace: + """Create fallback workspace on error""" + ws = FBWorkspace() + + # Create minimal agent.py + ws.inject_files(**{ + 'agent.py': '# Fallback agent implementation\nprint("Fallback mode")', + 'train.py': '# Fallback execution\nprint("Running in fallback mode")' + }) + + return ws + + +class AgenticSysRunner(Developer[Experiment]): + """ + Runner for agentic system experiments + Executes generated code and collects results + """ + + def __init__(self, scen): + self.scen = scen + logger.info("Initialized AgenticSysRunner") + + def develop(self, exp: Experiment) -> Experiment: + """ + Run the experiment + + Args: + exp: Experiment with generated code + + Returns: + Experiment with execution results + """ + logger.info(f"Running experiment: {getattr(exp, 'id', 'unknown')}") + + try: + if not hasattr(exp, 'experiment_workspace') or not exp.experiment_workspace: + raise ValueError("No workspace found in experiment") + + # Execute the code + env = get_agent_sys_env(running_timeout_period=600, enable_cache=True) + run_res = exp.experiment_workspace.run(env=env, entry="python train.py") + + # Store results + exp.run_returncode = getattr(run_res, 'returncode', None) + exp.run_stdout = getattr(run_res, 'stdout', getattr(run_res, 'logs', None)) + exp.run_stderr = getattr(run_res, 'stderr', None) + + logger.info(f"Execution completed with return code: {exp.run_returncode}") + + except Exception as e: + logger.error(f"Execution failed: {e}", exc_info=True) + exp.exception = e + + return exp \ No newline at end of file diff --git a/rdagent/scenarios/agentic_sys/docker/Dockerfile b/rdagent/scenarios/agentic_sys/docker/Dockerfile new file mode 100644 index 000000000..b4e6928a6 --- /dev/null +++ b/rdagent/scenarios/agentic_sys/docker/Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.11-slim + +RUN apt-get clean && apt-get update && apt-get install -y \ + curl \ + vim \ + git \ + build-essential \ + && pip install --no-cache-dir uv \ + && rm -rf /var/lib/apt/lists/* + +# Copy entrypoint.sh script into the container workspace +COPY entrypoint.sh /workspace/entrypoint.sh +RUN chmod +x /workspace/entrypoint.sh + diff --git a/rdagent/scenarios/agentic_sys/docker/entrypoint.sh b/rdagent/scenarios/agentic_sys/docker/entrypoint.sh new file mode 100644 index 000000000..2d94b4ccf --- /dev/null +++ b/rdagent/scenarios/agentic_sys/docker/entrypoint.sh @@ -0,0 +1,14 @@ + + +mkdir -p /env +cd /env +uv sync +uv add pip +source .venv/bin/activate + +mkdir -p /workspace +cd /workspace + +git clone https://github.com/your-username/deep_research_bench.git +cd deep_research_bench +pip install -r requirements.txt diff --git a/rdagent/scenarios/agentic_sys/env.py b/rdagent/scenarios/agentic_sys/env.py new file mode 100644 index 000000000..b32d472f7 --- /dev/null +++ b/rdagent/scenarios/agentic_sys/env.py @@ -0,0 +1,109 @@ + +from pathlib import Path +from pydantic_settings.main import SettingsConfigDict +from rdagent.utils.env import DockerConf, DockerEnv +from rdagent.app.data_science.conf import DS_RD_SETTING +import logging +import shutil + +logger = logging.getLogger(__name__) + + +class AgentSysDockerConf(DockerConf): + # TODO: change the content + model_config = SettingsConfigDict(env_prefix="ASYS_DOCKER_") + + build_from_dockerfile: bool = True + + dockerfile_folder_path: Path = Path(__file__).parent / "docker" + image: str = "local_agentic_sys:latest" + + + #Mount and execution strategy + mount_path: str = "/workspace/rdagent-solution" + #mount_path: str = "/workspace" + + + default_entry: str = "python main.py" + #default_entry: str = "python train.py" + + running_timeout_period: int | None = 600 + mem_limit: str | None = ( + "48g" # Add memory limit attribute # new-york-city-taxi-fare-prediction may need more memory + ) + +def sanitize_container_path(path): + p = path.replace("\\","/") + if ":" in p: + #remove drive letter + p = p.split(":",1)[-1] + if not p.startswith("/"): + p = "/" + p.lstrip("/") + return p + +def build_volume(ws_path, mount_path, extra): + """ + return Docker SDK volume mapping dict + """ + vols = {} + host_ws = str(ws_path.resolve()) + container_ws = sanitize_container_path(mount_path) + vols[host_ws] = {"bind": container_ws, "mode": "rw"} + if extra: + for host, container in extra.items(): + host_res = str(Path(host).resolve()) + container_res = sanitize_container_path(container) + vols[host_res] = {"bind": container_res, "mode": "rw"} + return vols + + + + +def get_agent_sys_env( + extra_volumes: dict = {}, + running_timeout_period: int | None = DS_RD_SETTING.debug_timeout, + enable_cache: bool | None = None, +) -> DockerEnv: + """ + create and prepare Docker environment for agentic system scenario + """ + conf = AgentSysDockerConf() + env = DockerEnv(conf=conf) + env.conf.extra_volumes = extra_volumes.copy() + env.conf.running_timeout_period = running_timeout_period + if enable_cache is not None: + env.conf.enable_cache = enable_cache + env.prepare() + return env + + +# def get_agent_sys_env( +# extra_volumes:dict = {}, +# running_timeout_period: int | None = DS_RD_SETTING.debug_timeout, +# enable_cache: bool | None = None, +# ) -> DockerEnv: +# """ +# create and prepare Docker environment for agentic system scenario +# """ +# conf = AgentSysDockerConf() +# env = DockerEnv(conf=conf) +# env.conf.extra_volumes = extra_volumes.copy() +# env.conf.running_timeout_period = running_timeout_period +# if enable_cache is not None: +# env.conf.enable_cache = enable_cache +# #inject correct volumes before preparation +# env.conf.mount_path = sanitize_container_path(env.conf.mount_path) + +# # 清理 extra_volumes 中的容器路径 +# if env.conf.extra_volumes: +# sanitized_extra = {} +# for host, container in env.conf.extra_volumes.items(): +# sanitized_extra[host] = sanitize_container_path(container) +# env.conf.extra_volumes = sanitized_extra + +# env.prepare() +# return env + + + + diff --git a/rdagent/scenarios/agentic_sys/evaluator.py b/rdagent/scenarios/agentic_sys/evaluator.py new file mode 100644 index 000000000..112e4499e --- /dev/null +++ b/rdagent/scenarios/agentic_sys/evaluator.py @@ -0,0 +1,500 @@ +""" +DeepResearch Bench Evaluator for Agentic System +Implements 4-dimension evaluation: Comprehensiveness, Insight, Instruction Following, Readability +""" +from typing import Dict, Any, Optional, List +from dataclasses import dataclass, field +import logging + +logger = logging.getLogger(__name__) + + +@dataclass +class DimensionScore: + """Score for a single dimension""" + score: float # 0-10 + checks: List[str] = field(default_factory=list) + details: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class EvaluationResult: + """Complete evaluation result with all dimensions""" + comprehensiveness: DimensionScore + insight: DimensionScore + instruction_following: DimensionScore + readability: DimensionScore + overall: float = 0.0 + weights: Dict[str, float] = field(default_factory=dict) + normalized_scores: Optional[Dict[str, float]] = None + + def __init__(self): + if not self.weights: + # Default equal weights + self.weights = { + 'comprehensiveness': 0.25, + 'insight': 0.25, + 'instruction_following': 0.25, + 'readability': 0.25 + } + self.calculate_overall() + + def calculate_overall(self) -> float: + """Calculate weighted overall score""" + self.overall = ( + self.comprehensiveness.score * self.weights['comprehensiveness'] + + self.insight.score * self.weights['insight'] + + self.instruction_following.score * self.weights['instruction_following'] + + self.readability.score * self.weights['readability'] + ) + return self.overall + + def normalize_against_reference(self, reference: 'EvaluationResult') -> Dict[str, float]: + """ + Pairwise normalization: target_normalized = target_score / (target_score + reference_score) + """ + normalized = {} + + dimensions = ['comprehensiveness', 'insight', 'instruction_following', 'readability'] + for dim in dimensions: + target_score = getattr(self, dim).score + ref_score = getattr(reference, dim).score + total = target_score + ref_score + normalized[dim] = target_score / total if total > 0 else 0.5 + + # Normalize overall score + total_overall = self.overall + reference.overall + normalized['overall'] = self.overall / total_overall if total_overall > 0 else 0.5 + + self.normalized_scores = normalized + return normalized + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for JSON serialization""" + result = { + 'scores': { + 'comprehensiveness': round(self.comprehensiveness.score, 2), + 'insight': round(self.insight.score, 2), + 'instruction_following': round(self.instruction_following.score, 2), + 'readability': round(self.readability.score, 2), + 'overall': round(self.overall, 2) + }, + 'weights': self.weights, + 'details': { + 'comprehensiveness': { + 'score': round(self.comprehensiveness.score, 2), + 'checks': self.comprehensiveness.checks, + 'details': self.comprehensiveness.details + }, + 'insight': { + 'score': round(self.insight.score, 2), + 'checks': self.insight.checks, + 'details': self.insight.details + }, + 'instruction_following': { + 'score': round(self.instruction_following.score, 2), + 'checks': self.instruction_following.checks, + 'details': self.instruction_following.details + }, + 'readability': { + 'score': round(self.readability.score, 2), + 'checks': self.readability.checks, + 'details': self.readability.details + } + } + } + + if self.normalized_scores: + result['normalized_scores'] = self.normalized_scores + + return result + + +class DeepResearchEvaluator: + """ + Evaluator implementing DeepResearch Bench 4-dimension rubric + + Scoring Anchors: + - 0-2: Poor/Missing core elements + - 4-6: Basic/Adequate with gaps + - 6-8: Good/Complete coverage + - 8-10: Excellent/Exhaustive + """ + + def __init__(self, dimension_weights: Optional[Dict[str, float]] = None): + """ + Initialize evaluator with optional custom weights + + Args: + dimension_weights: Custom weights for dimensions (must sum to 1.0) + """ + self.weights = dimension_weights or { + 'comprehensiveness': 0.25, + 'insight': 0.25, + 'instruction_following': 0.25, + 'readability': 0.25 + } + + # Validate weights + total = sum(self.weights.values()) + if abs(total - 1.0) > 0.01: + logger.warning(f"Weights sum to {total}, normalizing to 1.0") + for k in self.weights: + self.weights[k] /= total + + def evaluate( + self, + output: Any, + task_requirements: Optional[Dict[str, Any]] = None, + task_context: Optional[Dict[str, Any]] = None, + reference_result: Optional[EvaluationResult] = None + ) -> EvaluationResult: + """ + Evaluate output against DeepResearch Bench criteria + + Args: + output: The agent's output to evaluate + task_requirements: Task requirements and constraints + task_context: Additional context about the task + reference_result: Optional reference for normalization + + Returns: + EvaluationResult with scores for all dimensions + """ + task_requirements = task_requirements or {} + task_context = task_context or {} + + # Evaluate each dimension + comp_score = self._evaluate_comprehensiveness(output, task_requirements) + insight_score = self._evaluate_insight(output, task_context) + following_score = self._evaluate_instruction_following(output, task_requirements) + read_score = self._evaluate_readability(output) + + # Create result + result = EvaluationResult( + comprehensiveness=comp_score, + insight=insight_score, + instruction_following=following_score, + readability=read_score, + weights=self.weights + ) + + # Normalize against reference if provided + if reference_result: + result.normalize_against_reference(reference_result) + + return result + + def evaluate_comprehensiveness( + self, + output: Any, + requirements: Dict[str, Any] + ) -> DimensionScore: + """ + Evaluate Comprehensiveness (0-10) + - Required subtopics coverage (0-3 pts) + - Depth of analysis (0-3 pts) + - Evidence and sources (0-2 pts) + - Multiple perspectives (0-2 pts) + """ + score = 0.0 + checks = [] + details = {} + + output_str = str(output).lower() + + # 1. Required subtopics coverage (0-3 pts) + required_topics = requirements.get('required_topics', []) + if required_topics: + covered = sum(1 for topic in required_topics + if topic.lower() in output_str) + coverage_ratio = covered / len(required_topics) + coverage_score = min(3.0, coverage_ratio * 3.0) + score += coverage_score + checks.append(f"Topic coverage: {covered}/{len(required_topics)} ({coverage_score:.1f}/3.0)") + details['topic_coverage'] = { + 'required': len(required_topics), + 'covered': covered, + 'ratio': coverage_ratio + } + else: + score += 2.0 + checks.append("No specific topic requirements (default 2.0/3.0)") + + # 2. Depth of analysis (0-3 pts) + depth_indicators = { + 'detailed_analysis': 'detailed analysis' in output_str or 'in-depth' in output_str, + 'data_evidence': 'data' in output_str or 'evidence' in output_str, + 'substantial_content': len(str(output)) > 500, + 'methodology': 'methodology' in output_str or 'approach' in output_str + } + depth_score = sum(depth_indicators.values()) * 0.75 + score += depth_score + checks.append(f"Depth indicators: {sum(depth_indicators.values())}/4 ({depth_score:.1f}/3.0)") + details['depth_indicators'] = depth_indicators + + # 3. Evidence and sources (0-2 pts) + evidence_score = 0.0 + if 'references' in output_str or 'sources' in output_str or 'citation' in output_str: + evidence_score += 1.0 + if 'data' in output_str or 'statistics' in output_str or 'figure' in output_str: + evidence_score += 1.0 + score += evidence_score + checks.append(f"Evidence & sources: ({evidence_score:.1f}/2.0)") + details['evidence_score'] = evidence_score + + # 4. Multiple perspectives (0-2 pts) + perspective_keywords = [ + 'advantage', 'disadvantage', 'trade-off', 'alternative', + 'limitation', 'consideration', 'pros', 'cons' + ] + perspectives_found = sum(1 for kw in perspective_keywords if kw in output_str) + perspective_score = min(2.0, perspectives_found * 0.4) + score += perspective_score + checks.append(f"Multiple perspectives: {perspectives_found} keywords ({perspective_score:.1f}/2.0)") + details['perspectives_found'] = perspectives_found + + final_score = min(10.0, score) + checks.append(f"Total Comprehensiveness Score: {final_score:.2f}/10.0") + + return DimensionScore(score=final_score, checks=checks, details=details) + + def evaluate_insight( + self, + output: Any, + context: Dict[str, Any] + ) -> DimensionScore: + """ + Evaluate Insight (0-10) + - Causal reasoning (0-3 pts) + - Quantified analysis (0-2 pts) + - Non-obvious implications (0-3 pts) + - Novel synthesis (0-2 pts) + """ + score = 0.0 + checks = [] + details = {} + + output_str = str(output).lower() + + # 1. Causal reasoning (0-3 pts) + causal_indicators = [ + 'because', 'therefore', 'as a result', 'leads to', + 'causes', 'impacts', 'due to', 'consequently' + ] + causal_found = sum(1 for indicator in causal_indicators if indicator in output_str) + causal_score = min(3.0, causal_found * 0.5) + score += causal_score + checks.append(f"Causal reasoning: {causal_found} indicators ({causal_score:.1f}/3.0)") + details['causal_indicators'] = causal_found + + # 2. Quantified analysis (0-2 pts) + has_numbers = any(char.isdigit() for char in str(output)) + metric_keywords = ['percent', '%', 'rate', 'ratio', 'metric', 'measure', 'score'] + has_metrics = any(kw in output_str for kw in metric_keywords) + quant_score = (1.0 if has_numbers else 0.0) + (1.0 if has_metrics else 0.0) + score += quant_score + checks.append(f"Quantified analysis: numbers={has_numbers}, metrics={has_metrics} ({quant_score:.1f}/2.0)") + details['quantification'] = {'has_numbers': has_numbers, 'has_metrics': has_metrics} + + # 3. Non-obvious implications (0-3 pts) + insight_keywords = [ + 'implication', 'insight', 'suggests', 'indicates', + 'reveals', 'unexpected', 'surprisingly', 'notable', 'interesting' + ] + insights_found = sum(1 for kw in insight_keywords if kw in output_str) + implication_score = min(3.0, insights_found * 0.6) + score += implication_score + checks.append(f"Implications: {insights_found} keywords ({implication_score:.1f}/3.0)") + details['insights_found'] = insights_found + + # 4. Novel synthesis (0-2 pts) + synthesis_indicators = [ + 'framework', 'model', 'synthesis', 'integration', + 'novel', 'innovative', 'unique', 'original' + ] + synthesis_found = sum(1 for kw in synthesis_indicators if kw in output_str) + synthesis_score = min(2.0, synthesis_found * 0.5) + score += synthesis_score + checks.append(f"Novel synthesis: {synthesis_found} indicators ({synthesis_score:.1f}/2.0)") + details['synthesis_indicators'] = synthesis_found + + final_score = min(10.0, score) + checks.append(f"Total Insight Score: {final_score:.2f}/10.0") + + return DimensionScore(score=final_score, checks=checks, details=details) + + def evaluate_instruction_following( + self, + output: Any, + requirements: Dict[str, Any] + ) -> DimensionScore: + """ + Evaluate Instruction Following (0-10) + - Required sections present (0-4 pts) + - Scope compliance (0-3 pts) + - Format compliance (0-2 pts) + - Completeness (0-1 pt) + """ + score = 0.0 + checks = [] + details = {} + + output_str = str(output).lower() + + # 1. Required sections present (0-4 pts) + required_sections = requirements.get('required_sections', []) + if required_sections: + present = sum(1 for section in required_sections + if section.lower() in output_str) + section_score = min(4.0, (present / len(required_sections)) * 4.0) + score += section_score + checks.append(f"Required sections: {present}/{len(required_sections)} ({section_score:.1f}/4.0)") + details['sections'] = { + 'required': len(required_sections), + 'present': present + } + else: + score += 3.0 + checks.append("No specific section requirements (default 3.0/4.0)") + + # 2. Scope compliance (0-3 pts) + scope_violations = self._check_scope_violations(output, requirements) + scope_score = max(0.0, 3.0 - len(scope_violations) * 0.5) + score += scope_score + if scope_violations: + checks.append(f"Scope violations: {len(scope_violations)} ({scope_score:.1f}/3.0)") + details['scope_violations'] = scope_violations + else: + checks.append("No scope violations (3.0/3.0)") + + # 3. Format compliance (0-2 pts) + format_reqs = requirements.get('format', {}) + format_score = 2.0 # Default + if format_reqs: + format_type = format_reqs.get('type', '').lower() + if format_type == 'json': + try: + import json + json.loads(str(output)) + format_score = 2.0 + except: + format_score = 0.5 + checks.append(f"Format compliance: {format_type} ({format_score:.1f}/2.0)") + else: + checks.append("Format compliance: (2.0/2.0)") + score += format_score + details['format_score'] = format_score + + # 4. Completeness (0-1 pt) + length = len(str(output)) + completeness_score = 1.0 if length > 200 else 0.5 + score += completeness_score + checks.append(f"Completeness: {length} chars ({completeness_score:.1f}/1.0)") + details['output_length'] = length + + final_score = min(10.0, score) + checks.append(f"Total Instruction Following Score: {final_score:.2f}/10.0") + + return DimensionScore(score=final_score, checks=checks, details=details) + + def evaluate_readability(self, output: Any) -> DimensionScore: + """ + Evaluate Readability (0-10) + - Structure and organization (0-3 pts) + - Language quality (0-3 pts) + - Data presentation (0-2 pts) + - Clarity (0-2 pts) + """ + score = 0.0 + checks = [] + details = {} + + output_str = str(output) + output_lower = output_str.lower() + + # 1. Structure and organization (0-3 pts) + structure_indicators = { + 'has_breaks': '\n' in output_str, + 'has_sections': any(word in output_lower for word in + ['summary', 'introduction', 'conclusion', 'results', 'method']), + 'multi_paragraph': output_str.count('\n') > 5 + } + structure_score = min(3.0, sum(structure_indicators.values()) * 1.0) + score += structure_score + checks.append(f"Structure: {sum(structure_indicators.values())}/3 indicators ({structure_score:.1f}/3.0)") + details['structure'] = structure_indicators + + # 2. Language quality (0-3 pts) + words = output_str.split() + if words: + avg_word_length = sum(len(word) for word in words) / len(words) + unique_ratio = len(set(output_lower.split())) / len(words) if words else 0 + + language_score = 0.0 + if 4 < avg_word_length < 7: # Reasonable word length + language_score += 1.5 + if unique_ratio > 0.5: # Vocabulary variety + language_score += 1.5 + + score += language_score + checks.append(f"Language quality: avg_len={avg_word_length:.1f}, variety={unique_ratio:.2f} ({language_score:.1f}/3.0)") + details['language'] = { + 'avg_word_length': avg_word_length, + 'unique_ratio': unique_ratio + } + else: + checks.append("Language quality: no content (0.0/3.0)") + + # 3. Data presentation (0-2 pts) + has_formatting = any(marker in output_str for marker in ['|', ':', '-', '*', '•']) + has_structure = output_str.count('\n') > 3 + presentation_score = (1.0 if has_formatting else 0.0) + (1.0 if has_structure else 0.0) + score += presentation_score + checks.append(f"Data presentation: formatting={has_formatting}, structure={has_structure} ({presentation_score:.1f}/2.0)") + details['presentation'] = { + 'has_formatting': has_formatting, + 'has_structure': has_structure + } + + # 4. Clarity (0-2 pts) + length = len(output_str) + clarity_score = 2.0 + if length < 100: + clarity_score = 0.5 + elif length > 5000: + clarity_score = 1.5 + + score += clarity_score + checks.append(f"Clarity: length={length} chars ({clarity_score:.1f}/2.0)") + details['clarity'] = {'length': length, 'score': clarity_score} + + final_score = min(10.0, score) + checks.append(f"Total Readability Score: {final_score:.2f}/10.0") + + return DimensionScore(score=final_score, checks=checks, details=details) + + def _check_scope_violations( + self, + output: Any, + requirements: Dict[str, Any] + ) -> List[str]: + """Check for scope violations""" + violations = [] + output_lower = str(output).lower() + + # Check timeframe constraints + timeframe = requirements.get('timeframe') + if timeframe: + excluded_periods = requirements.get('excluded_periods', []) + for period in excluded_periods: + if period.lower() in output_lower: + violations.append(f"Out-of-scope timeframe: {period}") + + # Check topic constraints + excluded_topics = requirements.get('excluded_topics', []) + for topic in excluded_topics: + if topic.lower() in output_lower: + violations.append(f"Out-of-scope topic: {topic}") + + return violations \ No newline at end of file diff --git a/rdagent/scenarios/agentic_sys/exp.py b/rdagent/scenarios/agentic_sys/exp.py new file mode 100644 index 000000000..a688c7ea8 --- /dev/null +++ b/rdagent/scenarios/agentic_sys/exp.py @@ -0,0 +1,140 @@ +from pathlib import Path +from rdagent.core.experiment import Experiment +from typing import Any, List, Optional, Dict + +# convert code into executable experiment and output standard experiment result +class AgenticSysExperiment(Experiment): + def __init__(self, sub_tasks=None, based_experiments=None, experiment_workspace=None): + super().__init__(sub_tasks=sub_tasks, based_experiments=based_experiments) + if experiment_workspace is not None: + self.experiment_workspace = experiment_workspace + + # DeepResearch Bench evaluation scores + self.deepresearch_scores: Optional[Dict[str, float]] = None + self.evaluation_log: Optional[list] = None + + #web search related attributes (NEW) + self.used_web_search: bool = False + self.external_sources: List[Dict[str, Any]] = [] + self.external_knowledge_summary: str = "" + self.web_search_timestamp: Optional[str] = None + + # Existing attributes... + self.hypothesis: str = "" + self.iteration_number: int = 0 + self.complexity: str = "medium" + self.previous_performance_low: bool = False + + def run(self, code:str): + """ + Run the experiment with the given code. + Step: + 1. Prepare Experiment Environment + 2. Run Agent code + 3. Collect Performance Metrics + 4. record log + """ + code_path = self.workspace / "agent.py" + code_path.write_text(code) + + #construct running script + run_script = f""" + import sys + import json + try: + sys.path.insert(0, '{self.workspace}') + from agent import AgenticSystem + + agent = AgenticSystem() + results = agent.run_tasks() + + # output structured results + print("=== EXECUTION RESULTS ===") + print(f"Success Rate: {{results['success_rate']}}") + print(f"Average Time: {{results['avg_time']}}") + print(f"Error Count: {{results['error_count']}}") + print(f"Total Tasks: {{results['total_tasks']}}") + print("=== END RESULTS ===") + + # output JSON format for parsing + print("=== JSON RESULTS ===") + print(json.dumps(results)) + print("=== END JSON ===") + + except Exception as e: + print(f"ERROR: {{str(e)}}", file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) + """ + + # use runner to execute + result = self.runner.run( + script = run_script, + timeout = 300, # 5 minutes timeout + capture_output = True + ) + + # parse output for metrics + metrics = self.parse_output(result.stdout) + return Experiment( + success = result.returncode == 0, + metrics = metrics, + logs = result.stdout, + errors = result.stderr + ) + + #parse all the metrics from stdout + def parse_output(self, stdout: str): + metrics = { + "success_rate": 0.0, + "avg_time": float('inf'), + "error_count": 1, + "total_tasks": 0 + } + + try: + # try to extract JSON block first + import json + import re + json_match = re.search(r'=== JSON RESULTS ===\n(.*?)\n=== END JSON ===', stdout, re.DOTALL) + if json_match: + result_data = json.loads(json_match.group(1)) + metrics.update(result_data) + return metrics + + # fallback to text parsing + for line in stdout.splitlines(): + if "Success Rate:" in line: + metrics["success_rate"] = float(line.split(":")[1].strip()) + elif "Average Time:" in line: + metrics["avg_time"] = float(line.split(":")[1].strip()) + elif "Error Count:" in line: + metrics["error_count"] = int(line.split(":")[1].strip()) + elif "Total Tasks:" in line: + metrics["total_tasks"] = int(line.split(":")[1].strip()) + + except Exception as e: + print(f"Failed to parse output: {e}") + + return metrics + + + def get_deepresearch_score(self, dimension: str = 'overall') -> float: + """Get DeepResearch Bench score for specific dimension""" + if not self.deepresearch_scores: + return 0.0 + return self.deepresearch_scores.get(dimension, 0.0) + + def get_evaluation_summary(self): + """Get comprehensive evaluation summary""" + if not hasattr(self, 'result') or not self.result: + return {'status': 'no_results'} + + summary = { + 'execution_metrics': self.result.get('execution_results', {}), + 'deepresearch_scores': self.result.get('deepresearch_evaluation', {}), + 'overall_score': self.get_deepresearch_score('overall') + } + + return summary \ No newline at end of file diff --git a/rdagent/scenarios/agentic_sys/feedback.py b/rdagent/scenarios/agentic_sys/feedback.py new file mode 100644 index 000000000..69d9d9847 --- /dev/null +++ b/rdagent/scenarios/agentic_sys/feedback.py @@ -0,0 +1,244 @@ +from asyncio.log import logger +from pathlib import Path +from rdagent.core.experiment import Experiment +from rdagent.core.proposal import Experiment2Feedback, ExperimentFeedback, Trace +import re +import json + + +class AgenticSysExp2Feedback(Experiment2Feedback): + def generate_feedback(self, experiment: Experiment, trace: Trace) -> ExperimentFeedback: + + # BEGIN drafting + # read content from `expriment.workspace_path` + # END drafting + try: + if hasattr(experiment, 'experiment_workspace') and experiment.experiment_workspace: + ws_path = Path(experiment.experiment_workspace.workspace_path) + if ws_path.exists() and ws_path.is_dir(): + logger.info(f"Reading results from workspace: {ws_path}") + #Try to read result files in order of preference + result_files = [ + "result.json", + "detailed_result.json", + "output.json", + "error_result.json" + ] + for result_file in result_files: + result_path = ws_path / result_file + if result_path.exists(): + try: + content = result_path.read_text() + data = json.loads(content) + #Extract execution results if nested + if isinstance(data, dict): + if "execution_result" in data: + experiment.result = data["execution_result"] + else: + experiment.result = data + else: + experiment.result = data + break + except Exception as e: + logger.warning(f"Failed to parse {result_file}: {e}") + continue + #if no result file found, try parsing stdout/stderr from workspace + if not hasattr(experiment, 'result') or experiment.result is None: + self.try_parse_logs(experiment, ws_path) + except Exception as e: + logger.warning(f"Failed to read workspace contents: {e}") + + + + # 1. check whether experiment ran successfully + if not hasattr(experiment, 'result') or experiment.result is None: + return ExperimentFeedback( + reason = "Experiment did not complete execution.", + decision = False, + exception = getattr(experiment, 'exception', None) + ) + + #2. extract important metrics from experiment result + result = experiment.result + + #evaluation metrics + success_rate = result.get('success_rate', 0) + avg_time = result.get('avg_time', float('inf')) + error_count = result.get('error_count', 0) + + #3. formulate success criteria + MIN_SUCCESS_RATE = 0.7 + MAX_AVG_TIME = 30 + MAX_ERROR_COUNT = 2 + + is_successful = ( + success_rate >= MIN_SUCCESS_RATE and + avg_time <= MAX_AVG_TIME and + error_count <= MAX_ERROR_COUNT + ) + + #4. Compare with past experiments in the trace + historical_best = self.get_best_from_trace(trace) + is_improvement = False + + if historical_best: + best_success_rate = historical_best.get('success_rate', 0) + best_avg_time = historical_best.get('avg_time', float('inf')) + + is_improvement = ( + success_rate > best_success_rate or + (success_rate == best_success_rate and avg_time < best_avg_time) + ) + + else: + #first-time experiment, we should still accept it even if it is fail. + is_improvement = True + + #5. Generate detailed feedback + reason_parts = [] + reason_parts.append(f"Success Rate: {success_rate:.2f}") + reason_parts.append(f"Average Time: {avg_time:.2f}s") + if error_count > 0: + reason_parts.append(f"Errors Encountered: {error_count}") + if is_improvement: + reason_parts.append("This experiment shows improvement over past results.") + elif historical_best: + reason_parts.append( + f"No improvement (best: {historical_best.get('success_rate', 0)})" + ) + reason = "|".join(reason_parts) + return ExperimentFeedback( + reason = reason, + decision = is_improvement, + ) + + def try_parse_logs(self, experiment, ws_path): + """Try to parse result from workspace log files""" + try: + # look for commonn log files patterns + log_pattern = ["*.log", "*.out", "train_output.txt","execution.log"] + for pattern in log_pattern: + for log_file in ws_path.glob(pattern): + try: + content = log_file.read_text() + parsed = self.parse_stdout_for_metrics(content) + if parsed: + experiment.result = parsed + return + except Exception as e: + logger.warning(f"Failed to parse log file {log_file}: {e}") + continue + except Exception as e: + logger.warning(f"Failed to read workspace contents: {e}") + + + def parse_stdout_for_metrics(self, stdout): + """Parse metrics from stdout text""" + if not stdout: + return None + try: + # Method 1: Try to extract JSON block + json_pattern = r'=== JSON RESULTS ===\s*\n(.*?)\n=== END JSON ===' + match = re.search(json_pattern, stdout, re.DOTALL) + if match: + try: + return json.loads(match.group(1).strip()) + except json.JSONDecodeError: + pass + # Method 2: Try to find any JSON object + json_obj_pattern = r'\{[^{}]*"success_rate"[^{}]*\}' + match = re.search(json_obj_pattern, stdout) + if match: + try: + return json.loads(match.group(0)) + except json.JSONDecodeError: + pass + + # Method 3: Parse text patterns + metrics = {} + + success_match = re.search(r'Success Rate:\s*([0-9.]+)', stdout) + time_match = re.search(r'Average Time:\s*([0-9.]+)', stdout) + error_match = re.search(r'Error Count:\s*([0-9]+)', stdout) + tasks_match = re.search(r'Total Tasks:\s*([0-9]+)', stdout) + + if success_match: + metrics['success_rate'] = float(success_match.group(1)) + metrics['avg_time'] = float(time_match.group(1)) if time_match else 0.0 + metrics['error_count'] = int(error_match.group(1)) if error_match else 0 + metrics['total_tasks'] = int(tasks_match.group(1)) if tasks_match else 0 + return metrics + + except Exception as e: + logger.debug(f"Failed to parse stdout: {e}") + + return None + + + + + + def get_best_from_trace(self, trace:Trace): + # Extract the best experiment result from the trace + if not hasattr(trace, 'hist') or not trace.hist: + return None + best_result = None + best_success_rate = -1 + for exp, feedback in trace.hist: + if hasattr(exp, 'result') and exp.result: + success_rate = exp.result.get('success_rate', 0) + if success_rate > best_success_rate: + best_success_rate = success_rate + best_result = exp.result + return best_result + + def analyze_performance_issues(self, result): + # analyze performance issues based on result metrics + issues = [] + success_rate = result.get('success_rate', 0) + avg_time = result.get('avg_time', float('inf')) + error_count = result.get('error_count', 0) + + if success_rate < 0.3: + issues.append("Critical: Very low success rate - review core algorithm") + elif success_rate < 0.7: + issues.append("Warning: Success rate below target - optimize task handling") + + if avg_time > 10: + issues.append("Performance: High execution time - consider optimization") + + if error_count > 5: + issues.append("Stability: High error count - improve error handling") + + return issues + + def get_evaluation_summary(self, trace): + """Get summary of all experiments in trace""" + if not hasattr(trace, 'hist') or not trace.hist: + return {"total": 0, "successful": 0, "average_success_rate": 0.0} + + total = len(trace.hist) + successful = 0 + success_rates = [] + + for exp, feedback in trace.hist: + if hasattr(exp, 'result') and exp.result: + success_rate = exp.result.get('success_rate', 0) + success_rates.append(success_rate) + if feedback and feedback.decision: + successful += 1 + + return { + "total": total, + "successful": successful, + "success_ratio": successful / total if total > 0 else 0, + "average_success_rate": sum(success_rates) / len(success_rates) if success_rates else 0, + "best_success_rate": max(success_rates) if success_rates else 0 + } + + + + + + + diff --git a/rdagent/scenarios/agentic_sys/loop.py b/rdagent/scenarios/agentic_sys/loop.py new file mode 100644 index 000000000..92d032288 --- /dev/null +++ b/rdagent/scenarios/agentic_sys/loop.py @@ -0,0 +1,116 @@ +import asyncio +import shutil +import subprocess +from datetime import datetime +from pathlib import Path +from typing import Any, Optional, Union + +from rdagent.app.agentic_sys.conf import AgenticSysSetting +from rdagent.components.workflow.conf import BasePropSetting +from rdagent.components.workflow.rd_loop import RDLoop +from rdagent.core.conf import RD_AGENT_SETTINGS +from rdagent.core.developer import Developer +from rdagent.core.exception import CoderError, PolicyError, RunnerError +from rdagent.core.experiment import Experiment +from rdagent.core.proposal import Experiment2Feedback, ExperimentFeedback, ExpGen, Trace +from rdagent.core.scenario import Scenario +from rdagent.core.utils import import_class +from rdagent.log import rdagent_logger as logger +from rdagent.scenarios.agentic_sys.exp import AgenticSysExperiment +from rdagent.core.proposal import ExpGen + + +class AgenticSysRDLoop(RDLoop): + # NOTE: we move the DataScienceRDLoop here to be easier to be imported + # Maintain experiment loop history and context + # support multi-iteration optimization + skip_loop_error = (CoderError, RunnerError) + withdraw_loop_error = (PolicyError,) + + def __init__(self, PROP_SETTING: AgenticSysSetting): + + scen = import_class(PROP_SETTING.scen)(PROP_SETTING.competition) + self.scen: Scenario = scen + self.exp_gen: ExpGen = import_class(PROP_SETTING.exp_gen)(scen) + + self.coder: Developer = import_class(PROP_SETTING.coder)(scen) + self.runner: Developer = import_class(PROP_SETTING.runner)(scen) + + self.summarizer: Experiment2Feedback = import_class(PROP_SETTING.feedback)(scen) + self.trace = Trace(scen=scen) + + #Store configuration + self.setting = PROP_SETTING + + super(RDLoop, self).__init__() + + logger.info(f"AgenticSysRDLoop initialized for competition: {PROP_SETTING.competition}") + + async def direct_exp_gen(self, prev_out: dict[str, Any]): + exp = await self.exp_gen.async_gen(self.trace, self) + return {"exp_gen": exp} + + def record(self, prev_out: dict[str, Any]): + cur_loop_id = prev_out[self.LOOP_IDX_KEY] + + if (e := prev_out.get(self.EXCEPTION_KEY, None)) is None: + exp = prev_out["running"] + self.trace.sync_dag_parent_and_hist((exp, prev_out["feedback"]), cur_loop_id) + else: + exp: DSExperiment = prev_out["direct_exp_gen"] if isinstance(e, CoderError) else prev_out["coding"] + self.trace.sync_dag_parent_and_hist( + ( + exp, + ExperimentFeedback.from_exception(e), + ), + cur_loop_id, + ) + + async def propose(self, prev_out): + """Propose hypothesis""" + #integrate web search tool + hypothesis = self.hypothesis_gen.gen(self.trace) + #record result + if hasattr(hypothesis, 'external_sources'): + logger.log_object( + hypothesis.external_sources, + tag = "research.external.sources" + ) + return hypothesis + + + async def develop(self, prev_out): + """ + Develop code with optional web search enhancement + + Args: + prev_out: Previous output containing hypothesis + + Returns: + Developed experiment + """ + logger.info("=" * 80) + logger.info("DEVELOP PHASE: Generating code") + logger.info("=" * 80) + + hypothesis = prev_out.get("hypothesis") + + exp = Experiment() + exp.hypothesis = hypothesis.hypothesis if hypothesis else "Default hypothesis" + exp.iteration_number = len(self.trace.hist) + + # Develop code (web search is called inside if needed) + exp = self.developer.develop(exp) + + # Track web search usage in development phase + if hasattr(self.developer, 'web_search_tool'): + web_tool = self.developer.web_search_tool + if web_tool is not None and hasattr(exp, 'used_web_search'): + if exp.used_web_search: + self.web_search_usage['total_calls'] += 1 + self.web_search_usage['successful_calls'] += 1 + + return exp + + + diff --git a/rdagent/scenarios/agentic_sys/prompts.yaml b/rdagent/scenarios/agentic_sys/prompts.yaml new file mode 100644 index 000000000..8212af61f --- /dev/null +++ b/rdagent/scenarios/agentic_sys/prompts.yaml @@ -0,0 +1,838 @@ +# ==================== Knowledge Retrieval and RAG ==================== + +KG_hypothesis_gen_RAG: + system: |- + You are an expert in agentic systems research with access to knowledge from previous research tasks and current experiments. + + user: |- + {% if insights %} + ====== Cross-Task Insights (Transferable Knowledge) ====== + {% for insight in insights %} + Insight {{ loop.index }}: + - Task Domain: {{ insight.domain }} + - Research Method: {{ insight.method }} + - Key Finding: {{ insight.finding }} + - Applicability: {{ insight.applicability }} + {% endfor %} + {% endif %} + + {% if experiences %} + ====== Current Task History ====== + {% for exp in experiences %} + Experiment {{ loop.index }}: + - Hypothesis: {{ exp.hypothesis }} + - Approach: {{ exp.approach }} + - Dimensions Improved: {{ exp.improved_dims }} + - Lessons Learned: {{ exp.lessons }} + {% endfor %} + {% endif %} + + {% if external_sources %} + ====== Retrieved External Sources ====== + {% for source in external_sources %} + Source {{ loop.index }}: {{ source.citation }} + - Relevance Score: {{ source.relevance }} + - Key Information: {{ source.summary }} + {% endfor %} + {% endif %} + +retrieval_query_generation: + system: |- + You are an expert in formulating effective search queries for research tasks. + + user: |- + Based on the current research task, generate search queries to retrieve relevant information. + + Task: {{ task_description }} + Current Knowledge Gaps: {{ knowledge_gaps }} + + Generate: + 1. **Primary Queries**: Core concepts and requirements + 2. **Exploratory Queries**: Adjacent topics and methodologies + 3. **Validation Queries**: Fact-checking and source verification + + Output Format: + { + "primary_queries": ["query1", "query2", ...], + "exploratory_queries": ["query1", "query2", ...], + "validation_queries": ["query1", "query2", ...] + } + +# ==================== Scenario and Task Description ==================== + +scenario_description: + system: |- + You are an expert in agentic system design and DeepResearch benchmark evaluation. + + user: |- + {% if use_raw_description -%} + ====== Background of the Research Task ====== + {{ raw_description }} + {% else %} + ====== Background of the Research Task ====== + {{ background }} + {% endif %} + + {% if system_analysis_output is not none %} + ====== Current System Analysis ====== + The following is the analysis of the current agentic system implementation: + {{ system_analysis_output }} + {% endif %} + + ====== Task Requirements ====== + Your agentic system must address the following research task: + {{ task_requirements }} + + ====== System Specifications ====== + Please ensure your system adheres to the following specifications: + - **Architecture**: {{ architecture_requirements }} + - **Agent Communication**: {{ communication_protocol }} + - **Error Handling**: Graceful failure recovery and proper logging + - **Modularity**: Clear separation of concerns and reusable components + + ====== Evaluation Metrics ====== + Your system will be evaluated on **four dimensions** with scores from 0-10 (continuous): + + **1. Comprehensiveness (Coverage)** - Weight: {{ comprehensiveness_weight | default(0.25) }} + Intent: Breadth and depth of content; no major omissions. + - Coverage of all required subtopics and scope (time/geo/segments) + - Multiple data sources and evidence + - Balanced perspectives + Scoring Anchors: + - 0-2: Misses core parts; narrow, superficial + - 4-6: Covers basics; some gaps or shallow treatment + - 6-8: Covers all key areas with adequate depth and evidence + - 8-10: Exhaustive, balanced, well-evidenced, no meaningful gaps + + **2. Insight (Depth and Originality)** - Weight: {{ insight_weight | default(0.30) }} + Intent: Why-think, causality, synthesis, non-obvious implications. + - Causal chains and quantified reasoning + - Trade-offs and counterfactual analysis + - Novel synthesis and frameworks + - Acknowledges limitations + Scoring Anchors: + - 0-2: Descriptive only; platitudes + - 4-6: Some analysis; shallow drivers; limited originality + - 6-8: Clear causal logic; non-trivial implications; data-backed claims + - 8-10: Original frameworks; quantifies impact; anticipates edge cases + + **3. Instruction Following (Task Fit)** - Weight: {{ instruction_weight | default(0.25) }} + Intent: Strict adherence to task requirements and constraints. + - Answers all sub-questions + - Respects scope (topic/geo/time) + - Required deliverables and methods + - No out-of-scope content + Scoring Anchors: + - 0-2: Largely off-task; violates constraints + - 4-6: Partially compliant; missing notable requirements + - 6-8: Fully compliant with minor misses + - 8-10: Exact, complete, and precise compliance + + **4. Readability (Clarity and Presentation)** - Weight: {{ readability_weight | default(0.20) }} + Intent: Clear structure, fluent language, effective data presentation. + - Logical outline with clear headings + - Cohesive and precise wording + - Concise tables/figures + - Defined terms and consistent formatting + Scoring Anchors: + - 0-2: Hard to follow; disorganized; errors + - 4-6: Understandable but clunky or poorly organized + - 6-8: Clear, well-structured, minimal friction + - 8-10: Publication-ready polish; visuals aid understanding + + {% if evaluation_details is not none %} + ====== Additional Evaluation Details ====== + {{ evaluation_details }} + {% endif %} + + ====== Scoring Method ====== + - **Per Criterion**: Evidence-based scoring; 5 = baseline adequate, adjust ± + - **Per Dimension**: Weighted average of its criteria + - **Overall Score**: Weighted sum of four dimension scores + - **Pairwise Normalization**: target_normalized = target_score / (target_score + reference_score) + + {% if time_limit is not none %} + ====== Time Limit On System Execution ====== + Your system's execution is limited to **{{ time_limit }}**. Ensure efficient implementation. + {% endif %} + + {% if runtime_environment is not none %} + ====== Runtime Environment ====== + {{ runtime_environment }} + {% endif %} + +task_description_template: + system: |- + You are an expert in agentic system design and research task analysis. + The user will provide a research task description, and you need to extract structured information. + Please answer in JSON format with the following schema: + { + "Task Type": "The type of research task, e.g., 'Literature Review', 'Multi-hop QA', 'Data Analysis', 'Code Generation', 'Scientific Research'", + "Domain": "The domain of the task, e.g., 'Scientific Research', 'Business Intelligence', 'Software Engineering', 'Healthcare'", + "Brief Description": "A brief description of the task (2-3 sentences)", + "Scope Requirements": { + "Temporal": "Time range if specified, e.g., '2020-2024'", + "Geographical": "Geographical scope if relevant, e.g., 'Global', 'US only'", + "Topical": "Topic boundaries and depth" + }, + "Required Deliverables": "List of expected outputs, e.g., ['Report', 'Data tables', 'Visualizations', 'Code']", + "Data Sources": "Expected or required data sources", + "Sub-questions": "List of sub-questions that must be answered", + "Constraints": "Any specific constraints or limitations", + "Evaluation Focus": { + "Comprehensiveness": "What aspects determine coverage completeness", + "Insight": "What constitutes deep analysis for this task", + "Instruction Following": "Key requirements that must be met", + "Readability": "Presentation format expectations" + }, + "Complexity Level": "Low/Medium/High based on research depth and multi-hop reasoning requirements" + } + + user: |- + Research Task Description: + {{ task_raw_description }} + + Additional Context: + {{ task_context }} + +task_background: + system: |- + You are a world-class AI researcher and system architect specializing in agentic systems for research automation. + + Your expertise includes: + - Multi-agent coordination and planning + - Information retrieval and synthesis + - Causal reasoning and analysis + - Research methodology and evaluation + - Large Language Model orchestration + + user: |- + The task type for this research scenario is **{{ task_type }}**. + Domain: **{{ domain }}**. + + Brief task description: {{ brief_description }}. + + Scope Requirements: + {{ scope_requirements }}. + + Required Deliverables: + {{ required_deliverables }}. + + The task will be evaluated on four dimensions: + 1. **Comprehensiveness**: {{ comprehensiveness_focus }} + 2. **Insight**: {{ insight_focus }} + 3. **Instruction Following**: {{ instruction_focus }} + 4. **Readability**: {{ readability_focus }} + +# ==================== Hypothesis Generation ==================== + +hypothesis_generation: + system: |- + You are an expert in agentic system optimization and research automation. + Your task is to propose hypotheses to improve the system's performance on DeepResearch evaluation dimensions. + + user: |- + You are proposing a hypothesis to improve the agentic system for research tasks. + + ====== Current System State ====== + {{ current_system_description }} + + ====== Performance on DeepResearch Dimensions ====== + Current Scores (0-10 scale): + - Comprehensiveness: {{ current_comprehensiveness | default("N/A") }} + - Insight: {{ current_insight | default("N/A") }} + - Instruction Following: {{ current_instruction_following | default("N/A") }} + - Readability: {{ current_readability | default("N/A") }} + + ====== Previous Experiments ====== + {{ experiment_history }} + + ====== Identified Weaknesses ====== + {{ performance_gaps }} + + ====== Task ====== + Propose a hypothesis for system improvement that targets one or more evaluation dimensions. + + Your hypothesis should: + 1. **Target Dimension(s)**: Which evaluation dimension(s) will this improve? + 2. **Current Gap**: What specific weakness does it address? + 3. **Proposed Change**: Concrete architectural or algorithmic modification + 4. **Expected Impact**: How will this improve the target dimension score(s)? + 5. **Trade-offs**: Any potential negative impacts on other dimensions? + 6. **Implementation Feasibility**: Complexity and resource requirements + + Format your response as: + **Hypothesis**: [One clear sentence] + **Target Dimensions**: [List with expected improvement, e.g., "Comprehensiveness (+1.5), Insight (+0.8)"] + **Rationale**: [Why this will work, with evidence from experiments] + **Implementation Plan**: [Step-by-step approach] + **Risk Mitigation**: [How to avoid hurting other dimensions] + +hypothesis_output_format: + system: |- + You must format your hypothesis according to the following JSON schema. + + user: |- + The output should follow JSON format with the following schema: + { + "action": "Choose from ['Information_Gathering', 'Analysis_Synthesis', 'Structure_Refinement', 'Compliance_Verification']. If 'hypothesis_specification' provides the action you need to take, please follow it. Otherwise, based on previous experimental results, suggest the action you believe is most appropriate.", + "hypothesis": "One clear sentence stating what improvement will be made", + "target_dimensions": [ + { + "name": "Comprehensiveness/Insight/Instruction_Following/Readability", + "current_score": 0.0, + "target_score": 0.0, + "expected_improvement": 0.0, + "confidence": "Low/Medium/High" + } + ], + "current_gap": "Specific weakness being addressed (one sentence)", + "rationale": "Why this hypothesis should work, with evidence from previous experiments or theoretical principles (2-3 sentences)", + "implementation_plan": { + "step_1": "First concrete step", + "step_2": "Second concrete step", + "step_3": "Third concrete step (if needed)" + }, + "risk_assessment": { + "potential_negative_impacts": [ + {"dimension": "dimension_name", "reason": "why it might be affected", "severity": "Low/Medium/High"} + ], + "mitigation_strategies": ["strategy1", "strategy2", ...] + }, + "resource_requirements": { + "time_estimate": "Estimated time to implement and validate", + "external_tools": ["tool1", "tool2", ...], + "complexity": "Low/Medium/High", + "dependencies": ["dependency1", ...] + }, + "success_criteria": { + "primary": "Main success indicator (e.g., 'Comprehensiveness score increases by at least 1.0')", + "secondary": ["Additional indicators of success"], + "validation_method": "How to verify the improvement" + }, + "concise_knowledge": "One-line transferable principle using conditional grammar (e.g., 'If X, then Y'; 'When A, do B'). Must be clear and unambiguous without referencing 'previous hypothesis' or other context-dependent terms." + } + +hypothesis_and_feedback: + system: |- + You have access to the complete history of previous experiments and their results. + Analyze patterns and learn from past successes and failures. + + user: |- + ====== Recent Experiment History (Last {{ history_window | default(10) }} iterations) ====== + + {% for experiment, feedback in trace.hist[-history_window:] %} + ====== Iteration {{ loop.index }} ====== + **Hypothesis**: {{ experiment.hypothesis }} + **Action Type**: {{ experiment.action_type }} + **Target Dimensions**: {{ experiment.target_dimensions }} + + **Results**: + - Comprehensiveness: {{ feedback.comprehensiveness_score }} (Δ {{ feedback.comprehensiveness_delta }}) + - Insight: {{ feedback.insight_score }} (Δ {{ feedback.insight_delta }}) + - Instruction Following: {{ feedback.instruction_score }} (Δ {{ feedback.instruction_delta }}) + - Readability: {{ feedback.readability_score }} (Δ {{ feedback.readability_delta }}) + - Overall: {{ feedback.overall_score }} (Δ {{ feedback.overall_delta }}) + + **Observations**: {{ feedback.observations }} + **Decision**: {{ feedback.decision }} (Success/Partial/Failure) + **Reason**: {{ feedback.reason }} + **Lessons Learned**: {{ feedback.lessons }} + + {% endfor %} + + ====== Pattern Analysis ====== + - Most successful action type: {{ most_successful_action }} + - Most improved dimension: {{ most_improved_dimension }} + - Persistent weaknesses: {{ persistent_weaknesses }} + - Effective strategies: {{ effective_strategies }} + +# ==================== Action Type Specifications ==================== + +hypothesis_specification: + Information_Gathering: + system: |- + You are an expert in information gathering and comprehensive research methodologies. + + user: |- + Action: Information Gathering + + Focus: Comprehensive data collection and source validation + + Guidelines: + - Start with authoritative sources (peer-reviewed papers, official databases) + - Cover multiple perspectives and timeframes + - Verify facts through cross-referencing + - Document all sources with proper citations + + Evaluation Impact: + - Primary: **Comprehensiveness** (improved coverage and evidence) + - Secondary: **Instruction Following** (adherence to source requirements) + + Common Pitfalls: + - Relying on single sources + - Missing key subtopics + - Ignoring temporal or geographical constraints + + Output Format: + { + "sources": [ + { + "citation": "Author (Year). Title. Publisher.", + "relevance": "How this source addresses this task", + "key_information": "Summary of relevant content", + "credibility": "Assessment of source quality" + } + ], + "coverage_checklist": { + "temporal_scope": "Covered/Partial/Missing", + "geographical_scope": "Covered/Partial/Missing", + "subtopics": ["topic1: covered", "topic2: partial", ...] + } + } + + Analysis_Synthesis: + system: |- + You are an expert in causal analysis, quantitative reasoning, and knowledge synthesis. + + user: |- + Action: Analysis and Synthesis + + Focus: Deep causal reasoning and novel insights + + Guidelines: + - Identify causal relationships (not just correlations) + - Quantify impacts where possible + - Consider counterfactuals and trade-offs + - Acknowledge limitations and uncertainties + - Propose original frameworks or synthesis + + Evaluation Impact: + - Primary: **Insight** (depth of analysis and originality) + - Secondary: **Comprehensiveness** (improved understanding of topic) + + Common Pitfalls: + - Descriptive summaries without analysis + - Correlation presented as causation + - Generic "pros and cons" without depth + - Ignoring edge cases + + Output Format: + { + "causal_chains": [ + { + "cause": "...", + "mechanism": "...", + "effect": "...", + "evidence": "...", + "quantification": "X% increase/decrease" + } + ], + "trade_offs": [ + { + "dimension1": "...", + "dimension2": "...", + "relationship": "...", + "implications": "..." + } + ], + "novel_insights": "...", + "limitations": "..." + } + + Structure_Refinement: + system: |- + You are an expert in technical writing, information architecture, and presentation design. + + user: |- + Action: Structure and Presentation Refinement + + Focus: Clear organization and effective communication + + Guidelines: + - Logical hierarchical structure + - Clear section headings + - Effective use of tables/figures + - Consistent terminology + - Smooth transitions between sections + + Evaluation Impact: + - Primary: **Readability** (clarity and presentation quality) + - Secondary: **Instruction Following** (meeting format requirements) + + Common Pitfalls: + - Walls of text without structure + - Undefined acronyms or jargon + - Inconsistent formatting + - Cluttered or unclear visualizations + + Output Format: + { + "structure": { + "sections": [ + { + "title": "...", + "subsections": [...], + "key_points": [...] + } + ] + }, + "visual_elements": [ + { + "type": "table/figure/chart", + "purpose": "...", + "data": "..." + } + ], + "terminology": { + "term1": "definition", + "term2": "definition" + } + } + + Compliance_Verification: + system: |- + You are an expert in requirement validation and compliance checking. + + user: |- + Action: Compliance and Requirement Verification + + Focus: Ensuring all task requirements are met + + Guidelines: + - Check all sub-questions are answered + - Verify scope adherence (time/geo/topic) + - Confirm all deliverables are provided + - Validate required methods are used + - Remove out-of-scope content + + Evaluation Impact: + - Primary: **Instruction Following** (requirement adherence) + + Common Pitfalls: + - Missing mandatory sections + - Scope creep + - Wrong timeframe or geography + - Ignoring format specifications + + Output Format: + { + "compliance_checklist": { + "sub_questions": [ + {"question": "...", "status": "answered/partial/missing"} + ], + "scope_verification": { + "temporal": "compliant/violated", + "geographical": "compliant/violated", + "topical": "compliant/violated" + }, + "deliverables": [ + {"required": "...", "status": "provided/missing"} + ] + }, + "violations": ["list of any violations"], + "corrective_actions": ["list of needed fixes"] + } + +# ==================== Code Generation ==================== + +code_generation: + system: |- + You are an expert software engineer specializing in agentic systems and research automation. + You write clean, well-documented, evaluation-aware code. + + user: |- + You are implementing the following hypothesis: + {{ hypothesis }} + + Target Evaluation Dimensions: {{ target_dimensions }} + + ====== Current Codebase ====== + {{ current_code }} + + ====== Implementation Requirements ====== + Your implementation must: + 1. **Maintain/Improve Target Dimensions**: + {% for dim in target_dimensions %} + - {{ dim.name }}: Focus on {{ dim.focus_areas }} + {% endfor %} + + 2. **Code Quality Standards**: + - Clear documentation explaining how code improves target dimensions + - Error handling with informative messages + - Logging for debugging and analysis + - Modular design for maintainability + + 3. **Evaluation-Aware Design**: + - For **Comprehensiveness**: Ensure complete coverage of required topics + - For **Insight**: Include causal reasoning, quantification, synthesis logic + - For **Instruction Following**: Validate all requirements are met + - For **Readability**: Structure output clearly, use proper formatting + + ====== Implementation Guidelines ====== + {{ implementation_guidelines }} + + Please generate code with: + - Comments explaining dimension-specific improvements + - Docstrings describing evaluation impact + - Unit tests for critical functionality + +# ==================== Feedback and Analysis ==================== + +feedback_analysis: + system: |- + You are an expert in experimental analysis and performance evaluation for agentic systems. + Analyze results across multiple dimensions and provide actionable insights. + + user: |- + ====== Experiment Results ====== + Hypothesis: {{ hypothesis }} + Target Dimensions: {{ target_dimensions }} + + ====== Performance Metrics (0-10 scale) ====== + {% if metrics %} + Current vs. Baseline: + - Comprehensiveness: {{ metrics.comprehensiveness.current }} (Δ {{ metrics.comprehensiveness.delta }}) + - Insight: {{ metrics.insight.current }} (Δ {{ metrics.insight.delta }}) + - Instruction Following: {{ metrics.instruction_following.current }} (Δ {{ metrics.instruction_following.delta }}) + - Readability: {{ metrics.readability.current }} (Δ {{ metrics.readability.delta }}) + - Overall Score: {{ metrics.overall.current }} (Δ {{ metrics.overall.delta }}) + + Pairwise Normalized Score: {{ metrics.normalized_score }} + {% endif %} + + ====== Execution Logs ====== + {{ logs }} + + ====== Detailed Dimension Analysis ====== + {% if dimension_feedback %} + {{ dimension_feedback }} + {% endif %} + + ====== Analysis Task ====== + Provide a comprehensive analysis: + + 1. **Success Assessment** (Pass/Fail for each dimension): + - Did we improve target dimension(s)? + - Were there unexpected changes in non-target dimensions? + + 2. **Dimension-Specific Findings**: + For each dimension, explain: + - **Comprehensiveness**: Coverage gaps or improvements + - **Insight**: Quality of reasoning and originality + - **Instruction Following**: Compliance issues or successes + - **Readability**: Clarity and presentation quality + + 3. **Root Cause Analysis**: + - Why did improvements/regressions occur? + - What worked as expected vs. surprises? + + 4. **Trade-off Analysis**: + - Did improving one dimension hurt others? + - Is the trade-off acceptable? + + 5. **Next Steps**: + - Should we iterate on this hypothesis? + - New hypothesis directions based on learnings? + + 6. **Knowledge Update**: + - What general principles did we learn? + - What to avoid in future experiments? + +# ==================== Evaluation Rubric ==================== + +evaluation_rubric: + system: |- + You are an expert evaluator trained on the DeepResearch Benchmark rubric. + Apply consistent, evidence-based scoring across all four dimensions. + + user: |- + ====== DeepResearch Benchmark Evaluation Rubric ====== + + Use this rubric to score outputs on each dimension (0-10 continuous): + + **Comprehensiveness (0-10)** + Check: + - [ ] All required subtopics covered + - [ ] Appropriate scope (time/geography/segments) + - [ ] Multiple data sources and evidence cited + - [ ] Balanced perspectives presented + - [ ] No major omissions + + Pitfalls to avoid: + - Ignoring time/geographic constraints + - One-sided coverage + - Missing data/evidence + - Superficial treatment of topics + + **Insight (0-10)** + Check: + - [ ] Causal chains explained (not just correlation) + - [ ] Quantified reasoning where possible + - [ ] Trade-offs and counterfactuals discussed + - [ ] Limitations acknowledged + - [ ] Novel synthesis or frameworks + + Pitfalls to avoid: + - Purely descriptive content + - Platitudes and generic statements + - Untested assertions + - Shallow "pros and cons" lists + + **Instruction Following (0-10)** + Check: + - [ ] All sub-questions answered + - [ ] Scope respected (topic/geo/time) + - [ ] Required deliverables provided + - [ ] Required methods used + - [ ] No out-of-scope content + + Pitfalls to avoid: + - Missing mandatory sections + - Scope drift + - Wrong timeframe or geography + - Ignoring format requirements + + **Readability (0-10)** + Check: + - [ ] Logical structure with clear headings + - [ ] Cohesive flow between sections + - [ ] Precise and concise wording + - [ ] Effective tables/figures + - [ ] Terms defined, formatting consistent + + Pitfalls to avoid: + - Walls of text + - Undefined acronyms + - Noisy or unclear visualizations + - Inconsistent terminology + +# ==================== UI and Display ==================== + +rich_style_description: + system: |- + You are describing the agentic system scenario for display purposes. + + user: |- + ### {{ name }} Agent: Automated Research System for DeepResearch Tasks + + #### [Overview](#_summary) + This scenario focuses on automated research and development of agentic systems + optimized for DeepResearch Benchmark evaluation criteria. + + #### {{ name }} Task Info + Current Task: {{ task_name }} + Task Type: {{ task_type }} + Domain: {{ domain }} + + #### [Evaluation Dimensions](#_metrics) + - **Comprehensiveness** ({{ comprehensiveness_weight }}): Coverage breadth and depth + - **Insight** ({{ insight_weight }}): Causal reasoning and originality + - **Instruction Following** ({{ instruction_weight }}): Task requirement adherence + - **Readability** ({{ readability_weight }}): Clarity and presentation quality + + #### [Automated R&D Loop](#_rdloops) + + - **[R (Research)](#_research)** + - Hypothesis generation targeting evaluation dimensions + - Analysis of dimension-specific performance gaps + - Knowledge construction from scored experiments + + - **[D (Development)](#_development)** + - Code evolution optimizing for target dimensions + - Multi-dimensional performance validation + - Trade-off analysis across dimensions + + #### [Objective](#_summary) + To automatically discover and implement system improvements that maximize + performance across all four DeepResearch evaluation dimensions through + autonomous, dimension-aware research and development cycles. + +system_prompt_template: + system: |- + You are an advanced agentic system designed for research tasks. + + Your core capabilities: + - Multi-hop reasoning and information synthesis + - Causal analysis and quantitative reasoning + - Structured output generation + - Source verification and citation + + user: |- + Evaluation awareness: + You will be evaluated on four dimensions (0-10 each): + 1. **Comprehensiveness**: Complete coverage, no gaps + 2. **Insight**: Deep analysis, causal thinking, originality + 3. **Instruction Following**: Strict requirement adherence + 4. **Readability**: Clear structure and presentation + + Always optimize for all four dimensions in your responses. + + +# ...existing code... + +# NEW: Hypothesis generation with external knowledge +hypothesis_gen_with_external_knowledge: + system: | + You are an expert AI researcher specializing in agentic systems. + Your task is to generate innovative hypotheses based on: + 1. The scenario description + 2. Previous experimental results + 3. External knowledge from research papers and best practices + + Generate a clear, specific, and testable hypothesis in JSON format. + + user: | + # Scenario + {{ scenario_desc }} + + # Previous Trials + {{ previous_trials }} + + {% if external_knowledge %} + # External Knowledge (from web search) + {% for source in external_knowledge %} + {{ loop.index }}. [{{ source.credibility_level }}] {{ source.title }} + Summary: {{ source.summary }} + URL: {{ source.url }} + {% endfor %} + {% endif %} + + # Task + Generate a hypothesis to improve the agentic system. + Consider the external knowledge and previous results. + + Output format: + { + "hypothesis": "Your hypothesis here", + "reasoning": "Why this hypothesis is promising", + "expected_improvement": "What improvements you expect", + "implementation_approach": "How to implement this", + "external_sources_used": ["List of URLs used"] + } + +# NEW: Code generation with external knowledge +code_gen_with_external_knowledge: + system: | + You are an expert software engineer specializing in agentic systems. + Generate production-quality code based on the hypothesis and external knowledge. + + user: | + # Hypothesis + {{ hypothesis }} + + # External Knowledge Summary + {{ external_knowledge_summary }} + + # High-Credibility Sources + {% for source in high_cred_sources %} + - {{ source.title }}: {{ source.url }} + {% endfor %} + + # Task + Generate complete, working code for: + 1. agent.py - Main agent implementation + 2. evaluator.py - Performance evaluator + 3. train.py - Execution script + + Follow best practices from the external sources. \ No newline at end of file diff --git a/rdagent/scenarios/agentic_sys/proposal.py b/rdagent/scenarios/agentic_sys/proposal.py new file mode 100644 index 000000000..5a8f63eb1 --- /dev/null +++ b/rdagent/scenarios/agentic_sys/proposal.py @@ -0,0 +1,1019 @@ +from rdagent.core.experiment import Task +from rdagent.core.proposal import ExpGen, Trace +from pathlib import Path +from rdagent.scenarios.agentic_sys.exp import AgenticSysExperiment +from rdagent.core.proposal import ( + ExpGen, + Hypothesis, + HypothesisGen, + Trace, + Experiment2Feedback +) +from rdagent.scenarios.agentic_sys.scen import AgenticSysScen +from rdagent.log import rdagent_logger as logger +from rdagent.core.proposal import HypothesisGen, Hypothesis +from rdagent.oai.llm_utils import APIBackend +from rdagent.utils.agent.tpl import T # 使用 T 模板系统 +import json +from typing import Any, Dict, List, Optional, Tuple +from rdagent.scenarios.agentic_sys.tools.web_search import create_web_search_tool + + +class AgenticSysHypothesisGen(HypothesisGen): + """ + Generate hypothesis for agentic system improvements based on DeepResearch evaluation dimensions. + Uses T() template system to render prompts from prompts.yaml. + """ + + def __init__(self, scen: AgenticSysScen): + super().__init__(scen=scen) + self.scen = scen + + # Initialize LLM backend + self.api_backend = APIBackend() + + #Initialize web search tool + search_config_path = Path(__file__).parent /"tools"/ "search_config.yaml" + self.web_search = create_web_search_tool(config_path=search_config_path) + + logger.info("AgenticSysHypothesisGen initialized with T() template system") + + + @property + def web_search_tool(self): + """Lazy load web search tool when needed""" + if self._web_search_tool is None: + try: + search_config_path = Path(__file__).parent / "tools" / "search_config.yaml" + if search_config_path.exists(): + self._web_search_tool = create_web_search_tool(search_config_path) + logger.info("✓ Web search tool initialized in HypothesisGen") + else: + logger.warning(f"Search config not found: {search_config_path}") + self._web_search_tool = False + except Exception as e: + logger.warning(f"Failed to initialize web search tool: {e}") + self._web_search_tool = False + return self._web_search_tool if self._web_search_tool is not False else None + + + def gen(self, trace: Trace) -> Hypothesis: + """ + Generate hypothesis based on trace history and evaluation dimensions. + + Args: + trace: Experiment trace containing history + + Returns: + Hypothesis object with structured hypothesis data + """ + logger.info("Generating hypothesis...") + + # Prepare base context + scenario_desc = trace.scen.get_scenario_all_desc() + previous_trials = self._extract_previous_trials(trace) + + # Optionally enhance with web search + external_knowledge = [] + if self._should_use_web_search(trace): + external_knowledge = self._retrieve_external_knowledge(trace) + + # Generate hypothesis using LLM + system_prompt = self._build_system_prompt() + user_prompt = self._build_user_prompt( + scenario_desc=scenario_desc, + previous_trials=previous_trials, + external_knowledge=external_knowledge + ) + + response = APIBackend().build_messages_and_create_chat_completion( + user_prompt=user_prompt, + system_prompt=system_prompt, + json_mode=True + ) + + # Parse and return hypothesis + hypothesis = self._parse_hypothesis(response, trace) + + logger.info(f"Generated hypothesis: {hypothesis.hypothesis[:100]}...") + return hypothesis + + + def _should_use_web_search(self, trace: Trace) -> bool: + """Determine if web search should be used""" + # Check if tool is available + if self.web_search_tool is None: + return False + + # Check if service is healthy + if not self.web_search_tool.client.health_check(): + logger.warning("Web search service not healthy") + return False + + # Use for early iterations + iteration = len(trace.hist) + if iteration < 3: + logger.info(f"Early iteration ({iteration}/3), enabling web search") + return True + + # Use if previous performance is low + if trace.hist and hasattr(trace.hist[-1][1], 'overall_score'): + last_score = trace.hist[-1][1].overall_score + if last_score < 6.0: # Threshold for low performance + logger.info(f"Low previous score ({last_score}), enabling web search") + return True + + return False + + def _retrieve_external_knowledge(self, trace: Trace) -> list: + """ + Retrieve external knowledge using web search tool + + Args: + trace: Execution trace + + Returns: + List of external sources + """ + try: + scenario_desc = trace.scen.get_scenario_all_desc() + + # Identify knowledge gaps + knowledge_gaps = self._identify_knowledge_gaps(trace) + + # Prepare search context + search_context = { + 'iteration': len(trace.hist), + 'domain': getattr(trace.scen, 'domain', 'general') + } + + # Call web search tool + logger.info("Retrieving external knowledge via web search...") + external_sources = self.web_search_tool.search_for_hypothesis( + task_description=scenario_desc, + current_gaps=knowledge_gaps, + context=search_context + ) + + logger.info(f"Retrieved {len(external_sources)} external sources") + return external_sources + + except Exception as e: + logger.error(f"Failed to retrieve external knowledge: {e}") + return [] + + def _identify_knowledge_gaps(self, trace: Trace) -> list: + """Identify knowledge gaps from trace history""" + gaps = [] + + if trace.hist: + last_feedback = trace.hist[-1][1] + + # Check which dimensions performed poorly + if hasattr(last_feedback, 'dimension_feedback'): + for dim, feedback in last_feedback.dimension_feedback.items(): + if hasattr(feedback, 'score') and feedback.score < 6.0: + gaps.append(f"improve {dim}") + + # Default gaps if none identified + if not gaps: + gaps = [ + "agentic system best practices", + "system design patterns", + "performance optimization" + ] + + return gaps[:5] + + def _extract_previous_trials(self, trace: Trace) -> str: + """Extract previous trials from trace""" + if not trace.hist: + return "No previous trials" + + trials = [] + for exp, feedback in trace.hist[-3:]: # Last 3 trials + trial_summary = { + 'hypothesis': getattr(exp, 'hypothesis', 'N/A'), + 'result': getattr(feedback, 'decision', 'N/A'), + 'score': getattr(feedback, 'overall_score', 0.0) + } + trials.append(trial_summary) + + return str(trials) + + def _build_system_prompt(self) -> str: + """Build system prompt for hypothesis generation""" + return """You are an expert AI researcher specializing in agentic systems. +Your task is to generate innovative hypotheses for improving agentic system performance. + +Consider: +1. Previous experimental results +2. External knowledge from research papers and best practices +3. Novel approaches and methodologies +4. Feasibility and implementability + +Generate a clear, specific, and testable hypothesis.""" + + def _build_user_prompt( + self, + scenario_desc: str, + previous_trials: str, + external_knowledge: list + ) -> str: + """Build user prompt with all context""" + prompt = f"""# Scenario +{scenario_desc} + +# Previous Trials +{previous_trials} +""" + + if external_knowledge: + prompt += "\n# External Knowledge\n" + for idx, source in enumerate(external_knowledge[:5], 1): + prompt += f"\n{idx}. [{source['credibility_level']}] {source['title']}\n" + prompt += f" Summary: {source['summary'][:150]}...\n" + prompt += f" URL: {source['url']}\n" + + prompt += "\n# Task\nGenerate a hypothesis to improve the agentic system." + + return prompt + + def _parse_hypothesis(self, response: str, trace: Trace) -> Hypothesis: + """Parse LLM response into Hypothesis object""" + # Simplified parsing - in real implementation, use structured output + hypothesis_text = response.strip() + + hypothesis = Hypothesis( + hypothesis=hypothesis_text, + reason="Generated based on scenario and previous results", + concise_reason="Improve system performance", + concise_observation="", + concise_justification="", + concise_knowledge="" + ) + + return hypothesis + + def prepare_context(self, trace: Trace): + """ + Prepare context for hypothesis generation from trace history. + + KEY METHOD: Uses T() template system like Kaggle scenario + + Args: + trace: Experiment trace + + Returns: + Tuple of (context dictionary, is_first_experiment flag) + """ + is_first_experiment = not (hasattr(trace, 'hist') and trace.hist) + + # Use T() to render hypothesis_and_feedback prompt + hypothesis_and_feedback = ( + T("scenarios.agentic_sys.prompts:hypothesis_and_feedback").r( + trace=trace, + history_window=10, + most_successful_action=self._get_most_successful_action(trace), + most_improved_dimension=self._get_most_improved_dimension(trace), + persistent_weaknesses=self._get_persistent_weaknesses(trace), + effective_strategies=self._get_effective_strategies(trace) + ) + if len(trace.hist) > 0 + else "No previous hypothesis and feedback available since it's the first round." + ) + + context = { + "is_first_experiment": is_first_experiment, + "current_system_description": self._get_system_description(trace), + "experiment_history": hypothesis_and_feedback, # 使用渲染后的提示词 + "performance_gaps": self._identify_performance_gaps(trace), + "current_scores": self._extract_current_scores(trace), + } + + return context, is_first_experiment + + def prepare_rag_context(self, trace: Trace): + """ + Prepare RAG (Retrieval-Augmented Generation) context. + + Uses T() template system for RAG prompt rendering. + + Args: + trace: Experiment trace + + Returns: + Dictionary with RAG context + """ + # Retrieve knowledge sources + insights = self._retrieve_cross_task_insights() + experiences = self._retrieve_current_task_experiences(trace) + external_sources = self._retrieve_external_sources(trace) + + # Render RAG prompt if sources available + rag_prompt = "" + if insights or experiences or external_sources: + try: + rag_prompt = T("scenarios.agentic_sys.prompts:KG_hypothesis_gen_RAG").r( + insights=insights, + experiences=experiences, + external_sources=external_sources + ) + except Exception as e: + logger.warning(f"Failed to render KG_hypothesis_gen_RAG: {e}") + + return { + "insights": insights, + "experiences": experiences, + "external_sources": external_sources, + "rag_prompt": rag_prompt # 渲染后的 RAG 提示词 + } + + def generate_hypothesis_with_llm( + self, + context: Dict[str, Any], + rag_context: Dict[str, Any], + trace: Trace + ) -> Dict[str, Any]: + """ + Generate hypothesis using LLM with prompts from prompts.yaml. + + Uses T() template system to render all prompts. + + Args: + context: Context dictionary + rag_context: RAG context dictionary + trace: Experiment trace + + Returns: + Parsed hypothesis data dictionary + """ + # Step 1: Build system prompt using T() + try: + system_prompt = T("scenarios.agentic_sys.prompts:hypothesis_generation").s() + logger.info("Rendered hypothesis_generation system prompt") + except Exception as e: + logger.warning(f"Failed to render hypothesis_generation system prompt: {e}") + system_prompt = """You are an expert in agentic system optimization and research automation.Your task is to propose hypotheses to improve the system's performance on DeepResearch evaluation dimensions.""" + + # Step 2: Build user prompt using T() + user_prompt = self._build_user_prompt_with_t(context, rag_context, trace) + + # Step 3: Call LLM + logger.info("Calling LLM for hypothesis generation...") + response = self.api_backend.build_messages_and_create_chat_completion( + user_prompt=user_prompt, + system_prompt=system_prompt, + json_mode=True + ) + + # Step 4: Parse JSON response + try: + hypothesis_data = json.loads(response) + logger.info("Successfully parsed hypothesis JSON") + return hypothesis_data + except json.JSONDecodeError as e: + logger.error(f"Failed to parse hypothesis JSON: {e}") + logger.error(f"Response: {response}") + return self.get_fallback_hypothesis(context) + + def build_user_prompt_with_t( + self, + context: Dict[str, Any], + rag_context: Dict[str, Any], + trace: Trace + ) -> str: + """ + Build user prompt using T() template system. + + KEY METHOD: Shows how to use T() to render and combine multiple prompts. + + Pattern: + 1. T("path:prompt_name").r(**variables) - Render user part + 2. T("path:prompt_name").s(**variables) - Render system part (if needed) + 3. Combine multiple rendered prompts with "\n\n" + + Args: + context: Context dictionary + rag_context: RAG context dictionary + trace: Experiment trace + + Returns: + Complete user prompt string + """ + prompt_parts = [] + + # Part 1: Task background (user part) + try: + task_bg_user = T("scenarios.agentic_sys.prompts:task_background").r( + task_type=getattr(self.scen, 'task_type', 'Research Automation'), + domain=getattr(self.scen, 'domain', 'Agentic Systems'), + brief_description=getattr(self.scen, 'description', 'Automated research system'), + scope_requirements=getattr(self.scen, 'scope', 'N/A'), + required_deliverables=getattr(self.scen, 'deliverables', 'N/A'), + comprehensiveness_focus=getattr(self.scen, 'comprehensiveness_focus', 'Complete coverage'), + insight_focus=getattr(self.scen, 'insight_focus', 'Deep analysis'), + instruction_focus=getattr(self.scen, 'instruction_focus', 'Strict adherence'), + readability_focus=getattr(self.scen, 'readability_focus', 'Clear presentation') + ) + prompt_parts.append(task_bg_user) + except Exception as e: + logger.warning(f"Failed to render task_background: {e}") + prompt_parts.append(f"""Task Type: {getattr(self.scen, 'task_type', 'Research Automation')} +Domain: {getattr(self.scen, 'domain', 'Agentic Systems')} +Brief Description: {getattr(self.scen, 'description', 'Automated research system')}""") + + # Part 2: RAG context (if available) + if rag_context.get("rag_prompt"): + prompt_parts.append(rag_context["rag_prompt"]) + logger.info("Added RAG context") + + # Part 3: Main hypothesis generation instruction + try: + hypothesis_gen = T("scenarios.agentic_sys.prompts:hypothesis_generation").r( + current_system_description=context["current_system_description"], + current_comprehensiveness=context["current_scores"]["comprehensiveness"], + current_insight=context["current_scores"]["insight"], + current_instruction_following=context["current_scores"]["instruction_following"], + current_readability=context["current_scores"]["readability"], + experiment_history=context["experiment_history"], + performance_gaps=context["performance_gaps"] + ) + prompt_parts.append(hypothesis_gen) + logger.info("Rendered hypothesis_generation user prompt") + except Exception as e: + logger.error(f"Failed to render hypothesis_generation: {e}") + raise + + # Part 4: Output format specification + try: + output_format = T("scenarios.agentic_sys.prompts:hypothesis_output_format").r() + prompt_parts.append(output_format) + except Exception as e: + logger.warning(f"Failed to render hypothesis_output_format: {e}") + + # Combine all parts + full_prompt = "\n\n".join(prompt_parts) + + return full_prompt + + # ==================== Helper Methods for Context Preparation ==================== + + def get_most_successful_action(self, trace: Trace) -> str: + """Get most successful action type from trace history""" + if not hasattr(trace, 'hist') or not trace.hist: + return "N/A" + + action_success = {} + for exp, feedback in trace.hist: + action_type = getattr(exp, 'action_type', 'Unknown') + if getattr(feedback, 'decision', False): + action_success[action_type] = action_success.get(action_type, 0) + 1 + + return max(action_success, key=action_success.get) if action_success else "N/A" + + def get_most_improved_dimension(self, trace: Trace) -> str: + """Get most improved dimension from trace history""" + if not hasattr(trace, 'hist') or not trace.hist: + return "N/A" + + dimension_improvements = { + "comprehensiveness": 0, + "insight": 0, + "instruction_following": 0, + "readability": 0 + } + + for exp, feedback in trace.hist: + for dim in dimension_improvements.keys(): + delta_attr = f"{dim}_delta" + if hasattr(feedback, delta_attr): + delta = getattr(feedback, delta_attr, 0) + if delta > 0: + dimension_improvements[dim] += delta + + return max(dimension_improvements, key=dimension_improvements.get) + + def get_persistent_weaknesses(self, trace: Trace) -> str: + """Identify persistent weaknesses from trace history""" + if not hasattr(trace, 'hist') or not trace.hist: + return "N/A" + + weaknesses = [] + if trace.hist: + _, last_feedback = trace.hist[-1] + for dim in ["comprehensiveness", "insight", "instruction_following", "readability"]: + score_attr = f"{dim}_score" + if hasattr(last_feedback, score_attr): + score = getattr(last_feedback, score_attr, 0) + if score < 6.0: + weaknesses.append(f"{dim} (score: {score:.1f})") + + return ", ".join(weaknesses) if weaknesses else "None identified" + + def get_effective_strategies(self, trace: Trace) -> str: + """Get effective strategies from trace history""" + most_successful = self._get_most_successful_action(trace) + if most_successful != "N/A": + return f"{most_successful} action type has been most successful" + return "No clear pattern yet" + + def get_system_description(self, trace: Trace) -> str: + """Get current system description from trace""" + if not hasattr(trace, 'hist') or not trace.hist: + return "No previous system implementation. Starting from baseline." + + last_exp, last_feedback = trace.hist[-1] + + description = f"Current system status:\n" + description += f"- Last hypothesis: {getattr(last_exp, 'hypothesis', 'N/A')}\n" + description += f"- Last feedback: {getattr(last_feedback, 'reason', 'N/A')[:200]}\n" + description += f"- Success rate: {self._calculate_success_rate(trace):.1%}\n" + + return description + + def identify_performance_gaps(self, trace: Trace) -> str: + """Identify performance gaps from trace history""" + if not hasattr(trace, 'hist') or not trace.hist: + return "Initial baseline establishment needed. Focus on core functionality." + + gaps = [] + + # Analyze recent failures + failed_experiments = [ + (exp, fb) for exp, fb in trace.hist[-5:] + if not getattr(fb, 'decision', False) + ] + + if failed_experiments: + gaps.append(f"- {len(failed_experiments)} recent failures indicate instability") + + # Check success rate + success_rate = self._calculate_success_rate(trace) + if success_rate < 0.5: + gaps.append(f"- Low success rate ({success_rate:.1%}) requires fundamental improvements") + elif success_rate < 0.8: + gaps.append(f"- Moderate success rate ({success_rate:.1%}) suggests refinement opportunities") + + return "\n".join(gaps) if gaps else "System performing well. Focus on advanced optimizations." + + def extract_current_scores(self, trace: Trace) -> Dict[str, Optional[float]]: + """Extract current dimension scores from latest feedback""" + if not hasattr(trace, 'hist') or not trace.hist: + return { + "comprehensiveness": None, + "insight": None, + "instruction_following": None, + "readability": None + } + + _, last_feedback = trace.hist[-1] + + return { + "comprehensiveness": getattr(last_feedback, 'comprehensiveness_score', None), + "insight": getattr(last_feedback, 'insight_score', None), + "instruction_following": getattr(last_feedback, 'instruction_score', None), + "readability": getattr(last_feedback, 'readability_score', None) + } + + def calculate_success_rate(self, trace: Trace) -> float: + """Calculate success rate from trace history""" + if not hasattr(trace, 'hist') or not trace.hist: + return 0.0 + + success_count = sum( + 1 for _, fb in trace.hist + if getattr(fb, 'decision', False) + ) + + return success_count / len(trace.hist) + + def extract_concise_observation(self, trace: Trace) -> str: + """Extract concise observation from trace""" + if not hasattr(trace, 'hist') or not trace.hist: + return "Starting baseline implementation" + + _, last_feedback = trace.hist[-1] + observations = getattr(last_feedback, 'observations', '') + + if observations: + first_sentence = observations.split('.')[0] + return first_sentence[:100] + "..." if len(first_sentence) > 100 else first_sentence + + return "Previous experiment completed" + + # ==================== RAG Methods ==================== + + def retrieve_cross_task_insights(self) -> List[Dict[str, Any]]: + """Retrieve insights from other similar tasks""" + # TODO: Implement actual knowledge base retrieval + return [] + + def retrieve_current_task_experiences(self, trace: Trace) -> List[Dict[str, Any]]: + """Retrieve relevant experiences from current task's trace history""" + if not hasattr(trace, 'hist') or not trace.hist: + return [] + + experiences = [] + for exp, fb in trace.hist[-5:]: + experiences.append({ + "hypothesis": getattr(exp, 'hypothesis', 'N/A'), + "approach": getattr(exp, 'action_type', 'N/A') if hasattr(exp, 'action_type') else 'N/A', + "improved_dims": self._extract_improved_dimensions(fb), + "lessons": getattr(fb, 'reason', 'N/A')[:200] + }) + + return experiences + + def retrieve_external_sources(self, trace: Trace) -> List[Dict[str, Any]]: + """Retrieve external sources + Args: + trace: Experiment trace + Returns: + List of external source dictionaries + """ + + #check if web search is available + if not self.web_search.client_health_check(): + logger.warning("SearxNG service unavailable. Skipping external search") + return [] + #prepare search content + task_description = getattr(self.scen, 'description', 'Automated research system') + knowledge_gaps = self._identify_performance_gaps(trace) + context = { + "weak_dimension": self._get_most_improved_dimension(trace), + "methodology": getattr(self.scen, 'task_type', '') + } + try: + #perform web search + external_sources = self.web_search.search_for_hypothesis( + task_description = task_description, + current_gaps = knowledge_gaps, + context = context, + ) + + logger.info(f"Retrieved {len(external_sources)} external sources") + return external_sources + except Exception as e: + logger.error(f"Failed to retrieve external sources: {e}") + return [] + + def identify_knowledge_gaps(self, trace): + """ + Identify knowledge gaps from trace history for external search + Args: + trace: Experiment trace + Returns: + List of knowledge gap descriptions + """ + gaps = [] + if not hasattr(trace, 'hist') or not trace.hist: + gaps.append("baseline system design") + gaps.append("evaluation metrics implementation") + return gaps + #analyze recent failures + for exp, feedback in trace.hist[-3:]: + if not getattr(feedback, 'decision', False): + reason = getattr(feedback, 'reason', '') + if 'error' in reason.lower(): + gaps.append("error handling strategies") + if 'coverage' in reason.lower(): + gaps.append("comprehensive task coverage techniques") + if 'insight' in reason.lower(): + gaps.append("methods to enhance insight generation") + + #check dimension scores + if hasattr(trace, 'hist') and trace.hist: + _, last_feedback = trace.hist[-1] + dimensions = { + 'comprehensiveness': getattr(last_feedback, 'comprehensiveness_score', 0), + 'insight': getattr(last_feedback, 'insight_score', 0), + 'instruction_following': getattr(last_feedback, 'instruction_score', 0), + 'readability': getattr(last_feedback, 'readability_score', 0) + } + #identify low scoring dimensions + for dim, score in dimensions.items(): + if score and score < 6.0: + gaps.append(f"improving {dim} techniques") + + return gaps if gaps else ["general agentic system optimization"] + + + def get_weak_dimension(self, trace): + """ + get the weakest evaluation dimension from trace history + """ + if not hasattr(trace, 'hist') or not trace.hist: + return None + _, last_feedback = trace.hist[-1] + dimensions = { + "comprehensiveness": getattr(last_feedback, 'comprehensiveness_score', 10), + "insight": getattr(last_feedback, 'insight_score', 10), + "instruction_following": getattr(last_feedback, 'instruction_score', 10), + "readability": getattr(last_feedback, 'readability_score', 10) + } + + if dimensions: + weakest = min(dimensions, key = lambda x: x[1]) + return weakest[0] + + return None + + + + + def extract_improved_dimensions(self, feedback) -> List[str]: + """Extract which dimensions improved from feedback""" + improved = [] + + for dim in ["comprehensiveness", "insight", "instruction_following", "readability"]: + delta_attr = f"{dim}_delta" + if hasattr(feedback, delta_attr) and getattr(feedback, delta_attr, 0) > 0: + improved.append(dim.replace("_", " ").title()) + + return improved if improved else ["None"] + + def get_fallback_hypothesis(self, context: Dict[str, Any]) -> Dict[str, Any]: + """Get fallback hypothesis when LLM parsing fails""" + return { + "action": "Information_Gathering", + "hypothesis": "Improve system based on previous feedback", + "target_dimensions": [ + { + "name": "Comprehensiveness", + "current_score": context["current_scores"]["comprehensiveness"] or 0.0, + "target_score": (context["current_scores"]["comprehensiveness"] or 0.0) + 1.0, + "expected_improvement": 1.0, + "confidence": "Low" + } + ], + "current_gap": "Unable to generate structured hypothesis", + "rationale": "LLM response parsing failed. Using fallback hypothesis.", + "implementation_plan": { + "step_1": "Review previous feedback", + "step_2": "Implement basic improvements", + "step_3": "Validate changes" + }, + "risk_assessment": { + "potential_negative_impacts": [], + "mitigation_strategies": ["Incremental changes", "Thorough testing"] + }, + "success_criteria": { + "primary": "System runs without errors", + "secondary": ["Performance maintained or improved"], + "validation_method": "Manual verification" + }, + "concise_knowledge": "When LLM parsing fails, use incremental improvements" + } + + +class AgenticSysExpGen(ExpGen): + """Generate experiment based on hypothesis""" + + def __init__(self, scen: AgenticSysScen): + self.scen = scen + self.api_backend = APIBackend() + logger.info("AgenticSysExpGen initialized with T() template system") + + def gen(self, trace: Trace) -> AgenticSysExperiment: + """ + Generate experiment based on trace and hypothesis. + + Uses T() template system for task description generation. + + Args: + trace: Experiment trace + + Returns: + AgenticSysExperiment object + """ + logger.info("Generating experiment from hypothesis...") + + # Step 1: Get hypothesis from trace + hypothesis = self.get_latest_hypothesis(trace) + + # Step 2: Generate task description using T() + task_desc = self.generate_task_description_with_t(hypothesis, trace) + + # Step 3: Create experiment + main_task = Task(task_desc) + experiment = AgenticSysExperiment( + sub_tasks=[main_task] + ) + + # Step 4: Attach hypothesis and metadata + if hypothesis: + experiment.hypothesis = hypothesis.hypothesis + experiment.action_type = getattr(hypothesis, 'action_type', 'Information_Gathering') + experiment.target_dimensions = getattr(hypothesis, 'target_dimensions', []) + experiment.implementation_plan = getattr(hypothesis, 'implementation_plan', {}) + experiment.hypothesis_obj = hypothesis + else: + experiment.hypothesis = "Baseline implementation" + experiment.action_type = "Information_Gathering" + + logger.info(f"Generated experiment with action type: {experiment.action_type}") + + return experiment + + def get_latest_hypothesis(self, trace: Trace) -> Optional[Hypothesis]: + """Get the latest hypothesis from trace""" + if hasattr(trace, 'hypothesis') and trace.hypothesis: + return trace.hypothesis + + if hasattr(trace, 'hist') and trace.hist: + last_exp, _ = trace.hist[-1] + if hasattr(last_exp, 'hypothesis_obj'): + return last_exp.hypothesis_obj + + return None + + def generate_task_description_with_t( + self, + hypothesis: Optional[Hypothesis], + trace: Trace + ) -> str: + """ + Generate task description using T() template system. + + KEY METHOD: Shows how to use action-specific prompts with T(). + + Args: + hypothesis: Hypothesis object + trace: Experiment trace + + Returns: + Task description string + """ + is_first_experiment = not (hasattr(trace, 'hist') and trace.hist) + + # First experiment: baseline task + if is_first_experiment: + return self.get_baseline_task() + + # No hypothesis: fallback + if not hypothesis: + return self.get_improvement_task_fallback(trace) + + # Generate task based on action type using T() + action_type = getattr(hypothesis, 'action_type', 'Information_Gathering') + + try: + # Use T() to render action-specific specification + action_spec = T(f"scenarios.agentic_sys.prompts:hypothesis_specification.{action_type}").r() + + # Build complete task description + task_desc = f"""Action: {action_type} + +Hypothesis: {hypothesis.hypothesis} + +Target Dimensions: +{self.format_target_dimensions(getattr(hypothesis, 'target_dimensions', []))} + +Implementation Plan: +{self.format_implementation_plan(getattr(hypothesis, 'implementation_plan', {}))} + +====== Action-Specific Guidelines ====== +{action_spec} + +====== Success Criteria ====== +{self.format_success_criteria(getattr(hypothesis, 'success_criteria', {}))} + +====== Risk Assessment ====== +{self.format_risk_assessment(getattr(hypothesis, 'risk_assessment', {}))} +""" + return task_desc + + except Exception as e: + logger.warning(f"Failed to use T() for action specification: {e}") + return self._get_improvement_task_fallback(trace) + + # ==================== Formatting Helper Methods ==================== + + def format_target_dimensions(self, target_dimensions: List[Dict]) -> str: + """Format target dimensions""" + if not target_dimensions: + return "- No specific dimension targets" + + lines = [] + for dim in target_dimensions: + name = dim.get('name', 'Unknown') + current = dim.get('current_score', 'N/A') + target = dim.get('target_score', 'N/A') + improvement = dim.get('expected_improvement', 'N/A') + confidence = dim.get('confidence', 'N/A') + + lines.append(f"- {name}: {current} → {target} (Δ{improvement}, confidence: {confidence})") + + return "\n".join(lines) + + def format_implementation_plan(self, plan: Dict) -> str: + """Format implementation plan""" + if not plan: + return "- No specific implementation plan" + + lines = [] + for key, value in plan.items(): + lines.append(f"- {key}: {value}") + + return "\n".join(lines) + + def format_success_criteria(self, criteria: Dict) -> str: + """Format success criteria""" + if not criteria: + return "- Complete implementation without errors" + + lines = [] + + primary = criteria.get('primary', None) + if primary: + lines.append(f"- Primary: {primary}") + + secondary = criteria.get('secondary', []) + if secondary: + lines.append("- Secondary:") + for criterion in secondary: + lines.append(f" * {criterion}") + + validation = criteria.get('validation_method', None) + if validation: + lines.append(f"- Validation: {validation}") + + return "\n".join(lines) if lines else "- Complete implementation without errors" + + def format_risk_assessment(self, risk_assessment: Dict) -> str: + """Format risk assessment""" + if not risk_assessment: + return "- No specific risks identified" + + lines = [] + + negative_impacts = risk_assessment.get('potential_negative_impacts', []) + if negative_impacts: + lines.append("Potential Negative Impacts:") + for impact in negative_impacts: + if isinstance(impact, dict): + dimension = impact.get('dimension', 'Unknown') + reason = impact.get('reason', 'N/A') + severity = impact.get('severity', 'N/A') + lines.append(f" - {dimension}: {reason} (Severity: {severity})") + else: + lines.append(f" - {impact}") + + mitigations = risk_assessment.get('mitigation_strategies', []) + if mitigations: + lines.append("\nMitigation Strategies:") + for strategy in mitigations: + lines.append(f" - {strategy}") + + return "\n".join(lines) if lines else "- No specific risks identified" + + def get_baseline_task(self) -> str: + """Get baseline task description for first experiment""" + competition = getattr(self.scen, "competition", 'general') if self.scen else 'general' + + return f"""Design and implement a baseline agentic system for {competition}. + +Requirements: +1. Create an AgenticSystem class for autonomous research task execution +2. Implement task execution with performance monitoring +3. Include metrics collection for DeepResearch dimensions: + - Comprehensiveness, Insight, Instruction Following, Readability +4. Add error handling and logging +5. Output results in structured JSON format + +Target Scores: Comprehensiveness ≥6.0, Insight ≥5.0, Instruction Following ≥7.0, Readability ≥6.0 +""" + + def get_improvement_task_fallback(self, trace: Trace) -> str: + """Fallback task generation when hypothesis unavailable""" + if not hasattr(trace, 'hist') or not trace.hist: + return self._get_baseline_task() + + last_exp, last_feedback = trace.hist[-1] + + decision = getattr(last_feedback, 'decision', None) + base_desc = "Enhance successful system" if decision else "Fix issues in previous implementation" + + feedback_reason = getattr(last_feedback, 'reason', 'No feedback')[:200] + + return f"""{base_desc} + +Previous feedback: {feedback_reason} + +Focus on improving lowest-scoring dimension. + +Current Scores: +{self.format_current_scores(last_feedback)} +""" + + def format_current_scores(self, feedback) -> str: + """Format current dimension scores""" + scores = { + "Comprehensiveness": getattr(feedback, 'comprehensiveness_score', 'N/A'), + "Insight": getattr(feedback, 'insight_score', 'N/A'), + "Instruction Following": getattr(feedback, 'instruction_score', 'N/A'), + "Readability": getattr(feedback, 'readability_score', 'N/A') + } + + return "\n".join(f"- {dim}: {score}" for dim, score in scores.items()) \ No newline at end of file diff --git a/rdagent/scenarios/agentic_sys/research_task.py b/rdagent/scenarios/agentic_sys/research_task.py new file mode 100644 index 000000000..d2235f6f1 --- /dev/null +++ b/rdagent/scenarios/agentic_sys/research_task.py @@ -0,0 +1,212 @@ +""" +DeepResearch Bench Dataset Loader for Agentic System +""" +import json +import logging +from pathlib import Path +from typing import Dict, List, Any, Optional +import requests +from dataclasses import dataclass + +logger = logging.getLogger(__name__) + +@dataclass +class ResearchTask: + """ + Research task from DeepResearch Bench + """ + task_id: str + title: str + description: str + domain: str + difficulty: str + evaluation_metrics: Dict[str, Any] + input_data: Optional[Dict] = None + expected_output: Optional[Dict] = None + metadata: Optional[Dict] = None + +class DeepResearchBenchLoader: + """ + Load and manage DeepResearch Bench loader + """ + def __init__(self, data_path, cache_dir): + """ + Initialize DeepResearch Bench Loader + + Args: + data_path: Path to local dataset (if already download) + cache_dir: Directory to cache downloaded data + """ + self.data_path = data_path + self.cache_dir = cache_dir + self.cache_dir.mkdir(parents = True, exist_ok = True) + self.tasks: List[ResearchTask] = [] + + def load_dataset(self, subset:str): + """ + Load DeepResearch Bench dataset + Args: + subset: Dataset subset to load (e.g., 'easy', 'medium', 'hard') + """ + logger.info(f"Loading DeepResearch Bench dataset (subset={subset})") + if self.data_path and self.data_path.exists(): + #load from local path + self.tasks = self.load_from_local(self.data_path, subset) + else: + #download from remote + self.tasks = self.download_and_load(subset) + logger.info(f"Loaded {len(self.tasks)} tasks from DeepResearch Bench (subset={subset})") + return self.tasks + + def load_from_local(self, data_path: Path, subset: str): + "load dataset from local path" + tasks = [] + #assume JSON format + json_files = list(data_path.glob(".json")) + + for json_file in json_files: + try: + with open(json_file, 'r') as f: + data = json.load(f) + + #Parse task data + if isinstance(data,list): + for item in data: + task = self.parse_task(item) + if self.matches_subset(task, subset): + tasks.append(task) + else: + task = self.parse_task(data) + if self.matches_subset(task, subset): + tasks.append(task) + except Exception as e: + logger.error(f"Failed to load task from {json_file}: {e}") + return tasks + + def download_and_load(self, subset): + "download dataset from DeepResearch Bench and load" + base_url = "https://github.com/Ayanami0730/deep_research_bench" + + tasks = [] + cache_file = self.cache_dir / f"tasks_{subset}.json" + + #Check cache first + if cache_file.exists(): + logger.info(f"Loading from cache: {cache_file}") + with open(cache_file, 'r') as f: + cached_data = json.load(f) + return [self.parse_task(item) for item in cached_data] + try: + #Download task list + logger.info(f"Downloading tasks from {base_url}") + response = requests.get(f"{base_url}/tree/main/data/{subset}_data/{subset}.jsonl",timeout=30) + response.raise_for_status() + data = response.json() + tasks_data = data.get('tasks', []) + + #Parse tasks + for item in tasks_data: + task = self.parse_task(item) + tasks.append(task) + + #Cache downloaded data + with open(cache_file, 'w') as f: + json.dump(tasks_data, f) + + logger.info(f"Downloaded and cached {len(tasks)} tasks") + except Exception as e: + logger.error(f"Failed to download dataset: {e}") + tasks = self.create_mock_tasks(subset) + + return tasks + + def parse_task(self, data: Dict) -> ResearchTask: + """Parse task data into ResearchTask object""" + return ResearchTask( + task_id=data.get('id', 'unknown'), + title=data.get('title', ''), + description=data.get('description', ''), + domain=data.get('domain', 'general'), + difficulty=data.get('difficulty', 'medium'), + evaluation_criteria=data.get('evaluation_criteria', {}), + input_data=data.get('input_data'), + expected_output=data.get('expected_output'), + metadata=data.get('metadata', {}) + ) + + def matches_subset(self, task: ResearchTask, subset: str) -> bool: + """Check if task matches requested subset""" + if subset == "all": + return True + return task.difficulty.lower() == subset.lower() + + def create_mock_tasks(self, subset: str) -> List[ResearchTask]: + """Create mock tasks for testing when download fails""" + logger.warning("Creating mock tasks for testing") + + mock_tasks = [ + { + 'id': 'mock_001', + 'title': 'Literature Review Synthesis', + 'description': 'Synthesize findings from multiple research papers', + 'domain': 'research_synthesis', + 'difficulty': 'medium', + 'evaluation_criteria': { + 'completeness': 0.3, + 'coherence': 0.3, + 'accuracy': 0.4 + }, + 'input_data': { + 'papers': ['paper1.pdf', 'paper2.pdf', 'paper3.pdf'], + 'query': 'What are the main findings on topic X?' + } + }, + { + 'id': 'mock_002', + 'title': 'Hypothesis Generation', + 'description': 'Generate research hypotheses based on existing literature', + 'domain': 'hypothesis_generation', + 'difficulty': 'hard', + 'evaluation_criteria': { + 'novelty': 0.4, + 'feasibility': 0.3, + 'clarity': 0.3 + } + }, + { + 'id': 'mock_003', + 'title': 'Experiment Design', + 'description': 'Design an experiment to test a given hypothesis', + 'domain': 'experiment_design', + 'difficulty': 'easy', + 'evaluation_criteria': { + 'validity': 0.4, + 'completeness': 0.3, + 'practicality': 0.3 + } + } + ] + + return [self._parse_task(task) for task in mock_tasks + if self._matches_subset(self._parse_task(task), subset)] + + def get_task_by_id(self, task_id: str) -> Optional[ResearchTask]: + """Get specific task by ID""" + for task in self.tasks: + if task.task_id == task_id: + return task + return None + + def get_tasks_by_domain(self, domain: str): + """Get tasks filtered by domain""" + return [task for task in self.tasks if task.domain == domain] + + def get_tasks_by_difficulty(self, difficulty: str): + """Get tasks filtered by difficulty""" + return [task for task in self.tasks + if task.difficulty.lower() == difficulty.lower()] + + + + + diff --git a/rdagent/scenarios/agentic_sys/scen.py b/rdagent/scenarios/agentic_sys/scen.py new file mode 100644 index 000000000..0b9fdecbb --- /dev/null +++ b/rdagent/scenarios/agentic_sys/scen.py @@ -0,0 +1,299 @@ +from typing import Any, Dict, Optional +from rdagent.core.experiment import Task +from rdagent.core.scenario import Scenario +from rdagent.scenarios.agentic_sys.evaluator import DeepResearchEvaluator, EvaluationResult + +#define experiment scenario +#scenario abstraction for agentic system development +#support different competition contexts +class AgenticSysScen(Scenario): + def __init__(self, competition: str,evaluation_weights: Optional[Dict[str, float]] = None) -> None: + self.competition = competition + + #Initialize DeepResearch Bench evaluator + self.evaluator = DeepResearchEvaluator(dimension_weights=evaluation_weights) + + # Set competition-specific evaluation weights + self.evaluation_weights = evaluation_weights or { + 'comprehensiveness': 0.25, + 'insight': 0.25, + 'instruction_following': 0.25, + 'readability': 0.25 + } + + + # Implement dummy functions for the abstract methods in Scenario + @property + def background(self) -> str: + """Background information""" + background_template = { + "deepresearch": "Advanced AI agent research focusing on autonomous reasoning and complex problem solving", + "tool_usage": "Development of agents with sophisticated tool usage and API integration capabilities", + "multi_agent": "Multi-agent systems with coordination, communication, and collaborative task execution", + "planning": "Agent planning systems with strategic thinking and multi-step task decomposition", + "general": "General-purpose agentic system development with broad task handling capabilities" + } + base_desc = background_template.get(self.competition, f"Agentic system development for {self.competition}") + + evaluation_info = f""" + + Evaluation Framework: DeepResearch Bench Standards + - Comprehensiveness (weight: {self.evaluator.weights['comprehensiveness']:.2f}): Breadth and depth of coverage + - Insight (weight: {self.evaluator.weights['insight']:.2f}): Causal reasoning and originality + - Instruction Following (weight: {self.evaluator.weights['instruction_following']:.2f}): Task compliance + - Readability (weight: {self.evaluator.weights['readability']:.2f}): Clarity and presentation + """ + + return f"""Competition: {self.competition},Objective: {base_desc}, Focus: Create autonomous AI agents that can execute complex tasks with minimal human intervention. +Key requirements include task planning, execution monitoring, error handling, and performance optimization. {evaluation_info}""" + + + #running environment description and standards + def get_runtime_environment(self) -> str: + """Get the runtime environment information""" + return f"""Runtime Environment for competition {self.competition}: + Base Requirements: + - Python 3.8+ execution environment + - JSON serialization support for results + - File I/O capabilities for workspace management + - Standard Library access + + Agent Framework: + - Task execution and monitoring system + - Performance metrics collection module (success rate, average time, error count) + - Error handling and logging mechanisms + - Structured output format (JSON) + - DeepResearch Bench evaluation integration + + Execution Context: + - Isolated workspace directory + - Configurable timeout settings + - Resource monitoring (CPU, Memory usage) and cleanup + - Result validation and reporting + - Multi-dimensional quality assessment (Comprehensiveness, Insight, Instruction Following, Readability) + """ + + #task content analyze + def get_scenario_all_desc( + self, + task: Task | None = None, + filtered_tag: str | None = None, + simple_background: bool | None = None, + ) -> str: + """Combine all descriptions together""" + parts = [] + + #1. basic information processing + if simple_background: + parts.append(f"Competition: {self.competition}. Develop an autonomous agentic system.") + else: + parts.append(self.background) + parts.append(self.get_runtime_environment()) + + #2. task specific processing + if task: + parts.append(f"\n--- Current Task ---") + parts.append(task.description) + task_desc = task.description.lower() + if 'memory' in task_desc: + parts.append("Additional Focus: Memory management and state persistence.") + elif 'parallel' in task_desc: + parts.append("Additional Focus: Parallel execution and concurrency handling.") + elif 'planning' in task_desc: + parts.append("Additional Focus: Advanced planning and multi-step task decomposition.") + + # Add evaluation criteria for this task + parts.append(self.get_task_evaluation_criteria(task)) + + if filtered_tag: + parts.append(f"\n--- Filtered Tags: {filtered_tag} ---") + tag_guidance = self.get_tag_guidance(filtered_tag) + if tag_guidance: + parts.append(tag_guidance) + + if not simple_background: + parts.append(self.get_success_criteria()) + + return "\n".join(parts) + + def get_task_evaluation_criteria(self, task: Task) -> str: + """Get evaluation criteria specific to the task""" + + #extract task-specific information + task_desc = task.description.lower() if task and task.description else "" + task_domain = getattr(task, 'domain', 'general') if task else 'general' + + focus_areas = [] + emphasis_dimensions = {} + + #Analyze task description to adjust criteria emphasis + if 'comprehensive' in task_desc: + focus_areas.append("comprehensive coverage") + emphasis_dimensions['comprehensiveness'] = 'emphasized' + + if 'analyze' in task_desc or 'explain' in task_desc or 'reason' in task_desc: + focus_areas.append("analytical reasoning") + emphasis_dimensions['insight'] = 'emphasized' + + if 'follow' in task_desc or 'present' in task_desc or 'format' in task_desc: + focus_areas.append("strict instruction adherence") + emphasis_dimensions['instruction_following'] = 'emphasized' + + if 'report' in task_desc or 'present' in task_desc or 'clarity' in task_desc: + focus_areas.append("clear presentation") + emphasis_dimensions['readability'] = 'emphasized' + + #build focus statement + focus_statement = "" + if focus_areas: + focus_statement = f"\n**Task Focus**: This task particularly emphasizes {', '.join(focus_areas)}.\n" + else: + focus_statement = "\n**Task Focus**: Standard evaluation across all dimensions, including comprehensiveness, Insight, Instruction following and readability\n" + + #domain specific guidance + domain_guidance = self.get_domain_specific_guidance(task_domain) + + #build criteria with emphasis markers + comp_marker = emphasis_dimensions.get('comprehensiveness', '') + insight_marker = emphasis_dimensions.get('insight', '') + instruction_marker = emphasis_dimensions.get('instruction_following', '') + readability_marker = emphasis_dimensions.get('readability', '') + + return f""" +--- Evaluation Criteria (DeepResearch Bench) --- +{focus_statement} + +Your solution will be evaluated on four dimensions (0-10 scale each): + +1. Comprehensiveness ({self.evaluator.weights['comprehensiveness']:.0%} weight): + - Coverage of all required subtopics + - Depth of analysis with evidence + - Multiple perspectives considered + - No major omissions + +2. Insight ({self.evaluator.weights['insight']:.0%} weight): + - Causal reasoning and why-think + - Quantified analysis with data + - Non-obvious implications identified + - Novel synthesis or frameworks + +3. Instruction Following ({self.evaluator.weights['instruction_following']:.0%} weight): + - Answers all sub-questions + - Respects scope and constraints + - Required deliverables present + - Avoids out-of-scope content + +4. Readability ({self.evaluator.weights['readability']:.0%} weight): + - Clear structure and organization + - Fluent, precise language + - Effective data presentation + - Proper formatting + +Overall Score: Weighted sum of four dimensions +Target: >= 7.0/10.0 overall for success +""" + + def get_tag_guidance(self, tag): + """acquire specific guidance based on tag""" + tag_guidance = { + "performance": "Optimize for speed and resource efficiency. Evaluation: Focus on insight (efficiency analysis) and comprehensiveness (performance metrics).", + "robustness": "Focus on error handling and system stability. Evaluation: Emphasize comprehensiveness (edge cases) and instruction following (requirements).", + "scalability": "Design for handling larger and more complex tasks. Evaluation: Highlight insight (scalability analysis) and comprehensiveness (architectural depth).", + "planning": "Emphasize strategic thinking and multi-step execution. Evaluation: Prioritize insight (causal reasoning) and comprehensiveness (planning depth).", + "coordination": "Multi-agent communication and collaboration. Evaluation: Focus on comprehensiveness (interaction coverage) and readability (clear protocols)." + } + return tag_guidance.get(tag.lower(), f"Focus on {tag} aspects.") + + + + def get_success_criteria(self): + '''acquire success criteria with DeepResearch Bench standards''' + return f""" +--- Success Criteria --- + +Primary Metrics (Execution): +- Task Success Rate: >= 70% +- Average Execution Time: Within reasonable limits +- Error Rate: < 10% + +Quality Metrics (DeepResearch Bench): +- Comprehensiveness: >= 6.0/10.0 (adequate coverage) +- Insight: >= 6.0/10.0 (clear reasoning) +- Instruction Following: >= 7.0/10.0 (compliant) +- Readability: >= 6.0/10.0 (clear presentation) +- Overall Score: >= 7.0/10.0 + +Implementation Requirements: +- Clean, maintainable code structure +- Proper error handling and logging +- JSON-formatted result output with evaluation scores +- Autonomous task execution capability +- Documented reasoning and decision-making process + +Scoring Guidance: +- 0-2: Poor/Missing - Major issues +- 4-6: Basic/Adequate - Meets minimum requirements +- 6-8: Good/Complete - Solid implementation +- 8-10: Excellent/Exhaustive - Outstanding quality +""" + + def evaluate_output( + self, + output: Any, + task: Optional[Task] = None, + reference_output: Optional[Any] = None + ) -> EvaluationResult: + """ + Evaluate output using DeepResearch Bench standards + + Args: + output: The agent's output to evaluate + task: Optional task for context + reference_output: Optional reference for normalization + + Returns: + EvaluationResult with scores for all dimensions + """ + # Prepare task requirements + task_requirements = {} + task_context = {} + + if task: + task_context = { + 'task_description': task.description, + 'competition': self.competition + } + + # Extract requirements from task description + task_desc_lower = task.description.lower() + task_requirements['required_sections'] = [] + + if 'results' in task_desc_lower or 'output' in task_desc_lower: + task_requirements['required_sections'].append('results') + if 'analysis' in task_desc_lower or 'evaluate' in task_desc_lower: + task_requirements['required_sections'].append('analysis') + if 'metrics' in task_desc_lower or 'performance' in task_desc_lower: + task_requirements['required_sections'].append('metrics') + + # Evaluate using the evaluator + reference_result = None + if reference_output: + reference_result = self.evaluator.evaluate( + reference_output, + task_requirements, + task_context + ) + + result = self.evaluator.evaluate( + output, + task_requirements, + task_context, + reference_result + ) + + return result + + @property + def rich_style_description(self) -> str: + """Rich style description to present""" + return f"AgenticSysScen for competition: {self.competition} with DeepResearch Bench evaluation" \ No newline at end of file diff --git a/rdagent/scenarios/agentic_sys/tools/deploy_searxng.sh b/rdagent/scenarios/agentic_sys/tools/deploy_searxng.sh new file mode 100644 index 000000000..124b50961 --- /dev/null +++ b/rdagent/scenarios/agentic_sys/tools/deploy_searxng.sh @@ -0,0 +1,94 @@ +#!/bin/bash +# filepath: /data/userdata/v-wangzhu/RD-Agent/rdagent/scenarios/agentic_sys/tools/deploy_searxng.sh + +# SearxNG Deployment Script +set -e + +SEARXNG_DIR="${HOME}/apps/searxng" +SEARXNG_PORT=8888 +CONTAINER_NAME="searxng" + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } + +deploy() { + log_info "Deploying SearxNG..." + mkdir -p "${SEARXNG_DIR}/config" "${SEARXNG_DIR}/data" + + if docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + log_warn "Container exists, removing..." + docker rm -f ${CONTAINER_NAME} + fi + + docker run --name ${CONTAINER_NAME} -d \ + -p ${SEARXNG_PORT}:8080 \ + -v "${SEARXNG_DIR}/config:/etc/searxng/" \ + -v "${SEARXNG_DIR}/data:/var/cache/searxng/" \ + --restart unless-stopped \ + docker.io/searxng/searxng:latest + + log_info "SearxNG deployed at http://localhost:${SEARXNG_PORT}" + sleep 5 +} + +update_config() { + log_info "Updating configuration..." + CONFIG_FILE="${SEARXNG_DIR}/config/settings.yml" + + local attempts=0 + while [ ! -f "$CONFIG_FILE" ] && [ $attempts -lt 10 ]; do + log_info "Waiting for config file... ($((attempts+1))/10)" + sleep 2 + attempts=$((attempts+1)) + done + + if [ ! -f "$CONFIG_FILE" ]; then + log_error "Config file not found!" + exit 1 + fi + + sudo chmod 777 -R "${SEARXNG_DIR}/config/" + + if ! command -v yq &> /dev/null; then + log_warn "Installing yq..." + sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 + sudo chmod +x /usr/local/bin/yq + fi + + yq -i '.search.formats = ["html", "json", "csv"]' "$CONFIG_FILE" + log_info "Configuration updated" + + restart +} + +restart() { + log_info "Restarting SearxNG..." + docker restart ${CONTAINER_NAME} >/dev/null + log_info "Restarted successfully" + sleep 3 +} + +status() { + log_info "SearxNG Status:" + if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + echo -e "${GREEN} Status: Running${NC}" + echo " URL: http://localhost:${SEARXNG_PORT}" + else + echo -e "${RED} Status: Not running${NC}" + fi +} + +case "${1:-help}" in + deploy) deploy ;; + update_config) update_config ;; + restart) restart ;; + status) status ;; + *) echo "Usage: $0 {deploy|update_config|restart|status}" ;; +esac \ No newline at end of file diff --git a/rdagent/scenarios/agentic_sys/tools/how_to_use.md b/rdagent/scenarios/agentic_sys/tools/how_to_use.md new file mode 100644 index 000000000..441fc1839 --- /dev/null +++ b/rdagent/scenarios/agentic_sys/tools/how_to_use.md @@ -0,0 +1,19 @@ +from pathlib import Path +from rdagent.scenarios.agentic_sys.tools.web_search import create_web_search_tool + +# Initialize +config_path = Path(__file__).parent / "tools" / "search_config.yaml" +search_tool = create_web_search_tool(config_path) + +# Search for hypothesis +results = search_tool.search_for_hypothesis( + task_description="Improve agentic system", + current_gaps=["information gathering"], + context={'weak_dimension': 'comprehensiveness'} +) + +# Process results +for result in results: + print(f"Title: {result['title']}") + print(f"URL: {result['url']}") + print(f"Relevance: {result['relevance']}") \ No newline at end of file diff --git a/rdagent/scenarios/agentic_sys/tools/search_config.yaml b/rdagent/scenarios/agentic_sys/tools/search_config.yaml new file mode 100644 index 000000000..1216a09fc --- /dev/null +++ b/rdagent/scenarios/agentic_sys/tools/search_config.yaml @@ -0,0 +1,38 @@ +# SearxNG Configuration +base_url: "http://localhost:8888" # Change to your SearxNG server +timeout: 30 +max_retries: 3 +default_format: "json" +relevance_threshold: 0.3 + +# Search Strategy +max_results_per_query: 5 +preferred_engines: + - google + - bing + - google_scholar + +# Credibility Scoring +credibility_weights: + edu_domain: 0.3 + gov_domain: 0.3 + org_domain: 0.2 + academic_engine: 0.4 + tech_blog: 0.1 + +# Query Generation +max_queries_per_search: 5 +query_templates: + gap_specific: "how to improve {gap}" + best_practice: "best practices for {gap}" + case_study: "{methodology} case studies" + optimization: "improve {dimension} in research systems" + +#Search categories +categories: + - general + - science + +default_language: "auto" + +safesearch: 0 # 0: off, 1: moderate, 2: strict \ No newline at end of file diff --git a/rdagent/scenarios/agentic_sys/tools/searxng_client.py b/rdagent/scenarios/agentic_sys/tools/searxng_client.py new file mode 100644 index 000000000..79566ecb8 --- /dev/null +++ b/rdagent/scenarios/agentic_sys/tools/searxng_client.py @@ -0,0 +1,220 @@ +""" +SearXNG client for web search. Based on the deployment script's search API +""" +import requests +import json +import csv +from io import StringIO +from pathlib import Path +from typing import List, Dict, Any, Optional +from rdagent.log import rdagent_logger as logger +import yaml + +class SearxNGClient: + """ + Client for SearxNG search engine with multi-format support. + + Features: + - Multiple output formats (json,csv, html) + - Result filtering by relevant + - error handling and retry logic + - Configuration requirement + """ + + def __init__(self,config_path): + """ + Initialize SearxNG client + Args: + config_path (str): Path to the SearxNG configuration file + """ + #load configuration + if config_path and config_path.exists(): + with open(config_path, 'r') as f: + config = yaml.load(f) + else: + #base configuration + config = { + 'base_url': "http://localhost:8888", + 'timeout': 30, + 'max_retries': 3, + 'default_format': 'json', + 'relevant_threshold': 0.3 + } + + self.base_url = config.get('base_url', 'http://localhost:8888') + self.timeout = config.get('timeout', 30) + self.max_retries = config.get('max_retries', 3) + self.default_format = config.get('default_format', 'json') + self.relevance_threshold = config.get('relevant_threshold', 0.3) + + logger.info(f"SearxNGClient initialized with base_url: {self.base_url}") + + def search(self, query, format, categories, engines, languages, time_range,safesearch): + """Perform web search using SearxNG API.""" + if not query or not query.strip(): + logger.warning("Empty query provided to SearxNGClient.search") + return self.empty_result(query) + format = format or self.default_format + + #build search parameters + params = { + 'q': query, + 'format': format + } + if categories: + params['categories'] = ','.join(categories) + + if engines: + params['engines'] = ','.join(engines) + + if languages != 'auto': + params['language'] = languages + + if time_range: + params['time_range'] = time_range + + if safesearch > 0: + params['safesearch'] = safesearch + + #perform search with retry logic + for attempt in range(self.max_retries): + try: + logger.info(f"Searching SearxNG (attempt {attempt + 1}/{self.max_retries}): {query}") + response = requests.get( + f"{self.base_url}/search", + params=params, + timeout=self.timeout + ) + response.raise_for_status() + + #Parse response based on format + if format == 'json': + result = response.json() + elif format == 'csv': + result = self.parse_csv_response(response.text, query) + elif format == 'html': + result = self.parse_html_response(response.text, query) + else: + raise ValueError(f"Unsupported format: {format}") + logger.info(f"Search completed: {len(result.get('results', []))} results") + return result + + except requests.Timeout: + logger.warning(f"Search timeout") + if attempt == self.max_retries - 1: + return self.empty_result(query, error="Timeout") + + except requests.RequestException as e: + logger.error(f"Search request failed: {e}") + if attempt == self.max_retries - 1: + return self.empty_result(query, error=str(e)) + + except Exception as e: + logger.error(f"Error processing search response: {e}") + return self.empty_result(query, error=str(e)) + return self.empty_result(query) + + def search_json(self, query, **kwargs): + """Search with JSON output""" + return self.search(query, format = 'json', **kwargs) + + def search_with_filter( + self, + query, + min_score, + max_results, + **kwargs + ): + """Search and filter results by relevance score. + Args: + query: Search query + min_score: Minimum relevance score (0 to 1) + max_results: Maximum number of results to return + **kwargs: Additional search parameters + Returns: + Filtered list of search results + """ + + min_score = min_score or self.relevance_threshold + result = self.search(query, format = 'json', **kwargs) + + #filter and sort results + filtered = [ + r for r in result.get('results', []) + if r.get('relevance_score', 0) >= min_score + ] + + #sort by score (descending) + filtered.sort(key=lambda r: r.get('score', 0), reverse=True) + + #limit results + if max_results: + filtered = filtered[:max_results] + + return filtered + + def empty_result(self, query, error=None): + """Return empty search result""" + result = { + 'query': query, + 'number_of_results': 0, + 'results': [], + 'answers': [], + 'suggestions': [], + 'corrections': [], + 'infoboxes': [], + 'unresponsive_engines': [] + } + if error: + result['error'] = error + return result + + def parse_csv_response(self, csv_text, query): + """Parse CSV response to JSON format""" + import csv + from io import StringIO + results = [] + reader = csv.DictReader(StringIO(csv_text)) + for row in reader: + results.append({ + 'title': row.get('title',''), + 'url': row.get('url',''), + 'content': row.get('content',''), + 'score': 1 / (len(results) + 1) #simple score based on order + }) + return { + 'query': query, + 'number_of_results': len(results), + 'results': results, + 'answers': [], + 'suggestions': [], + } + + def parse_html_response(self, html_text, query): + """Parse HTML response""" + return { + 'query': query, + 'number_of_results': 0, + 'results': [], + 'answers': [], + 'suggestions': [], + } + + +def create_searxng_client(config_path=None): + """Factory method to create SearxNG client""" + return SearxNGClient(config_path) + + + + + + + + + + + + + + diff --git a/rdagent/scenarios/agentic_sys/tools/web_search.py b/rdagent/scenarios/agentic_sys/tools/web_search.py new file mode 100644 index 000000000..b2297f373 --- /dev/null +++ b/rdagent/scenarios/agentic_sys/tools/web_search.py @@ -0,0 +1,226 @@ +""" +Web Search Tool for Agentic System +Intelligent SearxNG for external knowledge retrieval +""" + +from typing import List, Dict, Any, Optional +from pathlib import Path + +from rdagent.log import rdagent_logger as logger +from rdagent.scenarios.agentic_sys.tools.searxng_client import SearxNGClient + +class WebSearchTool: + """ + High-level web search tool for hypothesis generation + + Features: + - Query generation from context + - Multi-source search support + - Result ranking and filtering + - Source validation + - Knowledge extraction + """ + + def __init__(self, config_path: Optional[Path] = None): + """ + Initialize WebSearchTool with SearxNG client + + Args: + config_path (Optional[Path]): Path to SearxNG configuration file + """ + self.client = SearxNGClient(config_path) + #search strategy configuration + self.max_results_per_query = 5 + self.min_relevance_score = 0.3 + self.preferred_engines = ['duckduckgo', 'google','bing'] + logger.info("WebSearchTool initialized with SearxNGClient") + + + def search_for_hypothesis(self, task_description, current_gaps, context): + """ + search for information to support hypothesis generation + Args: + task_description: Description of the research task + current_gaps: List of identified knowledge gaps + context: Additional context + Returns: + List of relevant external sources with metadata + """ + #generate search queries + queries = self.generate_queries(task_description, current_gaps, context) + + #execute searches + all_results = [] + for query in queries: + try: + results = self.client.search_with_filter( + query = query, + min_score = self.min_relevance_score, + max_results = self.max_results_per_query, + engines = self.preferred_engines + ) + all_results.extend(results) + except Exception as e: + logger.error(f"Error during search for query '{query}': {e}") + continue + + #rank and filter results + ranked_results = self.deduplicate_results(all_results) + + #validate sources + validated_results = self.validate_sources(ranked_results) + + #extract key information + enriched = self.extract_knowledge(validated_results) + + logger.info(f"Search completed with {len(enriched)} relevant sources found") + return enriched + + def generate_queries(self, task_description, gaps, context): + """ + Generate search queries based on task and gaps. + Strategy: + 1. Primary queries: Direct task-related questions + 2. Gap-specific queries: Target identified knowledge gaps + 3. Exploratory queries, adjacent topics and methodologies + """ + queries = [] + #Primary query + if task_description: + queries.append(task_description[:200]) + + #Gap-specific queries + for gap in gaps: + queries.append(f"how to improve {gap}") + + #context-based queries + if context: + #If previous experiments failed in specific dimension + if 'weak_dimension' in context: + dim = context['weak_dimension'] + queries.append(f"improve {dim} in research system") + queries.append(f"{dim} optimization techniques") + + # If specific methodology is being used + if 'methodology' in context: + method = context['methodology'] + queries.append(f"{method} case studies") + + #Remove duplicates while preserving order + seen = set() + unique_queries = [] + for q in queries: + if q.lower() not in seen: + seen.add(q.lower()) + unique_queries.append(q) + logger.info(f"Generated {len(unique_queries)} search queries") + return unique_queries + + def deduplicate_results(self, results): + """ + Remove duplicate results based on URL + """ + seen_urls = set() + deduplicated = [] + + #sort by score first + sorted_results = sorted( + results, + key = lambda x: x.get('score', 0), reverse=True + ) + for result in sorted_results: + url = result.get('url') + if url and url not in seen_urls: + seen_urls.add(url) + deduplicated.append(result) + + def extract_knowledge(self, results): + """ + Extract and structure key knowledge from search results + Args: + results: Validated search results + Returns: + Enriched results with structured knowledge + """ + + enriched = [] + for idx, result in enumerate(results): + enriched_result = { + 'citation': f"{result.get('title', 'Untitled')} ({result.get('url', 'No URL')})", + 'title': result.get('title', ''), + 'url': result.get('url', ''), + 'summary': result.get('content', '')[:300], # First 300 chars + 'relevance': result.get('score', 0), + 'credibility': result.get('credibility', 0.5), + 'credibility_level': result.get('credibility_level', 'Medium'), + 'source_engine': result.get('engine', 'unknown'), + 'rank': idx + } + enriched.append(enriched_result) + return enriched + + def validate_sources(self, results): + """ + Validate source credibility. + """ + validated = [] + for result in results: + url = result.get('url', '') + + #calculate credibility score + credibility = self.calculate_credibility(url, result) + + #Add credibility to result + result['credibility'] = credibility + result['credibility_level'] = self.credibility_level(credibility) + validated.append(result) + + validated.sort( + key = lambda r: (r.get('credibility', 0), r.get('score', 0)), + reverse=True + ) + return validated + + def calculate_credibility(self, url, result): + """ + Calculate source credibility score based on heuristics + """ + score = 0.5 # Baseline + + # Domain-based scoring + if any(domain in url.lower() for domain in ['.edu', '.gov', '.org']): + score += 0.3 + elif any(domain in url.lower() for domain in ['arxiv.org', 'scholar.google', 'pubmed']): + score += 0.4 # Academic sources + elif any(domain in url.lower() for domain in ['medium.com', 'towardsdatascience']): + score += 0.1 # Tech blogs + + # Title-based signals + title = result.get('title', '').lower() + if any(keyword in title for keyword in ['research', 'study', 'analysis', 'survey']): + score += 0.1 + + # Engine-based trust + engine = result.get('engine', '') + if engine in ['google_scholar', 'semantic_scholar']: + score += 0.2 + + # Normalize to [0, 1] + return min(1.0, score) + + def credibility_level(self, score): + """ + convert credibility score to qualitative label + """ + if score >= 0.8: + return 'High' + elif score >= 0.5: + return 'Medium' + else: + return 'Low' + +def create_web_search_tool(config_path): + """ + Factory function to create web search tool + """ + return WebSearchTool(config_path=config_path) \ No newline at end of file diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py index 209effdae..4af528fd0 100644 --- a/rdagent/scenarios/data_science/experiment/experiment.py +++ b/rdagent/scenarios/data_science/experiment/experiment.py @@ -41,3 +41,19 @@ def is_ready_to_run(self) -> bool: def set_local_selection(self, local_selection: tuple[int, ...]) -> None: self.local_selection = local_selection + + +class ExperimentResult: + def __init__( + self, + success: bool, + metrics: dict[str, float] | pd.DataFrame | None = None, + logs: str | None = None, + errors: str | None = None, + metadata: dict | None = None, + ) -> None: + self.success = success + self.metrics = metrics + self.logs = logs + self.errors = errors + self.metadata = metadata if metadata is not None else {} diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/base.py b/rdagent/scenarios/data_science/proposal/exp_gen/base.py index d94a054ac..ca5863cb9 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/base.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/base.py @@ -61,8 +61,6 @@ def __init__(self, scen: DataScienceScen, knowledge_base: KnowledgeBase | None = self.sota_exp_to_submit: DSExperiment | None = None # grab the global best exp to submit - self.uncommitted_experiments: dict[int, DSExperiment] = {} # loop_id -> DSExperiment - def should_inject_diversity(self, current_selection: tuple[int, ...] | None = None) -> bool: """ Check if diversity context should be injected based on the current selection. @@ -78,13 +76,6 @@ def should_inject_diversity(self, current_selection: tuple[int, ...] | None = No COMPLETE_ORDER = ("DataLoadSpec", "FeatureEng", "Model", "Ensemble", "Workflow") - def register_uncommitted_exp(self, exp: DSExperiment, loop_id: int): - self.uncommitted_experiments[loop_id] = exp - - def deregister_uncommitted_exp(self, loop_id: int): - if loop_id in self.uncommitted_experiments: - del self.uncommitted_experiments[loop_id] - def set_sota_exp_to_submit(self, exp: DSExperiment) -> None: self.sota_exp_to_submit = exp @@ -136,32 +127,6 @@ def get_sibling_exps(self, current_selection: tuple[int, ...] | None = None): sibling_exps.append(self.hist[idx][0]) return sibling_exps - def sync_dag_parent_and_hist( - self, - exp_and_fb: tuple[Experiment, ExperimentFeedback], - cur_loop_id: int, - ) -> None: - """ - Adding corresponding parent index to the dag_parent when the hist is going to be changed. - Should be called when the hist is changed. - """ - - if len(self.hist) == 0 or len(self.get_current_selection()) == 0: - # the node we are going to add is the first node of hist / root node of a new sub-trace - self.dag_parent.append(()) - - else: - current_node_idx = self.current_selection[0] - - if current_node_idx == -1: - # the current selection is the latest one - current_node_idx = len(self.hist) - 1 - - self.dag_parent.append((current_node_idx,)) - self.hist.append(exp_and_fb) - self.idx2loop_id[len(self.hist) - 1] = cur_loop_id - self.deregister_uncommitted_exp(cur_loop_id) - def retrieve_search_list( self, search_type: Literal["all", "ancestors"] = "ancestors",