diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index ce7dbe625..bafebf08e 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -156,6 +156,15 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     """Number of failures tolerated before escalating to next timeout level (stage width). Every 'patience' failures, timeout increases by 'runner_timeout_increase_stage'"""
     show_hard_limit: bool = True
 
+    #### hypothesis critique and rewrite
+    enable_hypo_critique_rewrite: bool = True
+    """Enable hypothesis critique and rewrite stages for improving hypothesis quality"""
+    enable_scale_check: bool = False
+
+    #### mcp in coder
+    enable_context7: bool = True
+    """enable the use of context7 as mcp to search for relevant documents of current implementation errors"""
+
     #### enable runner code change summary
     runner_enable_code_change_summary: bool = True
 
diff --git a/rdagent/components/coder/data_science/pipeline/eval.py b/rdagent/components/coder/data_science/pipeline/eval.py
index f296d986f..1973c81d5 100644
--- a/rdagent/components/coder/data_science/pipeline/eval.py
+++ b/rdagent/components/coder/data_science/pipeline/eval.py
@@ -1,6 +1,7 @@
 # tess successfully running.
 # (GPT) if it aligns with the spec & rationality of the spec.
-import json
+import asyncio
+import concurrent.futures
 import re
 from dataclasses import dataclass
 from pathlib import Path
@@ -20,6 +21,7 @@
 from rdagent.components.coder.data_science.conf import get_clear_ws_cmd, get_ds_env
 from rdagent.components.coder.data_science.share.notebook import NotebookConverter
 from rdagent.components.coder.data_science.utils import remove_eda_part
+from rdagent.components.mcp import query_context7
 from rdagent.core.experiment import FBWorkspace, Task
 from rdagent.log import rdagent_logger as logger
 from rdagent.scenarios.data_science.test_eval import get_test_eval
@@ -76,7 +78,7 @@ def __str__(self) -> str:
 
         if self.error_message is not None:
             # Check if error_message contains Context7 documentation results
-            if "### API Documentation Reference:" in self.error_message:
+            if "### Relevant Documentation Reference:" in self.error_message:
                 base_str += f"-------------------Error Analysis & Documentation Search Results ------------------\n{self.error_message}\n"
             else:
                 base_str += f"-------------------Error Message------------------\n{self.error_message}\n"
@@ -270,8 +272,8 @@ def evaluate(
         else:
             eda_output = implementation.file_dict.get("EDA.md", None)
 
-        # extract enable_mcp_documentation_search from data science configuration
-        enable_mcp_documentation_search = DS_RD_SETTING.enable_mcp_documentation_search
+        # extract enable_context7 from setting
+        enable_context7 = DS_RD_SETTING.enable_context7
 
         queried_similar_successful_knowledge = (
             queried_knowledge.task_to_similar_task_successful_knowledge[target_task.get_task_information()]
@@ -282,7 +284,7 @@ def evaluate(
         system_prompt = T(".prompts:pipeline_eval.system").r(
             is_sub_enabled=test_eval.is_sub_enabled(self.scen.competition),
             debug_mode=DS_RD_SETTING.sample_data_by_LLM,
-            enable_mcp_documentation_search=enable_mcp_documentation_search,
+            enable_context7=enable_context7,
             mle_check=DS_RD_SETTING.sample_data_by_LLM,
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
         )
@@ -303,33 +305,33 @@ def evaluate(
             init_kwargs_update_func=PipelineSingleFeedback.val_and_update_init_dict,
         )
 
-        # judge whether we should perform documentation search
-        do_documentation_search = enable_mcp_documentation_search and wfb.requires_documentation_search
-
-        if do_documentation_search:
-            # Use MCPAgent for clean, user-friendly interface
+        if enable_context7 and wfb.requires_documentation_search is True:
             try:
-                # Create agent targeting Context7 service - model config comes from mcp_config.json
-                doc_agent = DocAgent()
-
-                # Synchronous query - perfect for evaluation context
-                if wfb.error_message:  # Type safety check
-                    context7_result = doc_agent.query(query=wfb.error_message)
-
-                    if context7_result:
-                        logger.info("Context7: Documentation search completed successfully")
-                        wfb.error_message += f"\n\n### API Documentation Reference:\nThe following API documentation was retrieved based on the error. This provides factual information about API changes or parameter specifications only:\n\n{context7_result}"
-                    else:
-                        logger.warning("Context7: Documentation search failed or no results found")
-                else:
-                    logger.warning("Context7: No error message to search for")
 
-            # TODO: confirm what exception will be raised when timeout
-            # except concurrent.futures.TimeoutError:
-            #     logger.error("Context7: Query timed out after 180 seconds")
+                def run_context7_sync():
+                    """Run Context7 query in a new event loop"""
+                    # Create new event loop to avoid conflicts with existing loop
+                    new_loop = asyncio.new_event_loop()
+                    asyncio.set_event_loop(new_loop)
+                    try:
+                        return new_loop.run_until_complete(
+                            query_context7(error_message=wfb.error_message, full_code=implementation.all_codes)
+                        )
+                    finally:
+                        new_loop.close()
+
+                # Execute in thread pool to avoid event loop conflicts
+                with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+                    future = executor.submit(run_context7_sync)
+                    context7_result = future.result(timeout=120)  # 120s timeout, sufficient time for retry mechanism
+
+                if context7_result:
+                    logger.info("Context7: Documentation search completed successfully")
+                    wfb.error_message += f"\n\n### API Documentation Reference:\nThe following API documentation was retrieved based on the error. This provides factual information about API changes or parameter specifications only:\n\n{context7_result}"
+                else:
+                    logger.warning("Context7: Documentation search failed or no results found")
             except Exception as e:
-                error_msg = str(e) if str(e) else type(e).__name__
-                logger.error(f"Context7: Query failed - {error_msg}")
+                logger.error(f"Context7: Query failed - {str(e)}")
 
         if score_ret_code != 0 and wfb.final_decision is True:
             wfb.final_decision = False
diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
index 55b13ce5b..fd7fef384 100644
--- a/rdagent/components/coder/data_science/pipeline/prompts.yaml
+++ b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -233,11 +233,12 @@ pipeline_eval:
     - Notes:
       - Model performance is not evaluated in this step; focus solely on successful execution.
       - Warnings are acceptable if they do not interfere with successful code execution.
+      - **Environment Constraint**: The coding environment is fixed and pre-configured. No package installation or modification is allowed. Code must use only existing pre-installed packages.
     - If the code execute successfully:
-      - Proceed to Step 2.
+      - Proceed to Step 2 and overlook the remaining steps in Step 1.
     - If the code does not execute successfully:
       - Set the "final_decision" to false.
-      {% if enable_mcp_documentation_search %}
+      {% if enable_context7 %}
       - Given that my package/environment is fixed and unchangeable, first you should go through the code and the execution output,if the problem could be solved by looking up the official documentation to confirm feature/API availability, compatible usage, or official alternatives in the fixed environment, set the "requires_documentation_search" to true.
       {% endif %}
       - Write complete analysis in the "execution" field.
@@ -314,14 +315,14 @@ pipeline_eval:
     Please respond with your feedback in the following JSON format without anything else.
     ```json
     {
-        {% if enable_mcp_documentation_search %}
+    {% if enable_context7 %}
         "requires_documentation_search": <true/false>,
-        {% endif %}"execution": "Describe whether the code executed successfully. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information. If errors occurred, analyze the root causes: (1) Are they fundamental algorithmic/approach issues, or (2) Implementation details that can be easily fixed, or (3) Environment/dependency problems?",
+    {% endif %}
+        "execution": "Describe whether the code executed successfully. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information. If errors occurred, analyze the root causes: (1) Are they fundamental algorithmic/approach issues, or (2) Implementation details that can be easily fixed, or (3) Environment/dependency problems?",
         "return_checking": "Examine the generated files by cross-referencing the code logic and stdout output. Verify: (1) Format matches required submission format (index, column names, CSV content); (2) **File generation authenticity**: Is the file genuinely produced by successful model execution, or is it a result of exception handling/fallback mechanisms? Cite specific code sections and stdout evidence.",
         "code": "Begin explicitly with [Code analysis] or [Evaluation error]. Provide structured analysis: (1) **Technical Appropriateness**: Does the chosen approach (algorithms, data processing, validation strategy) match this problem's data characteristics and competition requirements? (2) **Effective Components**: What specific parts work well and why are they effective for this problem type? (3) **Issues & Improvements**: Identify concrete problems and suggest actionable improvement directions (without providing actual code). (4) **Code Quality**: Assess readability, structure, and adherence to specifications.",
-        {% if enable_mcp_documentation_search %}
         "error_message": "If the code execution has problems, extract the error information in the following format, otherwise set to empty string: ### TRACEBACK: <full relevant traceback extracted from execution output> ### SUPPLEMENTARY_INFO: <only if TRACEBACK is unclear - copy exact code fragments: import statements, variable=value assignments, function calls with parameters as they appear in code>",
-        {% endif %}"final_decision": <true/false>
+        "final_decision": <true/false>
     }
     ```
 
diff --git a/rdagent/components/mcp/__init__.py b/rdagent/components/mcp/__init__.py
new file mode 100644
index 000000000..8f627d33f
--- /dev/null
+++ b/rdagent/components/mcp/__init__.py
@@ -0,0 +1,8 @@
+"""MCP (Model Context Protocol) integration for RD-Agent.
+
+This module provides context7 functionality for documentation search.
+"""
+
+from .context7 import query_context7
+
+__all__ = ["query_context7"]
diff --git a/rdagent/components/mcp/cache.py b/rdagent/components/mcp/cache.py
new file mode 100644
index 000000000..ec06f8a22
--- /dev/null
+++ b/rdagent/components/mcp/cache.py
@@ -0,0 +1,225 @@
+"""MCP cache management module.
+
+Provides general caching functionality for MCP tools and query result caching.
+Reuses RD-Agent's existing SQLite cache system with permanent caching strategy.
+"""
+
+import hashlib
+from typing import Any, Dict, Optional
+
+from rdagent.log import rdagent_logger as logger
+from rdagent.oai.backend.base import SQliteLazyCache
+from rdagent.oai.llm_conf import LLM_SETTINGS
+
+
+class MCPCache:
+    """MCP cache manager based on existing SQLite cache system.
+
+    Uses permanent caching strategy, consistent with LITELLM.
+    """
+
+    def __init__(self):
+        """Initialize cache manager.
+
+        Uses permanent caching without expiration time.
+        """
+        self._cache = SQliteLazyCache(cache_location=LLM_SETTINGS.prompt_cache_path)
+        self._stats = {"tools_hits": 0, "tools_misses": 0, "query_hits": 0, "query_misses": 0}
+
+    def _get_cached_result(self, cache_key: str) -> Optional[str]:
+        """Get result from SQLite cache."""
+        return self._cache.chat_get(cache_key)
+
+    def _set_cached_result(self, cache_key: str, result: str):
+        """Set SQLite cache result."""
+        self._cache.chat_set(cache_key, result)
+
+    def get_tools(self, mcp_url: str) -> Optional[Any]:
+        """Get cached tools.
+
+        Args:
+            mcp_url: MCP service URL
+
+        Returns:
+            Cached tools list, returns None if cache miss
+        """
+        # Tool object serialization is complex, temporarily not implementing tool caching
+        self._stats["tools_misses"] += 1
+        logger.info(f"Tools cache miss for URL: {mcp_url} (tools caching disabled)")
+        return None
+
+    def set_tools(self, mcp_url: str, tools: Any):
+        """Set tools cache.
+
+        Args:
+            mcp_url: MCP service URL
+            tools: Tools list to cache (currently unused)
+        """
+        # Temporarily not caching tool objects as they contain complex objects that are difficult to serialize
+        logger.info(f"Tools caching skipped for URL: {mcp_url} (complex objects)")
+
+    def get_query_result(self, error_message: str) -> Optional[str]:
+        """Get cached query result.
+
+        Args:
+            error_message: Error message
+
+        Returns:
+            Cached query result, returns None if cache miss
+        """
+        cache_key = f"mcp_query:{hashlib.md5(error_message.encode('utf-8')).hexdigest()}"
+        cached_result = self._get_cached_result(cache_key)
+
+        if cached_result:
+            self._stats["query_hits"] += 1
+            logger.info(f"Query cache hit for key: {cache_key[-8:]}...")
+            return cached_result
+
+        self._stats["query_misses"] += 1
+        logger.info(f"Query cache miss for key: {cache_key[-8:]}...")
+        return None
+
+    def set_query_result(self, error_message: str, result: str):
+        """Set query result cache.
+
+        Args:
+            error_message: Error message
+            result: Query result
+        """
+        cache_key = f"mcp_query:{hashlib.md5(error_message.encode('utf-8')).hexdigest()}"
+        self._set_cached_result(cache_key, result)
+        logger.info(f"Query result cached for key: {cache_key[-8:]}...")
+
+    def clear_cache(self):
+        """Clear all MCP cache."""
+        cleared_count = 0
+
+        # Clear all cache keys with mcp_ prefix
+        # Note: This requires traversing the entire database, performance may be poor
+        logger.warning("Clearing all MCP cache entries...")
+
+        # Due to SQLite interface limitations, we cannot directly traverse keys, so provide hints
+        logger.info(
+            "To completely clear MCP cache, please delete the SQLite cache file or use clear_mcp_cache_by_pattern()"
+        )
+
+        return cleared_count
+
+    def clear_query_cache(self, error_message: str = None):
+        """Clear query cache.
+
+        Args:
+            error_message: If specified, only clear cache for specific error message; otherwise clear all query cache
+        """
+        if error_message:
+            # Clear cache for specific query
+            cache_key = f"mcp_query:{hashlib.md5(error_message.encode('utf-8')).hexdigest()}"
+            # SQLite has no direct delete method, we set to None to "delete"
+            self._set_cached_result(cache_key, "")  # Set to empty string to indicate deletion
+            logger.info(f"Cleared cache for specific query: {cache_key[-8:]}...")
+        else:
+            logger.info("To clear all query cache, please use clear_all_mcp_cache() or delete the cache file")
+
+    def get_cache_info(self):
+        """Get cache information."""
+        stats = self.get_cache_stats()
+        cache_file = getattr(self._cache, "cache_location", "unknown")
+
+        info = {"cache_file": cache_file, "stats": stats, "cache_type": "SQLite (shared with LITELLM)"}
+
+        logger.info(f"Cache info: {info}")
+        return info
+
+    def get_cache_stats(self) -> Dict[str, Any]:
+        """Get cache statistics."""
+        total_tools = self._stats["tools_hits"] + self._stats["tools_misses"]
+        total_queries = self._stats["query_hits"] + self._stats["query_misses"]
+
+        return {
+            "tools_cache": {
+                "hits": self._stats["tools_hits"],
+                "misses": self._stats["tools_misses"],
+                "hit_rate": self._stats["tools_hits"] / max(total_tools, 1),
+                "size": "N/A (SQLite)",
+            },
+            "query_cache": {
+                "hits": self._stats["query_hits"],
+                "misses": self._stats["query_misses"],
+                "hit_rate": self._stats["query_hits"] / max(total_queries, 1),
+                "size": "N/A (SQLite)",
+            },
+        }
+
+    def log_cache_stats(self):
+        """Log cache statistics to log."""
+        stats = self.get_cache_stats()
+        logger.info(
+            f"Cache stats - Tools: {stats['tools_cache']['hits']}/{stats['tools_cache']['hits'] + stats['tools_cache']['misses']} hits "
+            f"({stats['tools_cache']['hit_rate']:.2%}), "
+            f"Queries: {stats['query_cache']['hits']}/{stats['query_cache']['hits'] + stats['query_cache']['misses']} hits "
+            f"({stats['query_cache']['hit_rate']:.2%})"
+        )
+
+
+# Global cache instance
+_global_cache: Optional[MCPCache] = None
+
+
+def get_mcp_cache() -> MCPCache:
+    """Get global MCP cache instance.
+
+    Returns:
+        MCP cache instance (permanent cache)
+    """
+    global _global_cache
+    if _global_cache is None:
+        _global_cache = MCPCache()
+    return _global_cache
+
+
+def clear_mcp_cache_by_file():
+    """Clear all cache by deleting SQLite cache file.
+
+    Note: This will clear all cache, including LITELLM cache!
+    """
+    import os
+
+    from rdagent.oai.llm_conf import LLM_SETTINGS
+
+    cache_file = LLM_SETTINGS.prompt_cache_path
+    if os.path.exists(cache_file):
+        try:
+            os.remove(cache_file)
+            logger.info(f"Successfully deleted cache file: {cache_file}")
+
+            # Reset global cache instance
+            global _global_cache
+            _global_cache = None
+
+            return True
+        except Exception as e:
+            logger.error(f"Failed to delete cache file {cache_file}: {e}")
+            return False
+    else:
+        logger.info(f"Cache file does not exist: {cache_file}")
+        return True
+
+
+def get_cache_file_info():
+    """Get cache file information."""
+    import os
+
+    from rdagent.oai.llm_conf import LLM_SETTINGS
+
+    cache_file = LLM_SETTINGS.prompt_cache_path
+
+    if os.path.exists(cache_file):
+        stat = os.stat(cache_file)
+        size_mb = stat.st_size / (1024 * 1024)
+
+        info = {"file_path": cache_file, "exists": True, "size_mb": round(size_mb, 2), "modified_time": stat.st_mtime}
+    else:
+        info = {"file_path": cache_file, "exists": False, "size_mb": 0, "modified_time": None}
+
+    logger.info(f"Cache file info: {info}")
+    return info
diff --git a/rdagent/components/mcp/context7.py b/rdagent/components/mcp/context7.py
new file mode 100644
index 000000000..53dcaf17f
--- /dev/null
+++ b/rdagent/components/mcp/context7.py
@@ -0,0 +1,198 @@
+"""Context7 MCP integration for documentation search.
+
+This module provides a simplified interface for querying documentation
+using MCP (Model Context Protocol) tools.
+"""
+
+import asyncio
+import time
+from typing import Optional
+
+from llama_index.core.agent.workflow import ReActAgent
+from llama_index.core.workflow import Context
+from llama_index.llms.openai import OpenAI
+from llama_index.tools.mcp import aget_tools_from_mcp_url
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+
+from rdagent.components.mcp.cache import get_mcp_cache
+from rdagent.components.mcp.util import get_context7_settings
+from rdagent.log import rdagent_logger as logger
+
+
+@retry(
+    stop=stop_after_attempt(3),  # Retry 2 times, total 3 attempts
+    wait=wait_exponential(multiplier=1, min=3, max=20),  # Exponential backoff: 3s, 6s, 12s
+    retry=retry_if_exception_type(
+        (
+            ConnectionError,
+            TimeoutError,
+            RuntimeError,
+            OSError,
+        )
+    ),
+)
+async def _query_context7_with_retry(
+    error_message: str, full_code: Optional[str] = None, verbose: bool = False
+) -> Optional[str]:
+    """Internal function with retry mechanism for Context7 queries.
+
+    Args:
+        error_message: The error message or traceback to search for
+        full_code: Complete code context for better understanding (optional)
+        verbose: Enable verbose logging for ReAct agent (default: False)
+    Returns:
+        Documentation search result as string, or None if failed
+    """
+    # Load configuration using pydantic settings
+    settings = get_context7_settings()
+    # Initialize cache - enabled by default, permanent caching
+    cache = get_mcp_cache() if settings.cache_enabled else None
+
+    # Check query cache first
+    if cache:
+        cached_result = cache.get_query_result(error_message)
+        if cached_result:
+            logger.info("Returning cached query result")
+            cache.log_cache_stats()
+            return cached_result
+
+    # Record start time for execution timing
+    start_time = time.time()
+
+    # Try to get tools from cache first
+    tools = cache.get_tools(settings.mcp_url) if cache else None
+
+    if tools is None:
+        # Cache miss or cache disabled, load tools from URL
+        tool_start_time = time.time()
+        tools = await aget_tools_from_mcp_url(settings.mcp_url)
+        tool_end_time = time.time()
+        logger.info(f"Context7 tool loading time: {tool_end_time - tool_start_time:.2f} seconds")
+
+        # Cache the tools for future use
+        if cache:
+            cache.set_tools(settings.mcp_url, tools)
+    else:
+        logger.info("Using cached tools, loading time: 0.00 seconds")
+
+    # Initialize LLM with OpenAI configuration
+    llm = OpenAI(model=settings.model, api_key=settings.api_key, api_base=settings.api_base)
+
+    # Create ReAct agent with loaded tools
+    agent = ReActAgent(tools=tools, llm=llm, verbose=verbose)
+    ctx = Context(agent)
+
+    # Record time for agent execution
+    agent_start_time = time.time()
+
+    # Construct query with error message and context7 instruction
+    # TODO: how to fix the agent to force the two tools to be used
+    # TODO: how to extend to more apis (currently only gpt models through llama_index)
+
+    # ADD SPECIAL CASE FOR TIMM LIBRARY
+    timm_trigger = error_message.lower().count("timm") >= 3
+    if timm_trigger:
+        timm_trigger_text = "6.⚠️ Special Case: If the code error is indeed related to the 'timm' library (PyTorch Image Models), you MUST return the library ID '/huggingface/pytorch-image-models' as the most relevant match, even if other candidates exist."
+    else:
+        timm_trigger_text = ""
+    # END OF SPECIAL CASE FOR TIMM LIBRARY
+
+    # Build context information
+    context_info = ""
+    if full_code:
+        context_info += f"\n\nCURRENT CODE CONTEXT:\n```python\n{full_code}\n```\n"
+
+    query = f"""ERROR MESSAGE:
+{error_message}
+{context_info}
+
+IMPORTANT INSTRUCTIONS:
+1. ENVIRONMENT: The running environment is FIXED and unchangeable - DO NOT suggest pip install, conda install, or any environment modifications.
+
+2. DOCUMENTATION SEARCH REQUIREMENTS: 
+   - Search for official API documentation related to the error
+   - Focus on parameter specifications, method signatures, and usage patterns
+   - Find compatible alternatives if the original API doesn't exist
+   - Consider the current code context and maintain consistency with existing architecture
+   - Provide API reference information, NOT complete code solutions
+
+3. RESPONSE FORMAT:
+   - Start with a brief explanation of the root cause
+   - Provide relevant API documentation excerpts
+   - List available parameters and their descriptions
+   - Show method signatures and basic usage patterns
+   - If multiple API options exist, document all viable alternatives
+
+4. STRICT CONSTRAINTS:
+   - DO NOT provide complete working code replacements
+   - DO NOT suggest hardware configuration changes (CPU/GPU)
+   - DO NOT recommend architecture or framework changes
+   - DO NOT provide performance optimization suggestions
+   - ONLY provide API documentation and parameter information
+
+5. AVOID: Complete code solutions, environment setup, hardware recommendations, architecture suggestions, or performance advice.
+
+{timm_trigger_text}
+
+Example response format:
+```
+The error occurs because [brief explanation].
+
+API Documentation:
+- Method: library.function_name(param1, param2, ...)
+- Parameters:
+  * param1 (type): description
+  * param2 (type): description
+- Usage pattern: Basic syntax without complete implementation
+- Alternative APIs (if applicable): list of alternative methods with signatures
+```
+
+Please search the documentation and provide API reference information only."""
+
+    # Execute agent query
+    response = await agent.run(query, ctx=ctx)
+
+    agent_end_time = time.time()
+    logger.info(f"Context7 agent execution time: {agent_end_time - agent_start_time:.2f} seconds")
+
+    # Calculate and display total execution time
+    end_time = time.time()
+    total_time = end_time - start_time
+    logger.info(f"Context7 total execution time: {total_time:.2f} seconds")
+    logger.info(f"Context7 execution completed at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time))}")
+
+    result = str(response)
+
+    # Cache the query result
+    if cache:
+        cache.set_query_result(error_message, result)
+        cache.log_cache_stats()
+
+    return result
+
+
+async def query_context7(error_message: str, full_code: Optional[str] = None, verbose: bool = False) -> Optional[str]:
+    """Query context7 documentation for error resolution with retry mechanism.
+
+    Args:
+        error_message: The error message or traceback to search for
+        full_code: Complete code context for better understanding (optional)
+        verbose: Enable verbose logging for ReAct agent (default: False)
+    Returns:
+        Documentation search result as string, or None if failed
+    """
+    try:
+        return await _query_context7_with_retry(error_message, full_code, verbose)
+    except (ConnectionError, TimeoutError, RuntimeError, OSError) as e:
+        # These are retryable errors, but retries have failed
+        logger.error(f"Context7 query failed after retries due to {type(e).__name__}: {str(e)}")
+        return None
+    except Exception as e:
+        # Other non-retryable errors (e.g., configuration errors, authentication failures)
+        logger.error(f"Context7 query failed due to non-retryable error {type(e).__name__}: {str(e)}")
+        return None
diff --git a/rdagent/components/mcp/example.py b/rdagent/components/mcp/example.py
new file mode 100644
index 000000000..d8c33d20d
--- /dev/null
+++ b/rdagent/components/mcp/example.py
@@ -0,0 +1,452 @@
+"""Example usage of Context7 MCP integration."""
+
+import asyncio
+
+from context7 import query_context7
+
+
+async def main():
+    """Main function for testing context7 functionality."""
+    error_msg = """### TRACEBACK: Traceback (most recent call last):
+File "/opt/conda/envs/kaggle/lib/python3.11/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
+value_or_values = func(trial)
+^^^^^^^^^^^
+File "/workspace/RD-Agent/git_ignore_folder/RD-Agent_workspace/3ee7771734bb4b04af50f76e8c0e5ed7/main.py", line 226, in <lambda>
+study.optimize(lambda trial: lgb_optuna_objective(trial, X_sub, y_sub, num_class, debug),
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+File "/workspace/RD-Agent/git_ignore_folder/RD-Agent_workspace/3ee7771734bb4b04af50f76e8c0e5ed7/main.py", line 201, in lgb_optuna_objective
+gbm = lgb.train(
+^^^^^^^^^^
+File "/opt/conda/envs/kaggle/lib/python3.11/site-packages/lightgbm/engine.py", line 282, in train
+booster = Booster(params=params, train_set=train_set)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+File "/opt/conda/envs/kaggle/lib/python3.11/site-packages/lightgbm/basic.py", line 3641, in __init__
+_safe_call(
+File "/opt/conda/envs/kaggle/lib/python3.11/site-packages/lightgbm/basic.py", line 296, in _safe_call
+raise LightGBMError(_LIB.LGBM_GetLastError().decode("utf-8"))
+lightgbm.basic.LightGBMError: No OpenCL device found
+### SUPPLEMENTARY_INFO: lgb.train called with device=gpu, gpu_platform_id=0, gpu_device_id=0."""
+    full_code = """
+    import os
+import sys
+import time
+import numpy as np
+import pandas as pd
+import argparse
+
+import lightgbm as lgb
+import optuna
+
+from sklearn.model_selection import train_test_split, StratifiedKFold
+from sklearn.metrics import accuracy_score
+
+def print_eda(train, test):
+    eda = []
+    eda.append("=== Start of EDA part ===")
+    eda.append("# Initial Data Assessment & Sanitization")
+
+    eda.append(f"Train shape: {train.shape}")
+    eda.append(f"Test shape: {test.shape}")
+
+    eda.append("\nFirst 5 rows of train:")
+    eda.append(str(train.head().to_string()))
+
+    eda.append("\nFirst 5 rows of test:")
+    eda.append(str(test.head().to_string()))
+
+    eda.append("\n# Data Types per column (train):")
+    eda.append(str(train.dtypes.value_counts()))
+    eda.append("\n# Data Types per column (test):")
+    eda.append(str(test.dtypes.value_counts()))
+
+    eda.append("\nMissing values per column (train):")
+    eda.append(str(train.isnull().sum().sort_values(ascending=False).head(10)))
+    eda.append("\nMissing values per column (test):")
+    eda.append(str(test.isnull().sum().sort_values(ascending=False).head(10)))
+
+    eda.append("\nUnique values per column (train):")
+    uniques = train.nunique().sort_values(ascending=False)
+    eda.append(str(uniques.head(15)))
+    eda.append("\nUnique values per column (test):")
+    uniques_test = test.nunique().sort_values(ascending=False)
+    eda.append(str(uniques_test.head(15)))
+
+    if "Cover_Type" in train.columns:
+        eda.append("\nTarget variable distribution (Cover_Type):")
+        eda.append(str(train['Cover_Type'].value_counts().sort_index()))
+
+    num_cols = [col for col in train.columns if train[col].dtype in ['int32', 'int64', 'float32', 'float64']]
+    if len(num_cols) > 20:
+        num_cols = [c for c in num_cols if "Soil" not in c and "Wilder" not in c and c not in ["Id", "Cover_Type"]]
+    eda.append('\n# Numerical column summary (central tendency, spread, potential outliers):')
+    eda.append(str(train[num_cols].describe().transpose().head(10)))
+
+    bin_cols = [col for col in train.columns if train[col].nunique(dropna=False) == 2 and col not in ["Cover_Type"]]
+    eda.append("\n# Number of binary indicator features in train: {}".format(len(bin_cols)))
+    eda.append("Sample binary indicator columns: {}".format(bin_cols[:10]))
+
+    eda.append("=== End of EDA part ===")
+    print('\n'.join(eda[:10000]))  # truncate if very long
+
+def get_numeric_int32_cols(df, exclude=[]):
+    cols = []
+    for col in df.columns:
+        if col in exclude:
+            continue
+        if pd.api.types.is_integer_dtype(df[col]) or pd.api.types.is_float_dtype(df[col]):
+            cols.append(col)
+    return cols
+
+def engineer_features(df):
+    df = df.copy()
+    pi = np.pi
+    df['Aspect_rad'] = (df['Aspect'] * pi / 180).astype(np.float32)
+    df['Aspect_sin'] = np.sin(df['Aspect_rad']).astype(np.float32)
+    df['Aspect_cos'] = np.cos(df['Aspect_rad']).astype(np.float32)
+    df['Elevation_minus_VertHydro'] = (df['Elevation'] - df['Vertical_Distance_To_Hydrology']).astype(np.float32)
+    den = (df['Horizontal_Distance_To_Hydrology'] + 1).astype(np.float32)
+    ratio = df['Elevation'] / den
+    ratio = ratio.replace([np.inf, -np.inf], 0.0)
+    ratio = ratio.fillna(0).astype(np.float32)
+    df['Elevation_Hydro_Ratio'] = ratio
+    df['Slope_Hillshade'] = (df['Slope'] * df['Hillshade_Noon']).astype(np.float32)
+    return df
+
+def stratified_subsample(X, y, frac, seed, n_class):
+    n_total = len(y)
+    n_sub = int(n_total * frac)
+    value_counts = y.value_counts()
+    min_per_class = value_counts.min()
+    # Always guarantee at least 2 samples per class (for Sklearn stratify)
+    if n_sub < n_class * 2:
+        n_sub = max(n_class * 2, n_sub)
+    if min_per_class < 2 or min_per_class*n_class < n_sub:
+        print(f"[WARNING] Subsample size ({n_sub}) or class frequency is too small, using full data for Optuna in this run.")
+        return X, y
+    for c in range(n_class):
+        if (y == c).sum() < 2:
+            print(f"[WARNING] Class {c} has <2 samples, using full data for Optuna.")
+            return X, y
+    X_sub, _, y_sub, _ = train_test_split(
+        X, y, train_size=n_sub, stratify=y, random_state=seed, shuffle=True,
+    )
+    return X_sub, y_sub
+
+def compute_class_scaling(val_pred_proba, val_labels):
+    from scipy.optimize import minimize
+    n_class = val_pred_proba.shape[1]
+    y_true = val_labels
+    def custom_acc(scaling):
+        arr = val_pred_proba * scaling.reshape((1, -1))
+        pred = np.argmax(arr, axis=1)
+        return -accuracy_score(y_true, pred)
+    scaling0 = np.ones(n_class, dtype=np.float32)
+    bounds = [(0.8, 1.2) for _ in range(n_class)]
+    result = minimize(custom_acc, scaling0, method='L-BFGS-B', bounds=bounds)
+    best_scaling = result.x
+    acc_scaled = -result.fun
+    acc_base = accuracy_score(y_true, np.argmax(val_pred_proba, axis=1))
+    if acc_scaled - acc_base >= 0.001:
+        return best_scaling
+    return None
+
+def apply_scaling(pred_proba, scaling_factors):
+    if scaling_factors is None:
+        return pred_proba
+    return pred_proba * scaling_factors.reshape((1, -1))
+
+def build_lgb_dataset(X, y=None, categorical_feature='auto', free_raw_data=True):
+    if y is not None:
+        return lgb.Dataset(
+            data=X,
+            label=y,
+            free_raw_data=free_raw_data,
+            categorical_feature=categorical_feature
+        )
+    else:
+        return lgb.Dataset(
+            data=X,
+            free_raw_data=free_raw_data,
+            categorical_feature=categorical_feature
+        )
+
+def get_dtype_map_train(train, exclude=[]):
+    dt = {}
+    for col in train.columns:
+        if col in exclude:
+            continue
+        if pd.api.types.is_integer_dtype(train[col]) or pd.api.types.is_float_dtype(train[col]):
+            dt[col] = np.int32
+    return dt
+
+def get_dtype_map_test(test, exclude=[]):
+    return get_dtype_map_train(test, exclude=exclude)
+
+def free_memory(objs):
+    import gc
+    for x in objs:
+        del x
+    gc.collect()
+
+def lgb_optuna_objective(trial, X_sub, y_sub, num_class, debug):
+    param = {
+        'objective': 'multiclass',
+        'num_class': num_class,
+        'metric': 'multi_logloss',
+        'boosting_type': 'gbdt',
+        'verbosity': -1,
+        'device': 'gpu',
+        'gpu_platform_id': 0,
+        'gpu_device_id': 0,
+        'random_state': 42,
+    }
+    param['learning_rate'] = trial.suggest_float('learning_rate', 0.03, 0.21, log=True)
+    max_depth = trial.suggest_int('max_depth', 5, 14)
+    param['max_depth'] = max_depth
+    param['num_leaves'] = trial.suggest_int("num_leaves", 2 ** (max_depth - 1), 2 ** max_depth - 1)
+    param['min_child_samples'] = trial.suggest_int('min_child_samples', 10, 120)
+    param['min_child_weight'] = trial.suggest_float('min_child_weight', 1e-3, 100.0, log=True)
+    param['reg_alpha'] = trial.suggest_float('reg_alpha', 1e-3, 5.0, log=True)
+    param['reg_lambda'] = trial.suggest_float('reg_lambda', 1e-3, 5.0, log=True)
+    param['colsample_bytree'] = trial.suggest_float('colsample_bytree', 0.5, 1.0)
+    param['subsample'] = trial.suggest_float('subsample', 0.5, 1.0)
+    param['subsample_freq'] = 1
+    param['n_jobs'] = 4
+    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+    val_scores = []
+    # Setup LightGBM callbacks for log-eval and early-stopping
+    if debug:
+        num_boost_round = 10
+        early_stopping_rounds = 5
+    else:
+        num_boost_round = 200
+        early_stopping_rounds = 100
+    callbacks = [lgb.early_stopping(early_stopping_rounds), lgb.log_evaluation(50)]
+    for train_idx, val_idx in skf.split(X_sub, y_sub):
+        X_train, X_val = X_sub.iloc[train_idx, :], X_sub.iloc[val_idx, :]
+        y_train, y_val = y_sub.iloc[train_idx], y_sub.iloc[val_idx]
+        lgb_train = build_lgb_dataset(X_train, y_train)
+        lgb_val = build_lgb_dataset(X_val, y_val)
+        gbm = lgb.train(
+            param,
+            lgb_train,
+            valid_sets=[lgb_train, lgb_val],
+            valid_names=['train', 'valid'],
+            num_boost_round=num_boost_round,
+            callbacks=callbacks
+        )
+        val_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
+        val_pred_labels = np.argmax(val_pred, axis=1)
+        acc = accuracy_score(y_val, val_pred_labels)
+        val_scores.append(acc)
+    mean_acc = np.mean(val_scores)
+    trial.set_user_attr('mean_accuracy', mean_acc)
+    print(f"[Optuna trial] Params: {param} --> mean valid accuracy: {mean_acc:.5f}")
+    return mean_acc
+
+def run_optuna_search(X_sub, y_sub, num_class, debug, timeout_min=60, n_trials=100):
+    sampler = optuna.samplers.TPESampler(seed=42)
+    study = optuna.create_study(direction="maximize", sampler=sampler)
+    if debug:
+        timeout = 120
+        n_trials = 10
+    else:
+        timeout = int(timeout_min * 60)
+    study.optimize(lambda trial: lgb_optuna_objective(trial, X_sub, y_sub, num_class, debug),
+                   timeout=timeout, n_trials=n_trials, show_progress_bar=False)
+    trials = sorted([t for t in study.trials if t.value is not None], key=lambda t: t.value, reverse=True)
+    unique_param_sets = []
+    seen_strs = set()
+    for t in trials:
+        param_str = str({k: t.params[k] for k in sorted(t.params.keys())})
+        if param_str not in seen_strs:
+            unique_param_sets.append((t.value, t.params))
+            seen_strs.add(param_str)
+        if len(unique_param_sets) >= 3:
+            break
+    print('Top 3 hyperparameter sets:')
+    for i, (score, params) in enumerate(unique_param_sets, 1):
+        print(f"  Rank {i}: Accuracy={score:.5f} | Params={params}")
+    return [p for _, p in unique_param_sets]
+
+def train_full_and_select(train_features, train_labels, num_class, best_param_list, debug):
+    results = []
+    for i, params in enumerate(best_param_list):
+        X, y = train_features, train_labels
+        X_tr, X_val, y_tr, y_val = train_test_split(
+            X, y, test_size=0.1, random_state=42, stratify=y
+        )
+        lgb_train = build_lgb_dataset(X_tr, y_tr)
+        lgb_val = build_lgb_dataset(X_val, y_val)
+        full_params = dict(params)
+        full_params.update({
+            'objective': 'multiclass',
+            'num_class': num_class,
+            'metric': 'multi_logloss',
+            'boosting_type': 'gbdt',
+            'verbosity': -1,
+            'device': 'gpu',
+            'gpu_platform_id': 0,
+            'gpu_device_id': 0,
+            'random_state': 42,
+        })
+        if debug:
+            num_boost_round = 10
+            early_stopping_rounds = 5
+        else:
+            num_boost_round = 1000
+            early_stopping_rounds = 200
+        callbacks = [lgb.early_stopping(early_stopping_rounds), lgb.log_evaluation(50)]
+        print(f"[Full Training] Model {i+1}, start, params: {full_params}")
+        bst = lgb.train(
+            full_params,
+            lgb_train,
+            valid_sets=[lgb_train, lgb_val],
+            valid_names=['train', 'valid'],
+            num_boost_round=num_boost_round,
+            callbacks=callbacks
+        )
+        val_pred_proba = bst.predict(X_val, num_iteration=bst.best_iteration)
+        val_pred_labels = np.argmax(val_pred_proba, axis=1)
+        acc = accuracy_score(y_val, val_pred_labels)
+        scaling_factors = compute_class_scaling(val_pred_proba, y_val)
+        if scaling_factors is not None:
+            scaled_proba = apply_scaling(val_pred_proba, scaling_factors)
+            acc_scaled = accuracy_score(y_val, np.argmax(scaled_proba, axis=1))
+            print(f"[Full {i+1}] val_acc_raw={acc:.5f} val_acc_scaled={acc_scaled:.5f}")
+        else:
+            acc_scaled = None
+            print(f"[Full {i+1}] val_acc_raw={acc:.5f} no scaling improvement")
+        results.append({
+            "model": bst,
+            "val_acc_raw": acc,
+            "val_acc_scaled": acc_scaled,
+            "scaling_factors": scaling_factors,
+            "val_pred_proba": val_pred_proba,
+            "val_labels": y_val,
+            "params": full_params,
+        })
+    best_idx = 0
+    best_acc = -1
+    for i, row in enumerate(results):
+        acc_this = row["val_acc_scaled"] if row["val_acc_scaled"] is not None else row["val_acc_raw"]
+        if acc_this > best_acc:
+            best_acc = acc_this
+            best_idx = i
+    print("Selecting Model", best_idx+1, "as final_model")
+    final_row = results[best_idx]
+    return final_row, results
+
+def main():
+    import warnings
+    warnings.filterwarnings("ignore")
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--debug', action='store_true', help='Run in debug mode')
+    args = parser.parse_args()
+    DEBUG = args.debug
+
+    dpath = './workspace_input/'
+    print("[INFO] Loading data...")
+
+    train_sample = pd.read_csv(os.path.join(dpath, 'train.csv'), nrows=500)
+    test_sample = pd.read_csv(os.path.join(dpath, 'test.csv'), nrows=500)
+    numeric_cols_train = get_numeric_int32_cols(train_sample, exclude=[])
+    dtype_map_train = {c: np.int32 for c in numeric_cols_train if c not in ["Id", "Cover_Type"] + []}
+    numeric_cols_test = get_numeric_int32_cols(test_sample, exclude=[])
+    dtype_map_test = {c: np.int32 for c in numeric_cols_test if c not in ["Id"] + []}
+
+    try:
+        train = pd.read_csv(
+            os.path.join(dpath, 'train.csv'),
+            dtype=dtype_map_train,
+            low_memory=False
+        )
+        test = pd.read_csv(
+            os.path.join(dpath, 'test.csv'),
+            dtype=dtype_map_test,
+            low_memory=False
+        )
+    except Exception as e:
+        print("Error loading files:", e)
+        sys.exit(1)
+
+    print_eda(train, test)
+
+    print("[INFO] Engineering features ...")
+    train = engineer_features(train)
+    test = engineer_features(test)
+
+    y = train['Cover_Type'] - 1
+    X = train.drop(columns=['Cover_Type'])
+    X_test = test.copy()
+
+    n_class = 7
+    if DEBUG:
+        optuna_frac = 0.1
+    else:
+        optuna_frac = 0.20
+    print(f"[INFO] Creating stratified Optuna sample: fraction={optuna_frac}")
+    X_sub, y_sub = stratified_subsample(X, y, frac=optuna_frac, seed=42, n_class=n_class)
+    print(f"[INFO] Optuna subsample size: {len(X_sub)}")
+
+    print("[INFO] Running Optuna search for LightGBM ...")
+    optuna_best_params_list = run_optuna_search(X_sub, y_sub, n_class, DEBUG,
+                                               timeout_min=2 if DEBUG else 60,
+                                               n_trials=10 if DEBUG else 100)
+
+    print("[INFO] Full training ...")
+    start_time = time.time()
+    best_row, all_model_results = train_full_and_select(X, y, n_class, optuna_best_params_list, DEBUG)
+    end_time = time.time()
+    debug_time = end_time - start_time
+
+    if DEBUG:
+        if len(X_sub) == len(X):
+            scale = (1000 / 10)
+        else:
+            scale = (1/optuna_frac) * (1000 / 10)
+        est = scale * debug_time
+        print("=== Start of Debug Information ===")
+        print(f"debug_time: {debug_time:.2f}")
+        print(f"estimated_time: {est:.2f}")
+        print("=== End of Debug Information ===")
+
+    final_model = best_row["model"]
+    scaling_factors = best_row["scaling_factors"]
+
+    print("[INFO] Inference and Submission ...")
+    test_pred_proba = final_model.predict(X_test, num_iteration=final_model.best_iteration)
+    if scaling_factors is not None:
+        test_pred_proba = apply_scaling(test_pred_proba, scaling_factors)
+    test_pred_label = np.argmax(test_pred_proba, axis=1) + 1
+    submission = pd.DataFrame({'Id': X_test['Id'], 'Cover_Type': test_pred_label.astype(np.int32)})
+    submission = submission[['Id', 'Cover_Type']]
+    submission.to_csv('submission.csv', index=False)
+    print("[INFO] Saved submission.csv")
+
+    ind = ["model_1", "ensemble"]
+    accs = []
+    val_acc = best_row["val_acc_scaled"] if best_row["val_acc_scaled"] is not None else best_row["val_acc_raw"]
+    accs.append(val_acc)
+    accs.append(val_acc)
+    df_scores = pd.DataFrame({'Accuracy': accs}, index=ind)
+    df_scores.index.name = "Model"
+    df_scores.to_csv("scores.csv")
+    print("[INFO] Saved scores.csv")
+
+    print("[COMPLETE]")
+
+if __name__ == '__main__':
+    main()
+    """
+    # Normal usage (verbose=False by default)
+    result = await query_context7(error_message=error_msg, full_code=full_code, verbose=True)
+    print("Result:", result)
+
+    # Debug usage with verbose output
+    # result = await query_context7(error_msg, verbose=True)
+    # print("Debug Result:", result)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/rdagent/components/mcp/util.py b/rdagent/components/mcp/util.py
new file mode 100644
index 000000000..0013f1a05
--- /dev/null
+++ b/rdagent/components/mcp/util.py
@@ -0,0 +1,51 @@
+"""Context7 MCP configuration using pydantic BaseSettings.
+
+This module provides clean configuration management for Context7 MCP integration.
+"""
+
+import os
+from typing import Optional
+
+from pydantic import Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Context7Settings(BaseSettings):
+    """Context7 MCP configuration settings.
+
+    All settings can be configured via environment variables with CONTEXT7_ prefix.
+    """
+
+    # MCP service configuration
+    mcp_url: str = Field(default="http://localhost:8123/mcp", description="MCP service URL for Context7")
+
+    # LLM configuration
+    model: str = Field(default="gpt-4-turbo", description="LLM model name")
+    api_key: Optional[str] = Field(default=None, description="OpenAI API key")
+    api_base: Optional[str] = Field(default=None, description="OpenAI API base URL")
+
+    # Cache configuration
+    cache_enabled: bool = Field(default=False, description="Enable MCP caching (permanent cache)")
+
+    model_config = SettingsConfigDict(
+        env_prefix="CONTEXT7_",
+        extra="ignore",
+    )
+
+    def model_post_init(self, __context):
+        """Post-initialization fallback to common environment variables."""
+        # Simple environment variable fallback mechanism
+        if self.api_key is None:
+            self.api_key = os.getenv("OPENAI_API_KEY")
+
+        if self.api_base is None:
+            self.api_base = os.getenv("OPENAI_API_BASE")
+
+
+# Global configuration instance
+context7_settings = Context7Settings()
+
+
+def get_context7_settings() -> Context7Settings:
+    """Get the global Context7 settings instance."""
+    return context7_settings
diff --git a/rdagent/log/ui/ds_trace.py b/rdagent/log/ui/ds_trace.py
index 6ed43f877..2f7cc67ac 100644
--- a/rdagent/log/ui/ds_trace.py
+++ b/rdagent/log/ui/ds_trace.py
@@ -14,6 +14,9 @@
 from streamlit import session_state as state
 
 from rdagent.app.data_science.loop import DataScienceRDLoop
+
+# Import necessary classes for the response format
+from rdagent.components.coder.data_science.pipeline.eval import DSCoderFeedback
 from rdagent.log.storage import FileStorage
 from rdagent.log.ui.conf import UI_SETTING
 from rdagent.log.ui.utils import (
@@ -32,8 +35,6 @@
 )
 from rdagent.oai.backend.litellm import LITELLM_SETTINGS
 from rdagent.oai.llm_utils import APIBackend
-
-# Import necessary classes for the response format
 from rdagent.scenarios.data_science.proposal.exp_gen.proposal import (
     CodingSketch,
     HypothesisList,
diff --git a/requirements.txt b/requirements.txt
index adf39826c..62925537d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -68,6 +68,9 @@ mlflow
 azureml-mlflow
 types-pytz
 
+# mcp
+llama-index
+llama-index-tools-mcp
 # Agent
 pydantic-ai-slim[mcp,openai]
 nest-asyncio