explodinggradients
diff --git a/‎src/ragas/metrics/collections/__init__.py‎
Lines changed: 11 additions & 0 deletions b/‎src/ragas/metrics/collections/__init__.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎src/ragas/metrics/collections/_answer_relevancy.py‎
Lines changed: 135 additions & 0 deletions b/‎src/ragas/metrics/collections/_answer_relevancy.py‎
Lines changed: 135 additions & 0 deletions
diff --git a/‎src/ragas/metrics/collections/_rouge_score.py‎
Lines changed: 85 additions & 0 deletions b/‎src/ragas/metrics/collections/_rouge_score.py‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎src/ragas/metrics/collections/base.py‎
Lines changed: 131 additions & 0 deletions b/‎src/ragas/metrics/collections/base.py‎
Lines changed: 131 additions & 0 deletions
@@ -0,0 +1,11 @@
+"""Collections of metrics using modern component architecture."""
+
+from ragas.metrics.collections._answer_relevancy import AnswerRelevancy
+from ragas.metrics.collections._rouge_score import RougeScore
+from ragas.metrics.collections.base import BaseMetric
+
+__all__ = [
+    "AnswerRelevancy",  # Class-based answer relevancy
+    "RougeScore",  # Class-based rouge score
+    "BaseMetric",  # Base class for creating new v2 metrics
+]
@@ -0,0 +1,135 @@
+"""Answer Relevancy metric v2 - Class-based implementation with modern components."""
+
+import typing as t
+
+import numpy as np
+from pydantic import BaseModel
+
+from ragas.metrics.collections.base import BaseMetric
+from ragas.metrics.result import MetricResult
+from ragas.prompt.metrics.answer_relevance import answer_relevancy_prompt
+
+if t.TYPE_CHECKING:
+    from ragas.embeddings.base import BaseRagasEmbedding
+    from ragas.llms.base import InstructorBaseRagasLLM
+
+
+class AnswerRelevanceOutput(BaseModel):
+    """Structured output for answer relevance question generation."""
+
+    question: str
+    noncommittal: int
+
+
+class AnswerRelevancy(BaseMetric):
+    """
+    Evaluate answer relevancy by generating questions from the response and comparing to original question.
+
+    This implementation uses modern instructor LLMs with structured output and modern embeddings.
+    Only supports modern components - legacy wrappers are rejected with clear error messages.
+
+    Usage:
+        >>> import instructor
+        >>> from openai import AsyncOpenAI
+        >>> from ragas.llms.base import instructor_llm_factory
+        >>> from ragas.embeddings.base import embedding_factory
+        >>> from ragas.metrics.collections import AnswerRelevancy
+        >>>
+        >>> # Setup dependencies
+        >>> client = AsyncOpenAI()
+        >>> llm = instructor_llm_factory("openai", client=client, model="gpt-4o-mini")
+        >>> embeddings = embedding_factory("openai", model="text-embedding-ada-002", client=client, interface="modern")
+        >>>
+        >>> # Create metric instance
+        >>> metric = AnswerRelevancy(llm=llm, embeddings=embeddings, strictness=3)
+        >>>
+        >>> # Single evaluation
+        >>> result = await metric.ascore(
+        ...     user_input="What is the capital of France?",
+        ...     response="Paris is the capital of France."
+        ... )
+        >>> print(f"Score: {result.value}")
+        >>>
+        >>> # Batch evaluation
+        >>> results = await metric.abatch_score([
+        ...     {"user_input": "Q1", "response": "A1"},
+        ...     {"user_input": "Q2", "response": "A2"},
+        ... ])
+
+    Attributes:
+        llm: Modern instructor-based LLM for question generation
+        embeddings: Modern embeddings model with embed_text() and embed_texts() methods
+        name: The metric name
+        strictness: Number of questions to generate per answer (3-5 recommended)
+        allowed_values: Score range (0.0 to 1.0)
+    """
+
+    # Type hints for linter (attributes are set in __init__)
+    llm: "InstructorBaseRagasLLM"
+    embeddings: "BaseRagasEmbedding"
+
+    def __init__(
+        self,
+        llm: "InstructorBaseRagasLLM",
+        embeddings: "BaseRagasEmbedding",
+        name: str = "answer_relevancy",
+        strictness: int = 3,
+        **kwargs,
+    ):
+        """Initialize AnswerRelevancy metric with required components."""
+        # Set attributes explicitly before calling super()
+        self.llm = llm
+        self.embeddings = embeddings
+        self.strictness = strictness
+
+        # Call super() for validation (without passing llm/embeddings in kwargs)
+        super().__init__(name=name, **kwargs)
+
+    async def ascore(self, user_input: str, response: str) -> MetricResult:
+        """
+        Calculate answer relevancy score asynchronously.
+
+        Components are guaranteed to be validated and non-None by the base class.
+
+        Args:
+            user_input: The original question
+            response: The response to evaluate
+
+        Returns:
+            MetricResult with relevancy score (0.0-1.0)
+        """
+        prompt = answer_relevancy_prompt(response)
+
+        generated_questions = []
+        noncommittal_flags = []
+
+        for _ in range(self.strictness):
+            result = await self.llm.agenerate(prompt, AnswerRelevanceOutput)
+
+            if result.question:
+                generated_questions.append(result.question)
+                noncommittal_flags.append(result.noncommittal)
+
+        if not generated_questions:
+            return MetricResult(value=0.0)
+
+        all_noncommittal = np.all(noncommittal_flags)
+
+        question_vec = np.asarray(self.embeddings.embed_text(user_input)).reshape(1, -1)
+        gen_question_vec = np.asarray(
+            self.embeddings.embed_texts(generated_questions)
+        ).reshape(len(generated_questions), -1)
+
+        norm = np.linalg.norm(gen_question_vec, axis=1) * np.linalg.norm(
+            question_vec, axis=1
+        )
+        cosine_sim = (
+            np.dot(gen_question_vec, question_vec.T).reshape(
+                -1,
+            )
+            / norm
+        )
+
+        score = cosine_sim.mean() * int(not all_noncommittal)
+
+        return MetricResult(value=float(score))
@@ -0,0 +1,85 @@
+"""Rouge Score metric v2 - Class-based implementation with automatic validation."""
+
+import typing as t
+
+from ragas.metrics.collections.base import BaseMetric
+from ragas.metrics.result import MetricResult
+
+
+class RougeScore(BaseMetric):
+    """
+    Calculate ROUGE score between reference and response texts.
+
+    This implementation provides automatic validation and pure async design
+    without requiring LLM or embedding components.
+
+    Usage:
+        >>> from ragas.metrics.collections import RougeScore
+        >>>
+        >>> # Create metric instance (no LLM/embeddings needed)
+        >>> metric = RougeScore(rouge_type="rougeL", mode="fmeasure")
+        >>>
+        >>> # Single evaluation
+        >>> result = await metric.ascore(
+        ...     reference="The capital of France is Paris.",
+        ...     response="Paris is the capital of France."
+        ... )
+        >>> print(f"Score: {result.value}")
+        >>>
+        >>> # Batch evaluation
+        >>> results = await metric.abatch_score([
+        ...     {"reference": "Text 1", "response": "Response 1"},
+        ...     {"reference": "Text 2", "response": "Response 2"},
+        ... ])
+
+    Attributes:
+        name: The metric name
+        rouge_type: Type of ROUGE metric ("rouge1" for unigrams, "rougeL" for LCS)
+        mode: Scoring mode ("fmeasure", "precision", or "recall")
+        allowed_values: Score range (0.0 to 1.0)
+
+    Note: This metric doesn't define llm or embeddings fields, so no validation is performed.
+    """
+
+    def __init__(
+        self,
+        name: str = "rouge_score",
+        rouge_type: t.Literal["rouge1", "rougeL"] = "rougeL",
+        mode: t.Literal["fmeasure", "precision", "recall"] = "fmeasure",
+        **kwargs,
+    ):
+        """Initialize RougeScore metric."""
+        super().__init__(name=name, **kwargs)
+        self.rouge_type = rouge_type
+        self.mode = mode
+
+    async def ascore(
+        self,
+        reference: str,
+        response: str,
+    ) -> MetricResult:
+        """
+        Calculate ROUGE score asynchronously.
+
+        Args:
+            reference: The reference/ground truth text
+            response: The response text to evaluate
+
+        Returns:
+            MetricResult with ROUGE score (0.0-1.0)
+        """
+        # Import and check dependencies
+        try:
+            from rouge_score import rouge_scorer
+        except ImportError:
+            raise ImportError(
+                "rouge_score is required for ROUGE score calculation. "
+                "Please install it using `pip install rouge_score`"
+            )
+
+        # Calculate ROUGE score
+        scorer = rouge_scorer.RougeScorer([self.rouge_type], use_stemmer=True)
+        scores = scorer.score(reference, response)
+        score_value = getattr(scores[self.rouge_type], self.mode)
+
+        return MetricResult(value=float(score_value))
@@ -0,0 +1,131 @@
+"""Base class for collections metrics with modern component validation."""
+
+import asyncio
+import typing as t
+
+from ragas.embeddings.base import BaseRagasEmbedding
+from ragas.llms.base import InstructorBaseRagasLLM
+from ragas.metrics.base import SimpleBaseMetric
+from ragas.metrics.result import MetricResult
+from ragas.metrics.validators import NumericValidator
+
+
+class BaseMetric(SimpleBaseMetric, NumericValidator):
+    """
+    Base class for metrics collections with modern component validation.
+
+    This class inherits from SimpleBaseMetric and NumericValidator to provide:
+    - All the base metric functionality (ascore, abatch_score, score, batch_score)
+    - Numeric validation with configurable ranges
+    - Modern LLM and embedding component validation (when defined by subclass)
+    - Rejection of legacy wrappers with helpful error messages
+    - Consistent error handling and type safety
+
+    Attributes:
+        name: The metric name
+        allowed_values: Score range for numeric validation (tuple of min, max)
+
+    Note: Subclasses define llm and/or embeddings fields only if they need them.
+    The base classes handle all the core metric functionality - we just add modern component validation.
+    """
+
+    def __init__(
+        self,
+        name: str = "base_metric",
+        allowed_values: t.Tuple[float, float] = (0.0, 1.0),
+        **kwargs,
+    ):
+        """Initialize the base metric with validation."""
+        super().__init__(name=name, allowed_values=allowed_values)
+
+        # Validate components only if the metric defines them
+        # Check if this instance has these attributes after initialization
+        if hasattr(self, "llm"):
+            self._validate_llm()
+        if hasattr(self, "embeddings"):
+            self._validate_embeddings()
+
+    async def ascore(self, **kwargs) -> MetricResult:
+        """
+        Default async scoring method - subclasses should override this.
+
+        This base implementation just returns a placeholder result.
+        Subclasses should override this method with their specific logic.
+
+        The base class handles component validation in __post_init__.
+        """
+        return MetricResult(
+            value=0.0, reason="Base metric placeholder - override ascore() in subclass"
+        )
+
+    def score(self, **kwargs) -> MetricResult:
+        """
+        Synchronous scoring method that wraps ascore().
+
+        This is a convenience method for backward compatibility and sync usage.
+        For better performance, prefer using ascore() directly in async contexts.
+
+        Returns:
+            MetricResult object
+        """
+        try:
+            # Check if we're already in an async context
+            asyncio.get_running_loop()
+            # If we get here, there's already a running loop
+            raise RuntimeError(
+                "Cannot call sync score() from an async context. Use ascore() instead."
+            )
+        except RuntimeError as e:
+            if "Use ascore() instead" in str(e):
+                raise  # Re-raise our custom error
+            # No running loop found, safe to use asyncio.run()
+            return asyncio.run(self.ascore(**kwargs))
+
+    def batch_score(
+        self,
+        inputs: t.List[t.Dict[str, t.Any]],
+    ) -> t.List[MetricResult]:
+        """
+        Synchronous batch scoring that wraps abatch_score().
+
+        This is a convenience method for backward compatibility and sync usage.
+        For better performance, prefer using abatch_score() directly in async contexts.
+
+        Args:
+            inputs: List of input dictionaries for scoring
+
+        Returns:
+            List of MetricResult objects
+        """
+        try:
+            # Check if we're already in an async context
+            asyncio.get_running_loop()
+            # If we get here, there's already a running loop
+            raise RuntimeError(
+                "Cannot call sync batch_score() from an async context. Use abatch_score() instead."
+            )
+        except RuntimeError as e:
+            if "Use abatch_score() instead" in str(e):
+                raise  # Re-raise our custom error
+            # No running loop found, safe to use asyncio.run()
+            return asyncio.run(self.abatch_score(inputs))
+
+    def _validate_llm(self):
+        """Validate that a modern InstructorLLM is provided."""
+        llm = getattr(self, "llm", None)
+
+        if not isinstance(llm, InstructorBaseRagasLLM):
+            raise ValueError(
+                f"Collections metrics only support modern InstructorLLM. Found: {type(llm).__name__}. "
+                f"Use: instructor_llm_factory('openai', model='gpt-4o-mini', client=openai_client)"
+            )
+
+    def _validate_embeddings(self):
+        """Validate that modern embeddings are provided."""
+        embeddings = getattr(self, "embeddings", None)
+
+        if not isinstance(embeddings, BaseRagasEmbedding):
+            raise ValueError(
+                f"Collections metrics only support modern embeddings. Found: {type(embeddings).__name__}. "
+                f"Use: embedding_factory('openai', model='text-embedding-ada-002', client=openai_client, interface='modern')"
+            )