Skip to content

Commit cc59f6a

Browse files
authored
feat: bleu score migrated to collections (#2352)
1 parent 8b190e1 commit cc59f6a

File tree

3 files changed

+294
-3
lines changed

3 files changed

+294
-3
lines changed
Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
"""Collections of metrics using modern component architecture."""
22

33
from ragas.metrics.collections._answer_relevancy import AnswerRelevancy
4+
from ragas.metrics.collections._bleu_score import BleuScore
45
from ragas.metrics.collections._rouge_score import RougeScore
56
from ragas.metrics.collections.base import BaseMetric
67

78
__all__ = [
8-
"AnswerRelevancy", # Class-based answer relevancy
9-
"RougeScore", # Class-based rouge score
10-
"BaseMetric", # Base class for creating new v2 metrics
9+
"BaseMetric", # Base class
10+
"AnswerRelevancy",
11+
"BleuScore",
12+
"RougeScore",
1113
]
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
"""BLEU Score metric v2 - Class-based implementation with automatic validation."""
2+
3+
import typing as t
4+
5+
from ragas.metrics.collections.base import BaseMetric
6+
from ragas.metrics.result import MetricResult
7+
8+
9+
class BleuScore(BaseMetric):
10+
"""
11+
Calculate BLEU score between reference and response texts.
12+
13+
This implementation provides automatic validation and pure async design
14+
without requiring LLM or embedding components. Uses sacrebleu library.
15+
16+
Usage:
17+
>>> from ragas.metrics.collections import BleuScore
18+
>>>
19+
>>> metric = BleuScore()
20+
>>>
21+
>>> result = await metric.ascore(
22+
... reference="The capital of France is Paris.",
23+
... response="Paris is the capital of France."
24+
... )
25+
>>> print(f"Score: {result.value}")
26+
>>>
27+
>>> results = await metric.abatch_score([
28+
... {"reference": "Text 1", "response": "Response 1"},
29+
... {"reference": "Text 2", "response": "Response 2"},
30+
... ])
31+
32+
Attributes:
33+
name: The metric name
34+
kwargs: Additional arguments to pass to sacrebleu.corpus_bleu
35+
allowed_values: Score range (0.0 to 1.0)
36+
"""
37+
38+
def __init__(
39+
self,
40+
name: str = "bleu_score",
41+
kwargs: t.Optional[t.Dict[str, t.Any]] = None,
42+
**base_kwargs,
43+
):
44+
"""Initialize BleuScore metric."""
45+
super().__init__(name=name, **base_kwargs)
46+
self.kwargs = kwargs or {}
47+
48+
async def ascore(
49+
self,
50+
reference: str,
51+
response: str,
52+
) -> MetricResult:
53+
"""
54+
Calculate BLEU score asynchronously.
55+
56+
Args:
57+
reference: The reference/ground truth text
58+
response: The response text to evaluate
59+
60+
Returns:
61+
MetricResult with BLEU score (0.0-1.0)
62+
"""
63+
try:
64+
from sacrebleu import corpus_bleu
65+
except ImportError:
66+
raise ImportError(
67+
"sacrebleu is required for BLEU score calculation. "
68+
"Please install it using `pip install sacrebleu`"
69+
)
70+
71+
assert isinstance(reference, str), "BleuScore expects a valid reference string"
72+
assert isinstance(response, str), "BleuScore expects a valid response string"
73+
74+
reference_sentences = reference.split(". ")
75+
response_sentences = response.split(". ")
76+
77+
reference_formatted = [[ref] for ref in reference_sentences]
78+
response_formatted = response_sentences
79+
80+
score = (
81+
corpus_bleu(response_formatted, reference_formatted, **self.kwargs).score
82+
/ 100
83+
)
84+
85+
assert isinstance(score, float), "Expecting a float"
86+
return MetricResult(value=float(score))
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
"""E2E tests for BLEU score metric migration from v1 to v2."""
2+
3+
import pytest
4+
5+
from ragas.dataset_schema import SingleTurnSample
6+
from ragas.metrics import BleuScore as LegacyBleuScore, MetricResult
7+
from ragas.metrics.collections import BleuScore
8+
9+
10+
class TestBleuE2EMigration:
11+
"""E2E test compatibility between legacy BleuScore and new V2 implementations."""
12+
13+
@pytest.fixture
14+
def sample_data(self):
15+
"""Real-world sample reference and response texts for testing."""
16+
return [
17+
{
18+
"reference": "The cat sat on the mat. The dog ran in the park.",
19+
"response": "The cat sat on the mat. The dog ran in the park.",
20+
"description": "Exact match",
21+
},
22+
{
23+
"reference": "Python is a high-level programming language. It was created by Guido van Rossum.",
24+
"response": "Python is a programming language. It was developed by Guido van Rossum.",
25+
"description": "Similar content with paraphrasing",
26+
},
27+
{
28+
"reference": "Machine learning is a subset of artificial intelligence. It enables computers to learn from data.",
29+
"response": "Deep learning uses neural networks. It processes complex patterns in data.",
30+
"description": "Related but different content",
31+
},
32+
{
33+
"reference": "The capital of France is Paris.",
34+
"response": "Paris is the capital and largest city of France.",
35+
"description": "Reordered content",
36+
},
37+
{
38+
"reference": "",
39+
"response": "Some response text",
40+
"description": "Empty reference",
41+
},
42+
{
43+
"reference": "Some reference text",
44+
"response": "",
45+
"description": "Empty response",
46+
},
47+
]
48+
49+
@pytest.mark.asyncio
50+
async def test_legacy_vs_v2_class_e2e_compatibility(self, sample_data):
51+
"""E2E test that legacy and v2 class implementations produce identical scores."""
52+
53+
for i, data in enumerate(sample_data):
54+
print(f"\n🧪 Testing BLEU - Case {i + 1}: {data['description']}")
55+
print(f" Reference: {data['reference'][:50]}...")
56+
print(f" Response: {data['response'][:50]}...")
57+
58+
legacy_bleu = LegacyBleuScore()
59+
legacy_sample = SingleTurnSample(
60+
user_input="dummy",
61+
response=data["response"],
62+
reference=data["reference"],
63+
)
64+
legacy_score = await legacy_bleu._single_turn_ascore(legacy_sample, None)
65+
66+
v2_class_metric = BleuScore()
67+
v2_class_result = await v2_class_metric.ascore(
68+
reference=data["reference"],
69+
response=data["response"],
70+
)
71+
72+
class_diff = abs(legacy_score - v2_class_result.value)
73+
74+
print(f" Legacy: {legacy_score:.6f}")
75+
print(f" V2 Class: {v2_class_result.value:.6f}")
76+
print(f" Diff: {class_diff:.10f}")
77+
78+
assert class_diff < 1e-10, (
79+
f"Case {i + 1} ({data['description']}): BLEU mismatch: "
80+
f"{legacy_score} != {v2_class_result.value}"
81+
)
82+
83+
assert isinstance(legacy_score, float)
84+
assert isinstance(v2_class_result, MetricResult)
85+
86+
print(" ✅ Legacy and V2 class produce identical scores!")
87+
88+
@pytest.mark.asyncio
89+
async def test_bleu_score_performance_comparison(self, sample_data):
90+
"""Compare performance characteristics between legacy and v2 class."""
91+
import time
92+
93+
test_case = sample_data[0]
94+
95+
print("\n⚡ Performance test: BLEU score")
96+
97+
legacy_bleu = LegacyBleuScore()
98+
legacy_sample = SingleTurnSample(
99+
user_input="dummy",
100+
response=test_case["response"],
101+
reference=test_case["reference"],
102+
)
103+
104+
start_time = time.time()
105+
legacy_score = await legacy_bleu._single_turn_ascore(legacy_sample, None)
106+
legacy_time = time.time() - start_time
107+
108+
v2_class_metric = BleuScore()
109+
start_time = time.time()
110+
v2_class_result = await v2_class_metric.ascore(
111+
reference=test_case["reference"],
112+
response=test_case["response"],
113+
)
114+
v2_class_time = time.time() - start_time
115+
116+
print(f" Legacy: {legacy_time:.4f}s → {legacy_score:.6f}")
117+
print(f" V2 Class: {v2_class_time:.4f}s → {v2_class_result.value:.6f}")
118+
119+
assert abs(legacy_score - v2_class_result.value) < 1e-10
120+
assert isinstance(legacy_score, float)
121+
assert isinstance(v2_class_result, MetricResult)
122+
123+
@pytest.mark.asyncio
124+
async def test_v2_class_no_components_needed(self):
125+
"""Test that V2 class-based BleuScore doesn't require LLM or embeddings."""
126+
127+
print("\n🔧 Testing V2 BleuScore component requirements:")
128+
129+
metric = BleuScore()
130+
131+
print(f" has llm attr: {hasattr(metric, 'llm')}")
132+
print(f" has embeddings attr: {hasattr(metric, 'embeddings')}")
133+
134+
result = await metric.ascore(
135+
reference="The capital of France is Paris.",
136+
response="Paris is the capital of France.",
137+
)
138+
139+
print(f" Score: {result.value:.6f}")
140+
141+
assert not hasattr(metric, "llm") or metric.__dict__.get("llm") is None
142+
assert (
143+
not hasattr(metric, "embeddings")
144+
or metric.__dict__.get("embeddings") is None
145+
)
146+
assert isinstance(result.value, float)
147+
assert 0.0 <= result.value <= 1.0
148+
149+
print(" ✅ V2 BleuScore works without LLM/embeddings!")
150+
151+
@pytest.mark.asyncio
152+
async def test_v2_class_batch_processing(self, sample_data):
153+
"""Test V2 class-based BleuScore batch processing."""
154+
155+
metric = BleuScore()
156+
157+
batch_inputs = [
158+
{"reference": case["reference"], "response": case["response"]}
159+
for case in sample_data[:3]
160+
]
161+
162+
print(f"\n📦 Testing V2 class batch processing with {len(batch_inputs)} items:")
163+
164+
results = await metric.abatch_score(batch_inputs)
165+
166+
assert len(results) == len(batch_inputs)
167+
168+
for i, (case, result) in enumerate(zip(sample_data[:3], results)):
169+
print(f" Case {i + 1}: {result.value:.6f} - {case['description']}")
170+
assert isinstance(result.value, float)
171+
assert -1e-10 <= result.value <= 1.0 + 1e-10
172+
assert result.reason is None
173+
174+
print(" ✅ V2 class batch processing works correctly!")
175+
176+
@pytest.mark.asyncio
177+
async def test_bleu_with_custom_kwargs(self):
178+
"""Test that custom kwargs are passed correctly to sacrebleu."""
179+
180+
print("\n🔧 Testing BleuScore with custom kwargs:")
181+
182+
metric_default = BleuScore()
183+
metric_custom = BleuScore(kwargs={"smooth_method": "exp"})
184+
185+
reference = "The quick brown fox jumps over the lazy dog."
186+
response = "The quick brown fox jumps."
187+
188+
result_default = await metric_default.ascore(
189+
reference=reference, response=response
190+
)
191+
result_custom = await metric_custom.ascore(
192+
reference=reference, response=response
193+
)
194+
195+
print(f" Default kwargs: {result_default.value:.6f}")
196+
print(f" Custom kwargs: {result_custom.value:.6f}")
197+
198+
assert isinstance(result_default.value, float)
199+
assert isinstance(result_custom.value, float)
200+
assert 0.0 <= result_default.value <= 1.0
201+
assert 0.0 <= result_custom.value <= 1.0
202+
203+
print(" ✅ Custom kwargs work correctly!")

0 commit comments

Comments
 (0)