Skip to content

Commit c7a063a

Browse files
authored
Adding New Task SLR-Bench as a Community Task : Scalable Logical Reasoning Benchmark (#983)
* add slr_bench evals function * implement feedback on PR * remove logging and raise exception when judge not loaded
1 parent 5137e03 commit c7a063a

File tree

2 files changed

+127
-0
lines changed

2 files changed

+127
-0
lines changed

community_tasks/slr_bench_evals.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
# MIT License
2+
3+
# Copyright (c) 2025 Lukas Helff
4+
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
"""
24+
SLR-Bench is a large-scale benchmark for scalable logical reasoning with language models, comprising 19,000 prompts organized into 20 curriculum levels.
25+
The tasks progressively increase in relational, arithmetic, and recursive complexity, requiring models to synthesize Prolog rules that classify train compositions.
26+
For more details see: https://huggingface.co/datasets/AIML-TUDA/SLR-Bench
27+
The paper can be found here: https://arxiv.org/abs/2506.15787
28+
Before using this task, please ensure that SWI-Prolog and evaluate are installed on your system, as they are required for symbolic verification of the generated Prolog programs.
29+
"""
30+
31+
import logging
32+
import shutil
33+
34+
import numpy as np
35+
from evaluate import load
36+
37+
from lighteval.metrics.utils.metric_utils import SampleLevelComputation, SampleLevelMetric
38+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
39+
from lighteval.tasks.requests import Doc, SamplingMethod
40+
41+
42+
logger = logging.getLogger(__name__)
43+
44+
45+
# Check for SWI-Prolog installation
46+
if shutil.which("swipl") is None:
47+
raise ImportError(
48+
"SWI-Prolog (swipl) is not installed or not in PATH. "
49+
"Please install SWI-Prolog to use this task. "
50+
"You can install required dependencies with: pip install -r community_tasks/slr_bench_requirements.txt"
51+
)
52+
53+
# Load the symbolic judge for evaluating Prolog programs
54+
symbolic_judge = load("AIML-TUDA/VerifiableRewardsForScalableLogicalReasoning")
55+
56+
57+
def prompt_fn(line: dict, task_name: str):
58+
"""Defines how to go from a dataset line to a doc object."""
59+
return Doc(
60+
task_name=task_name, query=line["prompt"], choices=[str(line.get("validation program", ""))], gold_index=0
61+
)
62+
63+
64+
class VerifiableRewardMetric(SampleLevelComputation):
65+
def compute(self, doc, model_response, **kwargs):
66+
try:
67+
prediction = model_response.final_text[0]
68+
validation_program = doc.choices[0] if doc.choices else ""
69+
ref_format = [
70+
{
71+
"validation_program": validation_program,
72+
"evaluation_config": {"positive_predicate": "eastbound", "negative_predicate": "westbound"},
73+
}
74+
]
75+
76+
results = symbolic_judge.compute(predictions=[prediction], references=ref_format)
77+
return results["accuracy"]
78+
79+
except Exception as e:
80+
logger.error("Error during the computation of the metric")
81+
raise RuntimeError(f"Failed to compute verifiable reward metric: {e}")
82+
83+
84+
custom_metric = SampleLevelMetric(
85+
metric_name="verifiable_reward",
86+
higher_is_better=True,
87+
category=SamplingMethod.GENERATIVE,
88+
sample_level_fn=VerifiableRewardMetric(),
89+
corpus_level_fn=np.mean,
90+
)
91+
92+
# Define the subsets available in the SLR-Bench dataset
93+
CONFIGURATIONS = ["All", "Basic", "Easy", "Medium", "Hard"]
94+
95+
96+
class SLRBenchTask(LightevalTaskConfig):
97+
"""Task configuration for SLR-Bench evaluation."""
98+
99+
def __init__(
100+
self,
101+
config: str,
102+
):
103+
name = f"slr_bench_{config.lower()}"
104+
super().__init__(
105+
name=name,
106+
hf_subset=f"v1-{config}",
107+
prompt_function=prompt_fn,
108+
hf_repo="AIML-TUDA/SLR-Bench",
109+
metrics=[custom_metric],
110+
hf_avail_splits=["train", "validation", "test"],
111+
evaluation_splits=["test"],
112+
few_shots_split="train",
113+
few_shots_select="random_sampling_from_train",
114+
suite=["community"],
115+
generation_size=4096,
116+
stop_sequence=None,
117+
version=1,
118+
)
119+
120+
121+
# Create a single task instance for each configuration
122+
TASKS = [SLRBenchTask(config) for config in CONFIGURATIONS]
123+
124+
# Export tasks table
125+
TASKS_TABLE = TASKS
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
evaluate
2+
swipl

0 commit comments

Comments
 (0)