PR comments

evanmiller-anthropic · evanmiller-anthropic · commit f6a7724c00b4 · 2025-01-17T14:51:02.000-05:00
diff --git a/src/inspect_evals/docvqa/docvqa.py b/src/inspect_evals/docvqa/docvqa.py
@@ -31,6 +31,61 @@
 {question}
 """
 
+IMAGE_BASE_DIR = Path(user_cache_dir("inspect_evals")) / "docvqa_images"
+
+
+def _levenshtein_distance(str1: str, str2: str) -> int:
+    """Computes a Levenshtein distance, same as Levenshtein.distance in the python-Levenshtein package."""
+    # Create a matrix of size (len(str1) + 1) x (len(str2) + 1)
+    matrix = [[0 for j in range(len(str2) + 1)] for i in range(len(str1) + 1)]
+
+    # Initialize the first row and column
+    for i in range(len(str1) + 1):
+        matrix[i][0] = i
+    for j in range(len(str2) + 1):
+        matrix[0][j] = j
+
+    # Fill in the rest of the matrix
+    for i in range(1, len(str1) + 1):
+        for j in range(1, len(str2) + 1):
+            matrix[i][j] = min(
+                matrix[i - 1][j] + 1,  # deletion
+                matrix[i][j - 1] + 1,  # insertion
+                matrix[i - 1][j - 1] + int(str1[i - 1] != str2[j - 1]),  # substitution
+            )
+
+    return matrix[len(str1)][len(str2)]
+
+
+def _best_normalized_levenshtein_similiarity(
+    completion: str, ground_truths: list[str], threshold: float
+) -> float:
+    """
+    Compute the Average Normalized Levenshtein Similarity (ANLS) as defined in equation (1) of
+    https://arxiv.org/pdf/1907.00490.pdf
+
+    Note that the "average" is computed by the accuracy metric -- not here. This function computes
+    the term inside the summation of equation (1).
+    """
+    best_score = 0.0
+    for ground_truth in ground_truths:
+        if len(ground_truth) == 0 and len(completion) == 0:
+            best_score = 1
+            break
+        levenshtein_distance = _levenshtein_distance(
+            completion.lower(), ground_truth.lower()
+        )
+        normed_levenshtein_distance = levenshtein_distance / max(
+            len(completion), len(ground_truth)
+        )
+        if normed_levenshtein_distance < threshold:
+            score = 1.0 - normed_levenshtein_distance
+        else:
+            score = 0.0
+        if score > best_score:
+            best_score = score
+    return best_score
+
 
 @task
 def docvqa() -> Task:
@@ -52,30 +107,9 @@ def docvqa() -> Task:
 
 @scorer(metrics=[accuracy(), stderr()])
 def docvqa_scorer() -> Scorer:
-    def distance(str1: str, str2: str) -> int:
-        # Create a matrix of size (len(str1) + 1) x (len(str2) + 1)
-        matrix = [[0 for j in range(len(str2) + 1)] for i in range(len(str1) + 1)]
-
-        # Initialize the first row and column
-        for i in range(len(str1) + 1):
-            matrix[i][0] = i
-        for j in range(len(str2) + 1):
-            matrix[0][j] = j
-
-        # Fill in the rest of the matrix
-        for i in range(1, len(str1) + 1):
-            for j in range(1, len(str2) + 1):
-                matrix[i][j] = min(
-                    matrix[i - 1][j] + 1,  # deletion
-                    matrix[i][j - 1] + 1,  # insertion
-                    matrix[i - 1][j - 1]
-                    + int(str1[i - 1] != str2[j - 1]),  # substitution
-                )
-
-        return matrix[len(str1)][len(str2)]
-
-    async def get_ANLS_score(state: TaskState, target: Target) -> Score:
-        """Follows https://arxiv.org/pdf/1907.00490.pdf"""
+    async def normalized_levenshtein_similiarity_score(
+        state: TaskState, target: Target
+    ) -> Score:
         threshold = 0.5
         ground_truths = target.target
         match = re.search(
@@ -85,25 +119,10 @@ async def get_ANLS_score(state: TaskState, target: Target) -> Score:
         )
         if match:
             completion = match.groups()[0]
-            best_score = 0.0
-            for ground_truth in ground_truths:
-                if len(ground_truth) == 0 and len(completion) == 0:
-                    best_score = 1
-                    break
-                levenshtein_distance = distance(
-                    completion.lower(), ground_truth.lower()
-                )
-                normed_levenshtein_distance = levenshtein_distance / max(
-                    len(completion), len(ground_truth)
-                )
-                if normed_levenshtein_distance < threshold:
-                    score = 1.0 - normed_levenshtein_distance
-                else:
-                    score = 0.0
-                if score > best_score:
-                    best_score = score
             return Score(
-                value=best_score,
+                value=_best_normalized_levenshtein_similiarity(
+                    completion, ground_truths, threshold
+                ),
                 answer=completion,
             )
 
@@ -115,7 +134,7 @@ async def get_ANLS_score(state: TaskState, target: Target) -> Score:
                 + f"{state.output.completion}",
             )
 
-    return get_ANLS_score
+    return normalized_levenshtein_similiarity_score
 
 
 @solver
@@ -131,27 +150,26 @@ async def solve(state: TaskState, generate: Generate) -> TaskState:
 
 def record_to_sample(record: dict[str, Any]) -> Sample:
     # extract image
-    IMAGE_BASE_DIR = Path(user_cache_dir("inspect_evals")) / "docvqa_images"
-    image = Path(IMAGE_BASE_DIR / record["image"]["path"])
+    image_path = Path(IMAGE_BASE_DIR / record["image"]["path"])
 
     image_bytes = record["image"]["bytes"]
     assert is_image_png(image_bytes)
 
-    if not image.exists():
-        print(f"Extracting {image.name}")
+    if not image_path.exists():
+        print(f"Extracting {image_path.name}")
         # ensure parent
-        image.parent.mkdir(exist_ok=True, parents=True)
+        image_path.parent.mkdir(exist_ok=True, parents=True)
         # reduce the image size
         img = Image.open(BytesIO(image_bytes))
         img.thumbnail((1024, 1024))
         # save preserving format
-        img.save(image, format=img.format)
+        img.save(image_path, format=img.format)
 
     message: list[ChatMessage] = [
         ChatMessageUser(
             content=[
                 ContentText(text=record["question"]),
-                ContentImage(image=image.as_posix()),
+                ContentImage(image=image_path.as_posix()),
             ]
         )
     ]
diff --git a/tests/docvqa/test_docvqa.py b/tests/docvqa/test_docvqa.py
@@ -0,0 +1,95 @@
+from inspect_evals.docvqa.docvqa import (
+    _levenshtein_distance as levenshtein,
+    _best_normalized_levenshtein_similiarity as best,
+)
+
+
+def test_levenshtein():
+    # Basic test cases
+    assert levenshtein("", "") == 0  # Empty strings
+    assert levenshtein("a", "a") == 0  # Same single char
+    assert levenshtein("abc", "abc") == 0  # Same string
+
+    # Single operations
+    assert levenshtein("a", "") == 1  # Single deletion
+    assert levenshtein("", "a") == 1  # Single insertion
+    assert levenshtein("a", "b") == 1  # Single substitution
+
+    # Multiple operations
+    assert levenshtein("kitten", "sitting") == 3  # Classic example
+    assert levenshtein("sunday", "saturday") == 3  # Real words
+
+
+def test_best_normalized_levenshtein_distance():
+    def best_norm_lev_sim(completion, ground_truths, threshold=2.0):
+        return round(best(completion, ground_truths, threshold), 3)
+
+    # Basic cases
+    assert best_norm_lev_sim("", [""]) == 1.0  # Empty strings
+    assert best_norm_lev_sim("a", ["a"]) == 1.0  # Single char match
+    assert best_norm_lev_sim("", ["a"]) == 0.0  # Empty vs char
+    assert best_norm_lev_sim("a", ["b"]) == 0.0  # Different chars
+
+    # Multiple correct answers
+    assert (
+        best_norm_lev_sim("color", ["color", "colour"]) == 1.0
+    )  # Exact match with variants
+
+    assert (
+        best_norm_lev_sim("theatre", ["theater", "theatre"]) == 1.0
+    )  # Regional spellings
+
+    # Partial matches with multiple answers
+    assert best_norm_lev_sim("thetre", ["theater", "theatre"]) == round(
+        1 - 1 / 7, 3
+    )  # One deletion
+
+    # Case insensitivity
+    assert best_norm_lev_sim("HELLO", ["hello", "hola"]) == 1.0  # All case differences
+
+    # Length differences
+    assert best_norm_lev_sim("hi", ["hello", "hey"]) == round(
+        1 - 2 / 3, 3
+    )  # Short vs longer options
+
+    assert best_norm_lev_sim("hi", ["hello", "hey"], 0.5) == 0.0  # Test threshold
+
+    assert best_norm_lev_sim("hi", ["hello", "hey"], 0.75) == round(
+        1 - 2 / 3, 3
+    )  # Test threshold
+
+    # Numeric and special characters
+    assert (
+        best_norm_lev_sim("2nd floor", ["second floor", "2nd floor", "floor 2"]) == 1.0
+    )  # Number representations
+
+    # Common abbreviations
+    assert (
+        best_norm_lev_sim("dept", ["department", "dept.", "dept"]) == 1.0
+    )  # Abbreviation matches
+
+    # Multiple errors
+    assert best_norm_lev_sim(
+        "californa", ["california", "calif", "ca"]
+    ) > best_norm_lev_sim(
+        "calfrnia", ["california", "calif", "ca"]
+    )  # Better partial match
+
+    # Spaces and formatting
+    assert (
+        best_norm_lev_sim("new york", ["newyork", "new york", "ny"]) == 1.0
+    )  # Space variations
+
+    # Unicode and special characters
+    assert best_norm_lev_sim("café", ["cafe", "café", "caffè"]) == 1.0  # Accent marks
+
+    # Long string comparisons
+    assert (
+        best_norm_lev_sim(
+            "mississipi river", ["mississippi river", "river mississippi"]
+        )
+        > 0.9
+    )  # Minor spelling error
+
+    # Completely different strings
+    assert best_norm_lev_sim("kiwi", ["banana", "orange"]) == 0.0  # No similarity