11import random
2+ import re
23from dataclasses import dataclass
4+ from functools import partial
35from pathlib import Path
46
57import logfire
8+ from nltk import edit_distance
69from pydantic import TypeAdapter
7- from pydantic_ai import Agent , BinaryContent
10+ from pydantic_ai import Agent , BinaryContent , AudioUrl
11+ from pydantic_evals import Dataset , Case
12+ from pydantic_evals .evaluators import Evaluator , EvaluatorContext , EvaluatorOutput
813
9- logfire .configure (service_name = 'pai-audio-evals' )
10- logfire .instrument_pydantic_ai ()
11-
12- this_dir = Path (__file__ ).parent
13- assets = this_dir / 'assets'
1414
15+ @dataclass
16+ class EditSimilarity (Evaluator [object , str , object ]):
17+ def evaluate (self , ctx : EvaluatorContext [object , str , object ]) -> EvaluatorOutput :
18+ if ctx .expected_output is None :
19+ return {} # no metric
20+ actual_tokens = re .sub (r'[^a-z0-9\s]' , '' , ctx .output .lower ()).split ()
21+ expected_tokens = re .sub (r'[^a-z0-9\s]' , '' , ctx .expected_output .lower ()).split ()
22+ distance = edit_distance (actual_tokens , expected_tokens )
23+ normalized_distance = distance / max (len (actual_tokens ), len (expected_tokens ))
24+ return 1 - normalized_distance
1525
16- def levenshtein_distance (s1 : str , s2 : str ) -> int :
17- if len (s1 ) < len (s2 ):
18- return levenshtein_distance (s2 , s1 )
19- if len (s2 ) == 0 :
20- return len (s1 )
2126
22- previous_row = range (len (s2 ) + 1 )
23- for i , c1 in enumerate (s1 ):
24- current_row = [i + 1 ]
25- for j , c2 in enumerate (s2 ):
26- insertions = previous_row [j + 1 ] + 1
27- deletions = current_row [j ] + 1
28- substitutions = previous_row [j ] + (c1 != c2 )
29- current_row .append (min (insertions , deletions , substitutions ))
30- previous_row = current_row
27+ logfire .configure (service_name = 'pai-audio-evals' , scrubbing = False , console = False )
28+ logfire .instrument_pydantic_ai ()
3129
32- return previous_row [- 1 ]
30+ this_dir = Path (__file__ ).parent
31+ assets = this_dir / 'assets'
3332
3433
3534@dataclass
3635class AudioFile :
3736 file : str
3837 text : str
3938
39+ def audio_url (self ) -> AudioUrl :
40+ return AudioUrl (f'https://smokeshow.helpmanual.io/4l1l1s0s6q4741012x1w/{ self .file } ' )
41+
4042 def binary_content (self ) -> BinaryContent :
4143 path = assets / self .file
4244 return BinaryContent (data = path .read_bytes (), media_type = 'audio/mpeg' )
@@ -45,12 +47,19 @@ def binary_content(self) -> BinaryContent:
4547files_schema = TypeAdapter (list [AudioFile ])
4648files = files_schema .validate_json ((this_dir / 'assets.json' ).read_bytes ())
4749random .shuffle (files )
50+
51+ n_files = 30
4852audio_agent = Agent (instructions = 'return the transcription only, no prefix or quotes' )
53+ dataset = Dataset (
54+ cases = [Case (name = file .file , inputs = file .audio_url (), expected_output = file .text ) for file in files ][:n_files ],
55+ evaluators = [EditSimilarity ()],
56+ )
57+
58+
59+ async def task (audio_url : AudioUrl , model : str ) -> str :
60+ return (await audio_agent .run (['transcribe' , audio_url ], model = model )).output
61+
4962
50- for audio_file in files [:3 ]:
51- with logfire .span ('Transcribing audio {audio_file.text!r}' , audio_file = audio_file ):
52- model_distances : list [tuple [str , int ]] = []
53- for model in 'gpt-4o-audio-preview' , 'gpt-4o-mini-audio-preview' , 'google-vertex:gemini-2.0-flash' :
54- result = audio_agent .run_sync (['transcribe' , audio_file .binary_content ()], model = model )
55- model_distances .append ((model , levenshtein_distance (audio_file .text , result .output )))
56- logfire .info (f'{ model_distances } ' )
63+ with logfire .span ('Compare models' ):
64+ for model in 'gpt-4o-audio-preview' , 'gpt-4o-mini-audio-preview' , 'google-vertex:gemini-2.0-flash' :
65+ dataset .evaluate_sync (partial (task , model = model ), name = model , max_concurrency = 10 )
0 commit comments