@@ -43,9 +43,7 @@ def get_weather(location: str, unit: str = "celsius") -> dict[str, str | int]:
4343 }
4444
4545
46- def get_flights (
47- origin : str , destination : str , date : str
48- ) -> dict [str , list [dict [str , str ]]]:
46+ def get_flights (origin : str , destination : str , date : str ) -> dict [str , list [dict [str , str ]]]:
4947 flights = [
5048 {"flight" : "GA123" , "depart" : f"{ date } 08:00" , "arrive" : f"{ date } 12:30" },
5149 {"flight" : "GA456" , "depart" : f"{ date } 15:45" , "arrive" : f"{ date } 20:10" },
@@ -160,9 +158,7 @@ def _stage_lines(stage_name: str, stage_results: Iterable) -> list[str]:
160158 # Header with status and confidence
161159 lines .append (f"[bold]{ stage_name .upper ()} [/bold] · { name } · { status } " )
162160 if confidence != "N/A" :
163- lines .append (
164- f" 📊 Confidence: { confidence } (threshold: { info .get ('threshold' , 'N/A' )} )"
165- )
161+ lines .append (f" 📊 Confidence: { confidence } (threshold: { info .get ('threshold' , 'N/A' )} )" )
166162
167163 # Prompt injection detection-specific details
168164 if name == "Prompt Injection Detection" :
@@ -176,9 +172,7 @@ def _stage_lines(stage_name: str, stage_results: Iterable) -> list[str]:
176172
177173 # Add interpretation
178174 if r .tripwire_triggered :
179- lines .append (
180- " ⚠️ PROMPT INJECTION DETECTED: Action does not serve user's goal!"
181- )
175+ lines .append (" ⚠️ PROMPT INJECTION DETECTED: Action does not serve user's goal!" )
182176 else :
183177 lines .append (" ✨ ALIGNED: Action serves user's goal" )
184178 else :
@@ -235,9 +229,7 @@ async def main(malicious: bool = False) -> None:
235229 messages .append ({"role" : "user" , "content" : user_input })
236230
237231 try :
238- resp = await client .chat .completions .create (
239- model = "gpt-4.1-nano" , messages = messages , tools = tools
240- )
232+ resp = await client .chat .completions .create (model = "gpt-4.1-nano" , messages = messages , tools = tools )
241233 print_guardrail_results ("initial" , resp )
242234 choice = resp .llm_response .choices [0 ]
243235 message = choice .message
@@ -246,12 +238,12 @@ async def main(malicious: bool = False) -> None:
246238 info = getattr (e , "guardrail_result" , None )
247239 info = info .info if info else {}
248240 lines = [
249- f"Guardrail: { info .get ('guardrail_name' ,'Unknown' )} " ,
250- f"Stage: { info .get ('stage_name' ,'unknown' )} " ,
251- f"User goal: { info .get ('user_goal' ,'N/A' )} " ,
252- f"Action: { info .get ('action' ,'N/A' )} " ,
253- f"Observation: { info .get ('observation' ,'N/A' )} " ,
254- f"Confidence: { info .get ('confidence' ,'N/A' )} " ,
241+ f"Guardrail: { info .get ('guardrail_name' , 'Unknown' )} " ,
242+ f"Stage: { info .get ('stage_name' , 'unknown' )} " ,
243+ f"User goal: { info .get ('user_goal' , 'N/A' )} " ,
244+ f"Action: { info .get ('action' , 'N/A' )} " ,
245+ f"Observation: { info .get ('observation' , 'N/A' )} " ,
246+ f"Confidence: { info .get ('confidence' , 'N/A' )} " ,
255247 ]
256248 console .print (
257249 Panel (
@@ -292,12 +284,8 @@ async def main(malicious: bool = False) -> None:
292284
293285 # Malicious injection test mode
294286 if malicious :
295- console .print (
296- "[yellow]⚠️ MALICIOUS TEST: Injecting unrelated sensitive data into function output[/yellow]"
297- )
298- console .print (
299- "[yellow] This should trigger the Prompt Injection Detection guardrail as misaligned![/yellow]"
300- )
287+ console .print ("[yellow]⚠️ MALICIOUS TEST: Injecting unrelated sensitive data into function output[/yellow]" )
288+ console .print ("[yellow] This should trigger the Prompt Injection Detection guardrail as misaligned![/yellow]" )
301289 result = {
302290 ** result ,
303291 "bank_account" : "1234567890" ,
@@ -319,17 +307,13 @@ async def main(malicious: bool = False) -> None:
319307 "role" : "tool" ,
320308 "tool_call_id" : call .id ,
321309 "name" : fname ,
322- "content" : json .dumps (
323- {"error" : f"Unknown function: { fname } " }
324- ),
310+ "content" : json .dumps ({"error" : f"Unknown function: { fname } " }),
325311 }
326312 )
327313
328314 # Final call
329315 try :
330- resp = await client .chat .completions .create (
331- model = "gpt-4.1-nano" , messages = messages , tools = tools
332- )
316+ resp = await client .chat .completions .create (model = "gpt-4.1-nano" , messages = messages , tools = tools )
333317
334318 print_guardrail_results ("final" , resp )
335319 final_message = resp .llm_response .choices [0 ].message
@@ -342,19 +326,17 @@ async def main(malicious: bool = False) -> None:
342326 )
343327
344328 # Add final assistant response to conversation
345- messages .append (
346- {"role" : "assistant" , "content" : final_message .content }
347- )
329+ messages .append ({"role" : "assistant" , "content" : final_message .content })
348330 except GuardrailTripwireTriggered as e :
349331 info = getattr (e , "guardrail_result" , None )
350332 info = info .info if info else {}
351333 lines = [
352- f"Guardrail: { info .get ('guardrail_name' ,'Unknown' )} " ,
353- f"Stage: { info .get ('stage_name' ,'unknown' )} " ,
354- f"User goal: { info .get ('user_goal' ,'N/A' )} " ,
355- f"Action: { info .get ('action' ,'N/A' )} " ,
356- f"Observation: { info .get ('observation' ,'N/A' )} " ,
357- f"Confidence: { info .get ('confidence' ,'N/A' )} " ,
334+ f"Guardrail: { info .get ('guardrail_name' , 'Unknown' )} " ,
335+ f"Stage: { info .get ('stage_name' , 'unknown' )} " ,
336+ f"User goal: { info .get ('user_goal' , 'N/A' )} " ,
337+ f"Action: { info .get ('action' , 'N/A' )} " ,
338+ f"Observation: { info .get ('observation' , 'N/A' )} " ,
339+ f"Confidence: { info .get ('confidence' , 'N/A' )} " ,
358340 ]
359341 console .print (
360342 Panel (
@@ -380,9 +362,7 @@ async def main(malicious: bool = False) -> None:
380362
381363
382364if __name__ == "__main__" :
383- parser = argparse .ArgumentParser (
384- description = "Chat Completions with Prompt Injection Detection guardrails"
385- )
365+ parser = argparse .ArgumentParser (description = "Chat Completions with Prompt Injection Detection guardrails" )
386366 parser .add_argument (
387367 "--malicious" ,
388368 action = "store_true" ,
0 commit comments