test(ai): Remove flaky token count integration expect directives (#9401)

DellaBitta · web-flow · commit f5fc6bf76a4b · 2025-12-01T15:35:33.000-05:00
We had some "ballpark" checks for the number of tokens returned in responses. 

However, the number of tokens returned had a high variance based on general AI chaos and the model that was used. This PR replaces somewhat exacting token count checks with simple non-zero checks in order to reduce flake.

Additionally, the change removes `toolUsePromptTokenCount` checks for the `gemini-2.0-flash` model, which doesn't support this field.
diff --git a/packages/ai/integration/chat.test.ts b/packages/ai/integration/chat.test.ts
@@ -24,7 +24,7 @@ import {
   SafetySetting,
   getGenerativeModel
 } from '../src';
-import { testConfigs, TOKEN_COUNT_DELTA } from './constants';
+import { testConfigs } from './constants';
 
 describe('Chat Session', () => {
   testConfigs.forEach(testConfig => {
@@ -98,62 +98,21 @@ describe('Chat Session', () => {
 
         if (model.model.includes('gemini-2.5-flash')) {
           // Token counts can vary slightly in chat context
-          expect(response1.usageMetadata!.promptTokenCount).to.be.closeTo(
-            17, // "What is the capital of France?" + system instruction
-            TOKEN_COUNT_DELTA + 2 // More variance for chat context
-          );
-          expect(response1.usageMetadata!.candidatesTokenCount).to.be.closeTo(
-            8, // "Paris"
-            TOKEN_COUNT_DELTA
-          );
-          expect(response1.usageMetadata!.totalTokenCount).to.be.closeTo(
-            49, // "What is the capital of France?" + system instruction + "Paris"
-            TOKEN_COUNT_DELTA + 3 // More variance for chat context
-          );
-          expect(response1.usageMetadata!.totalTokenCount).to.be.closeTo(
-            49, // "What is the capital of France?" + system instruction + "Paris"
-            TOKEN_COUNT_DELTA + 3 // More variance for chat context
-          );
-
-          expect(response2.usageMetadata!.promptTokenCount).to.be.closeTo(
-            32, // History + "And what about Italy?" + system instruction
-            TOKEN_COUNT_DELTA + 5 // More variance for chat context with history
-          );
-          expect(response2.usageMetadata!.candidatesTokenCount).to.be.closeTo(
-            8,
-            TOKEN_COUNT_DELTA
-          );
-          expect(response2.usageMetadata!.totalTokenCount).to.be.closeTo(
-            68,
-            TOKEN_COUNT_DELTA + 2
-          );
+          expect(response1.usageMetadata!.promptTokenCount).to.not.equal(0);
+          expect(response1.usageMetadata!.candidatesTokenCount).to.not.equal(0);
+          expect(response1.usageMetadata!.totalTokenCount).to.not.equal(0);
+          expect(response2.usageMetadata!.promptTokenCount).to.not.equal(0);
+          expect(response2.usageMetadata!.candidatesTokenCount).to.not.equal(0);
+          expect(response2.usageMetadata!.totalTokenCount).to.not.equal(0);
         } else if (model.model.includes('gemini-2.0-flash')) {
           expect(response1.usageMetadata).to.not.be.null;
           // Token counts can vary slightly in chat context
-          expect(response1.usageMetadata!.promptTokenCount).to.be.closeTo(
-            15, // "What is the capital of France?" + system instruction
-            TOKEN_COUNT_DELTA + 2 // More variance for chat context
-          );
-          expect(response1.usageMetadata!.candidatesTokenCount).to.be.closeTo(
-            8, // "Paris"
-            TOKEN_COUNT_DELTA
-          );
-          expect(response1.usageMetadata!.totalTokenCount).to.be.closeTo(
-            23, // "What is the capital of France?" + system instruction + "Paris"
-            TOKEN_COUNT_DELTA + 3 // More variance for chat context
-          );
-          expect(response2.usageMetadata!.promptTokenCount).to.be.closeTo(
-            28, // History + "And what about Italy?" + system instruction
-            TOKEN_COUNT_DELTA + 5 // More variance for chat context with history
-          );
-          expect(response2.usageMetadata!.candidatesTokenCount).to.be.closeTo(
-            8,
-            TOKEN_COUNT_DELTA
-          );
-          expect(response2.usageMetadata!.totalTokenCount).to.be.closeTo(
-            36,
-            TOKEN_COUNT_DELTA
-          );
+          expect(response1.usageMetadata!.promptTokenCount).to.not.equal(0);
+          expect(response1.usageMetadata!.candidatesTokenCount).to.not.equal(0);
+          expect(response1.usageMetadata!.totalTokenCount).to.not.equal(0);
+          expect(response2.usageMetadata!.promptTokenCount).to.not.equal(0);
+          expect(response2.usageMetadata!.candidatesTokenCount).to.not.equal(0);
+          expect(response2.usageMetadata!.totalTokenCount).to.not.equal(0);
         }
       });
     });
diff --git a/packages/ai/integration/generate-content.test.ts b/packages/ai/integration/generate-content.test.ts
@@ -29,7 +29,7 @@ import {
   URLRetrievalStatus,
   getGenerativeModel
 } from '../src';
-import { testConfigs, TOKEN_COUNT_DELTA } from './constants';
+import { testConfigs } from './constants';
 
 describe('Generate Content', function () {
   this.timeout(20_000);
@@ -88,22 +88,10 @@ describe('Generate Content', function () {
         expect(response.usageMetadata).to.not.be.null;
 
         if (model.model.includes('gemini-2.5-flash')) {
-          expect(response.usageMetadata!.promptTokenCount).to.be.closeTo(
-            22,
-            TOKEN_COUNT_DELTA
-          );
-          expect(response.usageMetadata!.candidatesTokenCount).to.be.closeTo(
-            2,
-            TOKEN_COUNT_DELTA
-          );
-          expect(response.usageMetadata!.thoughtsTokenCount).to.be.closeTo(
-            30,
-            TOKEN_COUNT_DELTA * 2
-          );
-          expect(response.usageMetadata!.totalTokenCount).to.be.closeTo(
-            55,
-            TOKEN_COUNT_DELTA * 2
-          );
+          expect(response.usageMetadata!.promptTokenCount).to.not.equal(0);
+          expect(response.usageMetadata!.candidatesTokenCount).to.not.equal(0);
+          expect(response.usageMetadata!.thoughtsTokenCount).to.not.equal(0);
+          expect(response.usageMetadata!.totalTokenCount).to.not.equal(0);
           expect(response.usageMetadata!.promptTokensDetails).to.not.be.null;
           expect(response.usageMetadata!.promptTokensDetails!.length).to.equal(
             1
@@ -113,22 +101,13 @@ describe('Generate Content', function () {
           ).to.equal(Modality.TEXT);
           expect(
             response.usageMetadata!.promptTokensDetails![0].tokenCount
-          ).to.closeTo(22, TOKEN_COUNT_DELTA);
+          ).to.not.equal(0);
 
           // candidatesTokenDetails comes back about half the time, so let's just not test it.
         } else if (model.model.includes('gemini-2.0-flash')) {
-          expect(response.usageMetadata!.promptTokenCount).to.be.closeTo(
-            21,
-            TOKEN_COUNT_DELTA
-          );
-          expect(response.usageMetadata!.candidatesTokenCount).to.be.closeTo(
-            4,
-            TOKEN_COUNT_DELTA
-          );
-          expect(response.usageMetadata!.totalTokenCount).to.be.closeTo(
-            25,
-            TOKEN_COUNT_DELTA * 2
-          );
+          expect(response.usageMetadata!.promptTokenCount).to.not.equal(0);
+          expect(response.usageMetadata!.candidatesTokenCount).to.not.equal(0);
+          expect(response.usageMetadata!.totalTokenCount).to.not.equal(0);
           expect(response.usageMetadata!.promptTokensDetails).to.not.be.null;
           expect(response.usageMetadata!.promptTokensDetails!.length).to.equal(
             1
@@ -149,7 +128,7 @@ describe('Generate Content', function () {
           ).to.equal(Modality.TEXT);
           expect(
             response.usageMetadata!.candidatesTokensDetails![0].tokenCount
-          ).to.be.closeTo(4, TOKEN_COUNT_DELTA);
+          ).to.not.equal(0);
         }
       });
 
@@ -230,8 +209,11 @@ describe('Generate Content', function () {
 
           const usageMetadata = response.usageMetadata;
           expect(usageMetadata).to.exist;
-          expect(usageMetadata?.toolUsePromptTokenCount).to.exist;
-          expect(usageMetadata?.toolUsePromptTokenCount).to.be.greaterThan(0);
+          // usageMetaData.toolUsePromptTokenCount does not exist in Gemini 2.0 flash responses.
+          if (!model.model.includes('gemini-2.0-flash')) {
+            expect(usageMetadata?.toolUsePromptTokenCount).to.exist;
+            expect(usageMetadata?.toolUsePromptTokenCount).to.be.greaterThan(0);
+          }
         });
 
         it('generateContent: url context and google search grounding', async () => {
@@ -288,7 +270,7 @@ describe('Generate Content', function () {
           });
 
           const result = await model.generateContent(
-            'Recommend 3 books for beginners to read to learn more about the latest advancements in Quantum Computing.'
+            'Recommend 3 books for beginners to read to learn more about the latest advancements in Quantum Computing'
           );
           const response = result.response;
           const urlContextMetadata =