newrelic
diff --git a/‎newrelic/hooks/mlmodel_openai.py‎
Lines changed: 67 additions & 20 deletions b/‎newrelic/hooks/mlmodel_openai.py‎
Lines changed: 67 additions & 20 deletions
diff --git a/‎tests/mlmodel_langchain/test_chain.py‎
Lines changed: 8 additions & 0 deletions b/‎tests/mlmodel_langchain/test_chain.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎tests/mlmodel_openai/test_chat_completion.py‎
Lines changed: 9 additions & 3 deletions b/‎tests/mlmodel_openai/test_chat_completion.py‎
Lines changed: 9 additions & 3 deletions
@@ -129,11 +129,11 @@ def create_chat_completion_message_event(
     span_id,
     trace_id,
     response_model,
-    request_model,
     response_id,
     request_id,
     llm_metadata,
     output_message_list,
+    all_token_counts,
 ):
     settings = transaction.settings if transaction.settings is not None else global_settings()
 
@@ -153,11 +153,6 @@ def create_chat_completion_message_event(
             "request_id": request_id,
             "span_id": span_id,
             "trace_id": trace_id,
-            "token_count": (
-                settings.ai_monitoring.llm_token_count_callback(request_model, message_content)
-                if settings.ai_monitoring.llm_token_count_callback
-                else None
-            ),
             "role": message.get("role"),
             "completion_id": chat_completion_id,
             "sequence": index,
@@ -166,6 +161,9 @@ def create_chat_completion_message_event(
             "ingest_source": "Python",
         }
 
+        if all_token_counts:
+            chat_completion_input_message_dict["token_count"] = 0
+
         if settings.ai_monitoring.record_content.enabled:
             chat_completion_input_message_dict["content"] = message_content
 
@@ -193,11 +191,6 @@ def create_chat_completion_message_event(
                 "request_id": request_id,
                 "span_id": span_id,
                 "trace_id": trace_id,
-                "token_count": (
-                    settings.ai_monitoring.llm_token_count_callback(response_model, message_content)
-                    if settings.ai_monitoring.llm_token_count_callback
-                    else None
-                ),
                 "role": message.get("role"),
                 "completion_id": chat_completion_id,
                 "sequence": index,
@@ -207,6 +200,9 @@ def create_chat_completion_message_event(
                 "is_response": True,
             }
 
+            if all_token_counts:
+                chat_completion_output_message_dict["token_count"] = 0
+
             if settings.ai_monitoring.record_content.enabled:
                 chat_completion_output_message_dict["content"] = message_content
 
@@ -280,15 +276,18 @@ def _record_embedding_success(transaction, embedding_id, linking_metadata, kwarg
             else getattr(attribute_response, "organization", None)
         )
 
+        response_total_tokens = attribute_response.get("usage", {}).get("total_tokens") if response else None
+
+        total_tokens = (
+            settings.ai_monitoring.llm_token_count_callback(response_model, input_)
+            if settings.ai_monitoring.llm_token_count_callback and input_
+            else response_total_tokens
+        )
+
         full_embedding_response_dict = {
             "id": embedding_id,
             "span_id": span_id,
             "trace_id": trace_id,
-            "token_count": (
-                settings.ai_monitoring.llm_token_count_callback(response_model, input_)
-                if settings.ai_monitoring.llm_token_count_callback
-                else None
-            ),
             "request.model": kwargs.get("model") or kwargs.get("engine"),
             "request_id": request_id,
             "duration": ft.duration * 1000,
@@ -313,6 +312,7 @@ def _record_embedding_success(transaction, embedding_id, linking_metadata, kwarg
             "response.headers.ratelimitRemainingRequests": check_rate_limit_header(
                 response_headers, "x-ratelimit-remaining-requests", True
             ),
+            "response.usage.total_tokens": total_tokens,
             "vendor": "openai",
             "ingest_source": "Python",
         }
@@ -475,12 +475,15 @@ def _handle_completion_success(transaction, linking_metadata, completion_id, kwa
 
 
 def _record_completion_success(transaction, linking_metadata, completion_id, kwargs, ft, response_headers, response):
+    settings = transaction.settings if transaction.settings is not None else global_settings()
     span_id = linking_metadata.get("span.id")
     trace_id = linking_metadata.get("trace.id")
+
     try:
         if response:
             response_model = response.get("model")
             response_id = response.get("id")
+            token_usage = response.get("usage") or {}
             output_message_list = []
             finish_reason = None
             choices = response.get("choices") or []
@@ -494,6 +497,7 @@ def _record_completion_success(transaction, linking_metadata, completion_id, kwa
         else:
             response_model = kwargs.get("response.model")
             response_id = kwargs.get("id")
+            token_usage = {}
             output_message_list = []
             finish_reason = kwargs.get("finish_reason")
             if "content" in kwargs:
@@ -505,10 +509,44 @@ def _record_completion_success(transaction, linking_metadata, completion_id, kwa
                 output_message_list = []
         request_model = kwargs.get("model") or kwargs.get("engine")
 
-        request_id = response_headers.get("x-request-id")
-        organization = response_headers.get("openai-organization") or getattr(response, "organization", None)
         messages = kwargs.get("messages") or [{"content": kwargs.get("prompt"), "role": "user"}]
         input_message_list = list(messages)
+
+        # Extract token counts from response object
+        if token_usage:
+            response_prompt_tokens = token_usage.get("prompt_tokens")
+            response_completion_tokens = token_usage.get("completion_tokens")
+            response_total_tokens = token_usage.get("total_tokens")
+
+        else:
+            response_prompt_tokens = None
+            response_completion_tokens = None
+            response_total_tokens = None
+
+        # Calculate token counts by checking if a callback is registered and if we have the necessary content to pass
+        # to it. If not, then we use the token counts provided in the response object
+        input_message_content = " ".join([msg.get("content", "") for msg in input_message_list if msg.get("content")])
+        prompt_tokens = (
+            settings.ai_monitoring.llm_token_count_callback(request_model, input_message_content)
+            if settings.ai_monitoring.llm_token_count_callback and input_message_content
+            else response_prompt_tokens
+        )
+        output_message_content = " ".join([msg.get("content", "") for msg in output_message_list if msg.get("content")])
+        completion_tokens = (
+            settings.ai_monitoring.llm_token_count_callback(response_model, output_message_content)
+            if settings.ai_monitoring.llm_token_count_callback and output_message_content
+            else response_completion_tokens
+        )
+
+        total_tokens = (
+            prompt_tokens + completion_tokens if all([prompt_tokens, completion_tokens]) else response_total_tokens
+        )
+
+        all_token_counts = bool(prompt_tokens and completion_tokens and total_tokens)
+
+        request_id = response_headers.get("x-request-id")
+        organization = response_headers.get("openai-organization") or getattr(response, "organization", None)
+
         full_chat_completion_summary_dict = {
             "id": completion_id,
             "span_id": span_id,
@@ -553,6 +591,12 @@ def _record_completion_success(transaction, linking_metadata, completion_id, kwa
             ),
             "response.number_of_messages": len(input_message_list) + len(output_message_list),
         }
+
+        if all_token_counts:
+            full_chat_completion_summary_dict["response.usage.prompt_tokens"] = prompt_tokens
+            full_chat_completion_summary_dict["response.usage.completion_tokens"] = completion_tokens
+            full_chat_completion_summary_dict["response.usage.total_tokens"] = total_tokens
+
         llm_metadata = _get_llm_attributes(transaction)
         full_chat_completion_summary_dict.update(llm_metadata)
         transaction.record_custom_event("LlmChatCompletionSummary", full_chat_completion_summary_dict)
@@ -564,11 +608,11 @@ def _record_completion_success(transaction, linking_metadata, completion_id, kwa
             span_id,
             trace_id,
             response_model,
-            request_model,
             response_id,
             request_id,
             llm_metadata,
             output_message_list,
+            all_token_counts,
         )
     except Exception:
         _logger.warning(RECORD_EVENTS_FAILURE_LOG_MESSAGE, traceback.format_exception(*sys.exc_info()))
@@ -579,6 +623,7 @@ def _record_completion_error(transaction, linking_metadata, completion_id, kwarg
     trace_id = linking_metadata.get("trace.id")
     request_message_list = kwargs.get("messages", None) or []
     notice_error_attributes = {}
+
     try:
         if OPENAI_V1:
             response = getattr(exc, "response", None)
@@ -643,18 +688,20 @@ def _record_completion_error(transaction, linking_metadata, completion_id, kwarg
         output_message_list = []
         if "content" in kwargs:
             output_message_list = [{"content": kwargs.get("content"), "role": kwargs.get("role")}]
+
         create_chat_completion_message_event(
             transaction,
             request_message_list,
             completion_id,
             span_id,
             trace_id,
             kwargs.get("response.model"),
-            request_model,
             response_id,
             request_id,
             llm_metadata,
             output_message_list,
+            # We do not record token counts in error cases, so set all_token_counts to True so the pipeline tokenizer does not run
+            all_token_counts=True,
         )
     except Exception:
         _logger.warning(RECORD_EVENTS_FAILURE_LOG_MESSAGE, traceback.format_exception(*sys.exc_info()))
 
@@ -359,6 +359,7 @@
             "response.headers.ratelimitResetRequests": "20ms",
             "response.headers.ratelimitRemainingTokens": 999992,
             "response.headers.ratelimitRemainingRequests": 2999,
+            "response.usage.total_tokens": 8,
             "vendor": "openai",
             "ingest_source": "Python",
             "input": "[[3923, 374, 220, 17, 489, 220, 19, 30]]",
@@ -382,6 +383,7 @@
             "response.headers.ratelimitResetRequests": "20ms",
             "response.headers.ratelimitRemainingTokens": 999998,
             "response.headers.ratelimitRemainingRequests": 2999,
+            "response.usage.total_tokens": 1,
             "vendor": "openai",
             "ingest_source": "Python",
             "input": "[[10590]]",
@@ -452,6 +454,9 @@
             "response.headers.ratelimitResetRequests": "8.64s",
             "response.headers.ratelimitRemainingTokens": 199912,
             "response.headers.ratelimitRemainingRequests": 9999,
+            "response.usage.prompt_tokens": 73,
+            "response.usage.completion_tokens": 375,
+            "response.usage.total_tokens": 448,
             "response.number_of_messages": 3,
         },
     ],
@@ -467,6 +472,7 @@
             "sequence": 0,
             "response.model": "gpt-3.5-turbo-0125",
             "vendor": "openai",
+            "token_count": 0,
             "ingest_source": "Python",
             "content": "You are a generator of quiz questions for a seminar. Use the following pieces of retrieved context to generate 5 multiple choice questions (A,B,C,D) on the subject matter. Use a three sentence maximum and keep the answer concise. Render the output as HTML\n\nWhat is 2 + 4?",
         },
@@ -483,6 +489,7 @@
             "sequence": 1,
             "response.model": "gpt-3.5-turbo-0125",
             "vendor": "openai",
+            "token_count": 0,
             "ingest_source": "Python",
             "content": "math",
         },
@@ -499,6 +506,7 @@
             "sequence": 2,
             "response.model": "gpt-3.5-turbo-0125",
             "vendor": "openai",
+            "token_count": 0,
             "ingest_source": "Python",
             "is_response": True,
             "content": "```html\n<!DOCTYPE html>\n<html>\n<head>\n  <title>Math Quiz</title>\n</head>\n<body>\n  <h2>Math Quiz Questions</h2>\n  <ol>\n    <li>What is the result of 5 + 3?</li>\n      <ul>\n        <li>A) 7</li>\n        <li>B) 8</li>\n        <li>C) 9</li>\n        <li>D) 10</li>\n      </ul>\n    <li>What is the product of 6 x 7?</li>\n      <ul>\n        <li>A) 36</li>\n        <li>B) 42</li>\n        <li>C) 48</li>\n        <li>D) 56</li>\n      </ul>\n    <li>What is the square root of 64?</li>\n      <ul>\n        <li>A) 6</li>\n        <li>B) 7</li>\n        <li>C) 8</li>\n        <li>D) 9</li>\n      </ul>\n    <li>What is the result of 12 / 4?</li>\n      <ul>\n        <li>A) 2</li>\n        <li>B) 3</li>\n        <li>C) 4</li>\n        <li>D) 5</li>\n      </ul>\n    <li>What is the sum of 15 + 9?</li>\n      <ul>\n        <li>A) 22</li>\n        <li>B) 23</li>\n        <li>C) 24</li>\n        <li>D) 25</li>\n      </ul>\n  </ol>\n</body>\n</html>\n```",
 
@@ -15,7 +15,7 @@
 import openai
 from testing_support.fixtures import override_llm_token_callback_settings, reset_core_stats_engine, validate_attributes
 from testing_support.ml_testing_utils import (
-    add_token_count_to_events,
+    add_token_counts_to_chat_events,
     disabled_ai_monitoring_record_content_settings,
     disabled_ai_monitoring_settings,
     disabled_ai_monitoring_streaming_settings,
@@ -55,6 +55,9 @@
             "response.organization": "new-relic-nkmd8b",
             "request.temperature": 0.7,
             "request.max_tokens": 100,
+            "response.usage.completion_tokens": 11,
+            "response.usage.total_tokens": 64,
+            "response.usage.prompt_tokens": 53,
             "response.choices.finish_reason": "stop",
             "response.headers.llmVersion": "2020-10-01",
             "response.headers.ratelimitLimitRequests": 200,
@@ -81,6 +84,7 @@
             "role": "system",
             "completion_id": None,
             "sequence": 0,
+            "token_count": 0,
             "response.model": "gpt-3.5-turbo-0613",
             "vendor": "openai",
             "ingest_source": "Python",
@@ -99,6 +103,7 @@
             "role": "user",
             "completion_id": None,
             "sequence": 1,
+            "token_count": 0,
             "response.model": "gpt-3.5-turbo-0613",
             "vendor": "openai",
             "ingest_source": "Python",
@@ -117,6 +122,7 @@
             "role": "assistant",
             "completion_id": None,
             "sequence": 2,
+            "token_count": 0,
             "response.model": "gpt-3.5-turbo-0613",
             "vendor": "openai",
             "is_response": True,
@@ -172,7 +178,7 @@ def test_openai_chat_completion_sync_no_content(set_trace_info):
 
 @reset_core_stats_engine()
 @override_llm_token_callback_settings(llm_token_count_callback)
-@validate_custom_events(add_token_count_to_events(chat_completion_recorded_events))
+@validate_custom_events(add_token_counts_to_chat_events(chat_completion_recorded_events))
 # One summary event, one system message, one user message, and one response message from the assistant
 @validate_custom_event_count(count=4)
 @validate_transaction_metrics(
@@ -343,7 +349,7 @@ def test_openai_chat_completion_async_no_content(loop, set_trace_info):
 
 @reset_core_stats_engine()
 @override_llm_token_callback_settings(llm_token_count_callback)
-@validate_custom_events(add_token_count_to_events(chat_completion_recorded_events))
+@validate_custom_events(add_token_counts_to_chat_events(chat_completion_recorded_events))
 # One summary event, one system message, one user message, and one response message from the assistant
 @validate_custom_event_count(count=4)
 @validate_transaction_metrics(