Improving Notebook for Live Evaluation (#332)

chrisalexiuk-nvidia · web-flow · commit d1181fe11f47 · 2025-08-22T09:19:30.000-07:00
Signed-off-by: Chris Alexiuk &lt;c.s.alexiuk@gmail.com&gt;
diff --git a/nemo/Evaluator/Live Evaluation/docker_compose.yaml b/nemo/Evaluator/Live Evaluation/docker_compose.yaml
@@ -104,12 +104,12 @@ services:
         condition: service_healthy
       evaluator-postgres-db-migration:
         condition: service_completed_successfully
-      otel-collector:
-        condition: service_started
+        # otel-collector:
+        #   condition: service_started
     networks:
       - nemo-ms
     healthcheck:
-      test: ["CMD", "curl", "http://localhost:7331/health"]
+      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:7331/health')"]
       interval: 10s
       timeout: 3s
       retries: 3
@@ -124,16 +124,16 @@ services:
       SERVICE_ACCOUNT: nemo-evaluator-test-workflow-executor
       EVAL_ENABLE_VALIDATION: False
       # OpenTelemetry environmental variables
-      OTEL_SERVICE_NAME: nemo-evaluator
-      OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
-      OTEL_TRACES_EXPORTER: otlp
-      OTEL_METRICS_EXPORTER: none
-      OTEL_LOGS_EXPORTER: otlp
-      OTEL_PYTHON_EXCLUDED_URLS: "health"
-      OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED: "true"
-      CONSOLE_LOG_LEVEL: DEBUG
-      OTEL_LOG_LEVEL: DEBUG
-      LOG_LEVEL: DEBUG
+      # OTEL_SERVICE_NAME: nemo-evaluator
+      # OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
+      # OTEL_TRACES_EXPORTER: otlp
+      # OTEL_METRICS_EXPORTER: none
+      # OTEL_LOGS_EXPORTER: otlp
+      # OTEL_PYTHON_EXCLUDED_URLS: "health"
+      # OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED: "true"
+      # CONSOLE_LOG_LEVEL: DEBUG
+      # OTEL_LOG_LEVEL: DEBUG
+      # LOG_LEVEL: DEBUG
 
   evaluator-postgres-db-migration:
     image: ${EVALUATOR_IMAGE:-""}
@@ -288,16 +288,16 @@ services:
   #  adapted from https://jessitron.com/2021/08/11/run-an-opentelemetry-collector-locally-in-docker/
   #  and https://github.com/open-telemetry/opentelemetry-demo/blob/main/docker-compose.yml
   ###
-  otel-collector:
-    image: otel/opentelemetry-collector-contrib:0.91.0
-    command: ["--config=/etc/otel-collector-config.yaml"]
-    volumes:
-      - ./config/otel-collector-config.yaml:/etc/otel-collector-config.yaml
-    ports:
-      - "4317:4317" # OTLP over gRPC receiver
-      - "55679:55679" # UI
-    networks:
-      - nemo-ms
+  # otel-collector:
+  #   image: otel/opentelemetry-collector-contrib:0.91.0
+  #   command: ["--config=/etc/otel-collector-config.yaml"]
+  #   volumes:
+  #     - ./config/otel-collector-config.yaml:/etc/otel-collector-config.yaml
+  #   ports:
+  #     - "4317:4317" # OTLP over gRPC receiver
+  #     - "55679:55679" # UI
+  #   networks:
+  #     - nemo-ms
 
 networks:
   nemo-ms:
diff --git a/nemo/Evaluator/Live Evaluation/live_evaluation.ipynb b/nemo/Evaluator/Live Evaluation/live_evaluation.ipynb
@@ -50,6 +50,17 @@
     "```"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Installing Dependencies with `uv`\n",
+    "\n",
+    "Before moving forward in the notebook, please ensure you're using the virtual environment created by running `uv sync` in root directory of this notebook.  \n",
+    "\n",
+    "This will install all the necessary dependencies for the remainder of the notebook. "
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -63,7 +74,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -88,15 +99,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "Status: completed\n",
-      "Results: EvaluationResult(job='eval-JoR3GCSrjtRkC9jPFYyMv2', id='evaluation_result-3iaZjDE3a6tt4W7ag3vNsZ', created_at=datetime.datetime(2025, 7, 16, 22, 12, 21, 687891), custom_fields={}, description=None, files_url=None, groups={}, namespace='default', ownership=None, project=None, tasks={'qa': TaskResult(metrics={'accuracy': MetricResult(scores={'string-check': Score(value=1.0, stats=ScoreStats(count=1, max=None, mean=1.0, min=None, stddev=None, stderr=None, sum=1.0, sum_squared=None, variance=None))})})}, updated_at=datetime.datetime(2025, 7, 16, 22, 12, 21, 687893))\n"
+      "Results: EvaluationResult(job='eval-Akk2TPTzp96YCQjyvaJsMt', id='evaluation_result-2iKms1yr9GNjVWGSJVV7ZP', created_at=datetime.datetime(2025, 8, 16, 0, 53, 21, 87425), custom_fields={}, description=None, files_url=None, groups={}, namespace='default', ownership=None, project=None, tasks={'qa': TaskResult(metrics={'accuracy': MetricResult(scores={'string-check': Score(value=1.0, stats=ScoreStats(count=1, max=None, mean=1.0, min=None, stddev=None, stderr=None, sum=1.0, sum_squared=None, variance=None))})})}, updated_at=datetime.datetime(2025, 8, 16, 0, 53, 21, 89177))\n"
      ]
     }
    ],
@@ -153,7 +164,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -171,14 +182,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
     "model_target = {\n",
     "    \"api_endpoint\": {\n",
     "        \"url\": \"https://integrate.api.nvidia.com/v1\",\n",
-    "        \"model_id\": \"nvidia/llama-3.3-nemotron-super-49b-v1\",\n",
+    "        \"model_id\": \"nvidia/llama-3.3-nemotron-super-49b-v1.5\",\n",
     "        \"api_key\": os.getenv(\"NVIDIA_API_KEY\")\n",
     "    }\n",
     "}"
@@ -195,7 +206,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -220,15 +231,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "Status: completed\n",
-      "Results: {'correct': Score(value=2.8, stats=ScoreStats(count=5, max=None, mean=2.8, min=None, stddev=None, stderr=None, sum=14.0, sum_squared=None, variance=None))}\n"
+      "Results: {'correct': Score(value=2.6, stats=ScoreStats(count=5, max=None, mean=2.6, min=None, stddev=None, stderr=None, sum=13.0, sum_squared=None, variance=None))}\n"
      ]
     }
    ],
@@ -248,7 +259,7 @@
     "                                \"messages\": [\n",
     "                                    {\n",
     "                                        \"role\": \"system\",\n",
-    "                                        \"content\": \"detailed thinking off\"\n",
+    "                                        \"content\": \"/no_think\"\n",
     "                                    },\n",
     "                                    {\n",
     "                                        \"role\": \"user\",\n",