fix: align Makefile format command with CI and fix notebook linting

jjmachan · jjmachan · commit 561679ea0767 · 2025-10-16T16:28:20.000-07:00
- Add --exclude src/ragas/_version.py to format and CI commands
- This matches GitHub CI behavior which overrides pyproject.toml exclusions
- Ensures notebooks are formatted locally, preventing CI failures
- Add noqa: E402 comment to notebook for intentional sys.path modification
diff --git a/Makefile b/Makefile
@@ -52,11 +52,11 @@ install: ## Install full dependencies with uv sync (backward compatible - modern
 format: ## Format and lint all code
 	@echo "Formatting and linting all code..."
 	@echo "(ruff format) Formatting ragas..."
-	$(Q)uv run --active ruff format src tests docs --config pyproject.toml
+	$(Q)uv run --active ruff format src tests docs --exclude src/ragas/_version.py --config pyproject.toml
 	@echo "(ruff) Auto-fixing ragas (includes import sorting and unused imports)..."
-	$(Q)uv run --active ruff check src tests docs --fix-only --config pyproject.toml
+	$(Q)uv run --active ruff check src tests docs --exclude src/ragas/_version.py --fix-only --config pyproject.toml
 	@echo "(ruff) Final linting check for ragas..."
-	$(Q)uv run --active ruff check src tests docs --config pyproject.toml
+	$(Q)uv run --active ruff check src tests docs --exclude src/ragas/_version.py --config pyproject.toml
 
 type: ## Type check all code
 	@echo "Type checking all code..."
@@ -93,8 +93,8 @@ benchmarks-test: ## Run benchmarks for ragas unit tests
 run-ci: ## Run complete CI pipeline (mirrors GitHub CI exactly)
 	@echo "Running complete CI pipeline..."
 	@echo "Format check..."
-	$(Q)uv run --active ruff format --check src tests docs --config pyproject.toml
-	$(Q)uv run --active ruff check src tests docs --config pyproject.toml
+	$(Q)uv run --active ruff format --check src tests docs --exclude src/ragas/_version.py --config pyproject.toml
+	$(Q)uv run --active ruff check src tests docs --exclude src/ragas/_version.py --config pyproject.toml
 	@echo "Type check..."
 	$(Q)$(MAKE) type
 	@echo "Unit tests..."
@@ -104,8 +104,8 @@ run-ci: ## Run complete CI pipeline (mirrors GitHub CI exactly)
 run-ci-format-check: ## Run format check in dry-run mode (like GitHub CI)
 	@echo "Running format check (dry-run, like GitHub CI)..."
 	@echo "Checking ragas formatting..."
-	$(Q)uv run --active ruff format --check src tests docs --config pyproject.toml
-	$(Q)uv run --active ruff check src docs tests --config pyproject.toml
+	$(Q)uv run --active ruff format --check src tests docs --exclude src/ragas/_version.py --config pyproject.toml
+	$(Q)uv run --active ruff check src docs tests --exclude src/ragas/_version.py --config pyproject.toml
 
 run-ci-type: ## Run type checking (matches GitHub CI)
 	@echo "Running type checking (matches GitHub CI)..."
@@ -118,8 +118,8 @@ run-ci-tests: ## Run all tests with CI options
 run-ci-fast: ## Fast CI check for quick local validation (2-3 minutes)
 	@echo "Running fast CI check for quick feedback..."
 	@echo "Format check..."
-	$(Q)uv run --active ruff format --check src tests docs --config pyproject.toml
-	$(Q)uv run --active ruff check src docs tests --config pyproject.toml
+	$(Q)uv run --active ruff format --check src tests docs --exclude src/ragas/_version.py --config pyproject.toml
+	$(Q)uv run --active ruff check src docs tests --exclude src/ragas/_version.py --config pyproject.toml
 	@echo "Core unit tests (no nbmake for speed)..."
 	$(Q)uv run --active pytest tests/unit --dist loadfile -n auto -x
 	@echo "Fast CI check completed!"
diff --git a/tests/e2e/metrics_migration/metric_score_diff.ipynb b/tests/e2e/metrics_migration/metric_score_diff.ipynb
@@ -34,16 +34,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import asyncio\n",
-    "import time\n",
-    "from dataclasses import dataclass\n",
-    "from typing import Any, Dict, List, Tuple\n",
-    "\n",
     "import numpy as np\n",
-    "import pandas as pd\n",
     "\n",
-    "# Ragas imports\n",
-    "from ragas.dataset_schema import SingleTurnSample"
+    "# Ragas imports"
    ]
   },
   {
@@ -135,31 +128,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "✓ Setup complete\n",
-      "✓ Metric classes loaded:\n",
-      "  Legacy: LLMContextPrecisionWithReference from ragas.metrics._context_precision\n",
-      "  Modern: ContextPrecision from ragas.metrics.collections\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "import os\n",
+    "import importlib\n",
     "import sys\n",
     "from pathlib import Path\n",
-    "import importlib\n",
     "\n",
     "# Add project root to path\n",
     "project_root = Path.cwd().parent.parent.parent\n",
     "sys.path.insert(0, str(project_root))\n",
     "\n",
-    "from tests.utils import check_api_key\n",
+    "from tests.utils import check_api_key  # noqa: E402\n",
     "\n",
     "# Check for OpenAI API key\n",
     "check_api_key(\"openai\")\n",
@@ -182,7 +163,7 @@
     "LegacyMetric = load_metric_class(METRIC_CONFIG[\"legacy_import\"])\n",
     "ModernMetric = load_metric_class(METRIC_CONFIG[\"modern_import\"])\n",
     "\n",
-    "print(f\"✓ Metric classes loaded:\")\n",
+    "print(\"✓ Metric classes loaded:\")\n",
     "print(\n",
     "    f\"  Legacy: {METRIC_CONFIG['legacy_import']['class_name']} from {METRIC_CONFIG['legacy_import']['module']}\"\n",
     ")\n",
@@ -329,7 +310,7 @@
     "legacy_name = getattr(legacy_metric, \"name\", legacy_metric.__class__.__name__)\n",
     "modern_name = getattr(modern_metric, \"name\", modern_metric.__class__.__name__)\n",
     "\n",
-    "print(f\"✓ Metrics initialized:\")\n",
+    "print(\"✓ Metrics initialized:\")\n",
     "print(f\"  Legacy: {legacy_name}\")\n",
     "print(f\"  Modern: {modern_name}\")\n",
     "print(f\"  Dataset fields required: {METRIC_CONFIG['dataset_fields']}\")"
@@ -404,7 +385,7 @@
     "\n",
     "print(f\"✓ Prepared {len(amnesty_test_data)} samples for testing\")\n",
     "if amnesty_test_data:\n",
-    "    print(f\"\\nFirst sample fields:\")\n",
+    "    print(\"\\nFirst sample fields:\")\n",
     "    first_sample = amnesty_test_data[0]\n",
     "    for key, value in first_sample.items():\n",
     "        if isinstance(value, list):\n",
@@ -561,9 +542,7 @@
     }
    ],
    "source": [
-    "import pandas as pd\n",
     "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
     "\n",
     "# Get detailed DataFrame\n",
     "df_amnesty = amnesty_result.to_dataframe()\n",
@@ -586,12 +565,12 @@
     "print(\"DETAILED STATISTICAL ANALYSIS\")\n",
     "print(\"=\" * 70)\n",
     "print(f\"\\nDataset: amnesty_qa ({len(df_amnesty)} samples)\")\n",
-    "print(f\"\\nScore Statistics:\")\n",
+    "print(\"\\nScore Statistics:\")\n",
     "print(f\"  Legacy Mean:  {amnesty_result.old_mean:.4f}\")\n",
     "print(f\"  New Mean:     {amnesty_result.new_mean:.4f}\")\n",
     "print(f\"  Score Shift:  {amnesty_result.mean_diff:+.4f}\")\n",
     "\n",
-    "print(f\"\\nDifference Statistics:\")\n",
+    "print(\"\\nDifference Statistics:\")\n",
     "print(f\"  Mean |Diff|:  {df_amnesty['abs_diff'].mean():.4f}\")\n",
     "print(f\"  Std Dev:      {amnesty_result.std_diff:.4f}\")\n",
     "print(f\"  Max Diff:     {amnesty_result.max_diff:.4f}\")\n",
@@ -602,7 +581,7 @@
     "# For LLM-based metrics: use [0.1, 0.15, 0.2, 0.25, 0.3]\n",
     "# For deterministic metrics: use [1e-10, 1e-8, 1e-6, 1e-4, 0.01]\n",
     "tolerance_levels = [0.1, 0.15, 0.2, 0.25, 0.3]\n",
-    "print(f\"\\nTolerance Analysis:\")\n",
+    "print(\"\\nTolerance Analysis:\")\n",
     "for tol in tolerance_levels:\n",
     "    within = (df_amnesty[\"abs_diff\"] < tol).sum()\n",
     "    pct = within / len(df_amnesty) * 100\n",
@@ -896,7 +875,7 @@
     "\n",
     "print(f\"✓ Prepared {len(fiqa_test_data)} samples for testing\")\n",
     "if fiqa_test_data:\n",
-    "    print(f\"\\nFirst sample fields:\")\n",
+    "    print(\"\\nFirst sample fields:\")\n",
     "    first_sample = fiqa_test_data[0]\n",
     "    for key, value in first_sample.items():\n",
     "        if isinstance(value, list):\n",
@@ -1049,12 +1028,12 @@
     "print(\"DETAILED STATISTICAL ANALYSIS\")\n",
     "print(\"=\" * 70)\n",
     "print(f\"\\nDataset: fiqa ({len(df_fiqa)} samples)\")\n",
-    "print(f\"\\nScore Statistics:\")\n",
+    "print(\"\\nScore Statistics:\")\n",
     "print(f\"  Legacy Mean:  {fiqa_result.old_mean:.4f}\")\n",
     "print(f\"  New Mean:     {fiqa_result.new_mean:.4f}\")\n",
     "print(f\"  Score Shift:  {fiqa_result.mean_diff:+.4f}\")\n",
     "\n",
-    "print(f\"\\nDifference Statistics:\")\n",
+    "print(\"\\nDifference Statistics:\")\n",
     "print(f\"  Mean |Diff|:  {df_fiqa['abs_diff'].mean():.4f}\")\n",
     "print(f\"  Std Dev:      {fiqa_result.std_diff:.4f}\")\n",
     "print(f\"  Max Diff:     {fiqa_result.max_diff:.4f}\")\n",
@@ -1065,7 +1044,7 @@
     "# For LLM-based metrics: use [0.1, 0.15, 0.2, 0.25, 0.3]\n",
     "# For deterministic metrics: use [1e-10, 1e-8, 1e-6, 1e-4, 0.01]\n",
     "tolerance_levels = [0.1, 0.15, 0.2, 0.25, 0.3]\n",
-    "print(f\"\\nTolerance Analysis:\")\n",
+    "print(\"\\nTolerance Analysis:\")\n",
     "for tol in tolerance_levels:\n",
     "    within = (df_fiqa[\"abs_diff\"] < tol).sum()\n",
     "    pct = within / len(df_fiqa) * 100\n",
@@ -1311,4 +1290,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}