diff --git a/.github/workflows/RunTests.yml b/.github/workflows/RunTests.yml index c07d7e5ac..856a38422 100644 --- a/.github/workflows/RunTests.yml +++ b/.github/workflows/RunTests.yml @@ -53,64 +53,77 @@ jobs: build_mode: jax_ai_image base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:latest - gpu_image: - needs: prelim - uses: ./.github/workflows/build_upload_internal.yml - with: - device_type: gpu - device_name: a100-40gb-4 - cloud_runner: linux-x86-n2-16-buildkit - build_mode: jax_ai_image - base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/gpu:latest + # gpu_image: + # needs: prelim + # uses: ./.github/workflows/build_upload_internal.yml + # with: + # device_type: gpu + # device_name: a100-40gb-4 + # cloud_runner: linux-x86-n2-16-buildkit + # build_mode: jax_ai_image + # base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/gpu:latest - cpu_unit_tests: - needs: tpu_image - strategy: - fail-fast: false - matrix: - worker_group: [1, 2, 3, 4] - uses: ./.github/workflows/run_tests_internal.yml - with: - device_type: cpu - device_name: X64 - image_type: tpu - pytest_marker: 'cpu_only' - xla_python_client_mem_fraction: 0.75 - tf_force_gpu_allow_growth: false - container_resource_option: "--privileged" - is_scheduled_run: ${{ github.event_name == 'schedule' }} - worker_group: ${{ matrix.worker_group }} - total_workers: 4 + # cpu_unit_tests: + # needs: tpu_image + # strategy: + # fail-fast: false + # matrix: + # worker_group: [1, 2, 3, 4] + # uses: ./.github/workflows/run_tests_internal.yml + # with: + # device_type: cpu + # device_name: X64 + # image_type: tpu + # pytest_marker: 'cpu_only' + # xla_python_client_mem_fraction: 0.75 + # tf_force_gpu_allow_growth: false + # container_resource_option: "--privileged" + # is_scheduled_run: ${{ github.event_name == 'schedule' }} + # worker_group: ${{ matrix.worker_group }} + # total_workers: 4 - tpu_unit_tests: - needs: tpu_image - uses: ./.github/workflows/run_tests_internal.yml - with: - device_type: tpu - device_name: v4-8 - cloud_runner: linux-x86-ct4p-240-4tpu - pytest_marker: 'not cpu_only and not gpu_only and not integration_test' - xla_python_client_mem_fraction: 0.75 - tf_force_gpu_allow_growth: false - container_resource_option: "--privileged" - is_scheduled_run: ${{ github.event_name == 'schedule' }} + # tpu_unit_tests: + # needs: tpu_image + # uses: ./.github/workflows/run_tests_internal.yml + # with: + # device_type: tpu + # device_name: v4-8 + # cloud_runner: linux-x86-ct4p-240-4tpu + # pytest_marker: 'not cpu_only and not gpu_only and not integration_test' + # xla_python_client_mem_fraction: 0.75 + # tf_force_gpu_allow_growth: false + # container_resource_option: "--privileged" + # is_scheduled_run: ${{ github.event_name == 'schedule' }} - tpu_pathways_unit_tests: - needs: tpu_image - uses: ./.github/workflows/run_pathways_tests_internal.yml - with: - device_type: tpu - device_name: v4-8 - cloud_runner: linux-x86-ct4p-240-4tpu - pytest_marker: 'not cpu_only and not gpu_only and not integration_test' - xla_python_client_mem_fraction: 0.75 - tf_force_gpu_allow_growth: false - container_resource_option: "--privileged" - is_scheduled_run: ${{ github.event_name == 'schedule' }} + # tpu_pathways_unit_tests: + # needs: tpu_image + # uses: ./.github/workflows/run_pathways_tests_internal.yml + # with: + # device_type: tpu + # device_name: v4-8 + # cloud_runner: linux-x86-ct4p-240-4tpu + # pytest_marker: 'not cpu_only and not gpu_only and not integration_test' + # xla_python_client_mem_fraction: 0.75 + # tf_force_gpu_allow_growth: false + # container_resource_option: "--privileged" + # is_scheduled_run: ${{ github.event_name == 'schedule' }} + + # tpu_integration_tests: + # needs: tpu_image + # uses: ./.github/workflows/run_tests_internal.yml + # with: + # device_type: tpu + # device_name: v4-8 + # cloud_runner: linux-x86-ct4p-240-4tpu + # pytest_marker: 'not cpu_only and not gpu_only and integration_test' + # xla_python_client_mem_fraction: 0.75 + # tf_force_gpu_allow_growth: false + # container_resource_option: "--privileged" + # is_scheduled_run: ${{ github.event_name == 'schedule' }} - tpu_integration_tests: + tpu_pathways_integration_tests: needs: tpu_image - uses: ./.github/workflows/run_tests_internal.yml + uses: ./.github/workflows/run_pathways_tests_internal.yml with: device_type: tpu device_name: v4-8 @@ -121,37 +134,38 @@ jobs: container_resource_option: "--privileged" is_scheduled_run: ${{ github.event_name == 'schedule' }} - gpu_unit_tests: - needs: gpu_image - uses: ./.github/workflows/run_tests_internal.yml - with: - device_type: gpu - device_name: a100-40gb-4 - cloud_runner: linux-x86-a2-48-a100-4gpu - pytest_marker: 'not cpu_only and not tpu_only and not integration_test' - pytest_addopts: '--ignore=tests/sft_hooks_test.py' - xla_python_client_mem_fraction: 0.65 - tf_force_gpu_allow_growth: true - container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged" - is_scheduled_run: ${{ github.event_name == 'schedule' }} + # gpu_unit_tests: + # needs: gpu_image + # uses: ./.github/workflows/run_tests_internal.yml + # with: + # device_type: gpu + # device_name: a100-40gb-4 + # cloud_runner: linux-x86-a2-48-a100-4gpu + # pytest_marker: 'not cpu_only and not tpu_only and not integration_test' + # pytest_addopts: '--ignore=tests/sft_hooks_test.py' + # xla_python_client_mem_fraction: 0.65 + # tf_force_gpu_allow_growth: true + # container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged" + # is_scheduled_run: ${{ github.event_name == 'schedule' }} - gpu_integration_tests: - needs: gpu_image - uses: ./.github/workflows/run_tests_internal.yml - with: - device_type: gpu - device_name: a100-40gb-4 - cloud_runner: linux-x86-a2-48-a100-4gpu - pytest_marker: 'not cpu_only and not tpu_only and integration_test' - pytest_addopts: '--ignore=tests/sft_hooks_test.py' - xla_python_client_mem_fraction: 0.65 - tf_force_gpu_allow_growth: true - container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged" - is_scheduled_run: ${{ github.event_name == 'schedule' }} + # gpu_integration_tests: + # needs: gpu_image + # uses: ./.github/workflows/run_tests_internal.yml + # with: + # device_type: gpu + # device_name: a100-40gb-4 + # cloud_runner: linux-x86-a2-48-a100-4gpu + # pytest_marker: 'not cpu_only and not tpu_only and integration_test' + # pytest_addopts: '--ignore=tests/sft_hooks_test.py' + # xla_python_client_mem_fraction: 0.65 + # tf_force_gpu_allow_growth: true + # container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged" + # is_scheduled_run: ${{ github.event_name == 'schedule' }} clean_up: if: ${{ always() }} - needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests] + needs: [tpu_pathways_integration_tests] + # needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests, tpu_pathways_integration_tests] name: "Clean up" runs-on: ["self-hosted"] permissions: @@ -170,7 +184,8 @@ jobs: notify_failure: name: Notify failed build # creates an issue or modifies last open existing issue for failed build - needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests] + needs: [tpu_pathways_integration_tests] + # needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests, tpu_pathways_integration_tests] if: ${{ always() }} runs-on: ubuntu-latest permissions: @@ -198,52 +213,52 @@ jobs: # It will not fail if the labels don't exist. gh issue remove-label $ISSUE_NUMBER "success-run-1" "success-run-2" --repo $GH_REPO || echo "No success labels to remove." - notify_success_and_close: - name: Close issue after 3 successful builds - # This job runs only if all the preceding test jobs succeeded - if: ${{ success() && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }} - needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests] - runs-on: ubuntu-latest - permissions: - issues: write - steps: - - name: Find existing failure issue - id: find_issue - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GH_REPO: ${{ github.repository }} - run: | - ISSUE_NUMBER=$(gh issue list --label "failed-build" --state open --limit 1 --json number -q '.[0].number') - if [[ -z "$ISSUE_NUMBER" ]]; then - echo "No open build failure issue found. Nothing to do." - echo "issue_number=" >> $GITHUB_OUTPUT - else - echo "Found open build failure issue: #${ISSUE_NUMBER}" - echo "issue_number=${ISSUE_NUMBER}" >> $GITHUB_OUTPUT - fi - - - name: Add success label or close issue - if: steps.find_issue.outputs.issue_number != '' - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GH_REPO: ${{ github.repository }} - run: | - ISSUE_NUMBER=${{ steps.find_issue.outputs.issue_number }} - LABELS=$(gh issue view $ISSUE_NUMBER --json labels -q '.labels[].name') - - if echo "$LABELS" | grep -q "success-run-2"; then - echo "Third consecutive success. Closing issue #${ISSUE_NUMBER}." - gh issue comment $ISSUE_NUMBER --body "Build succeeded for the third consecutive time. Closing this issue automatically." - gh issue close $ISSUE_NUMBER - # Clean up all tracking labels - gh issue remove-label $ISSUE_NUMBER "failed-build" "success-run-2" --repo $GH_REPO - elif echo "$LABELS" | grep -q "success-run-1"; then - echo "Second consecutive success. Updating label on issue #${ISSUE_NUMBER}." - gh issue comment $ISSUE_NUMBER --body "Build succeeded for the second time. One more successful run will close this issue." - gh issue remove-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO - gh issue add-label $ISSUE_NUMBER "success-run-2" --repo $GH_REPO - else - echo "First consecutive success since failure. Adding label to issue #${ISSUE_NUMBER}." - gh issue comment $ISSUE_NUMBER --body "Build succeeded. This issue will be auto-closed after two more consecutive successful runs." - gh issue add-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO - fi + # notify_success_and_close: + # name: Close issue after 3 successful builds + # # This job runs only if all the preceding test jobs succeeded + # if: ${{ success() && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }} + # needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests] + # runs-on: ubuntu-latest + # permissions: + # issues: write + # steps: + # - name: Find existing failure issue + # id: find_issue + # env: + # GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # GH_REPO: ${{ github.repository }} + # run: | + # ISSUE_NUMBER=$(gh issue list --label "failed-build" --state open --limit 1 --json number -q '.[0].number') + # if [[ -z "$ISSUE_NUMBER" ]]; then + # echo "No open build failure issue found. Nothing to do." + # echo "issue_number=" >> $GITHUB_OUTPUT + # else + # echo "Found open build failure issue: #${ISSUE_NUMBER}" + # echo "issue_number=${ISSUE_NUMBER}" >> $GITHUB_OUTPUT + # fi + + # - name: Add success label or close issue + # if: steps.find_issue.outputs.issue_number != '' + # env: + # GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # GH_REPO: ${{ github.repository }} + # run: | + # ISSUE_NUMBER=${{ steps.find_issue.outputs.issue_number }} + # LABELS=$(gh issue view $ISSUE_NUMBER --json labels -q '.labels[].name') + + # if echo "$LABELS" | grep -q "success-run-2"; then + # echo "Third consecutive success. Closing issue #${ISSUE_NUMBER}." + # gh issue comment $ISSUE_NUMBER --body "Build succeeded for the third consecutive time. Closing this issue automatically." + # gh issue close $ISSUE_NUMBER + # # Clean up all tracking labels + # gh issue remove-label $ISSUE_NUMBER "failed-build" "success-run-2" --repo $GH_REPO + # elif echo "$LABELS" | grep -q "success-run-1"; then + # echo "Second consecutive success. Updating label on issue #${ISSUE_NUMBER}." + # gh issue comment $ISSUE_NUMBER --body "Build succeeded for the second time. One more successful run will close this issue." + # gh issue remove-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO + # gh issue add-label $ISSUE_NUMBER "success-run-2" --repo $GH_REPO + # else + # echo "First consecutive success since failure. Adding label to issue #${ISSUE_NUMBER}." + # gh issue comment $ISSUE_NUMBER --body "Build succeeded. This issue will be auto-closed after two more consecutive successful runs." + # gh issue add-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO + # fi diff --git a/.github/workflows/run_pathways_tests_internal.yml b/.github/workflows/run_pathways_tests_internal.yml index 079c712b0..4aef1c4e2 100644 --- a/.github/workflows/run_pathways_tests_internal.yml +++ b/.github/workflows/run_pathways_tests_internal.yml @@ -64,6 +64,7 @@ jobs: IFRT_PROXY_USE_INSECURE_GRPC_CREDENTIALS: true JAX_PLATFORMS: "proxy" JAX_BACKEND_TARGET: "grpc://localhost:29000" + JAX_COORDINATOR_ADDRESS: "localhost" options: ${{ inputs.container_resource_option }} steps: - uses: actions/checkout@v4 diff --git a/src/MaxText/max_utils.py b/src/MaxText/max_utils.py index 7993b9a36..cdf89d0bc 100644 --- a/src/MaxText/max_utils.py +++ b/src/MaxText/max_utils.py @@ -161,6 +161,7 @@ def maybe_initialize_jax_distributed_system(raw_keys): For CPUs, we call jax.distributed.initialize() explicitly, with the specified arguments. """ + print(f"LOG: maybe_initialize_jax_distributed_system - {raw_keys = }") if raw_keys["skip_jax_distributed_system"]: max_logging.log("Skipping jax distributed system due to skip_jax_distributed_system=True flag.") return diff --git a/tests/integration_tests/checkpointing_test.py b/tests/integration_tests/checkpointing_test.py index 4350c7324..deba5e984 100644 --- a/tests/integration_tests/checkpointing_test.py +++ b/tests/integration_tests/checkpointing_test.py @@ -85,6 +85,18 @@ def run_checkpointing(hardware, attention_type): "grain_worker_count=0", "grain_train_files=/tmp/gcsfuse/array-record/c4/en/3.0.1/c4-train.array_record*", ] + + command = get_checkpointing_command( + run_date, + hardware=hardware, + steps=1, + metrics_file="saved_metrics.txt", + attention_type=attention_type, + dataset_type="grain", + dataset_path="/tmp/gcsfuse", + ) + grain_command + print(f"LOG: {command = }") + train_main( get_checkpointing_command( run_date,