diff --git a/.buildkite/benchmarks/pipeline.yml b/.buildkite/benchmarks/pipeline.yml index 8eb4d8c833..cd1f7fd8db 100644 --- a/.buildkite/benchmarks/pipeline.yml +++ b/.buildkite/benchmarks/pipeline.yml @@ -4,7 +4,6 @@ agents: modules: climacommon/2025_05_15 env: - JULIA_NVTX_CALLBACKS: gc OPENBLAS_NUM_THREADS: 1 OMPI_MCA_opal_warn_on_missing_libcuda: 0 SLURM_KILL_BAD_EXIT: 1 diff --git a/.buildkite/hierarchies/pipeline.yml b/.buildkite/hierarchies/pipeline.yml index 7014a5e5de..5a5e732e95 100644 --- a/.buildkite/hierarchies/pipeline.yml +++ b/.buildkite/hierarchies/pipeline.yml @@ -6,7 +6,6 @@ agents: env: JULIA_LOAD_PATH: "${JULIA_LOAD_PATH}:${BUILDKITE_BUILD_CHECKOUT_PATH}/.buildkite" OPENBLAS_NUM_THREADS: 1 - JULIA_NVTX_CALLBACKS: gc OMPI_MCA_opal_warn_on_missing_libcuda: 0 JULIA_MAX_NUM_PRECOMPILE_FILES: 100 GKSwstype: 100 diff --git a/.buildkite/longruns/pipeline.yml b/.buildkite/longruns/pipeline.yml index 01ef08dc3e..1b0922c3f6 100644 --- a/.buildkite/longruns/pipeline.yml +++ b/.buildkite/longruns/pipeline.yml @@ -6,7 +6,6 @@ agents: env: JULIA_LOAD_PATH: "${JULIA_LOAD_PATH}:${BUILDKITE_BUILD_CHECKOUT_PATH}/.buildkite" OPENBLAS_NUM_THREADS: 1 - JULIA_NVTX_CALLBACKS: gc OMPI_MCA_opal_warn_on_missing_libcuda: 0 JULIA_MAX_NUM_PRECOMPILE_FILES: 100 GKSwstype: 100 diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 8dd13aed80..b7e19262ec 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -7,7 +7,6 @@ env: JULIA_LOAD_PATH: "${JULIA_LOAD_PATH}:${BUILDKITE_BUILD_CHECKOUT_PATH}/.buildkite" JULIA_DEPOT_PATH: "${BUILDKITE_BUILD_PATH}/${BUILDKITE_PIPELINE_SLUG}/depot/cpu" OPENBLAS_NUM_THREADS: 1 - JULIA_NVTX_CALLBACKS: gc OMPI_MCA_opal_warn_on_missing_libcuda: 0 JULIA_MAX_NUM_PRECOMPILE_FILES: 100 GKSwstype: 100 diff --git a/experiments/ClimaEarth/cli_options.jl b/experiments/ClimaEarth/cli_options.jl index 704eaf3a26..64d720c06e 100644 --- a/experiments/ClimaEarth/cli_options.jl +++ b/experiments/ClimaEarth/cli_options.jl @@ -141,6 +141,10 @@ function argparse_settings() help = "An optional YAML file used to overwrite the default model parameters." arg_type = String default = nothing + "--atmos_log_progress" + help = "Use the ClimaAtmos walltime logging callback instead of the default ClimaCoupler one [`false` (default), `true`]" + arg_type = Bool + default = false "--albedo_model" help = "Type of albedo model. [`ConstantAlbedo`, `RegressionFunctionAlbedo`, `CouplerAlbedo` (default)]" arg_type = String diff --git a/experiments/ClimaEarth/components/atmosphere/climaatmos.jl b/experiments/ClimaEarth/components/atmosphere/climaatmos.jl index bad1411ce8..8204acd0ae 100644 --- a/experiments/ClimaEarth/components/atmosphere/climaatmos.jl +++ b/experiments/ClimaEarth/components/atmosphere/climaatmos.jl @@ -626,6 +626,7 @@ function get_atmos_config_dict( # can pick up from where we have left. NOTE: This should not be needed, but # there is no easy way to initialize ClimaAtmos with a different t_start atmos_config["dt_save_state_to_disk"] = coupler_config["checkpoint_dt"] + atmos_config["log_progress"] = coupler_config["atmos_log_progress"] # The Atmos `get_simulation` function expects the atmos config to contains its timestep size # in the `dt` field. If there is a `dt_atmos` field in coupler_config, we add it to the atmos config as `dt` diff --git a/experiments/ClimaEarth/setup_run.jl b/experiments/ClimaEarth/setup_run.jl index d9f26c542c..f1d4d15f41 100644 --- a/experiments/ClimaEarth/setup_run.jl +++ b/experiments/ClimaEarth/setup_run.jl @@ -495,7 +495,13 @@ function CoupledSimulation(config_dict::AbstractDict) EveryCalendarDtSchedule(TimeManager.time_to_period(checkpoint_dt); start_date) checkpoint_cb = TimeManager.Callback(schedule_checkpoint, Checkpointer.checkpoint_sims) - callbacks = (checkpoint_cb,) + # Don't use coupler walltime logging if atmos is using its own walltime logging is true + if config_dict["atmos_log_progress"] + callbacks = (checkpoint_cb,) + else + walltime_cb = TimeManager.capped_geometric_walltime_cb(t_start, t_end, Δt_cpl) + callbacks = (checkpoint_cb, walltime_cb) + end #= Set up default AMIP diagnostics Use ClimaDiagnostics for default AMIP diagnostics, which currently include turbulent energy fluxes. diff --git a/src/TimeManager.jl b/src/TimeManager.jl index bdaff79afe..d8d17168f9 100644 --- a/src/TimeManager.jl +++ b/src/TimeManager.jl @@ -9,6 +9,7 @@ module TimeManager import Dates import ..Interfacer import ..Utilities: time_to_seconds +import ClimaUtilities.OnlineLogging: WallTimeInfo, report_walltime """ time_to_period(s::String) @@ -100,4 +101,28 @@ function (::NeverSchedule)(args...) return false end +""" + capped_geometric_walltime_cb(t_start, t_end, Δt_cpl) + +Create a callback that reports walltime at when the number of steps taken is a power of 2, or +when the percent of the simulation that is completed is a multiple of 5. This skips the +first two steps to avoid compilation time noise. +""" +function capped_geometric_walltime_cb(t_start, t_end, Δt_cpl) + tot_steps = Int(ceil(float(t_end - t_start) / float(Δt_cpl))) + five_percent_steps = ceil(Int, 0.05 * tot_steps) + steps_taken = (integrator) -> float(integrator.t - t_start) / float(Δt_cpl) + walltime_report_cond = + (integrator) -> begin + nsteps = steps_taken(integrator) + # skip first two steps for compilation + (nsteps <= 2) && return false + return nsteps % five_percent_steps == 0 || ispow2(nsteps) + end + walltime_affect! = let wt = WallTimeInfo() + (coupled_sim) -> report_walltime(wt, coupled_sim.model_sims.atmos_sim.integrator) + end + return TimeManager.Callback(walltime_report_cond, walltime_affect!) +end + end diff --git a/test/utilities_tests.jl b/test/utilities_tests.jl index 2a73c7781e..8d2d789709 100644 --- a/test/utilities_tests.jl +++ b/test/utilities_tests.jl @@ -5,6 +5,7 @@ import Test: @testset, @test import ClimaComms ClimaComms.@import_required_backends import ClimaCoupler: Utilities +import ClimaCoupler: TimeManager import ClimaCore as CC # Initialize MPI context, in case @@ -71,4 +72,31 @@ for FT in (Float32, Float64) ) @test Utilities.integral(ones(space3d)) == sum(ones(space3d)) end + + @testset "WallTime Callback" begin + t_start = 0.0 + t_end = 10.0 + Δt_cpl = 0.1 + + cb = TimeManager.capped_geometric_walltime_cb(t_start, t_end, Δt_cpl) + + # First two steps should not trigger + fake_integrator = (; t = t_start + Δt_cpl) + @test !cb.schedule(fake_integrator) + fake_integrator = (; t = t_start + Δt_cpl * 2) + @test !cb.schedule(fake_integrator) + # step 4, 8, 16 should trigger + fake_integrator = (; t = t_start + Δt_cpl * 4) + @test cb.schedule(fake_integrator) + fake_integrator = (; t = t_start + Δt_cpl * 8) + @test cb.schedule(fake_integrator) + fake_integrator = (; t = t_start + Δt_cpl * 14) + @test !cb.schedule(fake_integrator) + fake_integrator = (; t = t_start + Δt_cpl * 16) + @test cb.schedule(fake_integrator) + + # 20% should trigger + fake_integrator = (; t = t_start + Δt_cpl * 20) + @test cb.schedule(fake_integrator) + end end