From 357363208d53c08aa11e11570078a1fd4321637e Mon Sep 17 00:00:00 2001
From: willieyz <willie.zhao@chelpis.com>
Date: Tue, 4 Nov 2025 17:56:55 +0800
Subject: [PATCH 01/11] Add runtime dispatch (mld_ntt_native, mld_intt_native,
 mld_poly_permute_bitrev_to_custom)

- Change mld_ntt_native() return type from void to int
- Add runtime capability checking with fallback support
- Implement dispatch logic in mld_poly_ntt() to try native first,
  fallback to C
- Add MLD_NATIVE_FUNC_SUCCESS/FALLBACK return codes
- Add mld_sys_check_capability() for system capability detection
- Add test configuration for AVX2, static ON/OFF, add to CI test.

Signed-off-by: willieyz <willie.zhao@chelpis.com>
---
 .github/actions/config-variations/action.yml  |  67 +-
 BIBLIOGRAPHY.md                               |   6 +
 dev/aarch64_clean/meta.h                      |   7 +-
 dev/x86_64/meta.h                             |  21 +-
 .../custom_no_randomized_config.h             |  30 +
 examples/monolithic_build/config_44.h         |  30 +
 examples/monolithic_build/config_65.h         |  30 +
 examples/monolithic_build/config_87.h         |  30 +
 .../multilevel_config.h                       |  30 +
 .../multilevel_config.h                       |  30 +
 examples/monolithic_build_native/config_44.h  |  30 +
 examples/monolithic_build_native/config_65.h  |  30 +
 examples/monolithic_build_native/config_87.h  |  30 +
 mldsa/mldsa_native.S                          |   6 +
 mldsa/mldsa_native.c                          |   4 +
 mldsa/src/config.h                            |  30 +
 mldsa/src/fips202/native/api.h                |  16 +
 mldsa/src/native/aarch64/meta.h               |   7 +-
 mldsa/src/native/api.h                        |  21 +-
 mldsa/src/native/x86_64/meta.h                |  21 +-
 mldsa/src/ntt.c                               |  15 +-
 mldsa/src/ntt.h                               |   4 -
 mldsa/src/poly.c                              |  42 +-
 mldsa/src/sys.h                               |  36 ++
 test/break_pct_config.h                       |  30 +
 test/configs.yml                              |  80 +++
 test/custom_memcpy_config.h                   |  30 +
 test/custom_memset_config.h                   |  30 +
 test/custom_native_capability_config_0.h      | 578 +++++++++++++++++
 test/custom_native_capability_config_1.h      | 577 +++++++++++++++++
 ...stom_native_capability_config_CPUID_AVX2.h | 609 ++++++++++++++++++
 test/custom_randombytes_config.h              |  30 +
 test/custom_stdlib_config.h                   |  30 +
 test/custom_zeroize_config.h                  |  30 +
 test/no_asm_config.h                          |  30 +
 test/serial_fips202_config.h                  |  30 +
 36 files changed, 2594 insertions(+), 63 deletions(-)
 create mode 100644 test/custom_native_capability_config_0.h
 create mode 100644 test/custom_native_capability_config_1.h
 create mode 100644 test/custom_native_capability_config_CPUID_AVX2.h

diff --git a/.github/actions/config-variations/action.yml b/.github/actions/config-variations/action.yml
index 0cf83315f..166faa704 100644
--- a/.github/actions/config-variations/action.yml
+++ b/.github/actions/config-variations/action.yml
@@ -7,7 +7,7 @@ inputs:
     description: 'GitHub token'
     required: true
   tests:
-    description: 'List of tests to run (space-separated IDs) or "all" for all tests. Available IDs: pct-enabled, pct-enabled-broken, custom-zeroize, no-asm, custom-randombytes, custom-memcpy, custom-memset, custom-stdlib, serial-fips202'
+    description: 'List of tests to run (space-separated IDs) or "all" for all tests. Available IDs: pct-enabled, pct-enabled-broken, custom-zeroize, native-cap-ON, native-cap-OFF, native-cap-CPUID_AVX2, no-asm, serial-fips202, custom-randombytes, custom-memcpy, custom-memset, custom-stdlib'
     required: false
     default: 'all'
   opt:
@@ -59,6 +59,45 @@ runs:
         acvp: true
         opt: ${{ inputs.opt }}
         examples: false # Some examples use a custom config themselves
+    - name: "Custom native capability functions (static ON)"
+      if: ${{ inputs.tests == 'all' || contains(inputs.tests, 'native-cap-ON') }}
+      uses: ./.github/actions/multi-functest
+      with:
+        gh_token: ${{ inputs.gh_token }}
+        compile_mode: native
+        cflags: "-std=c11 -D_GNU_SOURCE -DMLD_CONFIG_FILE=\\\\\\\"../../test/custom_native_capability_config_1.h\\\\\\\" -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all"
+        ldflags: "-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all"
+        func: true
+        kat: true
+        acvp: true
+        opt: ${{ inputs.opt }}
+        examples: false # Some examples use a custom config themselves
+    - name: "Custom native capability functions (static OFF)"
+      if: ${{ inputs.tests == 'all' || contains(inputs.tests, 'native-cap-OFF') }}
+      uses: ./.github/actions/multi-functest
+      with:
+        gh_token: ${{ inputs.gh_token }}
+        compile_mode: native
+        cflags: "-std=c11 -D_GNU_SOURCE -DMLD_CONFIG_FILE=\\\\\\\"../../test/custom_native_capability_config_0.h\\\\\\\" -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all"
+        ldflags: "-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all"
+        func: true
+        kat: true
+        acvp: true
+        opt: ${{ inputs.opt }}
+        examples: false # Some examples use a custom config themselves
+    - name: "Custom native capability functions (CPUID AVX2 detection)"
+      if: ${{ (inputs.tests == 'all' || contains(inputs.tests, 'native-cap-CPUID_AVX2')) && runner.os == 'Linux' && runner.arch == 'X64' }}
+      uses: ./.github/actions/multi-functest
+      with:
+        gh_token: ${{ inputs.gh_token }}
+        compile_mode: native
+        cflags: "-std=c11 -mavx2 -mbmi2 -mpopcnt -D_GNU_SOURCE -DMLD_CONFIG_FILE=\\\\\\\"../../test/custom_native_capability_config_CPUID_AVX2.h\\\\\\\" -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all"
+        ldflags: "-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all"
+        func: true
+        kat: true
+        acvp: true
+        opt: ${{ inputs.opt }}
+        examples: false # Some examples use a custom config themselves
     - name: "No ASM"
       if: ${{ inputs.tests == 'all' || contains(inputs.tests, 'no-asm') }}
       uses: ./.github/actions/multi-functest
@@ -72,6 +111,19 @@ runs:
         acvp: true
         opt: ${{ inputs.opt }}
         examples: false # Some examples use a custom config themselves
+    - name: "Serial FIPS202 (no batched Keccak)"
+      if: ${{ inputs.tests == 'all' || contains(inputs.tests, 'serial-fips202') }}
+      uses: ./.github/actions/multi-functest
+      with:
+        gh_token: ${{ inputs.gh_token }}
+        compile_mode: native
+        cflags: "-std=c11 -D_GNU_SOURCE -DMLD_CONFIG_FILE=\\\\\\\"../../test/serial_fips202_config.h\\\\\\\" -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all"
+        ldflags: "-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all"
+        func: true
+        kat: true
+        acvp: true
+        opt: ${{ inputs.opt }}
+        examples: false # Some examples use a custom config themselves
     - name: "Custom randombytes"
       if: ${{ inputs.tests == 'all' || contains(inputs.tests, 'custom-randombytes') }}
       uses: ./.github/actions/multi-functest
@@ -124,16 +176,3 @@ runs:
         acvp: true
         opt: ${{ inputs.opt }}
         examples: false # Some examples use a custom config themselves
-    - name: "Serial FIPS202 (no batched Keccak)"
-      if: ${{ inputs.tests == 'all' || contains(inputs.tests, 'serial-fips202') }}
-      uses: ./.github/actions/multi-functest
-      with:
-        gh_token: ${{ inputs.gh_token }}
-        compile_mode: native
-        cflags: "-std=c11 -D_GNU_SOURCE -DMLD_CONFIG_FILE=\\\\\\\"../../test/serial_fips202_config.h\\\\\\\" -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all"
-        ldflags: "-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all"
-        func: true
-        kat: true
-        acvp: true
-        opt: ${{ inputs.opt }}
-        examples: false # Some examples use a custom config themselves
diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md
index ed9d7858d..49a948c7c 100644
--- a/BIBLIOGRAPHY.md
+++ b/BIBLIOGRAPHY.md
@@ -40,6 +40,9 @@ source code and documentation.
   - [test/break_pct_config.h](test/break_pct_config.h)
   - [test/custom_memcpy_config.h](test/custom_memcpy_config.h)
   - [test/custom_memset_config.h](test/custom_memset_config.h)
+  - [test/custom_native_capability_config_0.h](test/custom_native_capability_config_0.h)
+  - [test/custom_native_capability_config_1.h](test/custom_native_capability_config_1.h)
+  - [test/custom_native_capability_config_CPUID_AVX2.h](test/custom_native_capability_config_CPUID_AVX2.h)
   - [test/custom_randombytes_config.h](test/custom_randombytes_config.h)
   - [test/custom_stdlib_config.h](test/custom_stdlib_config.h)
   - [test/custom_zeroize_config.h](test/custom_zeroize_config.h)
@@ -88,6 +91,9 @@ source code and documentation.
   - [test/break_pct_config.h](test/break_pct_config.h)
   - [test/custom_memcpy_config.h](test/custom_memcpy_config.h)
   - [test/custom_memset_config.h](test/custom_memset_config.h)
+  - [test/custom_native_capability_config_0.h](test/custom_native_capability_config_0.h)
+  - [test/custom_native_capability_config_1.h](test/custom_native_capability_config_1.h)
+  - [test/custom_native_capability_config_CPUID_AVX2.h](test/custom_native_capability_config_CPUID_AVX2.h)
   - [test/custom_randombytes_config.h](test/custom_randombytes_config.h)
   - [test/custom_stdlib_config.h](test/custom_stdlib_config.h)
   - [test/custom_zeroize_config.h](test/custom_zeroize_config.h)
diff --git a/dev/aarch64_clean/meta.h b/dev/aarch64_clean/meta.h
index 72da0ab85..bb18c8f9e 100644
--- a/dev/aarch64_clean/meta.h
+++ b/dev/aarch64_clean/meta.h
@@ -32,18 +32,21 @@
 
 
 #if !defined(__ASSEMBLER__)
+#include "../api.h"
 #include "src/arith_native_aarch64.h"
 
-static MLD_INLINE void mld_ntt_native(int32_t data[MLDSA_N])
+static MLD_INLINE int mld_ntt_native(int32_t data[MLDSA_N])
 {
   mld_ntt_asm(data, mld_aarch64_ntt_zetas_layer123456,
               mld_aarch64_ntt_zetas_layer78);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_intt_native(int32_t data[MLDSA_N])
+static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N])
 {
   mld_intt_asm(data, mld_aarch64_intt_zetas_layer78,
                mld_aarch64_intt_zetas_layer123456);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE int mld_rej_uniform_native(int32_t *r, unsigned len,
diff --git a/dev/x86_64/meta.h b/dev/x86_64/meta.h
index 7aa2b563a..8d9a5c620 100644
--- a/dev/x86_64/meta.h
+++ b/dev/x86_64/meta.h
@@ -33,20 +33,35 @@
 #if !defined(__ASSEMBLER__)
 #include <string.h>
 #include "../../common.h"
+#include "../api.h"
 #include "src/arith_native_x86_64.h"
 
 static MLD_INLINE void mld_poly_permute_bitrev_to_custom(int32_t data[MLDSA_N])
 {
-  mld_nttunpack_avx2((__m256i *)(data));
+  if (mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    mld_nttunpack_avx2((__m256i *)(data));
+  }
 }
 
-static MLD_INLINE void mld_ntt_native(int32_t data[MLDSA_N])
+static MLD_INLINE int mld_ntt_native(int32_t data[MLDSA_N])
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
+
   mld_ntt_avx2((__m256i *)data, mld_qdata.vec);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
-static MLD_INLINE void mld_intt_native(int32_t data[MLDSA_N])
+static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N])
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_invntt_avx2((__m256i *)data, mld_qdata.vec);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE int mld_rej_uniform_native(int32_t *r, unsigned len,
diff --git a/examples/basic_deterministic/mldsa_native/custom_no_randomized_config.h b/examples/basic_deterministic/mldsa_native/custom_no_randomized_config.h
index 7b130c90d..1622a4526 100644
--- a/examples/basic_deterministic/mldsa_native/custom_no_randomized_config.h
+++ b/examples/basic_deterministic/mldsa_native/custom_no_randomized_config.h
@@ -361,6 +361,36 @@
    #endif
 */
 
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+   static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+   {
+       ... your implementation ...
+   }
+*/
+
 /******************************************************************************
  * Name:        MLD_CONFIG_NO_RANDOMIZED_API
  *
diff --git a/examples/monolithic_build/config_44.h b/examples/monolithic_build/config_44.h
index bebe4c233..f07b330a6 100644
--- a/examples/monolithic_build/config_44.h
+++ b/examples/monolithic_build/config_44.h
@@ -359,6 +359,36 @@
    #endif
 */
 
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+   static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+   {
+       ... your implementation ...
+   }
+*/
+
 /******************************************************************************
  * Name:        MLD_CONFIG_NO_RANDOMIZED_API
  *
diff --git a/examples/monolithic_build/config_65.h b/examples/monolithic_build/config_65.h
index c8814d14c..e22a74509 100644
--- a/examples/monolithic_build/config_65.h
+++ b/examples/monolithic_build/config_65.h
@@ -359,6 +359,36 @@
    #endif
 */
 
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+   static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+   {
+       ... your implementation ...
+   }
+*/
+
 /******************************************************************************
  * Name:        MLD_CONFIG_NO_RANDOMIZED_API
  *
diff --git a/examples/monolithic_build/config_87.h b/examples/monolithic_build/config_87.h
index e3df5cf06..2fa64c3c5 100644
--- a/examples/monolithic_build/config_87.h
+++ b/examples/monolithic_build/config_87.h
@@ -359,6 +359,36 @@
    #endif
 */
 
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+   static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+   {
+       ... your implementation ...
+   }
+*/
+
 /******************************************************************************
  * Name:        MLD_CONFIG_NO_RANDOMIZED_API
  *
diff --git a/examples/monolithic_build_multilevel/multilevel_config.h b/examples/monolithic_build_multilevel/multilevel_config.h
index 32b204b9c..3d1f94fea 100644
--- a/examples/monolithic_build_multilevel/multilevel_config.h
+++ b/examples/monolithic_build_multilevel/multilevel_config.h
@@ -360,6 +360,36 @@
    #endif
 */
 
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+   static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+   {
+       ... your implementation ...
+   }
+*/
+
 /******************************************************************************
  * Name:        MLD_CONFIG_NO_RANDOMIZED_API
  *
diff --git a/examples/monolithic_build_multilevel_native/multilevel_config.h b/examples/monolithic_build_multilevel_native/multilevel_config.h
index 3af21c56e..037f026d8 100644
--- a/examples/monolithic_build_multilevel_native/multilevel_config.h
+++ b/examples/monolithic_build_multilevel_native/multilevel_config.h
@@ -367,6 +367,36 @@ static MLD_INLINE void mld_randombytes(uint8_t *ptr, size_t len)
 #endif /* !__ASSEMBLER__ */
 
 
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+   static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+   {
+       ... your implementation ...
+   }
+*/
+
 /******************************************************************************
  * Name:        MLD_CONFIG_NO_RANDOMIZED_API
  *
diff --git a/examples/monolithic_build_native/config_44.h b/examples/monolithic_build_native/config_44.h
index c609cd422..a5f933a56 100644
--- a/examples/monolithic_build_native/config_44.h
+++ b/examples/monolithic_build_native/config_44.h
@@ -357,6 +357,36 @@
    #endif
 */
 
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+   static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+   {
+       ... your implementation ...
+   }
+*/
+
 /******************************************************************************
  * Name:        MLD_CONFIG_NO_RANDOMIZED_API
  *
diff --git a/examples/monolithic_build_native/config_65.h b/examples/monolithic_build_native/config_65.h
index fa5d16605..af93757a2 100644
--- a/examples/monolithic_build_native/config_65.h
+++ b/examples/monolithic_build_native/config_65.h
@@ -357,6 +357,36 @@
    #endif
 */
 
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+   static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+   {
+       ... your implementation ...
+   }
+*/
+
 /******************************************************************************
  * Name:        MLD_CONFIG_NO_RANDOMIZED_API
  *
diff --git a/examples/monolithic_build_native/config_87.h b/examples/monolithic_build_native/config_87.h
index 5d470757b..b588c5899 100644
--- a/examples/monolithic_build_native/config_87.h
+++ b/examples/monolithic_build_native/config_87.h
@@ -357,6 +357,36 @@
    #endif
 */
 
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+   static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+   {
+       ... your implementation ...
+   }
+*/
+
 /******************************************************************************
  * Name:        MLD_CONFIG_NO_RANDOMIZED_API
  *
diff --git a/mldsa/mldsa_native.S b/mldsa/mldsa_native.S
index 6fd5ac269..80720bacd 100644
--- a/mldsa/mldsa_native.S
+++ b/mldsa/mldsa_native.S
@@ -419,8 +419,10 @@
 #undef MLD_RESTRICT
 #undef MLD_SYS_AARCH64
 #undef MLD_SYS_AARCH64_EB
+#undef MLD_SYS_APPLE
 #undef MLD_SYS_BIG_ENDIAN
 #undef MLD_SYS_H
+#undef MLD_SYS_LINUX
 #undef MLD_SYS_LITTLE_ENDIAN
 #undef MLD_SYS_PPC64LE
 #undef MLD_SYS_RISCV32
@@ -482,6 +484,8 @@
 #if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)
 /* mldsa/src/fips202/native/api.h */
 #undef MLD_FIPS202_NATIVE_API_H
+#undef MLD_NATIVE_FUNC_FALLBACK
+#undef MLD_NATIVE_FUNC_SUCCESS
 /* mldsa/src/fips202/native/auto.h */
 #undef MLD_FIPS202_NATIVE_AUTO_H
 #if defined(MLD_SYS_AARCH64)
@@ -535,6 +539,8 @@
 #if defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH)
 /* mldsa/src/native/api.h */
 #undef MLD_NATIVE_API_H
+#undef MLD_NATIVE_FUNC_FALLBACK
+#undef MLD_NATIVE_FUNC_SUCCESS
 /* mldsa/src/native/meta.h */
 #undef MLD_NATIVE_META_H
 #if defined(MLD_SYS_AARCH64)
diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c
index 7113bbac9..9e6ad4c29 100644
--- a/mldsa/mldsa_native.c
+++ b/mldsa/mldsa_native.c
@@ -416,8 +416,10 @@
 #undef MLD_RESTRICT
 #undef MLD_SYS_AARCH64
 #undef MLD_SYS_AARCH64_EB
+#undef MLD_SYS_APPLE
 #undef MLD_SYS_BIG_ENDIAN
 #undef MLD_SYS_H
+#undef MLD_SYS_LINUX
 #undef MLD_SYS_LITTLE_ENDIAN
 #undef MLD_SYS_PPC64LE
 #undef MLD_SYS_RISCV32
@@ -532,6 +534,8 @@
 #if defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH)
 /* mldsa/src/native/api.h */
 #undef MLD_NATIVE_API_H
+#undef MLD_NATIVE_FUNC_FALLBACK
+#undef MLD_NATIVE_FUNC_SUCCESS
 /* mldsa/src/native/meta.h */
 #undef MLD_NATIVE_META_H
 #if defined(MLD_SYS_AARCH64)
diff --git a/mldsa/src/config.h b/mldsa/src/config.h
index 7c849b68d..9e48023db 100644
--- a/mldsa/src/config.h
+++ b/mldsa/src/config.h
@@ -345,6 +345,36 @@
    #endif
 */
 
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+   static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+   {
+       ... your implementation ...
+   }
+*/
+
 /******************************************************************************
  * Name:        MLD_CONFIG_NO_RANDOMIZED_API
  *
diff --git a/mldsa/src/fips202/native/api.h b/mldsa/src/fips202/native/api.h
index f15fb0763..1dc51cfb1 100644
--- a/mldsa/src/fips202/native/api.h
+++ b/mldsa/src/fips202/native/api.h
@@ -16,6 +16,22 @@
 #include <stdint.h>
 #include "../../cbmc.h"
 
+/* Backends must return MLD_NATIVE_FUNC_SUCCESS upon success. */
+#define MLD_NATIVE_FUNC_SUCCESS (0)
+/* Backends may return MLD_NATIVE_FUNC_FALLBACK to signal to the frontend that
+ * the target/parameters are unsupported; typically, this would be because of
+ * dependencies on CPU features not detected on the host CPU. In this case,
+ * the frontend falls back to the default C implementation.
+ *
+ * IMPORTANT: Backend implementations must ensure that the decision of whether
+ * to fallback (return MLD_NATIVE_FUNC_FALLBACK) or not must never depend on
+ * the input data itself. Fallback decisions may only depend on system
+ * capabilities (e.g., CPU features) and, where present, length information.
+ * This requirement applies to all backend functions to maintain constant-time
+ * properties.
+ */
+#define MLD_NATIVE_FUNC_FALLBACK (-1)
+
 /*
  * This is the C<->native interface allowing for the drop-in
  * of custom Keccak-F1600 implementations.
diff --git a/mldsa/src/native/aarch64/meta.h b/mldsa/src/native/aarch64/meta.h
index 72da0ab85..bb18c8f9e 100644
--- a/mldsa/src/native/aarch64/meta.h
+++ b/mldsa/src/native/aarch64/meta.h
@@ -32,18 +32,21 @@
 
 
 #if !defined(__ASSEMBLER__)
+#include "../api.h"
 #include "src/arith_native_aarch64.h"
 
-static MLD_INLINE void mld_ntt_native(int32_t data[MLDSA_N])
+static MLD_INLINE int mld_ntt_native(int32_t data[MLDSA_N])
 {
   mld_ntt_asm(data, mld_aarch64_ntt_zetas_layer123456,
               mld_aarch64_ntt_zetas_layer78);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_intt_native(int32_t data[MLDSA_N])
+static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N])
 {
   mld_intt_asm(data, mld_aarch64_intt_zetas_layer78,
                mld_aarch64_intt_zetas_layer123456);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE int mld_rej_uniform_native(int32_t *r, unsigned len,
diff --git a/mldsa/src/native/api.h b/mldsa/src/native/api.h
index 138ba47c9..1728b56d2 100644
--- a/mldsa/src/native/api.h
+++ b/mldsa/src/native/api.h
@@ -22,6 +22,22 @@
 #include "../cbmc.h"
 #include "../common.h"
 
+/* Backends must return MLD_NATIVE_FUNC_SUCCESS upon success. */
+#define MLD_NATIVE_FUNC_SUCCESS (0)
+/* Backends may return MLD_NATIVE_FUNC_FALLBACK to signal to the frontend that
+ * the target/parameters are unsupported; typically, this would be because of
+ * dependencies on CPU features not detected on the host CPU. In this case,
+ * the frontend falls back to the default C implementation.
+ *
+ * IMPORTANT: Backend implementations must ensure that the decision of whether
+ * to fallback (return MLD_NATIVE_FUNC_FALLBACK) or not must never depend on
+ * the input data itself. Fallback decisions may only depend on system
+ * capabilities (e.g., CPU features) and, where present, length information.
+ * This requirement applies to all backend functions to maintain constant-time
+ * properties.
+ */
+#define MLD_NATIVE_FUNC_FALLBACK (-1)
+
 /*
  * This is the C<->native interface allowing for the drop-in of
  * native code for performance critical arithmetic components of ML-DSA.
@@ -30,7 +46,6 @@
  *
  * To add a function to a backend, define MLD_USE_NATIVE_XXX and
  * implement `static inline xxx(...)` in the profile header.
- *
  */
 
 /*
@@ -52,7 +67,7 @@
  *
  * Arguments:   - int32_t p[MLDSA_N]: pointer to in/output polynomial
  **************************************************/
-static MLD_INLINE void mld_ntt_native(int32_t p[MLDSA_N]);
+static MLD_INLINE int mld_ntt_native(int32_t p[MLDSA_N]);
 #endif /* MLD_USE_NATIVE_NTT */
 
 
@@ -96,7 +111,7 @@ static MLD_INLINE void mld_poly_permute_bitrev_to_custom(int32_t p[MLDSA_N]);
  *
  * Arguments:   - uint32_t p[MLDSA_N]: pointer to in/output polynomial
  **************************************************/
-static MLD_INLINE void mld_intt_native(int32_t p[MLDSA_N]);
+static MLD_INLINE int mld_intt_native(int32_t p[MLDSA_N]);
 #endif /* MLD_USE_NATIVE_INTT */
 
 #if defined(MLD_USE_NATIVE_REJ_UNIFORM)
diff --git a/mldsa/src/native/x86_64/meta.h b/mldsa/src/native/x86_64/meta.h
index 7aa2b563a..8d9a5c620 100644
--- a/mldsa/src/native/x86_64/meta.h
+++ b/mldsa/src/native/x86_64/meta.h
@@ -33,20 +33,35 @@
 #if !defined(__ASSEMBLER__)
 #include <string.h>
 #include "../../common.h"
+#include "../api.h"
 #include "src/arith_native_x86_64.h"
 
 static MLD_INLINE void mld_poly_permute_bitrev_to_custom(int32_t data[MLDSA_N])
 {
-  mld_nttunpack_avx2((__m256i *)(data));
+  if (mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    mld_nttunpack_avx2((__m256i *)(data));
+  }
 }
 
-static MLD_INLINE void mld_ntt_native(int32_t data[MLDSA_N])
+static MLD_INLINE int mld_ntt_native(int32_t data[MLDSA_N])
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
+
   mld_ntt_avx2((__m256i *)data, mld_qdata.vec);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
-static MLD_INLINE void mld_intt_native(int32_t data[MLDSA_N])
+static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N])
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_invntt_avx2((__m256i *)data, mld_qdata.vec);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE int mld_rej_uniform_native(int32_t *r, unsigned len,
diff --git a/mldsa/src/ntt.c b/mldsa/src/ntt.c
index befe0928c..a916a640a 100644
--- a/mldsa/src/ntt.c
+++ b/mldsa/src/ntt.c
@@ -14,9 +14,7 @@
 
 #include "common.h"
 
-#if !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
-    (!defined(MLD_USE_NATIVE_NTT) || !defined(MLD_USE_NATIVE_INTT))
-
+#if !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
 
 #include <stdint.h>
 
@@ -42,8 +40,6 @@ __contract__(
 
 #include "zetas.inc"
 
-#if !defined(MLD_USE_NATIVE_NTT)
-
 /* mld_ntt_butterfly_block()
  *
  * Computes a block CT butterflies with a fixed twiddle factor,
@@ -164,9 +160,7 @@ void mld_ntt(int32_t a[MLDSA_N])
   /* directly implies the postcondition in that coefficients */
   /* are bounded in magnitude by 9 * MLDSA_Q                 */
 }
-#endif /* !MLD_USE_NATIVE_NTT */
 
-#if !defined(MLD_USE_NATIVE_INTT)
 /*************************************************
  * Name:        mld_fqscale
  *
@@ -261,10 +255,7 @@ void mld_invntt_tomont(int32_t a[MLDSA_N])
     a[j] = mld_fqscale(a[j]);
   }
 }
-#endif /* !MLD_USE_NATIVE_INTT */
 
-#else  /* !MLD_CONFIG_MULTILEVEL_NO_SHARED && (!MLD_USE_NATIVE_NTT || \
-          !MLD_USE_NATIVE_INTT) */
+#else  /* !MLD_CONFIG_MULTILEVEL_NO_SHARED */
 MLD_EMPTY_CU(mld_ntt)
-#endif /* !(!MLD_CONFIG_MULTILEVEL_NO_SHARED && (!MLD_USE_NATIVE_NTT || \
-          !MLD_USE_NATIVE_INTT)) */
+#endif /* MLD_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/mldsa/src/ntt.h b/mldsa/src/ntt.h
index 152626354..fbb7ddabc 100644
--- a/mldsa/src/ntt.h
+++ b/mldsa/src/ntt.h
@@ -24,7 +24,6 @@
 /* Absolute exclusive upper bound for the output of the inverse NTT*/
 #define MLD_INTT_BOUND MLDSA_Q
 
-#if !defined(MLD_USE_NATIVE_NTT)
 #define mld_ntt MLD_NAMESPACE(ntt)
 /*************************************************
  * Name:        mld_ntt
@@ -54,9 +53,7 @@ __contract__(
   assigns(memory_slice(a, MLDSA_N * sizeof(int32_t)))
   ensures(array_abs_bound(a, 0, MLDSA_N, MLD_NTT_BOUND))
 );
-#endif /* !MLD_USE_NATIVE_NTT */
 
-#if !defined(MLD_USE_NATIVE_INTT)
 #define mld_invntt_tomont MLD_NAMESPACE(invntt_tomont)
 /*************************************************
  * Name:        mld_invntt_tomont
@@ -79,6 +76,5 @@ __contract__(
   assigns(memory_slice(a, MLDSA_N * sizeof(int32_t)))
   ensures(array_abs_bound(a, 0, MLDSA_N, MLD_INTT_BOUND))
 );
-#endif /* !MLD_USE_NATIVE_INTT */
 
 #endif /* !MLD_NTT_H */
diff --git a/mldsa/src/poly.c b/mldsa/src/poly.c
index bc9d7908d..1a5b1b428 100644
--- a/mldsa/src/poly.c
+++ b/mldsa/src/poly.c
@@ -142,41 +142,43 @@ void mld_poly_shiftl(mld_poly *a)
   mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
 }
 
-#if !defined(MLD_USE_NATIVE_NTT)
 MLD_INTERNAL_API
 void mld_poly_ntt(mld_poly *a)
 {
   mld_assert_abs_bound(a->coeffs, MLDSA_N, MLDSA_Q);
+#if defined(MLD_USE_NATIVE_NTT)
+  {
+    int ret;
+    ret = mld_ntt_native(a->coeffs);
+    if (ret == MLD_NATIVE_FUNC_SUCCESS)
+    {
+      mld_assert_abs_bound(a->coeffs, MLDSA_N, MLD_NTT_BOUND);
+      return;
+    }
+  }
+#endif /* MLD_USE_NATIVE_NTT */
   mld_ntt(a->coeffs);
   mld_assert_abs_bound(a->coeffs, MLDSA_N, MLD_NTT_BOUND);
 }
-#else  /* !MLD_USE_NATIVE_NTT */
-MLD_INTERNAL_API
-void mld_poly_ntt(mld_poly *p)
-{
-  mld_assert_abs_bound(p->coeffs, MLDSA_N, MLDSA_Q);
-  mld_ntt_native(p->coeffs);
-  mld_assert_abs_bound(p->coeffs, MLDSA_N, MLD_NTT_BOUND);
-}
-#endif /* MLD_USE_NATIVE_NTT */
 
-#if !defined(MLD_USE_NATIVE_INTT)
 MLD_INTERNAL_API
 void mld_poly_invntt_tomont(mld_poly *a)
 {
   mld_assert_abs_bound(a->coeffs, MLDSA_N, MLDSA_Q);
+#if defined(MLD_USE_NATIVE_INTT)
+  {
+    int ret;
+    ret = mld_intt_native(a->coeffs);
+    if (ret == MLD_NATIVE_FUNC_SUCCESS)
+    {
+      mld_assert_abs_bound(a->coeffs, MLDSA_N, MLD_INTT_BOUND);
+      return;
+    }
+  }
+#endif /* MLD_USE_NATIVE_INTT */
   mld_invntt_tomont(a->coeffs);
   mld_assert_abs_bound(a->coeffs, MLDSA_N, MLD_INTT_BOUND);
 }
-#else  /* !MLD_USE_NATIVE_INTT */
-MLD_INTERNAL_API
-void mld_poly_invntt_tomont(mld_poly *a)
-{
-  mld_assert_abs_bound(a->coeffs, MLDSA_N, MLDSA_Q);
-  mld_intt_native(a->coeffs);
-  mld_assert_abs_bound(a->coeffs, MLDSA_N, MLD_INTT_BOUND);
-}
-#endif /* MLD_USE_NATIVE_INTT */
 
 MLD_INTERNAL_API
 void mld_poly_pointwise_montgomery(mld_poly *c, const mld_poly *a,
diff --git a/mldsa/src/sys.h b/mldsa/src/sys.h
index f9280c695..f5f3cc4ac 100644
--- a/mldsa/src/sys.h
+++ b/mldsa/src/sys.h
@@ -67,6 +67,14 @@
 #define MLD_SYS_WINDOWS
 #endif
 
+#if defined(__linux__)
+#define MLD_SYS_LINUX
+#endif
+
+#if defined(__APPLE__)
+#define MLD_SYS_APPLE
+#endif
+
 /* If MLD_FORCE_AARCH64 is set, assert that we're indeed on an AArch64 system.
  */
 #if defined(MLD_FORCE_AARCH64) && !defined(MLD_SYS_AARCH64)
@@ -203,4 +211,32 @@
 #define MLD_MUST_CHECK_RETURN_VALUE
 #endif
 
+
+#if !defined(__ASSEMBLER__)
+/* System capability enumeration */
+typedef enum
+{
+  /* x86_64 */
+  MLD_SYS_CAP_AVX2,
+  /* AArch64 */
+  MLD_SYS_CAP_SHA3
+} mld_sys_cap;
+
+#if !defined(MLD_CONFIG_CUSTOM_CAPABILITY_FUNC)
+#include "cbmc.h"
+
+static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+{
+  /* By default, we rely on compile-time feature detection/specification:
+   * If a feature is enabled at compile-time, we assume it is supported by
+   * the host that the resulting library/binary will be built on.
+   * If this assumption is not true, you MUST overwrite this function.
+   * See the documentation of MLD_CONFIG_CUSTOM_CAPABILITY_FUNC in config.h
+   * for more information. */
+  (void)cap;
+  return 1;
+}
+#endif /* !MLD_CONFIG_CUSTOM_CAPABILITY_FUNC */
+#endif /* !__ASSEMBLER__ */
+
 #endif /* !MLD_SYS_H */
diff --git a/test/break_pct_config.h b/test/break_pct_config.h
index 56fdc21fb..9b60a055f 100644
--- a/test/break_pct_config.h
+++ b/test/break_pct_config.h
@@ -361,6 +361,36 @@
    #endif
 */
 
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+   static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+   {
+       ... your implementation ...
+   }
+*/
+
 /******************************************************************************
  * Name:        MLD_CONFIG_NO_RANDOMIZED_API
  *
diff --git a/test/configs.yml b/test/configs.yml
index d37323f17..b875d0e9c 100644
--- a/test/configs.yml
+++ b/test/configs.yml
@@ -165,6 +165,86 @@ configs:
           }
           #endif
 
+  - path: test/custom_native_capability_config_0.h
+    description: "Test configuration with custom capability function returning 0"
+    defines:
+      MLD_CONFIG_CUSTOM_CAPABILITY_FUNC:
+        content: |
+          #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+          #if !defined(__ASSEMBLER__)
+          #include "../mldsa/src/sys.h"
+          /* System capability enumeration */
+
+          static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+          {
+            (void)cap; /* Ignore parameter */
+            return 0;
+          }
+          #endif /* !__ASSEMBLER__ */
+
+  - path: test/custom_native_capability_config_1.h
+    description: "Test configuration with custom capability function returning 1"
+    defines:
+      MLD_CONFIG_CUSTOM_CAPABILITY_FUNC:
+        content: |
+          #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+          #if !defined(__ASSEMBLER__)
+          #include "../mldsa/src/sys.h"
+
+          static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+          {
+            (void)cap; /* Ignore parameter */
+            return 1;
+          }
+          #endif /* !__ASSEMBLER__ */
+
+  - path: test/custom_native_capability_config_CPUID_AVX2.h
+    description: "Test configuration with CPUID-based AVX2 capability detection"
+    defines:
+      MLD_CONFIG_CUSTOM_CAPABILITY_FUNC:
+        content: |
+          #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+          #if !defined(__ASSEMBLER__)
+          #include <stdint.h>
+          #include "../mldsa/src/sys.h"
+
+          /* Assert this config is only used on Linux/x86_64 systems */
+          #if !defined(MLD_SYS_X86_64) || !defined(MLD_SYS_LINUX)
+          #error "This configuration is only supported on Linux/x86_64 systems"
+          #endif
+
+          static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+          {
+            if (cap == MLD_SYS_CAP_AVX2)
+            {
+              uint32_t eax, ebx, ecx, edx;
+
+              /* AVX2 support is queried using `cpuid` with EAX=7, ECX=0.
+               * Check first if `cpuid` supports EAX=7 by calling it with
+               * EAX=0, which gives the maximum supported value of EAX in
+               * EAX. */
+
+              __asm__ volatile("cpuid"
+                               : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
+                               : "a"(0));
+
+              if (eax < 7)
+              {
+                return 0; /* Extended features not supported */
+              }
+
+              __asm__ volatile("cpuid"
+                               : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
+                               : "a"(7), "c"(0));
+
+              /* AVX2 is bit 5 in EBX */
+              return (ebx & (1 << 5)) ? 1 : 0;
+            }
+
+            /* Default to 0 (conservative) for unknown capabilities */
+            return 0;
+          }
+          #endif /* !__ASSEMBLER__ */
 
   # Example configs
 
diff --git a/test/custom_memcpy_config.h b/test/custom_memcpy_config.h
index dec434d3d..54f11a515 100644
--- a/test/custom_memcpy_config.h
+++ b/test/custom_memcpy_config.h
@@ -368,6 +368,36 @@ static MLD_INLINE void *mld_memcpy(void *dest, const void *src, size_t n)
    #endif
 */
 
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+   static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+   {
+       ... your implementation ...
+   }
+*/
+
 /******************************************************************************
  * Name:        MLD_CONFIG_NO_RANDOMIZED_API
  *
diff --git a/test/custom_memset_config.h b/test/custom_memset_config.h
index f19ba114e..2b0de7ea4 100644
--- a/test/custom_memset_config.h
+++ b/test/custom_memset_config.h
@@ -367,6 +367,36 @@ static MLD_INLINE void *mld_memset(void *s, int c, size_t n)
    #endif
 */
 
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+   static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+   {
+       ... your implementation ...
+   }
+*/
+
 /******************************************************************************
  * Name:        MLD_CONFIG_NO_RANDOMIZED_API
  *
diff --git a/test/custom_native_capability_config_0.h b/test/custom_native_capability_config_0.h
new file mode 100644
index 000000000..90179dce5
--- /dev/null
+++ b/test/custom_native_capability_config_0.h
@@ -0,0 +1,578 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS140_3_IG]
+ *   Implementation Guidance for FIPS 140-3 and the Cryptographic Module
+ *   Validation Program
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
+ *
+ * - [FIPS204]
+ *   FIPS 204 Module-Lattice-Based Digital Signature Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/204/final
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogen
+ *          in the mldsa-native repository.
+ *          Do not modify it directly.
+ */
+
+/*
+ * Test configuration: Test configuration with custom capability function
+ * returning 0
+ *
+ * This configuration differs from the default mldsa/src/config.h in the
+ * following places:
+ *   - MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ */
+
+
+#ifndef MLD_CONFIG_H
+#define MLD_CONFIG_H
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_PARAMETER_SET
+ *
+ * Description: Specifies the parameter set for ML-DSA
+ *              - MLD_CONFIG_PARAMETER_SET=44 corresponds to ML-DSA-44
+ *              - MLD_CONFIG_PARAMETER_SET=65 corresponds to ML-DSA-65
+ *              - MLD_CONFIG_PARAMETER_SET=87 corresponds to ML-DSA-87
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#ifndef MLD_CONFIG_PARAMETER_SET
+#define MLD_CONFIG_PARAMETER_SET \
+  44 /* Change this for different security strengths */
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_NAMESPACE_PREFIX
+ *
+ * Description: The prefix to use to namespace global symbols from mldsa/.
+ *
+ *              In a multi-level build (that is, if either
+ *              - MLD_CONFIG_MULTILEVEL_WITH_SHARED, or
+ *              - MLD_CONFIG_MULTILEVEL_NO_SHARED,
+ *              are set, level-dependent symbols will additionally be prefixed
+ *              with the parameter set (44/65/87).
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLD_CONFIG_NAMESPACE_PREFIX)
+#define MLD_CONFIG_NAMESPACE_PREFIX MLD_DEFAULT_NAMESPACE_PREFIX
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_MULTILEVEL_WITH_SHARED
+ *
+ * Description: This is for multi-level builds of mldsa-native only. If you
+ *              need only a single parameter set, keep this unset.
+ *
+ *              If this is set, all MLD_CONFIG_PARAMETER_SET-independent
+ *              code will be included in the build, including code needed only
+ *              for other parameter sets.
+ *
+ *              Example: TODO: add example
+ *
+ *              To build mldsa-native with support for all parameter sets,
+ *              build it three times -- once per parameter set -- and set the
+ *              option MLD_CONFIG_MULTILEVEL_WITH_SHARED for exactly one of
+ *              them, and MLD_CONFIG_MULTILEVEL_NO_SHARED for the others.
+ *
+ *              See examples/multilevel_build_mldsa for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_MULTILEVEL_WITH_SHARED */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_MULTILEVEL_NO_SHARED
+ *
+ * Description: This is for multi-level builds of mldsa-native only. If you
+ *              need only a single parameter set, keep this unset.
+ *
+ *              If this is set, no MLD_CONFIG_PARAMETER_SET-independent code
+ *              will be included in the build.
+ *
+ *              To build mldsa-native with support for all parameter sets,
+ *              build it three times -- once per parameter set -- and set the
+ *              option MLD_CONFIG_MULTILEVEL_WITH_SHARED for exactly one of
+ *              them, and MLD_CONFIG_MULTILEVEL_NO_SHARED for the others.
+ *
+ *              See examples/multilevel_build_mldsa for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_MULTILEVEL_NO_SHARED */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_FILE
+ *
+ * Description: If defined, this is a header that will be included instead
+ *              of the default configuration file mldsa/src/config.h.
+ *
+ *              When you need to build mldsa-native in multiple configurations,
+ *              using varying MLD_CONFIG_FILE can be more convenient
+ *              then configuring everything through CFLAGS.
+ *
+ *              To use, MLD_CONFIG_FILE _must_ be defined prior
+ *              to the inclusion of any mldsa-native headers. For example,
+ *              it can be set by passing `-DMLD_CONFIG_FILE="..."`
+ *              on the command line.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_FILE "config.h" */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_USE_NATIVE_BACKEND_ARITH
+ *
+ * Description: Determines whether an native arithmetic backend should be used.
+ *
+ *              The arithmetic backend covers performance critical functions
+ *              such as the number-theoretic transform (NTT).
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the arithmetic backend to be use is
+ *              determined by MLD_CONFIG_ARITH_BACKEND_FILE: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH)
+/* #define MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_ARITH_BACKEND_FILE
+ *
+ * Description: The arithmetic backend to use.
+ *
+ *              If MLD_CONFIG_USE_NATIVE_BACKEND_ARITH is unset, this option
+ *              is ignored.
+ *
+ *              If MLD_CONFIG_USE_NATIVE_BACKEND_ARITH is set, this option must
+ *              either be undefined or the filename of an arithmetic backend.
+ *              If unset, the default backend will be used.
+ *
+ *              This can be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH) && \
+    !defined(MLD_CONFIG_ARITH_BACKEND_FILE)
+#define MLD_CONFIG_ARITH_BACKEND_FILE "native/meta.h"
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202
+ *
+ * Description: Determines whether an native FIPS202 backend should be used.
+ *
+ *              The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is
+ *              the performance bottleneck of SHA3 and SHAKE.
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the FIPS202 backend to be use is
+ *              determined by MLD_CONFIG_FIPS202_BACKEND_FILE: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)
+/* #define MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 */
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_FIPS202_BACKEND_FILE
+ *
+ * Description: The FIPS-202 backend to use.
+ *
+ *              If MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 is set, this option
+ *              must either be undefined or the filename of a FIPS202 backend.
+ *              If unset, the default backend will be used.
+ *
+ *              This can be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202) && \
+    !defined(MLD_CONFIG_FIPS202_BACKEND_FILE)
+#define MLD_CONFIG_FIPS202_BACKEND_FILE "fips202/native/auto.h"
+#endif
+/******************************************************************************
+ * Name:        MLD_CONFIG_FIPS202_CUSTOM_HEADER
+ *
+ * Description: Custom header to use for FIPS-202
+ *
+ *              This should only be set if you intend to use a custom
+ *              FIPS-202 implementation, different from the one shipped
+ *              with mldsa-native.
+ *
+ *              If set, it must be the name of a file serving as the
+ *              replacement for mldsa/src/fips202/fips202.h, and exposing
+ *              the same API (see FIPS202.md).
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_FIPS202_CUSTOM_HEADER "SOME_FILE.h" */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_FIPS202X4_CUSTOM_HEADER
+ *
+ * Description: Custom header to use for FIPS-202-X4
+ *
+ *              This should only be set if you intend to use a custom
+ *              FIPS-202 implementation, different from the one shipped
+ *              with mldsa-native.
+ *
+ *              If set, it must be the name of a file serving as the
+ *              replacement for mldsa/src/fips202/fips202x4.h, and exposing
+ *              the same API (see FIPS202.md).
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_FIPS202X4_CUSTOM_HEADER "SOME_FILE.h" */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_ZEROIZE
+ *
+ * Description: In compliance with @[FIPS204, Section 3.6.3], mldsa-native,
+ *              zeroizes intermediate stack buffers before returning from
+ *              function calls.
+ *
+ *              Set this option and define `mld_zeroize_native` if you want to
+ *              use a custom method to zeroize intermediate stack buffers.
+ *              The default implementation uses SecureZeroMemory on Windows
+ *              and a memset + compiler barrier otherwise. If neither of those
+ *              is available on the target platform, compilation will fail,
+ *              and you will need to use MLD_CONFIG_CUSTOM_ZEROIZE to provide
+ *              a custom implementation of `mld_zeroize_native()`.
+ *
+ *              WARNING:
+ *              The explicit stack zeroization conducted by mldsa-native
+ *              reduces the likelihood of data leaking on the stack, but
+ *              does not eliminate it! The C standard makes no guarantee about
+ *              where a compiler allocates structures and whether/where it makes
+ *              copies of them. Also, in addition to entire structures, there
+ *              may also be potentially exploitable leakage of individual values
+ *              on the stack.
+ *
+ *              If you need bullet-proof zeroization of the stack, you need to
+ *              consider additional measures instead of what this feature
+ *              provides. In this case, you can set mld_zeroize_native to a
+ *              no-op.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_ZEROIZE
+   #if !defined(__ASSEMBLER__)
+   #include <stdint.h>
+   #include "sys.h"
+   static MLD_INLINE void mld_zeroize_native(void *ptr, size_t len)
+   {
+       ... your implementation ...
+   }
+   #endif
+*/
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_MEMCPY
+ *
+ * Description: Set this option and define `mld_memcpy` if you want to
+ *              use a custom method to copy memory instead of the standard
+ *              library memcpy function.
+ *
+ *              The custom implementation must have the same signature and
+ *              behavior as the standard memcpy function:
+ *              void *mld_memcpy(void *dest, const void *src, size_t n)
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_MEMCPY
+   #if !defined(__ASSEMBLER__)
+   #include <stdint.h>
+   #include "sys.h"
+   static MLD_INLINE void *mld_memcpy(void *dest, const void *src, size_t n)
+   {
+       ... your implementation ...
+   }
+   #endif
+*/
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_MEMSET
+ *
+ * Description: Set this option and define `mld_memset` if you want to
+ *              use a custom method to set memory instead of the standard
+ *              library memset function.
+ *
+ *              The custom implementation must have the same signature and
+ *              behavior as the standard memset function:
+ *              void *mld_memset(void *s, int c, size_t n)
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_MEMSET
+   #if !defined(__ASSEMBLER__)
+   #include <stdint.h>
+   #include "sys.h"
+   static MLD_INLINE void *mld_memset(void *s, int c, size_t n)
+   {
+       ... your implementation ...
+   }
+   #endif
+*/
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_RANDOMBYTES
+ *
+ * Description: mldsa-native does not provide a secure randombytes
+ *              implementation. Such an implementation has to provided by the
+ *              consumer.
+ *
+ *              If this option is not set, mldsa-native expects a function
+ *              void randombytes(uint8_t *out, size_t outlen).
+ *
+ *              Set this option and define `mld_randombytes` if you want to
+ *              use a custom method to sample randombytes with a different name
+ *              or signature.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_RANDOMBYTES
+   #if !defined(__ASSEMBLER__)
+   #include <stdint.h>
+   #include "sys.h"
+   static MLD_INLINE void mld_randombytes(uint8_t *ptr, size_t len)
+   {
+       ... your implementation ...
+   }
+   #endif
+*/
+
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+#define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+#if !defined(__ASSEMBLER__)
+#include "../mldsa/src/sys.h"
+/* System capability enumeration */
+
+static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+{
+  (void)cap; /* Ignore parameter */
+  return 0;
+}
+#endif /* !__ASSEMBLER__ */
+
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_NO_RANDOMIZED_API
+ *
+ * Description: If this option is set, mldsa-native will be built without the
+ *              randomized API functions (crypto_sign_keypair,
+ *              crypto_sign, crypto_sign_signature, and
+ *              crypto_sign_signature_extmu).
+ *              This allows users to build mldsa-native without providing a
+ *              randombytes() implementation if they only need the
+ *              internal deterministic API
+ *              (crypto_sign_keypair_internal, crypto_sign_signature_internal).
+ *
+ *              NOTE: This option is incompatible with MLD_CONFIG_KEYGEN_PCT
+ *              as the current PCT implementation requires
+ *              crypto_sign_signature().
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_NO_RANDOMIZED_API */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_KEYGEN_PCT
+ *
+ * Description: Compliance with @[FIPS140_3_IG, p.87] requires a
+ *              Pairwise Consistency Test (PCT) to be carried out on a freshly
+ *              generated keypair before it can be exported.
+ *
+ *              Set this option if such a check should be implemented.
+ *              In this case, crypto_sign_keypair_internal and
+ *              crypto_sign_keypair will return a non-zero error code if the
+ *              PCT failed.
+ *
+ *              NOTE: This feature will drastically lower the performance of
+ *              key generation.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_KEYGEN_PCT */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_KEYGEN_PCT_BREAKAGE_TEST
+ *
+ * Description: If this option is set, the user must provide a runtime
+ *              function `static inline int mld_break_pct() { ... }` to
+ *              indicate whether the PCT should be made fail.
+ *
+ *              This option only has an effect if MLD_CONFIG_KEYGEN_PCT is set.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_KEYGEN_PCT_BREAKAGE_TEST
+   #if !defined(__ASSEMBLER__)
+   #include "sys.h"
+   static MLD_INLINE int mld_break_pct(void)
+   {
+       ... return 0/1 depending on whether PCT should be broken ...
+   }
+   #endif
+*/
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_INTERNAL_API_QUALIFIER
+ *
+ * Description: If set, this option provides an additional function
+ *              qualifier to be added to declarations of internal API.
+ *
+ *              The primary use case for this option are single-CU builds,
+ *              in which case this option can be set to `static`.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_INTERNAL_API_QUALIFIER */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_EXTERNAL_API_QUALIFIER
+ *
+ * Description: If set, this option provides an additional function
+ *              qualifier to be added to declarations of mldsa-native's
+ *              public API.
+ *
+ *              The primary use case for this option are single-CU builds
+ *              where the public API exposed by mldsa-native is wrapped by
+ *              another API in the consuming application. In this case,
+ *              even mldsa-native's public API can be marked `static`.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_EXTERNAL_API_QUALIFIER */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CT_TESTING_ENABLED
+ *
+ * Description: If set, mldsa-native annotates data as secret / public using
+ *              valgrind's annotations VALGRIND_MAKE_MEM_UNDEFINED and
+ *              VALGRIND_MAKE_MEM_DEFINED, enabling various checks for secret-
+ *              dependent control flow of variable time execution (depending
+ *              on the exact version of valgrind installed).
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CT_TESTING_ENABLED */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_NO_ASM
+ *
+ * Description: If this option is set, mldsa-native will be built without
+ *              use of native code or inline assembly.
+ *
+ *              By default, inline assembly is used to implement value barriers.
+ *              Without inline assembly, mldsa-native will use a global volatile
+ *              'opt blocker' instead; see ct.h.
+ *
+ *              Inline assembly is also used to implement a secure zeroization
+ *              function on non-Windows platforms. If this option is set and
+ *              the target platform is not Windows, you MUST set
+ *              MLD_CONFIG_CUSTOM_ZEROIZE and provide a custom zeroization
+ *              function.
+ *
+ *              If this option is set, MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 and
+ *              and MLD_CONFIG_USE_NATIVE_BACKEND_ARITH will be ignored, and no
+ *              native backends will be used.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_NO_ASM */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_NO_ASM_VALUE_BARRIER
+ *
+ * Description: If this option is set, mldsa-native will be built without
+ *              use of native code or inline assembly for value barriers.
+ *
+ *              By default, inline assembly (if available) is used to implement
+ *              value barriers.
+ *              Without inline assembly, mldsa-native will use a global volatile
+ *              'opt blocker' instead; see ct.h.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_NO_ASM_VALUE_BARRIER */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_SERIAL_FIPS202_ONLY
+ *
+ * Description: Set this to use a FIPS202 implementation with global state
+ *              that supports only one active Keccak computation at a time
+ *              (e.g. some hardware accelerators).
+ *
+ *              If this option is set, ML-DSA will use FIPS202 operations
+ *              serially, ensuring that only one SHAKE context is active
+ *              at any given time.
+ *
+ *              This allows offloading Keccak computations to a hardware
+ *              accelerator that holds only a single Keccak state locally,
+ *              rather than requiring support for multiple concurrent
+ *              Keccak states.
+ *
+ *              NOTE: Depending on the target CPU, this may reduce
+ *              performance when using software FIPS202 implementations.
+ *              Only enable this when you have to.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_SERIAL_FIPS202_ONLY */
+
+/*************************  Config internals  ********************************/
+
+/* Default namespace
+ *
+ * Don't change this. If you need a different namespace, re-define
+ * MLD_CONFIG_NAMESPACE_PREFIX above instead, and remove the following.
+ *
+ * The default MLDSA namespace is
+ *
+ *   PQCP_MLDSA_NATIVE_MLDSA<LEVEL>_
+ *
+ * e.g., PQCP_MLDSA_NATIVE_MLDSA44_
+ */
+
+#if MLD_CONFIG_PARAMETER_SET == 44
+#define MLD_DEFAULT_NAMESPACE_PREFIX PQCP_MLDSA_NATIVE_MLDSA44
+#elif MLD_CONFIG_PARAMETER_SET == 65
+#define MLD_DEFAULT_NAMESPACE_PREFIX PQCP_MLDSA_NATIVE_MLDSA65
+#elif MLD_CONFIG_PARAMETER_SET == 87
+#define MLD_DEFAULT_NAMESPACE_PREFIX PQCP_MLDSA_NATIVE_MLDSA87
+#endif
+
+#endif /* !MLD_CONFIG_H */
diff --git a/test/custom_native_capability_config_1.h b/test/custom_native_capability_config_1.h
new file mode 100644
index 000000000..233d64f3c
--- /dev/null
+++ b/test/custom_native_capability_config_1.h
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS140_3_IG]
+ *   Implementation Guidance for FIPS 140-3 and the Cryptographic Module
+ *   Validation Program
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
+ *
+ * - [FIPS204]
+ *   FIPS 204 Module-Lattice-Based Digital Signature Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/204/final
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogen
+ *          in the mldsa-native repository.
+ *          Do not modify it directly.
+ */
+
+/*
+ * Test configuration: Test configuration with custom capability function
+ * returning 1
+ *
+ * This configuration differs from the default mldsa/src/config.h in the
+ * following places:
+ *   - MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ */
+
+
+#ifndef MLD_CONFIG_H
+#define MLD_CONFIG_H
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_PARAMETER_SET
+ *
+ * Description: Specifies the parameter set for ML-DSA
+ *              - MLD_CONFIG_PARAMETER_SET=44 corresponds to ML-DSA-44
+ *              - MLD_CONFIG_PARAMETER_SET=65 corresponds to ML-DSA-65
+ *              - MLD_CONFIG_PARAMETER_SET=87 corresponds to ML-DSA-87
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#ifndef MLD_CONFIG_PARAMETER_SET
+#define MLD_CONFIG_PARAMETER_SET \
+  44 /* Change this for different security strengths */
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_NAMESPACE_PREFIX
+ *
+ * Description: The prefix to use to namespace global symbols from mldsa/.
+ *
+ *              In a multi-level build (that is, if either
+ *              - MLD_CONFIG_MULTILEVEL_WITH_SHARED, or
+ *              - MLD_CONFIG_MULTILEVEL_NO_SHARED,
+ *              are set, level-dependent symbols will additionally be prefixed
+ *              with the parameter set (44/65/87).
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLD_CONFIG_NAMESPACE_PREFIX)
+#define MLD_CONFIG_NAMESPACE_PREFIX MLD_DEFAULT_NAMESPACE_PREFIX
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_MULTILEVEL_WITH_SHARED
+ *
+ * Description: This is for multi-level builds of mldsa-native only. If you
+ *              need only a single parameter set, keep this unset.
+ *
+ *              If this is set, all MLD_CONFIG_PARAMETER_SET-independent
+ *              code will be included in the build, including code needed only
+ *              for other parameter sets.
+ *
+ *              Example: TODO: add example
+ *
+ *              To build mldsa-native with support for all parameter sets,
+ *              build it three times -- once per parameter set -- and set the
+ *              option MLD_CONFIG_MULTILEVEL_WITH_SHARED for exactly one of
+ *              them, and MLD_CONFIG_MULTILEVEL_NO_SHARED for the others.
+ *
+ *              See examples/multilevel_build_mldsa for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_MULTILEVEL_WITH_SHARED */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_MULTILEVEL_NO_SHARED
+ *
+ * Description: This is for multi-level builds of mldsa-native only. If you
+ *              need only a single parameter set, keep this unset.
+ *
+ *              If this is set, no MLD_CONFIG_PARAMETER_SET-independent code
+ *              will be included in the build.
+ *
+ *              To build mldsa-native with support for all parameter sets,
+ *              build it three times -- once per parameter set -- and set the
+ *              option MLD_CONFIG_MULTILEVEL_WITH_SHARED for exactly one of
+ *              them, and MLD_CONFIG_MULTILEVEL_NO_SHARED for the others.
+ *
+ *              See examples/multilevel_build_mldsa for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_MULTILEVEL_NO_SHARED */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_FILE
+ *
+ * Description: If defined, this is a header that will be included instead
+ *              of the default configuration file mldsa/src/config.h.
+ *
+ *              When you need to build mldsa-native in multiple configurations,
+ *              using varying MLD_CONFIG_FILE can be more convenient
+ *              then configuring everything through CFLAGS.
+ *
+ *              To use, MLD_CONFIG_FILE _must_ be defined prior
+ *              to the inclusion of any mldsa-native headers. For example,
+ *              it can be set by passing `-DMLD_CONFIG_FILE="..."`
+ *              on the command line.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_FILE "config.h" */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_USE_NATIVE_BACKEND_ARITH
+ *
+ * Description: Determines whether an native arithmetic backend should be used.
+ *
+ *              The arithmetic backend covers performance critical functions
+ *              such as the number-theoretic transform (NTT).
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the arithmetic backend to be use is
+ *              determined by MLD_CONFIG_ARITH_BACKEND_FILE: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH)
+/* #define MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_ARITH_BACKEND_FILE
+ *
+ * Description: The arithmetic backend to use.
+ *
+ *              If MLD_CONFIG_USE_NATIVE_BACKEND_ARITH is unset, this option
+ *              is ignored.
+ *
+ *              If MLD_CONFIG_USE_NATIVE_BACKEND_ARITH is set, this option must
+ *              either be undefined or the filename of an arithmetic backend.
+ *              If unset, the default backend will be used.
+ *
+ *              This can be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH) && \
+    !defined(MLD_CONFIG_ARITH_BACKEND_FILE)
+#define MLD_CONFIG_ARITH_BACKEND_FILE "native/meta.h"
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202
+ *
+ * Description: Determines whether an native FIPS202 backend should be used.
+ *
+ *              The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is
+ *              the performance bottleneck of SHA3 and SHAKE.
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the FIPS202 backend to be use is
+ *              determined by MLD_CONFIG_FIPS202_BACKEND_FILE: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)
+/* #define MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 */
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_FIPS202_BACKEND_FILE
+ *
+ * Description: The FIPS-202 backend to use.
+ *
+ *              If MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 is set, this option
+ *              must either be undefined or the filename of a FIPS202 backend.
+ *              If unset, the default backend will be used.
+ *
+ *              This can be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202) && \
+    !defined(MLD_CONFIG_FIPS202_BACKEND_FILE)
+#define MLD_CONFIG_FIPS202_BACKEND_FILE "fips202/native/auto.h"
+#endif
+/******************************************************************************
+ * Name:        MLD_CONFIG_FIPS202_CUSTOM_HEADER
+ *
+ * Description: Custom header to use for FIPS-202
+ *
+ *              This should only be set if you intend to use a custom
+ *              FIPS-202 implementation, different from the one shipped
+ *              with mldsa-native.
+ *
+ *              If set, it must be the name of a file serving as the
+ *              replacement for mldsa/src/fips202/fips202.h, and exposing
+ *              the same API (see FIPS202.md).
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_FIPS202_CUSTOM_HEADER "SOME_FILE.h" */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_FIPS202X4_CUSTOM_HEADER
+ *
+ * Description: Custom header to use for FIPS-202-X4
+ *
+ *              This should only be set if you intend to use a custom
+ *              FIPS-202 implementation, different from the one shipped
+ *              with mldsa-native.
+ *
+ *              If set, it must be the name of a file serving as the
+ *              replacement for mldsa/src/fips202/fips202x4.h, and exposing
+ *              the same API (see FIPS202.md).
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_FIPS202X4_CUSTOM_HEADER "SOME_FILE.h" */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_ZEROIZE
+ *
+ * Description: In compliance with @[FIPS204, Section 3.6.3], mldsa-native,
+ *              zeroizes intermediate stack buffers before returning from
+ *              function calls.
+ *
+ *              Set this option and define `mld_zeroize_native` if you want to
+ *              use a custom method to zeroize intermediate stack buffers.
+ *              The default implementation uses SecureZeroMemory on Windows
+ *              and a memset + compiler barrier otherwise. If neither of those
+ *              is available on the target platform, compilation will fail,
+ *              and you will need to use MLD_CONFIG_CUSTOM_ZEROIZE to provide
+ *              a custom implementation of `mld_zeroize_native()`.
+ *
+ *              WARNING:
+ *              The explicit stack zeroization conducted by mldsa-native
+ *              reduces the likelihood of data leaking on the stack, but
+ *              does not eliminate it! The C standard makes no guarantee about
+ *              where a compiler allocates structures and whether/where it makes
+ *              copies of them. Also, in addition to entire structures, there
+ *              may also be potentially exploitable leakage of individual values
+ *              on the stack.
+ *
+ *              If you need bullet-proof zeroization of the stack, you need to
+ *              consider additional measures instead of what this feature
+ *              provides. In this case, you can set mld_zeroize_native to a
+ *              no-op.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_ZEROIZE
+   #if !defined(__ASSEMBLER__)
+   #include <stdint.h>
+   #include "sys.h"
+   static MLD_INLINE void mld_zeroize_native(void *ptr, size_t len)
+   {
+       ... your implementation ...
+   }
+   #endif
+*/
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_MEMCPY
+ *
+ * Description: Set this option and define `mld_memcpy` if you want to
+ *              use a custom method to copy memory instead of the standard
+ *              library memcpy function.
+ *
+ *              The custom implementation must have the same signature and
+ *              behavior as the standard memcpy function:
+ *              void *mld_memcpy(void *dest, const void *src, size_t n)
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_MEMCPY
+   #if !defined(__ASSEMBLER__)
+   #include <stdint.h>
+   #include "sys.h"
+   static MLD_INLINE void *mld_memcpy(void *dest, const void *src, size_t n)
+   {
+       ... your implementation ...
+   }
+   #endif
+*/
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_MEMSET
+ *
+ * Description: Set this option and define `mld_memset` if you want to
+ *              use a custom method to set memory instead of the standard
+ *              library memset function.
+ *
+ *              The custom implementation must have the same signature and
+ *              behavior as the standard memset function:
+ *              void *mld_memset(void *s, int c, size_t n)
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_MEMSET
+   #if !defined(__ASSEMBLER__)
+   #include <stdint.h>
+   #include "sys.h"
+   static MLD_INLINE void *mld_memset(void *s, int c, size_t n)
+   {
+       ... your implementation ...
+   }
+   #endif
+*/
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_RANDOMBYTES
+ *
+ * Description: mldsa-native does not provide a secure randombytes
+ *              implementation. Such an implementation has to provided by the
+ *              consumer.
+ *
+ *              If this option is not set, mldsa-native expects a function
+ *              void randombytes(uint8_t *out, size_t outlen).
+ *
+ *              Set this option and define `mld_randombytes` if you want to
+ *              use a custom method to sample randombytes with a different name
+ *              or signature.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_RANDOMBYTES
+   #if !defined(__ASSEMBLER__)
+   #include <stdint.h>
+   #include "sys.h"
+   static MLD_INLINE void mld_randombytes(uint8_t *ptr, size_t len)
+   {
+       ... your implementation ...
+   }
+   #endif
+*/
+
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+#define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+#if !defined(__ASSEMBLER__)
+#include "../mldsa/src/sys.h"
+
+static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+{
+  (void)cap; /* Ignore parameter */
+  return 1;
+}
+#endif /* !__ASSEMBLER__ */
+
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_NO_RANDOMIZED_API
+ *
+ * Description: If this option is set, mldsa-native will be built without the
+ *              randomized API functions (crypto_sign_keypair,
+ *              crypto_sign, crypto_sign_signature, and
+ *              crypto_sign_signature_extmu).
+ *              This allows users to build mldsa-native without providing a
+ *              randombytes() implementation if they only need the
+ *              internal deterministic API
+ *              (crypto_sign_keypair_internal, crypto_sign_signature_internal).
+ *
+ *              NOTE: This option is incompatible with MLD_CONFIG_KEYGEN_PCT
+ *              as the current PCT implementation requires
+ *              crypto_sign_signature().
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_NO_RANDOMIZED_API */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_KEYGEN_PCT
+ *
+ * Description: Compliance with @[FIPS140_3_IG, p.87] requires a
+ *              Pairwise Consistency Test (PCT) to be carried out on a freshly
+ *              generated keypair before it can be exported.
+ *
+ *              Set this option if such a check should be implemented.
+ *              In this case, crypto_sign_keypair_internal and
+ *              crypto_sign_keypair will return a non-zero error code if the
+ *              PCT failed.
+ *
+ *              NOTE: This feature will drastically lower the performance of
+ *              key generation.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_KEYGEN_PCT */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_KEYGEN_PCT_BREAKAGE_TEST
+ *
+ * Description: If this option is set, the user must provide a runtime
+ *              function `static inline int mld_break_pct() { ... }` to
+ *              indicate whether the PCT should be made fail.
+ *
+ *              This option only has an effect if MLD_CONFIG_KEYGEN_PCT is set.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_KEYGEN_PCT_BREAKAGE_TEST
+   #if !defined(__ASSEMBLER__)
+   #include "sys.h"
+   static MLD_INLINE int mld_break_pct(void)
+   {
+       ... return 0/1 depending on whether PCT should be broken ...
+   }
+   #endif
+*/
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_INTERNAL_API_QUALIFIER
+ *
+ * Description: If set, this option provides an additional function
+ *              qualifier to be added to declarations of internal API.
+ *
+ *              The primary use case for this option are single-CU builds,
+ *              in which case this option can be set to `static`.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_INTERNAL_API_QUALIFIER */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_EXTERNAL_API_QUALIFIER
+ *
+ * Description: If set, this option provides an additional function
+ *              qualifier to be added to declarations of mldsa-native's
+ *              public API.
+ *
+ *              The primary use case for this option are single-CU builds
+ *              where the public API exposed by mldsa-native is wrapped by
+ *              another API in the consuming application. In this case,
+ *              even mldsa-native's public API can be marked `static`.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_EXTERNAL_API_QUALIFIER */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CT_TESTING_ENABLED
+ *
+ * Description: If set, mldsa-native annotates data as secret / public using
+ *              valgrind's annotations VALGRIND_MAKE_MEM_UNDEFINED and
+ *              VALGRIND_MAKE_MEM_DEFINED, enabling various checks for secret-
+ *              dependent control flow of variable time execution (depending
+ *              on the exact version of valgrind installed).
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CT_TESTING_ENABLED */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_NO_ASM
+ *
+ * Description: If this option is set, mldsa-native will be built without
+ *              use of native code or inline assembly.
+ *
+ *              By default, inline assembly is used to implement value barriers.
+ *              Without inline assembly, mldsa-native will use a global volatile
+ *              'opt blocker' instead; see ct.h.
+ *
+ *              Inline assembly is also used to implement a secure zeroization
+ *              function on non-Windows platforms. If this option is set and
+ *              the target platform is not Windows, you MUST set
+ *              MLD_CONFIG_CUSTOM_ZEROIZE and provide a custom zeroization
+ *              function.
+ *
+ *              If this option is set, MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 and
+ *              and MLD_CONFIG_USE_NATIVE_BACKEND_ARITH will be ignored, and no
+ *              native backends will be used.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_NO_ASM */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_NO_ASM_VALUE_BARRIER
+ *
+ * Description: If this option is set, mldsa-native will be built without
+ *              use of native code or inline assembly for value barriers.
+ *
+ *              By default, inline assembly (if available) is used to implement
+ *              value barriers.
+ *              Without inline assembly, mldsa-native will use a global volatile
+ *              'opt blocker' instead; see ct.h.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_NO_ASM_VALUE_BARRIER */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_SERIAL_FIPS202_ONLY
+ *
+ * Description: Set this to use a FIPS202 implementation with global state
+ *              that supports only one active Keccak computation at a time
+ *              (e.g. some hardware accelerators).
+ *
+ *              If this option is set, ML-DSA will use FIPS202 operations
+ *              serially, ensuring that only one SHAKE context is active
+ *              at any given time.
+ *
+ *              This allows offloading Keccak computations to a hardware
+ *              accelerator that holds only a single Keccak state locally,
+ *              rather than requiring support for multiple concurrent
+ *              Keccak states.
+ *
+ *              NOTE: Depending on the target CPU, this may reduce
+ *              performance when using software FIPS202 implementations.
+ *              Only enable this when you have to.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_SERIAL_FIPS202_ONLY */
+
+/*************************  Config internals  ********************************/
+
+/* Default namespace
+ *
+ * Don't change this. If you need a different namespace, re-define
+ * MLD_CONFIG_NAMESPACE_PREFIX above instead, and remove the following.
+ *
+ * The default MLDSA namespace is
+ *
+ *   PQCP_MLDSA_NATIVE_MLDSA<LEVEL>_
+ *
+ * e.g., PQCP_MLDSA_NATIVE_MLDSA44_
+ */
+
+#if MLD_CONFIG_PARAMETER_SET == 44
+#define MLD_DEFAULT_NAMESPACE_PREFIX PQCP_MLDSA_NATIVE_MLDSA44
+#elif MLD_CONFIG_PARAMETER_SET == 65
+#define MLD_DEFAULT_NAMESPACE_PREFIX PQCP_MLDSA_NATIVE_MLDSA65
+#elif MLD_CONFIG_PARAMETER_SET == 87
+#define MLD_DEFAULT_NAMESPACE_PREFIX PQCP_MLDSA_NATIVE_MLDSA87
+#endif
+
+#endif /* !MLD_CONFIG_H */
diff --git a/test/custom_native_capability_config_CPUID_AVX2.h b/test/custom_native_capability_config_CPUID_AVX2.h
new file mode 100644
index 000000000..41cd8a823
--- /dev/null
+++ b/test/custom_native_capability_config_CPUID_AVX2.h
@@ -0,0 +1,609 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS140_3_IG]
+ *   Implementation Guidance for FIPS 140-3 and the Cryptographic Module
+ *   Validation Program
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
+ *
+ * - [FIPS204]
+ *   FIPS 204 Module-Lattice-Based Digital Signature Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/204/final
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogen
+ *          in the mldsa-native repository.
+ *          Do not modify it directly.
+ */
+
+/*
+ * Test configuration: Test configuration with CPUID-based AVX2 capability
+ * detection
+ *
+ * This configuration differs from the default mldsa/src/config.h in the
+ * following places:
+ *   - MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ */
+
+
+#ifndef MLD_CONFIG_H
+#define MLD_CONFIG_H
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_PARAMETER_SET
+ *
+ * Description: Specifies the parameter set for ML-DSA
+ *              - MLD_CONFIG_PARAMETER_SET=44 corresponds to ML-DSA-44
+ *              - MLD_CONFIG_PARAMETER_SET=65 corresponds to ML-DSA-65
+ *              - MLD_CONFIG_PARAMETER_SET=87 corresponds to ML-DSA-87
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#ifndef MLD_CONFIG_PARAMETER_SET
+#define MLD_CONFIG_PARAMETER_SET \
+  44 /* Change this for different security strengths */
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_NAMESPACE_PREFIX
+ *
+ * Description: The prefix to use to namespace global symbols from mldsa/.
+ *
+ *              In a multi-level build (that is, if either
+ *              - MLD_CONFIG_MULTILEVEL_WITH_SHARED, or
+ *              - MLD_CONFIG_MULTILEVEL_NO_SHARED,
+ *              are set, level-dependent symbols will additionally be prefixed
+ *              with the parameter set (44/65/87).
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLD_CONFIG_NAMESPACE_PREFIX)
+#define MLD_CONFIG_NAMESPACE_PREFIX MLD_DEFAULT_NAMESPACE_PREFIX
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_MULTILEVEL_WITH_SHARED
+ *
+ * Description: This is for multi-level builds of mldsa-native only. If you
+ *              need only a single parameter set, keep this unset.
+ *
+ *              If this is set, all MLD_CONFIG_PARAMETER_SET-independent
+ *              code will be included in the build, including code needed only
+ *              for other parameter sets.
+ *
+ *              Example: TODO: add example
+ *
+ *              To build mldsa-native with support for all parameter sets,
+ *              build it three times -- once per parameter set -- and set the
+ *              option MLD_CONFIG_MULTILEVEL_WITH_SHARED for exactly one of
+ *              them, and MLD_CONFIG_MULTILEVEL_NO_SHARED for the others.
+ *
+ *              See examples/multilevel_build_mldsa for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_MULTILEVEL_WITH_SHARED */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_MULTILEVEL_NO_SHARED
+ *
+ * Description: This is for multi-level builds of mldsa-native only. If you
+ *              need only a single parameter set, keep this unset.
+ *
+ *              If this is set, no MLD_CONFIG_PARAMETER_SET-independent code
+ *              will be included in the build.
+ *
+ *              To build mldsa-native with support for all parameter sets,
+ *              build it three times -- once per parameter set -- and set the
+ *              option MLD_CONFIG_MULTILEVEL_WITH_SHARED for exactly one of
+ *              them, and MLD_CONFIG_MULTILEVEL_NO_SHARED for the others.
+ *
+ *              See examples/multilevel_build_mldsa for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_MULTILEVEL_NO_SHARED */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_FILE
+ *
+ * Description: If defined, this is a header that will be included instead
+ *              of the default configuration file mldsa/src/config.h.
+ *
+ *              When you need to build mldsa-native in multiple configurations,
+ *              using varying MLD_CONFIG_FILE can be more convenient
+ *              then configuring everything through CFLAGS.
+ *
+ *              To use, MLD_CONFIG_FILE _must_ be defined prior
+ *              to the inclusion of any mldsa-native headers. For example,
+ *              it can be set by passing `-DMLD_CONFIG_FILE="..."`
+ *              on the command line.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_FILE "config.h" */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_USE_NATIVE_BACKEND_ARITH
+ *
+ * Description: Determines whether an native arithmetic backend should be used.
+ *
+ *              The arithmetic backend covers performance critical functions
+ *              such as the number-theoretic transform (NTT).
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the arithmetic backend to be use is
+ *              determined by MLD_CONFIG_ARITH_BACKEND_FILE: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH)
+/* #define MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_ARITH_BACKEND_FILE
+ *
+ * Description: The arithmetic backend to use.
+ *
+ *              If MLD_CONFIG_USE_NATIVE_BACKEND_ARITH is unset, this option
+ *              is ignored.
+ *
+ *              If MLD_CONFIG_USE_NATIVE_BACKEND_ARITH is set, this option must
+ *              either be undefined or the filename of an arithmetic backend.
+ *              If unset, the default backend will be used.
+ *
+ *              This can be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH) && \
+    !defined(MLD_CONFIG_ARITH_BACKEND_FILE)
+#define MLD_CONFIG_ARITH_BACKEND_FILE "native/meta.h"
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202
+ *
+ * Description: Determines whether an native FIPS202 backend should be used.
+ *
+ *              The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is
+ *              the performance bottleneck of SHA3 and SHAKE.
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the FIPS202 backend to be use is
+ *              determined by MLD_CONFIG_FIPS202_BACKEND_FILE: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)
+/* #define MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 */
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_FIPS202_BACKEND_FILE
+ *
+ * Description: The FIPS-202 backend to use.
+ *
+ *              If MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 is set, this option
+ *              must either be undefined or the filename of a FIPS202 backend.
+ *              If unset, the default backend will be used.
+ *
+ *              This can be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202) && \
+    !defined(MLD_CONFIG_FIPS202_BACKEND_FILE)
+#define MLD_CONFIG_FIPS202_BACKEND_FILE "fips202/native/auto.h"
+#endif
+/******************************************************************************
+ * Name:        MLD_CONFIG_FIPS202_CUSTOM_HEADER
+ *
+ * Description: Custom header to use for FIPS-202
+ *
+ *              This should only be set if you intend to use a custom
+ *              FIPS-202 implementation, different from the one shipped
+ *              with mldsa-native.
+ *
+ *              If set, it must be the name of a file serving as the
+ *              replacement for mldsa/src/fips202/fips202.h, and exposing
+ *              the same API (see FIPS202.md).
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_FIPS202_CUSTOM_HEADER "SOME_FILE.h" */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_FIPS202X4_CUSTOM_HEADER
+ *
+ * Description: Custom header to use for FIPS-202-X4
+ *
+ *              This should only be set if you intend to use a custom
+ *              FIPS-202 implementation, different from the one shipped
+ *              with mldsa-native.
+ *
+ *              If set, it must be the name of a file serving as the
+ *              replacement for mldsa/src/fips202/fips202x4.h, and exposing
+ *              the same API (see FIPS202.md).
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_FIPS202X4_CUSTOM_HEADER "SOME_FILE.h" */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_ZEROIZE
+ *
+ * Description: In compliance with @[FIPS204, Section 3.6.3], mldsa-native,
+ *              zeroizes intermediate stack buffers before returning from
+ *              function calls.
+ *
+ *              Set this option and define `mld_zeroize_native` if you want to
+ *              use a custom method to zeroize intermediate stack buffers.
+ *              The default implementation uses SecureZeroMemory on Windows
+ *              and a memset + compiler barrier otherwise. If neither of those
+ *              is available on the target platform, compilation will fail,
+ *              and you will need to use MLD_CONFIG_CUSTOM_ZEROIZE to provide
+ *              a custom implementation of `mld_zeroize_native()`.
+ *
+ *              WARNING:
+ *              The explicit stack zeroization conducted by mldsa-native
+ *              reduces the likelihood of data leaking on the stack, but
+ *              does not eliminate it! The C standard makes no guarantee about
+ *              where a compiler allocates structures and whether/where it makes
+ *              copies of them. Also, in addition to entire structures, there
+ *              may also be potentially exploitable leakage of individual values
+ *              on the stack.
+ *
+ *              If you need bullet-proof zeroization of the stack, you need to
+ *              consider additional measures instead of what this feature
+ *              provides. In this case, you can set mld_zeroize_native to a
+ *              no-op.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_ZEROIZE
+   #if !defined(__ASSEMBLER__)
+   #include <stdint.h>
+   #include "sys.h"
+   static MLD_INLINE void mld_zeroize_native(void *ptr, size_t len)
+   {
+       ... your implementation ...
+   }
+   #endif
+*/
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_MEMCPY
+ *
+ * Description: Set this option and define `mld_memcpy` if you want to
+ *              use a custom method to copy memory instead of the standard
+ *              library memcpy function.
+ *
+ *              The custom implementation must have the same signature and
+ *              behavior as the standard memcpy function:
+ *              void *mld_memcpy(void *dest, const void *src, size_t n)
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_MEMCPY
+   #if !defined(__ASSEMBLER__)
+   #include <stdint.h>
+   #include "sys.h"
+   static MLD_INLINE void *mld_memcpy(void *dest, const void *src, size_t n)
+   {
+       ... your implementation ...
+   }
+   #endif
+*/
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_MEMSET
+ *
+ * Description: Set this option and define `mld_memset` if you want to
+ *              use a custom method to set memory instead of the standard
+ *              library memset function.
+ *
+ *              The custom implementation must have the same signature and
+ *              behavior as the standard memset function:
+ *              void *mld_memset(void *s, int c, size_t n)
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_MEMSET
+   #if !defined(__ASSEMBLER__)
+   #include <stdint.h>
+   #include "sys.h"
+   static MLD_INLINE void *mld_memset(void *s, int c, size_t n)
+   {
+       ... your implementation ...
+   }
+   #endif
+*/
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_RANDOMBYTES
+ *
+ * Description: mldsa-native does not provide a secure randombytes
+ *              implementation. Such an implementation has to provided by the
+ *              consumer.
+ *
+ *              If this option is not set, mldsa-native expects a function
+ *              void randombytes(uint8_t *out, size_t outlen).
+ *
+ *              Set this option and define `mld_randombytes` if you want to
+ *              use a custom method to sample randombytes with a different name
+ *              or signature.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_RANDOMBYTES
+   #if !defined(__ASSEMBLER__)
+   #include <stdint.h>
+   #include "sys.h"
+   static MLD_INLINE void mld_randombytes(uint8_t *ptr, size_t len)
+   {
+       ... your implementation ...
+   }
+   #endif
+*/
+
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+#define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+#if !defined(__ASSEMBLER__)
+#include <stdint.h>
+#include "../mldsa/src/sys.h"
+
+/* Assert this config is only used on Linux/x86_64 systems */
+#if !defined(MLD_SYS_X86_64) || !defined(MLD_SYS_LINUX)
+#error "This configuration is only supported on Linux/x86_64 systems"
+#endif
+
+static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+{
+  if (cap == MLD_SYS_CAP_AVX2)
+  {
+    uint32_t eax, ebx, ecx, edx;
+
+    /* AVX2 support is queried using `cpuid` with EAX=7, ECX=0.
+     * Check first if `cpuid` supports EAX=7 by calling it with
+     * EAX=0, which gives the maximum supported value of EAX in
+     * EAX. */
+
+    __asm__ volatile("cpuid"
+                     : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
+                     : "a"(0));
+
+    if (eax < 7)
+    {
+      return 0; /* Extended features not supported */
+    }
+
+    __asm__ volatile("cpuid"
+                     : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
+                     : "a"(7), "c"(0));
+
+    /* AVX2 is bit 5 in EBX */
+    return (ebx & (1 << 5)) ? 1 : 0;
+  }
+
+  /* Default to 0 (conservative) for unknown capabilities */
+  return 0;
+}
+#endif /* !__ASSEMBLER__ */
+
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_NO_RANDOMIZED_API
+ *
+ * Description: If this option is set, mldsa-native will be built without the
+ *              randomized API functions (crypto_sign_keypair,
+ *              crypto_sign, crypto_sign_signature, and
+ *              crypto_sign_signature_extmu).
+ *              This allows users to build mldsa-native without providing a
+ *              randombytes() implementation if they only need the
+ *              internal deterministic API
+ *              (crypto_sign_keypair_internal, crypto_sign_signature_internal).
+ *
+ *              NOTE: This option is incompatible with MLD_CONFIG_KEYGEN_PCT
+ *              as the current PCT implementation requires
+ *              crypto_sign_signature().
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_NO_RANDOMIZED_API */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_KEYGEN_PCT
+ *
+ * Description: Compliance with @[FIPS140_3_IG, p.87] requires a
+ *              Pairwise Consistency Test (PCT) to be carried out on a freshly
+ *              generated keypair before it can be exported.
+ *
+ *              Set this option if such a check should be implemented.
+ *              In this case, crypto_sign_keypair_internal and
+ *              crypto_sign_keypair will return a non-zero error code if the
+ *              PCT failed.
+ *
+ *              NOTE: This feature will drastically lower the performance of
+ *              key generation.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_KEYGEN_PCT */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_KEYGEN_PCT_BREAKAGE_TEST
+ *
+ * Description: If this option is set, the user must provide a runtime
+ *              function `static inline int mld_break_pct() { ... }` to
+ *              indicate whether the PCT should be made fail.
+ *
+ *              This option only has an effect if MLD_CONFIG_KEYGEN_PCT is set.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_KEYGEN_PCT_BREAKAGE_TEST
+   #if !defined(__ASSEMBLER__)
+   #include "sys.h"
+   static MLD_INLINE int mld_break_pct(void)
+   {
+       ... return 0/1 depending on whether PCT should be broken ...
+   }
+   #endif
+*/
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_INTERNAL_API_QUALIFIER
+ *
+ * Description: If set, this option provides an additional function
+ *              qualifier to be added to declarations of internal API.
+ *
+ *              The primary use case for this option are single-CU builds,
+ *              in which case this option can be set to `static`.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_INTERNAL_API_QUALIFIER */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_EXTERNAL_API_QUALIFIER
+ *
+ * Description: If set, this option provides an additional function
+ *              qualifier to be added to declarations of mldsa-native's
+ *              public API.
+ *
+ *              The primary use case for this option are single-CU builds
+ *              where the public API exposed by mldsa-native is wrapped by
+ *              another API in the consuming application. In this case,
+ *              even mldsa-native's public API can be marked `static`.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_EXTERNAL_API_QUALIFIER */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CT_TESTING_ENABLED
+ *
+ * Description: If set, mldsa-native annotates data as secret / public using
+ *              valgrind's annotations VALGRIND_MAKE_MEM_UNDEFINED and
+ *              VALGRIND_MAKE_MEM_DEFINED, enabling various checks for secret-
+ *              dependent control flow of variable time execution (depending
+ *              on the exact version of valgrind installed).
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CT_TESTING_ENABLED */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_NO_ASM
+ *
+ * Description: If this option is set, mldsa-native will be built without
+ *              use of native code or inline assembly.
+ *
+ *              By default, inline assembly is used to implement value barriers.
+ *              Without inline assembly, mldsa-native will use a global volatile
+ *              'opt blocker' instead; see ct.h.
+ *
+ *              Inline assembly is also used to implement a secure zeroization
+ *              function on non-Windows platforms. If this option is set and
+ *              the target platform is not Windows, you MUST set
+ *              MLD_CONFIG_CUSTOM_ZEROIZE and provide a custom zeroization
+ *              function.
+ *
+ *              If this option is set, MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 and
+ *              and MLD_CONFIG_USE_NATIVE_BACKEND_ARITH will be ignored, and no
+ *              native backends will be used.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_NO_ASM */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_NO_ASM_VALUE_BARRIER
+ *
+ * Description: If this option is set, mldsa-native will be built without
+ *              use of native code or inline assembly for value barriers.
+ *
+ *              By default, inline assembly (if available) is used to implement
+ *              value barriers.
+ *              Without inline assembly, mldsa-native will use a global volatile
+ *              'opt blocker' instead; see ct.h.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_NO_ASM_VALUE_BARRIER */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_SERIAL_FIPS202_ONLY
+ *
+ * Description: Set this to use a FIPS202 implementation with global state
+ *              that supports only one active Keccak computation at a time
+ *              (e.g. some hardware accelerators).
+ *
+ *              If this option is set, ML-DSA will use FIPS202 operations
+ *              serially, ensuring that only one SHAKE context is active
+ *              at any given time.
+ *
+ *              This allows offloading Keccak computations to a hardware
+ *              accelerator that holds only a single Keccak state locally,
+ *              rather than requiring support for multiple concurrent
+ *              Keccak states.
+ *
+ *              NOTE: Depending on the target CPU, this may reduce
+ *              performance when using software FIPS202 implementations.
+ *              Only enable this when you have to.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_SERIAL_FIPS202_ONLY */
+
+/*************************  Config internals  ********************************/
+
+/* Default namespace
+ *
+ * Don't change this. If you need a different namespace, re-define
+ * MLD_CONFIG_NAMESPACE_PREFIX above instead, and remove the following.
+ *
+ * The default MLDSA namespace is
+ *
+ *   PQCP_MLDSA_NATIVE_MLDSA<LEVEL>_
+ *
+ * e.g., PQCP_MLDSA_NATIVE_MLDSA44_
+ */
+
+#if MLD_CONFIG_PARAMETER_SET == 44
+#define MLD_DEFAULT_NAMESPACE_PREFIX PQCP_MLDSA_NATIVE_MLDSA44
+#elif MLD_CONFIG_PARAMETER_SET == 65
+#define MLD_DEFAULT_NAMESPACE_PREFIX PQCP_MLDSA_NATIVE_MLDSA65
+#elif MLD_CONFIG_PARAMETER_SET == 87
+#define MLD_DEFAULT_NAMESPACE_PREFIX PQCP_MLDSA_NATIVE_MLDSA87
+#endif
+
+#endif /* !MLD_CONFIG_H */
diff --git a/test/custom_randombytes_config.h b/test/custom_randombytes_config.h
index 9f0d53dc8..061263549 100644
--- a/test/custom_randombytes_config.h
+++ b/test/custom_randombytes_config.h
@@ -361,6 +361,36 @@ static MLD_INLINE void mld_randombytes(uint8_t *ptr, size_t len)
 #endif /* !__ASSEMBLER__ */
 
 
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+   static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+   {
+       ... your implementation ...
+   }
+*/
+
 /******************************************************************************
  * Name:        MLD_CONFIG_NO_RANDOMIZED_API
  *
diff --git a/test/custom_stdlib_config.h b/test/custom_stdlib_config.h
index f4ea4aa2f..c1d0e90ab 100644
--- a/test/custom_stdlib_config.h
+++ b/test/custom_stdlib_config.h
@@ -376,6 +376,36 @@ static MLD_INLINE void *mld_memset(void *s, int c, size_t n)
    #endif
 */
 
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+   static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+   {
+       ... your implementation ...
+   }
+*/
+
 /******************************************************************************
  * Name:        MLD_CONFIG_NO_RANDOMIZED_API
  *
diff --git a/test/custom_zeroize_config.h b/test/custom_zeroize_config.h
index c4155d82a..850e2afd2 100644
--- a/test/custom_zeroize_config.h
+++ b/test/custom_zeroize_config.h
@@ -361,6 +361,36 @@ static MLD_INLINE void mld_zeroize_native(void *ptr, size_t len)
    #endif
 */
 
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+   static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+   {
+       ... your implementation ...
+   }
+*/
+
 /******************************************************************************
  * Name:        MLD_CONFIG_NO_RANDOMIZED_API
  *
diff --git a/test/no_asm_config.h b/test/no_asm_config.h
index 0a2eab1ba..5f28a4a25 100644
--- a/test/no_asm_config.h
+++ b/test/no_asm_config.h
@@ -362,6 +362,36 @@ static MLD_INLINE void mld_zeroize_native(void *ptr, size_t len)
    #endif
 */
 
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+   static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+   {
+       ... your implementation ...
+   }
+*/
+
 /******************************************************************************
  * Name:        MLD_CONFIG_NO_RANDOMIZED_API
  *
diff --git a/test/serial_fips202_config.h b/test/serial_fips202_config.h
index 58ae64f4e..7df4b0e64 100644
--- a/test/serial_fips202_config.h
+++ b/test/serial_fips202_config.h
@@ -360,6 +360,36 @@
    #endif
 */
 
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+   static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+   {
+       ... your implementation ...
+   }
+*/
+
 /******************************************************************************
  * Name:        MLD_CONFIG_NO_RANDOMIZED_API
  *

From 5ebdf0bf93eef00fbc79a421f29af6de4609b13c Mon Sep 17 00:00:00 2001
From: willieyz <willie.zhao@chelpis.com>
Date: Thu, 6 Nov 2025 14:33:13 +0800
Subject: [PATCH 02/11] Add runtime dispatch (mld_rej_uniform_native,
 mld_rej_uniform_eta2/eta4_native)

Signed-off-by: willieyz <willie.zhao@chelpis.com>
---
 dev/aarch64_clean/meta.h        |  9 +++++----
 dev/x86_64/meta.h               | 15 +++++++++------
 mldsa/src/native/aarch64/meta.h |  9 +++++----
 mldsa/src/native/x86_64/meta.h  | 15 +++++++++------
 mldsa/src/poly.c                |  5 +++--
 mldsa/src/poly_kl.c             | 10 ++++++----
 6 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/dev/aarch64_clean/meta.h b/dev/aarch64_clean/meta.h
index bb18c8f9e..457eb028c 100644
--- a/dev/aarch64_clean/meta.h
+++ b/dev/aarch64_clean/meta.h
@@ -53,9 +53,10 @@ static MLD_INLINE int mld_rej_uniform_native(int32_t *r, unsigned len,
                                              const uint8_t *buf,
                                              unsigned buflen)
 {
-  if (len != MLDSA_N || buflen % 24 != 0)
+  if (len != MLDSA_N ||
+      buflen % 24 != 0) /* NEON support is mandatory for AArch64 */
   {
-    return -1;
+    return MLD_NATIVE_FUNC_FALLBACK;
   }
 
   /* Safety: outlen is at most MLDSA_N, hence, this cast is safe. */
@@ -70,7 +71,7 @@ static MLD_INLINE int mld_rej_uniform_eta2_native(int32_t *r, unsigned len,
   /* AArch64 implementation assumes specific buffer lengths */
   if (len != MLDSA_N || buflen != MLD_AARCH64_REJ_UNIFORM_ETA2_BUFLEN)
   {
-    return -1;
+    return MLD_NATIVE_FUNC_FALLBACK;
   }
   /* Constant time: Inputs and outputs to this function are secret.
    * It is safe to leak which coefficients are accepted/rejected.
@@ -94,7 +95,7 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len,
   /* AArch64 implementation assumes specific buffer lengths */
   if (len != MLDSA_N || buflen != MLD_AARCH64_REJ_UNIFORM_ETA4_BUFLEN)
   {
-    return -1;
+    return MLD_NATIVE_FUNC_FALLBACK;
   }
   /* Constant time: Inputs and outputs to this function are secret.
    * It is safe to leak which coefficients are accepted/rejected.
diff --git a/dev/x86_64/meta.h b/dev/x86_64/meta.h
index 8d9a5c620..c7962b86c 100644
--- a/dev/x86_64/meta.h
+++ b/dev/x86_64/meta.h
@@ -69,9 +69,10 @@ static MLD_INLINE int mld_rej_uniform_native(int32_t *r, unsigned len,
                                              unsigned buflen)
 {
   /* AVX2 implementation assumes specific buffer lengths */
-  if (len != MLDSA_N || buflen != MLD_AVX2_REJ_UNIFORM_BUFLEN)
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2) || len != MLDSA_N ||
+      buflen != MLD_AVX2_REJ_UNIFORM_BUFLEN)
   {
-    return -1;
+    return MLD_NATIVE_FUNC_FALLBACK;
   }
 
   /* Safety: outlen is at most MLDSA_N and, hence, this cast is safe. */
@@ -84,9 +85,10 @@ static MLD_INLINE int mld_rej_uniform_eta2_native(int32_t *r, unsigned len,
 {
   unsigned int outlen;
   /* AVX2 implementation assumes specific buffer lengths */
-  if (len != MLDSA_N || buflen != MLD_AVX2_REJ_UNIFORM_ETA2_BUFLEN)
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2) || len != MLDSA_N ||
+      buflen != MLD_AVX2_REJ_UNIFORM_ETA2_BUFLEN)
   {
-    return -1;
+    return MLD_NATIVE_FUNC_FALLBACK;
   }
 
   /* Constant time: Inputs and outputs to this function are secret.
@@ -109,9 +111,10 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len,
 {
   unsigned int outlen;
   /* AVX2 implementation assumes specific buffer lengths */
-  if (len != MLDSA_N || buflen != MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN)
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2) || len != MLDSA_N ||
+      buflen != MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN)
   {
-    return -1;
+    return MLD_NATIVE_FUNC_FALLBACK;
   }
 
   /* Constant time: Inputs and outputs to this function are secret.
diff --git a/mldsa/src/native/aarch64/meta.h b/mldsa/src/native/aarch64/meta.h
index bb18c8f9e..457eb028c 100644
--- a/mldsa/src/native/aarch64/meta.h
+++ b/mldsa/src/native/aarch64/meta.h
@@ -53,9 +53,10 @@ static MLD_INLINE int mld_rej_uniform_native(int32_t *r, unsigned len,
                                              const uint8_t *buf,
                                              unsigned buflen)
 {
-  if (len != MLDSA_N || buflen % 24 != 0)
+  if (len != MLDSA_N ||
+      buflen % 24 != 0) /* NEON support is mandatory for AArch64 */
   {
-    return -1;
+    return MLD_NATIVE_FUNC_FALLBACK;
   }
 
   /* Safety: outlen is at most MLDSA_N, hence, this cast is safe. */
@@ -70,7 +71,7 @@ static MLD_INLINE int mld_rej_uniform_eta2_native(int32_t *r, unsigned len,
   /* AArch64 implementation assumes specific buffer lengths */
   if (len != MLDSA_N || buflen != MLD_AARCH64_REJ_UNIFORM_ETA2_BUFLEN)
   {
-    return -1;
+    return MLD_NATIVE_FUNC_FALLBACK;
   }
   /* Constant time: Inputs and outputs to this function are secret.
    * It is safe to leak which coefficients are accepted/rejected.
@@ -94,7 +95,7 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len,
   /* AArch64 implementation assumes specific buffer lengths */
   if (len != MLDSA_N || buflen != MLD_AARCH64_REJ_UNIFORM_ETA4_BUFLEN)
   {
-    return -1;
+    return MLD_NATIVE_FUNC_FALLBACK;
   }
   /* Constant time: Inputs and outputs to this function are secret.
    * It is safe to leak which coefficients are accepted/rejected.
diff --git a/mldsa/src/native/x86_64/meta.h b/mldsa/src/native/x86_64/meta.h
index 8d9a5c620..c7962b86c 100644
--- a/mldsa/src/native/x86_64/meta.h
+++ b/mldsa/src/native/x86_64/meta.h
@@ -69,9 +69,10 @@ static MLD_INLINE int mld_rej_uniform_native(int32_t *r, unsigned len,
                                              unsigned buflen)
 {
   /* AVX2 implementation assumes specific buffer lengths */
-  if (len != MLDSA_N || buflen != MLD_AVX2_REJ_UNIFORM_BUFLEN)
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2) || len != MLDSA_N ||
+      buflen != MLD_AVX2_REJ_UNIFORM_BUFLEN)
   {
-    return -1;
+    return MLD_NATIVE_FUNC_FALLBACK;
   }
 
   /* Safety: outlen is at most MLDSA_N and, hence, this cast is safe. */
@@ -84,9 +85,10 @@ static MLD_INLINE int mld_rej_uniform_eta2_native(int32_t *r, unsigned len,
 {
   unsigned int outlen;
   /* AVX2 implementation assumes specific buffer lengths */
-  if (len != MLDSA_N || buflen != MLD_AVX2_REJ_UNIFORM_ETA2_BUFLEN)
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2) || len != MLDSA_N ||
+      buflen != MLD_AVX2_REJ_UNIFORM_ETA2_BUFLEN)
   {
-    return -1;
+    return MLD_NATIVE_FUNC_FALLBACK;
   }
 
   /* Constant time: Inputs and outputs to this function are secret.
@@ -109,9 +111,10 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len,
 {
   unsigned int outlen;
   /* AVX2 implementation assumes specific buffer lengths */
-  if (len != MLDSA_N || buflen != MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN)
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2) || len != MLDSA_N ||
+      buflen != MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN)
   {
-    return -1;
+    return MLD_NATIVE_FUNC_FALLBACK;
   }
 
   /* Constant time: Inputs and outputs to this function are secret.
diff --git a/mldsa/src/poly.c b/mldsa/src/poly.c
index 1a5b1b428..8df08097e 100644
--- a/mldsa/src/poly.c
+++ b/mldsa/src/poly.c
@@ -279,8 +279,9 @@ __contract__(
 #if defined(MLD_USE_NATIVE_REJ_UNIFORM)
   if (offset == 0)
   {
-    int ret = mld_rej_uniform_native(a, target, buf, buflen);
-    if (ret != -1)
+    int ret;
+    ret = mld_rej_uniform_native(a, target, buf, buflen);
+    if (ret != MLD_NATIVE_FUNC_FALLBACK)
     {
       unsigned res = (unsigned)ret;
       mld_assert_bound(a, res, 0, MLDSA_Q);
diff --git a/mldsa/src/poly_kl.c b/mldsa/src/poly_kl.c
index 4acd39bab..8e87298c7 100644
--- a/mldsa/src/poly_kl.c
+++ b/mldsa/src/poly_kl.c
@@ -192,8 +192,9 @@ __contract__(
 #if MLDSA_ETA == 2 && defined(MLD_USE_NATIVE_REJ_UNIFORM_ETA2)
   if (offset == 0)
   {
-    int ret = mld_rej_uniform_eta2_native(a, target, buf, buflen);
-    if (ret != -1)
+    int ret;
+    ret = mld_rej_uniform_eta2_native(a, target, buf, buflen);
+    if (ret != MLD_NATIVE_FUNC_FALLBACK)
     {
       unsigned res = (unsigned)ret;
       mld_assert_abs_bound(a, res, MLDSA_ETA + 1);
@@ -204,8 +205,9 @@ __contract__(
 #elif MLDSA_ETA == 4 && defined(MLD_USE_NATIVE_REJ_UNIFORM_ETA4)
   if (offset == 0)
   {
-    int ret = mld_rej_uniform_eta4_native(a, target, buf, buflen);
-    if (ret != -1)
+    int ret;
+    ret = mld_rej_uniform_eta4_native(a, target, buf, buflen);
+    if (ret != MLD_NATIVE_FUNC_FALLBACK)
     {
       unsigned res = (unsigned)ret;
       mld_assert_abs_bound(a, res, MLDSA_ETA + 1);

From 8749d5c46fe9eb51f1d73d5f932c943aee6c5570 Mon Sep 17 00:00:00 2001
From: willieyz <willie.zhao@chelpis.com>
Date: Thu, 6 Nov 2025 15:14:24 +0800
Subject: [PATCH 03/11] Add runtime dispatch (mld_poly_decompose_32/88_native)

Signed-off-by: willieyz <willie.zhao@chelpis.com>
---
 dev/aarch64_clean/meta.h        | 10 +++++---
 dev/x86_64/meta.h               | 18 +++++++++++---
 mldsa/src/native/aarch64/meta.h | 10 +++++---
 mldsa/src/native/api.h          |  8 +++---
 mldsa/src/native/x86_64/meta.h  | 18 +++++++++++---
 mldsa/src/poly_kl.c             | 43 ++++++++++++++++++++++-----------
 6 files changed, 73 insertions(+), 34 deletions(-)

diff --git a/dev/aarch64_clean/meta.h b/dev/aarch64_clean/meta.h
index 457eb028c..8416acc3c 100644
--- a/dev/aarch64_clean/meta.h
+++ b/dev/aarch64_clean/meta.h
@@ -111,16 +111,18 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len,
   return (int)outlen;
 }
 
-static MLD_INLINE void mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
-                                                    const int32_t *a)
+static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
+                                                   const int32_t *a)
 {
   mld_poly_decompose_32_asm(a1, a0, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
-                                                    const int32_t *a)
+static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
+                                                   const int32_t *a)
 {
   mld_poly_decompose_88_asm(a1, a0, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE void mld_poly_caddq_native(int32_t a[MLDSA_N])
diff --git a/dev/x86_64/meta.h b/dev/x86_64/meta.h
index c7962b86c..252360dfa 100644
--- a/dev/x86_64/meta.h
+++ b/dev/x86_64/meta.h
@@ -131,16 +131,26 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len,
   return (int)outlen;
 }
 
-static MLD_INLINE void mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
-                                                    const int32_t *a)
+static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
+                                                   const int32_t *a)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_poly_decompose_32_avx2((__m256i *)a1, (__m256i *)a0, (const __m256i *)a);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
-                                                    const int32_t *a)
+static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
+                                                   const int32_t *a)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_poly_decompose_88_avx2((__m256i *)a1, (__m256i *)a0, (const __m256i *)a);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE void mld_poly_caddq_native(int32_t a[MLDSA_N])
diff --git a/mldsa/src/native/aarch64/meta.h b/mldsa/src/native/aarch64/meta.h
index 457eb028c..8416acc3c 100644
--- a/mldsa/src/native/aarch64/meta.h
+++ b/mldsa/src/native/aarch64/meta.h
@@ -111,16 +111,18 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len,
   return (int)outlen;
 }
 
-static MLD_INLINE void mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
-                                                    const int32_t *a)
+static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
+                                                   const int32_t *a)
 {
   mld_poly_decompose_32_asm(a1, a0, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
-                                                    const int32_t *a)
+static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
+                                                   const int32_t *a)
 {
   mld_poly_decompose_88_asm(a1, a0, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE void mld_poly_caddq_native(int32_t a[MLDSA_N])
diff --git a/mldsa/src/native/api.h b/mldsa/src/native/api.h
index 1728b56d2..383e0d615 100644
--- a/mldsa/src/native/api.h
+++ b/mldsa/src/native/api.h
@@ -200,8 +200,8 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len,
  *              - int32_t *a0: output polynomial with coefficients c0
  *              - const int32_t *a: input polynomial
  **************************************************/
-static MLD_INLINE void mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
-                                                    const int32_t *a);
+static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
+                                                   const int32_t *a);
 #endif /* MLD_USE_NATIVE_POLY_DECOMPOSE_32 */
 
 #if defined(MLD_USE_NATIVE_POLY_DECOMPOSE_88)
@@ -221,8 +221,8 @@ static MLD_INLINE void mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
  *              - int32_t *a0: output polynomial with coefficients c0
  *              - const int32_t *a: input polynomial
  **************************************************/
-static MLD_INLINE void mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
-                                                    const int32_t *a);
+static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
+                                                   const int32_t *a);
 #endif /* MLD_USE_NATIVE_POLY_DECOMPOSE_88 */
 
 #if defined(MLD_USE_NATIVE_POLY_CADDQ)
diff --git a/mldsa/src/native/x86_64/meta.h b/mldsa/src/native/x86_64/meta.h
index c7962b86c..252360dfa 100644
--- a/mldsa/src/native/x86_64/meta.h
+++ b/mldsa/src/native/x86_64/meta.h
@@ -131,16 +131,26 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len,
   return (int)outlen;
 }
 
-static MLD_INLINE void mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
-                                                    const int32_t *a)
+static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
+                                                   const int32_t *a)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_poly_decompose_32_avx2((__m256i *)a1, (__m256i *)a0, (const __m256i *)a);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
-                                                    const int32_t *a)
+static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
+                                                   const int32_t *a)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_poly_decompose_88_avx2((__m256i *)a1, (__m256i *)a0, (const __m256i *)a);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE void mld_poly_caddq_native(int32_t a[MLDSA_N])
diff --git a/mldsa/src/poly_kl.c b/mldsa/src/poly_kl.c
index 8e87298c7..39a7e6760 100644
--- a/mldsa/src/poly_kl.c
+++ b/mldsa/src/poly_kl.c
@@ -37,19 +37,38 @@
 MLD_INTERNAL_API
 void mld_poly_decompose(mld_poly *a1, mld_poly *a0, const mld_poly *a)
 {
-#if defined(MLD_USE_NATIVE_POLY_DECOMPOSE_88) && MLD_CONFIG_PARAMETER_SET == 44
-  /* TODO: proof */
+  unsigned int i;
   mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
-  mld_poly_decompose_88_native(a1->coeffs, a0->coeffs, a->coeffs);
+#if defined(MLD_USE_NATIVE_POLY_DECOMPOSE_88) && MLD_CONFIG_PARAMETER_SET == 44
+  {
+    int ret;
+    /* TODO: proof */
+    ret = mld_poly_decompose_88_native(a1->coeffs, a0->coeffs, a->coeffs);
+    if (ret == MLD_NATIVE_FUNC_SUCCESS)
+    {
+      mld_assert_abs_bound(a0->coeffs, MLDSA_N, MLDSA_GAMMA2 + 1);
+      mld_assert_bound(a1->coeffs, MLDSA_N, 0,
+                       (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2));
+      return;
+    }
+  }
 #elif defined(MLD_USE_NATIVE_POLY_DECOMPOSE_32) && \
     (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
-  /* TODO: proof */
-  mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
-  mld_poly_decompose_32_native(a1->coeffs, a0->coeffs, a->coeffs);
-#else /* !(MLD_USE_NATIVE_POLY_DECOMPOSE_88 && MLD_CONFIG_PARAMETER_SET == 44) \
-         && MLD_USE_NATIVE_POLY_DECOMPOSE_32 && (MLD_CONFIG_PARAMETER_SET ==   \
-         65 || MLD_CONFIG_PARAMETER_SET == 87) */
-  unsigned int i;
+  {
+    int ret;
+    /* TODO: proof */
+    ret = mld_poly_decompose_32_native(a1->coeffs, a0->coeffs, a->coeffs);
+    if (ret == MLD_NATIVE_FUNC_SUCCESS)
+    {
+      mld_assert_abs_bound(a0->coeffs, MLDSA_N, MLDSA_GAMMA2 + 1);
+      mld_assert_bound(a1->coeffs, MLDSA_N, 0,
+                       (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2));
+      return;
+    }
+  }
+#endif /* !(MLD_USE_NATIVE_POLY_DECOMPOSE_88 && MLD_CONFIG_PARAMETER_SET ==    \
+          44) && MLD_USE_NATIVE_POLY_DECOMPOSE_32 && (MLD_CONFIG_PARAMETER_SET \
+          == 65 || MLD_CONFIG_PARAMETER_SET == 87) */
   mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
   for (i = 0; i < MLDSA_N; ++i)
   __loop__(
@@ -61,10 +80,6 @@ void mld_poly_decompose(mld_poly *a1, mld_poly *a0, const mld_poly *a)
   {
     mld_decompose(&a0->coeffs[i], &a1->coeffs[i], a->coeffs[i]);
   }
-#endif /* !(MLD_USE_NATIVE_POLY_DECOMPOSE_88 && MLD_CONFIG_PARAMETER_SET ==   \
-          44) && !(MLD_USE_NATIVE_POLY_DECOMPOSE_32 &&                        \
-          (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) \
-        */
 
   mld_assert_abs_bound(a0->coeffs, MLDSA_N, MLDSA_GAMMA2 + 1);
   mld_assert_bound(a1->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2));

From 289b0a0fd5652a5537e694deb339bfe01f6dc62f Mon Sep 17 00:00:00 2001
From: willieyz <willie.zhao@chelpis.com>
Date: Thu, 6 Nov 2025 15:20:05 +0800
Subject: [PATCH 04/11] Add runtime dispatch (mld_poly_caddq_native)

Signed-off-by: willieyz <willie.zhao@chelpis.com>
---
 dev/aarch64_clean/meta.h        |  3 ++-
 dev/x86_64/meta.h               |  7 ++++++-
 mldsa/src/native/aarch64/meta.h |  3 ++-
 mldsa/src/native/api.h          |  2 +-
 mldsa/src/native/x86_64/meta.h  |  7 ++++++-
 mldsa/src/poly.c                | 22 +++++++++++-----------
 6 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/dev/aarch64_clean/meta.h b/dev/aarch64_clean/meta.h
index 8416acc3c..d8a1ad33a 100644
--- a/dev/aarch64_clean/meta.h
+++ b/dev/aarch64_clean/meta.h
@@ -125,9 +125,10 @@ static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_poly_caddq_native(int32_t a[MLDSA_N])
+static MLD_INLINE int mld_poly_caddq_native(int32_t a[MLDSA_N])
 {
   mld_poly_caddq_asm(a);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE void mld_poly_use_hint_32_native(int32_t *b, const int32_t *a,
diff --git a/dev/x86_64/meta.h b/dev/x86_64/meta.h
index 252360dfa..1bfa2bdf1 100644
--- a/dev/x86_64/meta.h
+++ b/dev/x86_64/meta.h
@@ -153,9 +153,14 @@ static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_poly_caddq_native(int32_t a[MLDSA_N])
+static MLD_INLINE int mld_poly_caddq_native(int32_t a[MLDSA_N])
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_poly_caddq_avx2(a);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 static MLD_INLINE void mld_poly_use_hint_32_native(int32_t *b, const int32_t *a,
                                                    const int32_t *h)
diff --git a/mldsa/src/native/aarch64/meta.h b/mldsa/src/native/aarch64/meta.h
index 8416acc3c..d8a1ad33a 100644
--- a/mldsa/src/native/aarch64/meta.h
+++ b/mldsa/src/native/aarch64/meta.h
@@ -125,9 +125,10 @@ static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_poly_caddq_native(int32_t a[MLDSA_N])
+static MLD_INLINE int mld_poly_caddq_native(int32_t a[MLDSA_N])
 {
   mld_poly_caddq_asm(a);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE void mld_poly_use_hint_32_native(int32_t *b, const int32_t *a,
diff --git a/mldsa/src/native/api.h b/mldsa/src/native/api.h
index 383e0d615..35063a59b 100644
--- a/mldsa/src/native/api.h
+++ b/mldsa/src/native/api.h
@@ -234,7 +234,7 @@ static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
  *
  * Arguments:   - int32_t *a: pointer to input/output polynomial
  **************************************************/
-static MLD_INLINE void mld_poly_caddq_native(int32_t a[MLDSA_N]);
+static MLD_INLINE int mld_poly_caddq_native(int32_t a[MLDSA_N]);
 #endif /* MLD_USE_NATIVE_POLY_CADDQ */
 
 #if defined(MLD_USE_NATIVE_POLY_USE_HINT_32)
diff --git a/mldsa/src/native/x86_64/meta.h b/mldsa/src/native/x86_64/meta.h
index 252360dfa..1bfa2bdf1 100644
--- a/mldsa/src/native/x86_64/meta.h
+++ b/mldsa/src/native/x86_64/meta.h
@@ -153,9 +153,14 @@ static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_poly_caddq_native(int32_t a[MLDSA_N])
+static MLD_INLINE int mld_poly_caddq_native(int32_t a[MLDSA_N])
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_poly_caddq_avx2(a);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 static MLD_INLINE void mld_poly_use_hint_32_native(int32_t *b, const int32_t *a,
                                                    const int32_t *h)
diff --git a/mldsa/src/poly.c b/mldsa/src/poly.c
index 8df08097e..113b3aba7 100644
--- a/mldsa/src/poly.c
+++ b/mldsa/src/poly.c
@@ -51,13 +51,22 @@ void mld_poly_reduce(mld_poly *a)
 }
 
 
-#if !defined(MLD_USE_NATIVE_POLY_CADDQ)
 MLD_INTERNAL_API
 void mld_poly_caddq(mld_poly *a)
 {
   unsigned int i;
   mld_assert_abs_bound(a->coeffs, MLDSA_N, MLDSA_Q);
-
+#if defined(MLD_USE_NATIVE_POLY_CADDQ)
+  {
+    int ret;
+    ret = mld_poly_caddq_native(a->coeffs);
+    if (ret == MLD_NATIVE_FUNC_SUCCESS)
+    {
+      mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
+      return;
+    }
+  }
+#endif /* MLD_USE_NATIVE_POLY_CADDQ */
   for (i = 0; i < MLDSA_N; ++i)
   __loop__(
     invariant(i <= MLDSA_N)
@@ -70,15 +79,6 @@ void mld_poly_caddq(mld_poly *a)
 
   mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
 }
-#else  /* !MLD_USE_NATIVE_POLY_CADDQ */
-MLD_INTERNAL_API
-void mld_poly_caddq(mld_poly *a)
-{
-  mld_assert_abs_bound(a->coeffs, MLDSA_N, MLDSA_Q);
-  mld_poly_caddq_native(a->coeffs);
-  mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
-}
-#endif /* MLD_USE_NATIVE_POLY_CADDQ */
 
 /* Reference: We use destructive version (output=first input) to avoid
  *            reasoning about aliasing in the CBMC specification */

From ccb8062f3fa8be18450ffc447ad66f6492fa52a3 Mon Sep 17 00:00:00 2001
From: willieyz <willie.zhao@chelpis.com>
Date: Thu, 6 Nov 2025 15:25:55 +0800
Subject: [PATCH 05/11] Add runtime dispatch (mld_poly_use_hint_32/88_native)

Signed-off-by: willieyz <willie.zhao@chelpis.com>
---
 dev/aarch64_clean/meta.h        | 10 +++++---
 dev/x86_64/meta.h               | 18 +++++++++++---
 mldsa/src/native/aarch64/meta.h | 10 +++++---
 mldsa/src/native/api.h          |  8 +++---
 mldsa/src/native/x86_64/meta.h  | 18 +++++++++++---
 mldsa/src/poly_kl.c             | 44 ++++++++++++++++++++-------------
 6 files changed, 71 insertions(+), 37 deletions(-)

diff --git a/dev/aarch64_clean/meta.h b/dev/aarch64_clean/meta.h
index d8a1ad33a..9a2022b2a 100644
--- a/dev/aarch64_clean/meta.h
+++ b/dev/aarch64_clean/meta.h
@@ -131,16 +131,18 @@ static MLD_INLINE int mld_poly_caddq_native(int32_t a[MLDSA_N])
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_poly_use_hint_32_native(int32_t *b, const int32_t *a,
-                                                   const int32_t *h)
+static MLD_INLINE int mld_poly_use_hint_32_native(int32_t *b, const int32_t *a,
+                                                  const int32_t *h)
 {
   mld_poly_use_hint_32_asm(b, a, h);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
-                                                   const int32_t *h)
+static MLD_INLINE int mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
+                                                  const int32_t *h)
 {
   mld_poly_use_hint_88_asm(b, a, h);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE int mld_poly_chknorm_native(const int32_t *a, int32_t B)
diff --git a/dev/x86_64/meta.h b/dev/x86_64/meta.h
index 1bfa2bdf1..edfa3f43e 100644
--- a/dev/x86_64/meta.h
+++ b/dev/x86_64/meta.h
@@ -162,18 +162,28 @@ static MLD_INLINE int mld_poly_caddq_native(int32_t a[MLDSA_N])
   mld_poly_caddq_avx2(a);
   return MLD_NATIVE_FUNC_SUCCESS;
 }
-static MLD_INLINE void mld_poly_use_hint_32_native(int32_t *b, const int32_t *a,
-                                                   const int32_t *h)
+static MLD_INLINE int mld_poly_use_hint_32_native(int32_t *b, const int32_t *a,
+                                                  const int32_t *h)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_poly_use_hint_32_avx2((__m256i *)b, (const __m256i *)a,
                             (const __m256i *)h);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
-                                                   const int32_t *h)
+static MLD_INLINE int mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
+                                                  const int32_t *h)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_poly_use_hint_88_avx2((__m256i *)b, (const __m256i *)a,
                             (const __m256i *)h);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE int mld_poly_chknorm_native(const int32_t *a, int32_t B)
diff --git a/mldsa/src/native/aarch64/meta.h b/mldsa/src/native/aarch64/meta.h
index d8a1ad33a..9a2022b2a 100644
--- a/mldsa/src/native/aarch64/meta.h
+++ b/mldsa/src/native/aarch64/meta.h
@@ -131,16 +131,18 @@ static MLD_INLINE int mld_poly_caddq_native(int32_t a[MLDSA_N])
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_poly_use_hint_32_native(int32_t *b, const int32_t *a,
-                                                   const int32_t *h)
+static MLD_INLINE int mld_poly_use_hint_32_native(int32_t *b, const int32_t *a,
+                                                  const int32_t *h)
 {
   mld_poly_use_hint_32_asm(b, a, h);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
-                                                   const int32_t *h)
+static MLD_INLINE int mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
+                                                  const int32_t *h)
 {
   mld_poly_use_hint_88_asm(b, a, h);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE int mld_poly_chknorm_native(const int32_t *a, int32_t B)
diff --git a/mldsa/src/native/api.h b/mldsa/src/native/api.h
index 35063a59b..51534715f 100644
--- a/mldsa/src/native/api.h
+++ b/mldsa/src/native/api.h
@@ -249,8 +249,8 @@ static MLD_INLINE int mld_poly_caddq_native(int32_t a[MLDSA_N]);
  *              - const int32_t *a: pointer to input polynomial
  *              - const int32_t *h: pointer to input hint polynomial
  **************************************************/
-static MLD_INLINE void mld_poly_use_hint_32_native(int32_t *b, const int32_t *a,
-                                                   const int32_t *h);
+static MLD_INLINE int mld_poly_use_hint_32_native(int32_t *b, const int32_t *a,
+                                                  const int32_t *h);
 #endif /* MLD_USE_NATIVE_POLY_USE_HINT_32 */
 
 #if defined(MLD_USE_NATIVE_POLY_USE_HINT_88)
@@ -265,8 +265,8 @@ static MLD_INLINE void mld_poly_use_hint_32_native(int32_t *b, const int32_t *a,
  *              - const int32_t *a: pointer to input polynomial
  *              - const int32_t *h: pointer to input hint polynomial
  **************************************************/
-static MLD_INLINE void mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
-                                                   const int32_t *h);
+static MLD_INLINE int mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
+                                                  const int32_t *h);
 #endif /* MLD_USE_NATIVE_POLY_USE_HINT_88 */
 
 #if defined(MLD_USE_NATIVE_POLY_CHKNORM)
diff --git a/mldsa/src/native/x86_64/meta.h b/mldsa/src/native/x86_64/meta.h
index 1bfa2bdf1..edfa3f43e 100644
--- a/mldsa/src/native/x86_64/meta.h
+++ b/mldsa/src/native/x86_64/meta.h
@@ -162,18 +162,28 @@ static MLD_INLINE int mld_poly_caddq_native(int32_t a[MLDSA_N])
   mld_poly_caddq_avx2(a);
   return MLD_NATIVE_FUNC_SUCCESS;
 }
-static MLD_INLINE void mld_poly_use_hint_32_native(int32_t *b, const int32_t *a,
-                                                   const int32_t *h)
+static MLD_INLINE int mld_poly_use_hint_32_native(int32_t *b, const int32_t *a,
+                                                  const int32_t *h)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_poly_use_hint_32_avx2((__m256i *)b, (const __m256i *)a,
                             (const __m256i *)h);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
-                                                   const int32_t *h)
+static MLD_INLINE int mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
+                                                  const int32_t *h)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_poly_use_hint_88_avx2((__m256i *)b, (const __m256i *)a,
                             (const __m256i *)h);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE int mld_poly_chknorm_native(const int32_t *a, int32_t B)
diff --git a/mldsa/src/poly_kl.c b/mldsa/src/poly_kl.c
index 39a7e6760..ccbd86942 100644
--- a/mldsa/src/poly_kl.c
+++ b/mldsa/src/poly_kl.c
@@ -111,23 +111,37 @@ unsigned int mld_poly_make_hint(mld_poly *h, const mld_poly *a0,
 MLD_INTERNAL_API
 void mld_poly_use_hint(mld_poly *b, const mld_poly *a, const mld_poly *h)
 {
-#if defined(MLD_USE_NATIVE_POLY_USE_HINT_88) && MLD_CONFIG_PARAMETER_SET == 44
-  /* TODO: proof */
+  unsigned int i;
   mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
   mld_assert_bound(h->coeffs, MLDSA_N, 0, 2);
-  mld_poly_use_hint_88_native(b->coeffs, a->coeffs, h->coeffs);
+#if defined(MLD_USE_NATIVE_POLY_USE_HINT_88) && MLD_CONFIG_PARAMETER_SET == 44
+  {
+    int ret;
+    /* TODO: proof */
+    ret = mld_poly_use_hint_88_native(b->coeffs, a->coeffs, h->coeffs);
+    if (ret == MLD_NATIVE_FUNC_SUCCESS)
+    {
+      mld_assert_bound(b->coeffs, MLDSA_N, 0,
+                       (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2));
+      return;
+    }
+  }
 #elif defined(MLD_USE_NATIVE_POLY_USE_HINT_32) && \
     (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
-  /* TODO: proof */
-  mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
-  mld_assert_bound(h->coeffs, MLDSA_N, 0, 2);
-  mld_poly_use_hint_32_native(b->coeffs, a->coeffs, h->coeffs);
-#else  /* !(MLD_USE_NATIVE_POLY_USE_HINT_88 && MLD_CONFIG_PARAMETER_SET == 44)  \
-          && MLD_USE_NATIVE_POLY_USE_HINT_32 && (MLD_CONFIG_PARAMETER_SET == 65 \
-          || MLD_CONFIG_PARAMETER_SET == 87) */
-  unsigned int i;
-  mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
-  mld_assert_bound(h->coeffs, MLDSA_N, 0, 2);
+  {
+    int ret;
+    /* TODO: proof */
+    ret = mld_poly_use_hint_32_native(b->coeffs, a->coeffs, h->coeffs);
+    if (ret == MLD_NATIVE_FUNC_SUCCESS)
+    {
+      mld_assert_bound(b->coeffs, MLDSA_N, 0,
+                       (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2));
+      return;
+    }
+  }
+#endif /* !(MLD_USE_NATIVE_POLY_USE_HINT_88 && MLD_CONFIG_PARAMETER_SET == 44) \
+          && MLD_USE_NATIVE_POLY_USE_HINT_32 && (MLD_CONFIG_PARAMETER_SET ==   \
+          65 || MLD_CONFIG_PARAMETER_SET == 87) */
 
   for (i = 0; i < MLDSA_N; ++i)
   __loop__(
@@ -137,10 +151,6 @@ void mld_poly_use_hint(mld_poly *b, const mld_poly *a, const mld_poly *h)
   {
     b->coeffs[i] = mld_use_hint(a->coeffs[i], h->coeffs[i]);
   }
-#endif /* !(MLD_USE_NATIVE_POLY_USE_HINT_88 && MLD_CONFIG_PARAMETER_SET == 44) \
-          && !(MLD_USE_NATIVE_POLY_USE_HINT_32 && (MLD_CONFIG_PARAMETER_SET == \
-          65 || MLD_CONFIG_PARAMETER_SET == 87)) */
-
   mld_assert_bound(b->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2));
 }
 

From 60293f00822293ac813b3c8e538cca85dc314f51 Mon Sep 17 00:00:00 2001
From: willieyz <willie.zhao@chelpis.com>
Date: Thu, 13 Nov 2025 16:55:11 +0800
Subject: [PATCH 06/11] Add runtime dispatch (mld_poly_chknorm)

The inputs to chknorm are potentially secret. However, it is fine (and
unavoidable) to leak if chknorm return 0 or 1 (i.e., if all coeffs are within
bound or not). The declassification of that currently happens in sign.c.
However, for the run-time dispatch we require to branch depending on whether
mld_poly_chknorm_native returns MLD_NATIVE_FUNC_FALLBACK (-1) or not to signal
that the platform does not have the required capabilities to run the native
code (and we should hence fallback to the C code).

This commit adds a declassification of (ret == FALLBACK) which is sufficient
to make this code pass the constant time tests. The declassification of the
actual value (0/1) remains in sign.c to be consistent with the C
implementation.

Co-authored-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
Signed-off-by: willieyz <willie.zhao@chelpis.com>
---
 dev/x86_64/meta.h              |  4 ++++
 mldsa/src/native/x86_64/meta.h |  4 ++++
 mldsa/src/poly.c               | 39 ++++++++++++++++++++++++----------
 3 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/dev/x86_64/meta.h b/dev/x86_64/meta.h
index edfa3f43e..542c27f63 100644
--- a/dev/x86_64/meta.h
+++ b/dev/x86_64/meta.h
@@ -188,6 +188,10 @@ static MLD_INLINE int mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
 
 static MLD_INLINE int mld_poly_chknorm_native(const int32_t *a, int32_t B)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   return mld_poly_chknorm_avx2((const __m256i *)a, B);
 }
 
diff --git a/mldsa/src/native/x86_64/meta.h b/mldsa/src/native/x86_64/meta.h
index edfa3f43e..542c27f63 100644
--- a/mldsa/src/native/x86_64/meta.h
+++ b/mldsa/src/native/x86_64/meta.h
@@ -188,6 +188,10 @@ static MLD_INLINE int mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
 
 static MLD_INLINE int mld_poly_chknorm_native(const int32_t *a, int32_t B)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   return mld_poly_chknorm_avx2((const __m256i *)a, B);
 }
 
diff --git a/mldsa/src/poly.c b/mldsa/src/poly.c
index 113b3aba7..0934dc95d 100644
--- a/mldsa/src/poly.c
+++ b/mldsa/src/poly.c
@@ -589,19 +589,37 @@ void mld_polyt0_unpack(mld_poly *r, const uint8_t *a)
 MLD_INTERNAL_API
 uint32_t mld_poly_chknorm(const mld_poly *a, int32_t B)
 {
-#if defined(MLD_USE_NATIVE_POLY_CHKNORM)
-  /* TODO: proof */
-  mld_assert_bound(a->coeffs, MLDSA_N, -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX);
-
-  /* The native backend returns 0 if all coeffs within the bound, 1 otherwise */
-  /* Convert to 0 / 0xFFFFFFFF here */
-  return 0U - (uint32_t)mld_poly_chknorm_native(a->coeffs, B);
-#else  /* MLD_USE_NATIVE_POLY_CHKNORM */
   unsigned int i;
   uint32_t t = 0;
   mld_assert_bound(a->coeffs, MLDSA_N, -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX);
-
-
+#if defined(MLD_USE_NATIVE_POLY_CHKNORM)
+  {
+    /* TODO: proof */
+    int ret;
+    int success;
+    /* The native backend returns 0 if all coefficients are within the bound,
+     * 1 if at least one coefficient exceeds the bound, and
+     * -1 (MLD_NATIVE_FUNC_FALLBACK) if the platform does not have the
+     * required capabilities to run the native function.
+     */
+    ret = mld_poly_chknorm_native(a->coeffs, B);
+
+    success = (ret != MLD_NATIVE_FUNC_FALLBACK);
+    /* Constant-time: It would be fine to leak the return value of chknorm
+     * entirely (as it is fine to leak if any coefficient exceeded the bound or
+     * not). However, it is cleaner to perform declassification in sign.c.
+     * Hence, here we only declassify if the native function returned
+     * MLD_NATIVE_FUNC_FALLBACK or not (which solely depends on system
+     * capabilities).
+     */
+    MLD_CT_TESTING_DECLASSIFY(&success, sizeof(int));
+    if (success)
+    {
+      /* Convert 0 / 1 to 0 / 0xFFFFFFFF here */
+      return 0U - (uint32_t)ret;
+    }
+  }
+#endif /* MLD_USE_NATIVE_POLY_CHKNORM */
   for (i = 0; i < MLDSA_N; ++i)
   __loop__(
     invariant(i <= MLDSA_N)
@@ -630,7 +648,6 @@ uint32_t mld_poly_chknorm(const mld_poly *a, int32_t B)
   }
 
   return t;
-#endif /* !MLD_USE_NATIVE_POLY_CHKNORM */
 }
 
 #else  /* !MLD_CONFIG_MULTILEVEL_NO_SHARED */

From a0c14f628a5b5e80e0347854938d93ee5c57306e Mon Sep 17 00:00:00 2001
From: willieyz <willie.zhao@chelpis.com>
Date: Thu, 6 Nov 2025 18:50:13 +0800
Subject: [PATCH 07/11] Add runtime dispatch (mld_polyz_unpack_17/19_native)

Signed-off-by: willieyz <willie.zhao@chelpis.com>
---
 dev/aarch64_clean/meta.h        |  8 ++++----
 dev/x86_64/meta.h               | 14 +++++++++++--
 mldsa/src/native/aarch64/meta.h |  8 ++++----
 mldsa/src/native/api.h          |  4 ++--
 mldsa/src/native/x86_64/meta.h  | 14 +++++++++++--
 mldsa/src/poly_kl.c             | 36 ++++++++++++++++++++-------------
 6 files changed, 56 insertions(+), 28 deletions(-)

diff --git a/dev/aarch64_clean/meta.h b/dev/aarch64_clean/meta.h
index 9a2022b2a..cb8fa3190 100644
--- a/dev/aarch64_clean/meta.h
+++ b/dev/aarch64_clean/meta.h
@@ -150,16 +150,16 @@ static MLD_INLINE int mld_poly_chknorm_native(const int32_t *a, int32_t B)
   return mld_poly_chknorm_asm(a, B);
 }
 
-static MLD_INLINE void mld_polyz_unpack_17_native(int32_t *r,
-                                                  const uint8_t *buf)
+static MLD_INLINE int mld_polyz_unpack_17_native(int32_t *r, const uint8_t *buf)
 {
   mld_polyz_unpack_17_asm(r, buf, mld_polyz_unpack_17_indices);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_polyz_unpack_19_native(int32_t *r,
-                                                  const uint8_t *buf)
+static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *buf)
 {
   mld_polyz_unpack_19_asm(r, buf, mld_polyz_unpack_19_indices);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE void mld_poly_pointwise_montgomery_native(
diff --git a/dev/x86_64/meta.h b/dev/x86_64/meta.h
index 542c27f63..fca08bf9e 100644
--- a/dev/x86_64/meta.h
+++ b/dev/x86_64/meta.h
@@ -195,14 +195,24 @@ static MLD_INLINE int mld_poly_chknorm_native(const int32_t *a, int32_t B)
   return mld_poly_chknorm_avx2((const __m256i *)a, B);
 }
 
-static MLD_INLINE void mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a)
+static MLD_INLINE int mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_polyz_unpack_17_avx2((__m256i *)r, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a)
+static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_polyz_unpack_19_avx2((__m256i *)r, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE void mld_poly_pointwise_montgomery_native(
diff --git a/mldsa/src/native/aarch64/meta.h b/mldsa/src/native/aarch64/meta.h
index 9a2022b2a..cb8fa3190 100644
--- a/mldsa/src/native/aarch64/meta.h
+++ b/mldsa/src/native/aarch64/meta.h
@@ -150,16 +150,16 @@ static MLD_INLINE int mld_poly_chknorm_native(const int32_t *a, int32_t B)
   return mld_poly_chknorm_asm(a, B);
 }
 
-static MLD_INLINE void mld_polyz_unpack_17_native(int32_t *r,
-                                                  const uint8_t *buf)
+static MLD_INLINE int mld_polyz_unpack_17_native(int32_t *r, const uint8_t *buf)
 {
   mld_polyz_unpack_17_asm(r, buf, mld_polyz_unpack_17_indices);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_polyz_unpack_19_native(int32_t *r,
-                                                  const uint8_t *buf)
+static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *buf)
 {
   mld_polyz_unpack_19_asm(r, buf, mld_polyz_unpack_19_indices);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE void mld_poly_pointwise_montgomery_native(
diff --git a/mldsa/src/native/api.h b/mldsa/src/native/api.h
index 51534715f..1b8645405 100644
--- a/mldsa/src/native/api.h
+++ b/mldsa/src/native/api.h
@@ -296,7 +296,7 @@ static MLD_INLINE int mld_poly_chknorm_native(const int32_t *a, int32_t B);
  * Arguments:   - int32_t *r: pointer to output polynomial
  *              - const uint8_t *a: byte array with bit-packed polynomial
  **************************************************/
-static MLD_INLINE void mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a);
+static MLD_INLINE int mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a);
 #endif /* MLD_USE_NATIVE_POLYZ_UNPACK_17 */
 
 #if defined(MLD_USE_NATIVE_POLYZ_UNPACK_19)
@@ -310,7 +310,7 @@ static MLD_INLINE void mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a);
  * Arguments:   - int32_t *r: pointer to output polynomial
  *              - const uint8_t *a: byte array with bit-packed polynomial
  **************************************************/
-static MLD_INLINE void mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a);
+static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a);
 #endif /* MLD_USE_NATIVE_POLYZ_UNPACK_19 */
 
 #if defined(MLD_USE_NATIVE_POINTWISE_MONTGOMERY)
diff --git a/mldsa/src/native/x86_64/meta.h b/mldsa/src/native/x86_64/meta.h
index 542c27f63..fca08bf9e 100644
--- a/mldsa/src/native/x86_64/meta.h
+++ b/mldsa/src/native/x86_64/meta.h
@@ -195,14 +195,24 @@ static MLD_INLINE int mld_poly_chknorm_native(const int32_t *a, int32_t B)
   return mld_poly_chknorm_avx2((const __m256i *)a, B);
 }
 
-static MLD_INLINE void mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a)
+static MLD_INLINE int mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_polyz_unpack_17_avx2((__m256i *)r, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a)
+static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_polyz_unpack_19_avx2((__m256i *)r, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE void mld_poly_pointwise_montgomery_native(
diff --git a/mldsa/src/poly_kl.c b/mldsa/src/poly_kl.c
index ccbd86942..75b550f97 100644
--- a/mldsa/src/poly_kl.c
+++ b/mldsa/src/poly_kl.c
@@ -750,15 +750,31 @@ void mld_polyz_pack(uint8_t *r, const mld_poly *a)
 MLD_INTERNAL_API
 void mld_polyz_unpack(mld_poly *r, const uint8_t *a)
 {
+  unsigned int i;
 #if defined(MLD_USE_NATIVE_POLYZ_UNPACK_17) && MLD_CONFIG_PARAMETER_SET == 44
   /* TODO: proof */
-  mld_polyz_unpack_17_native(r->coeffs, a);
+  int ret;
+  ret = mld_polyz_unpack_17_native(r->coeffs, a);
+  if (ret == MLD_NATIVE_FUNC_SUCCESS)
+  {
+    mld_assert_bound(r->coeffs, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1);
+    return;
+  }
 #elif defined(MLD_USE_NATIVE_POLYZ_UNPACK_19) && \
     (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
   /* TODO: proof */
-  mld_polyz_unpack_19_native(r->coeffs, a);
-#elif MLD_CONFIG_PARAMETER_SET == 44
-  unsigned int i;
+  int ret;
+  ret = mld_polyz_unpack_19_native(r->coeffs, a);
+  if (ret == MLD_NATIVE_FUNC_SUCCESS)
+  {
+    mld_assert_bound(r->coeffs, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1);
+    return;
+  }
+#endif /* !(MLD_USE_NATIVE_POLYZ_UNPACK_17 && MLD_CONFIG_PARAMETER_SET == 44)  \
+          && MLD_USE_NATIVE_POLYZ_UNPACK_19 && (MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87) */
+
+#if MLD_CONFIG_PARAMETER_SET == 44
   for (i = 0; i < MLDSA_N / 4; ++i)
   __loop__(
     invariant(i <= MLDSA_N/4)
@@ -789,11 +805,7 @@ void mld_polyz_unpack(mld_poly *r, const uint8_t *a)
     r->coeffs[4 * i + 2] = MLDSA_GAMMA1 - r->coeffs[4 * i + 2];
     r->coeffs[4 * i + 3] = MLDSA_GAMMA1 - r->coeffs[4 * i + 3];
   }
-#else  /* !(MLD_USE_NATIVE_POLYZ_UNPACK_17 && MLD_CONFIG_PARAMETER_SET == 44)   \
-          && !(MLD_USE_NATIVE_POLYZ_UNPACK_19 && (MLD_CONFIG_PARAMETER_SET ==   \
-          65 || MLD_CONFIG_PARAMETER_SET == 87)) && MLD_CONFIG_PARAMETER_SET == \
-          44 */
-  unsigned int i;
+#else  /* MLD_CONFIG_PARAMETER_SET == 44 */
   for (i = 0; i < MLDSA_N / 2; ++i)
   __loop__(
     invariant(i <= MLDSA_N/2)
@@ -813,11 +825,7 @@ void mld_polyz_unpack(mld_poly *r, const uint8_t *a)
     r->coeffs[2 * i + 0] = MLDSA_GAMMA1 - r->coeffs[2 * i + 0];
     r->coeffs[2 * i + 1] = MLDSA_GAMMA1 - r->coeffs[2 * i + 1];
   }
-#endif /* !(MLD_USE_NATIVE_POLYZ_UNPACK_17 && MLD_CONFIG_PARAMETER_SET == 44) \
-          && !(MLD_USE_NATIVE_POLYZ_UNPACK_19 && (MLD_CONFIG_PARAMETER_SET == \
-          65 || MLD_CONFIG_PARAMETER_SET == 87)) && MLD_CONFIG_PARAMETER_SET  \
-          != 44 */
-
+#endif /* MLD_CONFIG_PARAMETER_SET != 44 */
   mld_assert_bound(r->coeffs, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1);
 }
 

From 5afab8cc51945dea642c790a84262ed6e714a27a Mon Sep 17 00:00:00 2001
From: willieyz <willie.zhao@chelpis.com>
Date: Thu, 6 Nov 2025 19:21:40 +0800
Subject: [PATCH 08/11] Add runtime dispatch
 (mld_poly_pointwise_montgomery_native)

Signed-off-by: willieyz <willie.zhao@chelpis.com>
---
 dev/aarch64_clean/meta.h        |  3 ++-
 dev/x86_64/meta.h               |  7 ++++++-
 mldsa/src/native/aarch64/meta.h |  3 ++-
 mldsa/src/native/api.h          |  2 +-
 mldsa/src/native/x86_64/meta.h  |  7 ++++++-
 mldsa/src/poly.c                | 22 ++++++++++++----------
 6 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/dev/aarch64_clean/meta.h b/dev/aarch64_clean/meta.h
index cb8fa3190..28cd686e5 100644
--- a/dev/aarch64_clean/meta.h
+++ b/dev/aarch64_clean/meta.h
@@ -162,11 +162,12 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *buf)
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_poly_pointwise_montgomery_native(
+static MLD_INLINE int mld_poly_pointwise_montgomery_native(
     int32_t out[MLDSA_N], const int32_t in0[MLDSA_N],
     const int32_t in1[MLDSA_N])
 {
   mld_poly_pointwise_montgomery_asm(out, in0, in1);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l4_native(
diff --git a/dev/x86_64/meta.h b/dev/x86_64/meta.h
index fca08bf9e..35cd26b4b 100644
--- a/dev/x86_64/meta.h
+++ b/dev/x86_64/meta.h
@@ -215,11 +215,16 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a)
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_poly_pointwise_montgomery_native(
+static MLD_INLINE int mld_poly_pointwise_montgomery_native(
     int32_t c[MLDSA_N], const int32_t a[MLDSA_N], const int32_t b[MLDSA_N])
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_pointwise_avx2((__m256i *)c, (const __m256i *)a, (const __m256i *)b,
                      mld_qdata.vec);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l4_native(
diff --git a/mldsa/src/native/aarch64/meta.h b/mldsa/src/native/aarch64/meta.h
index cb8fa3190..28cd686e5 100644
--- a/mldsa/src/native/aarch64/meta.h
+++ b/mldsa/src/native/aarch64/meta.h
@@ -162,11 +162,12 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *buf)
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_poly_pointwise_montgomery_native(
+static MLD_INLINE int mld_poly_pointwise_montgomery_native(
     int32_t out[MLDSA_N], const int32_t in0[MLDSA_N],
     const int32_t in1[MLDSA_N])
 {
   mld_poly_pointwise_montgomery_asm(out, in0, in1);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l4_native(
diff --git a/mldsa/src/native/api.h b/mldsa/src/native/api.h
index 1b8645405..764534f84 100644
--- a/mldsa/src/native/api.h
+++ b/mldsa/src/native/api.h
@@ -327,7 +327,7 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a);
  *              - const int32_t a[MLDSA_N]: first input polynomial
  *              - const int32_t b[MLDSA_N]: second input polynomial
  **************************************************/
-static MLD_INLINE void mld_poly_pointwise_montgomery_native(
+static MLD_INLINE int mld_poly_pointwise_montgomery_native(
     int32_t c[MLDSA_N], const int32_t a[MLDSA_N], const int32_t b[MLDSA_N]);
 #endif /* MLD_USE_NATIVE_POINTWISE_MONTGOMERY */
 
diff --git a/mldsa/src/native/x86_64/meta.h b/mldsa/src/native/x86_64/meta.h
index fca08bf9e..35cd26b4b 100644
--- a/mldsa/src/native/x86_64/meta.h
+++ b/mldsa/src/native/x86_64/meta.h
@@ -215,11 +215,16 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a)
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_poly_pointwise_montgomery_native(
+static MLD_INLINE int mld_poly_pointwise_montgomery_native(
     int32_t c[MLDSA_N], const int32_t a[MLDSA_N], const int32_t b[MLDSA_N])
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_pointwise_avx2((__m256i *)c, (const __m256i *)a, (const __m256i *)b,
                      mld_qdata.vec);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l4_native(
diff --git a/mldsa/src/poly.c b/mldsa/src/poly.c
index 0934dc95d..ba570e04c 100644
--- a/mldsa/src/poly.c
+++ b/mldsa/src/poly.c
@@ -184,18 +184,21 @@ MLD_INTERNAL_API
 void mld_poly_pointwise_montgomery(mld_poly *c, const mld_poly *a,
                                    const mld_poly *b)
 {
-#if defined(MLD_USE_NATIVE_POINTWISE_MONTGOMERY)
-  /* TODO: proof */
-  mld_assert_abs_bound(a->coeffs, MLDSA_N, MLD_NTT_BOUND);
-  mld_assert_abs_bound(b->coeffs, MLDSA_N, MLD_NTT_BOUND);
-  mld_poly_pointwise_montgomery_native(c->coeffs, a->coeffs, b->coeffs);
-  mld_assert_abs_bound(c->coeffs, MLDSA_N, MLDSA_Q);
-#else  /* MLD_USE_NATIVE_POINTWISE_MONTGOMERY */
   unsigned int i;
-
   mld_assert_abs_bound(a->coeffs, MLDSA_N, MLD_NTT_BOUND);
   mld_assert_abs_bound(b->coeffs, MLDSA_N, MLD_NTT_BOUND);
-
+#if defined(MLD_USE_NATIVE_POINTWISE_MONTGOMERY)
+  {
+    /* TODO: proof */
+    int ret;
+    ret = mld_poly_pointwise_montgomery_native(c->coeffs, a->coeffs, b->coeffs);
+    if (ret == MLD_NATIVE_FUNC_SUCCESS)
+    {
+      mld_assert_abs_bound(c->coeffs, MLDSA_N, MLDSA_Q);
+      return;
+    }
+  }
+#endif /* MLD_USE_NATIVE_POINTWISE_MONTGOMERY */
   for (i = 0; i < MLDSA_N; ++i)
   __loop__(
     invariant(i <= MLDSA_N)
@@ -206,7 +209,6 @@ void mld_poly_pointwise_montgomery(mld_poly *c, const mld_poly *a,
   }
 
   mld_assert_abs_bound(c->coeffs, MLDSA_N, MLDSA_Q);
-#endif /* !MLD_USE_NATIVE_POINTWISE_MONTGOMERY */
 }
 
 MLD_INTERNAL_API

From 522d67f4a4979517bafefb0f2a459c73ffeff1b3 Mon Sep 17 00:00:00 2001
From: willieyz <willie.zhao@chelpis.com>
Date: Thu, 6 Nov 2025 19:26:19 +0800
Subject: [PATCH 09/11] Add runtime dispatch
 (mld_polyvecl_pointwise_acc_montgomery_l4/l5/l7_native)

Signed-off-by: willieyz <willie.zhao@chelpis.com>
---
 dev/aarch64_clean/meta.h        |  9 +++--
 dev/x86_64/meta.h               | 21 ++++++++--
 mldsa/src/native/aarch64/meta.h |  9 +++--
 mldsa/src/native/api.h          |  6 +--
 mldsa/src/native/x86_64/meta.h  | 21 ++++++++--
 mldsa/src/polyvec.c             | 71 +++++++++++++++++++--------------
 6 files changed, 91 insertions(+), 46 deletions(-)

diff --git a/dev/aarch64_clean/meta.h b/dev/aarch64_clean/meta.h
index 28cd686e5..9f4a01835 100644
--- a/dev/aarch64_clean/meta.h
+++ b/dev/aarch64_clean/meta.h
@@ -170,28 +170,31 @@ static MLD_INLINE int mld_poly_pointwise_montgomery_native(
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l4_native(
+static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l4_native(
     int32_t w[MLDSA_N], const int32_t u[4][MLDSA_N],
     const int32_t v[4][MLDSA_N])
 {
   mld_polyvecl_pointwise_acc_montgomery_l4_asm(w, (const int32_t *)u,
                                                (const int32_t *)v);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l5_native(
+static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l5_native(
     int32_t w[MLDSA_N], const int32_t u[5][MLDSA_N],
     const int32_t v[5][MLDSA_N])
 {
   mld_polyvecl_pointwise_acc_montgomery_l5_asm(w, (const int32_t *)u,
                                                (const int32_t *)v);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l7_native(
+static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l7_native(
     int32_t w[MLDSA_N], const int32_t u[7][MLDSA_N],
     const int32_t v[7][MLDSA_N])
 {
   mld_polyvecl_pointwise_acc_montgomery_l7_asm(w, (const int32_t *)u,
                                                (const int32_t *)v);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 #endif /* !__ASSEMBLER__ */
diff --git a/dev/x86_64/meta.h b/dev/x86_64/meta.h
index 35cd26b4b..7978443f4 100644
--- a/dev/x86_64/meta.h
+++ b/dev/x86_64/meta.h
@@ -227,28 +227,43 @@ static MLD_INLINE int mld_poly_pointwise_montgomery_native(
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l4_native(
+static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l4_native(
     int32_t w[MLDSA_N], const int32_t u[4][MLDSA_N],
     const int32_t v[4][MLDSA_N])
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_pointwise_acc_l4_avx2((__m256i *)w, (const __m256i *)u,
                             (const __m256i *)v, mld_qdata.vec);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l5_native(
+static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l5_native(
     int32_t w[MLDSA_N], const int32_t u[5][MLDSA_N],
     const int32_t v[5][MLDSA_N])
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_pointwise_acc_l5_avx2((__m256i *)w, (const __m256i *)u,
                             (const __m256i *)v, mld_qdata.vec);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l7_native(
+static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l7_native(
     int32_t w[MLDSA_N], const int32_t u[7][MLDSA_N],
     const int32_t v[7][MLDSA_N])
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_pointwise_acc_l7_avx2((__m256i *)w, (const __m256i *)u,
                             (const __m256i *)v, mld_qdata.vec);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 #endif /* !__ASSEMBLER__ */
diff --git a/mldsa/src/native/aarch64/meta.h b/mldsa/src/native/aarch64/meta.h
index 28cd686e5..9f4a01835 100644
--- a/mldsa/src/native/aarch64/meta.h
+++ b/mldsa/src/native/aarch64/meta.h
@@ -170,28 +170,31 @@ static MLD_INLINE int mld_poly_pointwise_montgomery_native(
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l4_native(
+static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l4_native(
     int32_t w[MLDSA_N], const int32_t u[4][MLDSA_N],
     const int32_t v[4][MLDSA_N])
 {
   mld_polyvecl_pointwise_acc_montgomery_l4_asm(w, (const int32_t *)u,
                                                (const int32_t *)v);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l5_native(
+static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l5_native(
     int32_t w[MLDSA_N], const int32_t u[5][MLDSA_N],
     const int32_t v[5][MLDSA_N])
 {
   mld_polyvecl_pointwise_acc_montgomery_l5_asm(w, (const int32_t *)u,
                                                (const int32_t *)v);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l7_native(
+static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l7_native(
     int32_t w[MLDSA_N], const int32_t u[7][MLDSA_N],
     const int32_t v[7][MLDSA_N])
 {
   mld_polyvecl_pointwise_acc_montgomery_l7_asm(w, (const int32_t *)u,
                                                (const int32_t *)v);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 #endif /* !__ASSEMBLER__ */
diff --git a/mldsa/src/native/api.h b/mldsa/src/native/api.h
index 764534f84..333bb0c7b 100644
--- a/mldsa/src/native/api.h
+++ b/mldsa/src/native/api.h
@@ -346,7 +346,7 @@ static MLD_INLINE int mld_poly_pointwise_montgomery_native(
  *              - const int32_t u[MLDSA_L][MLDSA_N]: first input vector
  *              - const int32_t v[MLDSA_L][MLDSA_N]: second input vector
  **************************************************/
-static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l4_native(
+static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l4_native(
     int32_t w[MLDSA_N], const int32_t u[4][MLDSA_N],
     const int32_t v[4][MLDSA_N]);
 #endif /* MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 */
@@ -366,7 +366,7 @@ static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l4_native(
  *              - const int32_t u[MLDSA_L][MLDSA_N]: first input vector
  *              - const int32_t v[MLDSA_L][MLDSA_N]: second input vector
  **************************************************/
-static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l5_native(
+static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l5_native(
     int32_t w[MLDSA_N], const int32_t u[5][MLDSA_N],
     const int32_t v[5][MLDSA_N]);
 #endif /* MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 */
@@ -386,7 +386,7 @@ static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l5_native(
  *              - const int32_t u[MLDSA_L][MLDSA_N]: first input vector
  *              - const int32_t v[MLDSA_L][MLDSA_N]: second input vector
  **************************************************/
-static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l7_native(
+static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l7_native(
     int32_t w[MLDSA_N], const int32_t u[7][MLDSA_N],
     const int32_t v[7][MLDSA_N]);
 #endif /* MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 */
diff --git a/mldsa/src/native/x86_64/meta.h b/mldsa/src/native/x86_64/meta.h
index 35cd26b4b..7978443f4 100644
--- a/mldsa/src/native/x86_64/meta.h
+++ b/mldsa/src/native/x86_64/meta.h
@@ -227,28 +227,43 @@ static MLD_INLINE int mld_poly_pointwise_montgomery_native(
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l4_native(
+static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l4_native(
     int32_t w[MLDSA_N], const int32_t u[4][MLDSA_N],
     const int32_t v[4][MLDSA_N])
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_pointwise_acc_l4_avx2((__m256i *)w, (const __m256i *)u,
                             (const __m256i *)v, mld_qdata.vec);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l5_native(
+static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l5_native(
     int32_t w[MLDSA_N], const int32_t u[5][MLDSA_N],
     const int32_t v[5][MLDSA_N])
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_pointwise_acc_l5_avx2((__m256i *)w, (const __m256i *)u,
                             (const __m256i *)v, mld_qdata.vec);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
-static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l7_native(
+static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l7_native(
     int32_t w[MLDSA_N], const int32_t u[7][MLDSA_N],
     const int32_t v[7][MLDSA_N])
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_pointwise_acc_l7_avx2((__m256i *)w, (const __m256i *)u,
                             (const __m256i *)v, mld_qdata.vec);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 
 #endif /* !__ASSEMBLER__ */
diff --git a/mldsa/src/polyvec.c b/mldsa/src/polyvec.c
index b8e692215..f4c8d7567 100644
--- a/mldsa/src/polyvec.c
+++ b/mldsa/src/polyvec.c
@@ -328,42 +328,57 @@ MLD_INTERNAL_API
 void mld_polyvecl_pointwise_acc_montgomery(mld_poly *w, const mld_polyvecl *u,
                                            const mld_polyvecl *v)
 {
-#if defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4) && \
-    MLD_CONFIG_PARAMETER_SET == 44
-  /* TODO: proof */
+  unsigned int i, j;
   mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
   mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
-  mld_polyvecl_pointwise_acc_montgomery_l4_native(
-      w->coeffs, (const int32_t(*)[MLDSA_N])u->vec,
-      (const int32_t(*)[MLDSA_N])v->vec);
-  mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
+#if defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4) && \
+    MLD_CONFIG_PARAMETER_SET == 44
+  {
+    /* TODO: proof */
+    int ret;
+    ret = mld_polyvecl_pointwise_acc_montgomery_l4_native(
+        w->coeffs, (const int32_t(*)[MLDSA_N])u->vec,
+        (const int32_t(*)[MLDSA_N])v->vec);
+    if (ret == MLD_NATIVE_FUNC_SUCCESS)
+    {
+      mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
+      return;
+    }
+  }
 #elif defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5) && \
     MLD_CONFIG_PARAMETER_SET == 65
-  /* TODO: proof */
-  mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
-  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
-  mld_polyvecl_pointwise_acc_montgomery_l5_native(
-      w->coeffs, (const int32_t(*)[MLDSA_N])u->vec,
-      (const int32_t(*)[MLDSA_N])v->vec);
-  mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
+  {
+    /* TODO: proof */
+    int ret;
+    ret = mld_polyvecl_pointwise_acc_montgomery_l5_native(
+        w->coeffs, (const int32_t(*)[MLDSA_N])u->vec,
+        (const int32_t(*)[MLDSA_N])v->vec);
+    if (ret == MLD_NATIVE_FUNC_SUCCESS)
+    {
+      mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
+      return;
+    }
+  }
 #elif defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7) && \
     MLD_CONFIG_PARAMETER_SET == 87
-  /* TODO: proof */
-  mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
-  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
-  mld_polyvecl_pointwise_acc_montgomery_l7_native(
-      w->coeffs, (const int32_t(*)[MLDSA_N])u->vec,
-      (const int32_t(*)[MLDSA_N])v->vec);
-  mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
-#else  /* !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 && \
+  {
+    /* TODO: proof */
+    int ret;
+    ret = mld_polyvecl_pointwise_acc_montgomery_l7_native(
+        w->coeffs, (const int32_t(*)[MLDSA_N])u->vec,
+        (const int32_t(*)[MLDSA_N])v->vec);
+    if (ret == MLD_NATIVE_FUNC_SUCCESS)
+    {
+      mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
+      return;
+    }
+  }
+#endif /* !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 && \
           MLD_CONFIG_PARAMETER_SET == 44) &&                       \
           !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 && \
           MLD_CONFIG_PARAMETER_SET == 65) &&                       \
           MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 &&   \
           MLD_CONFIG_PARAMETER_SET == 87 */
-  unsigned int i, j;
-  mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
-  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
   /* The first input is bounded by [0, Q-1] inclusive
    * The second input is bounded by [-9Q+1, 9Q-1] inclusive . Hence, we can
    * safely accumulate in 64-bits without intermediate reductions as
@@ -398,12 +413,6 @@ void mld_polyvecl_pointwise_acc_montgomery(mld_poly *w, const mld_polyvecl *u,
   }
 
   mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
-#endif /* !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 && \
-          MLD_CONFIG_PARAMETER_SET == 44) &&                       \
-          !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 && \
-          MLD_CONFIG_PARAMETER_SET == 65) &&                       \
-          !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 && \
-          MLD_CONFIG_PARAMETER_SET == 87) */
 }
 
 MLD_INTERNAL_API

From bd8c62435a93abb9eef40d3498edc172cdc7295c Mon Sep 17 00:00:00 2001
From: willieyz <willie.zhao@chelpis.com>
Date: Fri, 7 Nov 2025 18:03:18 +0800
Subject: [PATCH 10/11] Add runtime dispatch (FIPS202)

Signed-off-by: willieyz <willie.zhao@chelpis.com>
---
 .github/actions/config-variations/action.yml  |  15 +-
 BIBLIOGRAPHY.md                               |   2 +
 dev/fips202/aarch64/x1_scalar.h               |   4 +-
 dev/fips202/aarch64/x1_v84a.h                 |   9 +-
 dev/fips202/aarch64/x2_v84a.h                 |   9 +-
 dev/fips202/aarch64/x4_v8a_scalar.h           |   4 +-
 dev/fips202/aarch64/x4_v8a_v84a_scalar.h      |   9 +-
 mldsa/mldsa_native.c                          |   2 +
 mldsa/src/fips202/keccakf1600.c               |  22 +-
 mldsa/src/fips202/native/aarch64/x1_scalar.h  |   4 +-
 mldsa/src/fips202/native/aarch64/x1_v84a.h    |   9 +-
 mldsa/src/fips202/native/aarch64/x2_v84a.h    |   9 +-
 .../fips202/native/aarch64/x4_v8a_scalar.h    |   4 +-
 .../native/aarch64/x4_v8a_v84a_scalar.h       |   9 +-
 mldsa/src/fips202/native/api.h                |   4 +-
 mldsa/src/fips202/native/x86_64/xkcp.h        |   8 +-
 test/configs.yml                              |  34 +
 ...native_capability_config_ID_AA64PFR1_EL1.h | 596 ++++++++++++++++++
 18 files changed, 731 insertions(+), 22 deletions(-)
 create mode 100644 test/custom_native_capability_config_ID_AA64PFR1_EL1.h

diff --git a/.github/actions/config-variations/action.yml b/.github/actions/config-variations/action.yml
index 166faa704..774dabc92 100644
--- a/.github/actions/config-variations/action.yml
+++ b/.github/actions/config-variations/action.yml
@@ -7,7 +7,7 @@ inputs:
     description: 'GitHub token'
     required: true
   tests:
-    description: 'List of tests to run (space-separated IDs) or "all" for all tests. Available IDs: pct-enabled, pct-enabled-broken, custom-zeroize, native-cap-ON, native-cap-OFF, native-cap-CPUID_AVX2, no-asm, serial-fips202, custom-randombytes, custom-memcpy, custom-memset, custom-stdlib'
+    description: 'List of tests to run (space-separated IDs) or "all" for all tests. Available IDs: pct-enabled, pct-enabled-broken, custom-zeroize, native-cap-ON, native-cap-OFF, native-cap-ID_AA64PFR1_EL1, native-cap-CPUID_AVX2, no-asm, serial-fips202, custom-randombytes, custom-memcpy, custom-memset, custom-stdlib'
     required: false
     default: 'all'
   opt:
@@ -85,6 +85,19 @@ runs:
         acvp: true
         opt: ${{ inputs.opt }}
         examples: false # Some examples use a custom config themselves
+    - name: "Custom native capability functions (ID_AA64PFR1_EL1 detection)"
+      if: ${{ (inputs.tests == 'all' || contains(inputs.tests, 'native-cap-ID_AA64PFR1_EL1')) && runner.os == 'Linux' && runner.arch == 'ARM64' }}
+      uses: ./.github/actions/multi-functest
+      with:
+        gh_token: ${{ inputs.gh_token }}
+        compile_mode: native
+        cflags: "-std=c11 -march=armv8.4-a+sha3 -D_GNU_SOURCE -DMLD_CONFIG_FILE=\\\\\\\"../../test/custom_native_capability_config_ID_AA64PFR1_EL1.h\\\\\\\" -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all"
+        ldflags: "-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all"
+        func: true
+        kat: true
+        acvp: true
+        opt: ${{ inputs.opt }}
+        examples: false # Some examples use a custom config themselves
     - name: "Custom native capability functions (CPUID AVX2 detection)"
       if: ${{ (inputs.tests == 'all' || contains(inputs.tests, 'native-cap-CPUID_AVX2')) && runner.os == 'Linux' && runner.arch == 'X64' }}
       uses: ./.github/actions/multi-functest
diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md
index 49a948c7c..050ea6c73 100644
--- a/BIBLIOGRAPHY.md
+++ b/BIBLIOGRAPHY.md
@@ -43,6 +43,7 @@ source code and documentation.
   - [test/custom_native_capability_config_0.h](test/custom_native_capability_config_0.h)
   - [test/custom_native_capability_config_1.h](test/custom_native_capability_config_1.h)
   - [test/custom_native_capability_config_CPUID_AVX2.h](test/custom_native_capability_config_CPUID_AVX2.h)
+  - [test/custom_native_capability_config_ID_AA64PFR1_EL1.h](test/custom_native_capability_config_ID_AA64PFR1_EL1.h)
   - [test/custom_randombytes_config.h](test/custom_randombytes_config.h)
   - [test/custom_stdlib_config.h](test/custom_stdlib_config.h)
   - [test/custom_zeroize_config.h](test/custom_zeroize_config.h)
@@ -94,6 +95,7 @@ source code and documentation.
   - [test/custom_native_capability_config_0.h](test/custom_native_capability_config_0.h)
   - [test/custom_native_capability_config_1.h](test/custom_native_capability_config_1.h)
   - [test/custom_native_capability_config_CPUID_AVX2.h](test/custom_native_capability_config_CPUID_AVX2.h)
+  - [test/custom_native_capability_config_ID_AA64PFR1_EL1.h](test/custom_native_capability_config_ID_AA64PFR1_EL1.h)
   - [test/custom_randombytes_config.h](test/custom_randombytes_config.h)
   - [test/custom_stdlib_config.h](test/custom_stdlib_config.h)
   - [test/custom_zeroize_config.h](test/custom_zeroize_config.h)
diff --git a/dev/fips202/aarch64/x1_scalar.h b/dev/fips202/aarch64/x1_scalar.h
index eeafcaaff..681b99137 100644
--- a/dev/fips202/aarch64/x1_scalar.h
+++ b/dev/fips202/aarch64/x1_scalar.h
@@ -13,10 +13,12 @@
 #define MLD_FIPS202_AARCH64_NEED_X1_SCALAR
 
 #if !defined(__ASSEMBLER__)
+#include "../api.h"
 #include "src/fips202_native_aarch64.h"
-static MLD_INLINE void mld_keccak_f1600_x1_native(uint64_t *state)
+static MLD_INLINE int mld_keccak_f1600_x1_native(uint64_t *state)
 {
   mld_keccak_f1600_x1_scalar_asm(state, mld_keccakf1600_round_constants);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 #endif /* !__ASSEMBLER__ */
 
diff --git a/dev/fips202/aarch64/x1_v84a.h b/dev/fips202/aarch64/x1_v84a.h
index b35861ab2..d15c8c26f 100644
--- a/dev/fips202/aarch64/x1_v84a.h
+++ b/dev/fips202/aarch64/x1_v84a.h
@@ -17,10 +17,17 @@
 #define MLD_FIPS202_AARCH64_NEED_X1_V84A
 
 #if !defined(__ASSEMBLER__)
+#include "../api.h"
 #include "src/fips202_native_aarch64.h"
-static MLD_INLINE void mld_keccak_f1600_x1_native(uint64_t *state)
+static MLD_INLINE int mld_keccak_f1600_x1_native(uint64_t *state)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_SHA3))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
+
   mld_keccak_f1600_x1_v84a_asm(state, mld_keccakf1600_round_constants);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 #endif /* !__ASSEMBLER__ */
 
diff --git a/dev/fips202/aarch64/x2_v84a.h b/dev/fips202/aarch64/x2_v84a.h
index 0c6e4789f..16c22fa1a 100644
--- a/dev/fips202/aarch64/x2_v84a.h
+++ b/dev/fips202/aarch64/x2_v84a.h
@@ -17,13 +17,20 @@
 #define MLD_FIPS202_AARCH64_NEED_X2_V84A
 
 #if !defined(__ASSEMBLER__)
+#include "../api.h"
 #include "src/fips202_native_aarch64.h"
 
 
-static MLD_INLINE void mld_keccak_f1600_x4_native(uint64_t *state)
+static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_SHA3))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
+
   mld_keccak_f1600_x2_v84a_asm(state + 0 * 25, mld_keccakf1600_round_constants);
   mld_keccak_f1600_x2_v84a_asm(state + 2 * 25, mld_keccakf1600_round_constants);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 #endif /* !__ASSEMBLER__ */
 
diff --git a/dev/fips202/aarch64/x4_v8a_scalar.h b/dev/fips202/aarch64/x4_v8a_scalar.h
index 76a71672d..0f7f8fe49 100644
--- a/dev/fips202/aarch64/x4_v8a_scalar.h
+++ b/dev/fips202/aarch64/x4_v8a_scalar.h
@@ -13,11 +13,13 @@
 #define MLD_FIPS202_AARCH64_NEED_X4_V8A_SCALAR_HYBRID
 
 #if !defined(__ASSEMBLER__)
+#include "../api.h"
 #include "src/fips202_native_aarch64.h"
-static MLD_INLINE void mld_keccak_f1600_x4_native(uint64_t *state)
+static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state)
 {
   mld_keccak_f1600_x4_scalar_v8a_hybrid_asm(state,
                                             mld_keccakf1600_round_constants);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 #endif /* !__ASSEMBLER__ */
 
diff --git a/dev/fips202/aarch64/x4_v8a_v84a_scalar.h b/dev/fips202/aarch64/x4_v8a_v84a_scalar.h
index c95b97699..3da883b76 100644
--- a/dev/fips202/aarch64/x4_v8a_v84a_scalar.h
+++ b/dev/fips202/aarch64/x4_v8a_v84a_scalar.h
@@ -17,11 +17,18 @@
 #define MLD_FIPS202_AARCH64_NEED_X4_V8A_V84A_SCALAR_HYBRID
 
 #if !defined(__ASSEMBLER__)
+#include "../api.h"
 #include "src/fips202_native_aarch64.h"
-static MLD_INLINE void mld_keccak_f1600_x4_native(uint64_t *state)
+static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_SHA3))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
+
   mld_keccak_f1600_x4_scalar_v8a_v84a_hybrid_asm(
       state, mld_keccakf1600_round_constants);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 #endif /* !__ASSEMBLER__ */
 
diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c
index 9e6ad4c29..ca37bef16 100644
--- a/mldsa/mldsa_native.c
+++ b/mldsa/mldsa_native.c
@@ -481,6 +481,8 @@
 #if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)
 /* mldsa/src/fips202/native/api.h */
 #undef MLD_FIPS202_NATIVE_API_H
+#undef MLD_NATIVE_FUNC_FALLBACK
+#undef MLD_NATIVE_FUNC_SUCCESS
 /* mldsa/src/fips202/native/auto.h */
 #undef MLD_FIPS202_NATIVE_AUTO_H
 #if defined(MLD_SYS_AARCH64)
diff --git a/mldsa/src/fips202/keccakf1600.c b/mldsa/src/fips202/keccakf1600.c
index b0b7b21ab..e59120c07 100644
--- a/mldsa/src/fips202/keccakf1600.c
+++ b/mldsa/src/fips202/keccakf1600.c
@@ -113,16 +113,17 @@ void mld_keccakf1600x4_xor_bytes(uint64_t *state, const unsigned char *data0,
 void mld_keccakf1600x4_permute(uint64_t *state)
 {
 #if defined(MLD_USE_FIPS202_X4_NATIVE)
-  mld_keccak_f1600_x4_native(state);
-#else
+  if (mld_keccak_f1600_x4_native(state) == MLD_NATIVE_FUNC_SUCCESS)
+  {
+    return;
+  }
+#endif /* MLD_USE_FIPS202_X4_NATIVE */
   mld_keccakf1600_permute(state + MLD_KECCAK_LANES * 0);
   mld_keccakf1600_permute(state + MLD_KECCAK_LANES * 1);
   mld_keccakf1600_permute(state + MLD_KECCAK_LANES * 2);
   mld_keccakf1600_permute(state + MLD_KECCAK_LANES * 3);
-#endif /* !MLD_USE_FIPS202_X4_NATIVE */
 }
 
-#if !defined(MLD_USE_FIPS202_X1_NATIVE)
 static const uint64_t mld_KeccakF_RoundConstants[MLD_KECCAK_NROUNDS] = {
     (uint64_t)0x0000000000000001ULL, (uint64_t)0x0000000000008082ULL,
     (uint64_t)0x800000000000808aULL, (uint64_t)0x8000000080008000ULL,
@@ -137,7 +138,7 @@ static const uint64_t mld_KeccakF_RoundConstants[MLD_KECCAK_NROUNDS] = {
     (uint64_t)0x8000000080008081ULL, (uint64_t)0x8000000000008080ULL,
     (uint64_t)0x0000000080000001ULL, (uint64_t)0x8000000080008008ULL};
 
-void mld_keccakf1600_permute(uint64_t *state)
+static void mld_keccakf1600_permute_c(uint64_t *state)
 {
   unsigned round;
 
@@ -402,12 +403,17 @@ void mld_keccakf1600_permute(uint64_t *state)
   state[23] = Aso;
   state[24] = Asu;
 }
-#else  /* !MLD_USE_FIPS202_X1_NATIVE */
+
 void mld_keccakf1600_permute(uint64_t *state)
 {
-  mld_keccak_f1600_x1_native(state);
-}
+#if defined(MLD_USE_FIPS202_X1_NATIVE)
+  if (mld_keccak_f1600_x1_native(state) == MLD_NATIVE_FUNC_SUCCESS)
+  {
+    return;
+  }
 #endif /* MLD_USE_FIPS202_X1_NATIVE */
+  mld_keccakf1600_permute_c(state);
+}
 
 #else /* !MLD_CONFIG_MULTILEVEL_NO_SHARED */
 
diff --git a/mldsa/src/fips202/native/aarch64/x1_scalar.h b/mldsa/src/fips202/native/aarch64/x1_scalar.h
index eeafcaaff..681b99137 100644
--- a/mldsa/src/fips202/native/aarch64/x1_scalar.h
+++ b/mldsa/src/fips202/native/aarch64/x1_scalar.h
@@ -13,10 +13,12 @@
 #define MLD_FIPS202_AARCH64_NEED_X1_SCALAR
 
 #if !defined(__ASSEMBLER__)
+#include "../api.h"
 #include "src/fips202_native_aarch64.h"
-static MLD_INLINE void mld_keccak_f1600_x1_native(uint64_t *state)
+static MLD_INLINE int mld_keccak_f1600_x1_native(uint64_t *state)
 {
   mld_keccak_f1600_x1_scalar_asm(state, mld_keccakf1600_round_constants);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 #endif /* !__ASSEMBLER__ */
 
diff --git a/mldsa/src/fips202/native/aarch64/x1_v84a.h b/mldsa/src/fips202/native/aarch64/x1_v84a.h
index b35861ab2..d15c8c26f 100644
--- a/mldsa/src/fips202/native/aarch64/x1_v84a.h
+++ b/mldsa/src/fips202/native/aarch64/x1_v84a.h
@@ -17,10 +17,17 @@
 #define MLD_FIPS202_AARCH64_NEED_X1_V84A
 
 #if !defined(__ASSEMBLER__)
+#include "../api.h"
 #include "src/fips202_native_aarch64.h"
-static MLD_INLINE void mld_keccak_f1600_x1_native(uint64_t *state)
+static MLD_INLINE int mld_keccak_f1600_x1_native(uint64_t *state)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_SHA3))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
+
   mld_keccak_f1600_x1_v84a_asm(state, mld_keccakf1600_round_constants);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 #endif /* !__ASSEMBLER__ */
 
diff --git a/mldsa/src/fips202/native/aarch64/x2_v84a.h b/mldsa/src/fips202/native/aarch64/x2_v84a.h
index 0c6e4789f..16c22fa1a 100644
--- a/mldsa/src/fips202/native/aarch64/x2_v84a.h
+++ b/mldsa/src/fips202/native/aarch64/x2_v84a.h
@@ -17,13 +17,20 @@
 #define MLD_FIPS202_AARCH64_NEED_X2_V84A
 
 #if !defined(__ASSEMBLER__)
+#include "../api.h"
 #include "src/fips202_native_aarch64.h"
 
 
-static MLD_INLINE void mld_keccak_f1600_x4_native(uint64_t *state)
+static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_SHA3))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
+
   mld_keccak_f1600_x2_v84a_asm(state + 0 * 25, mld_keccakf1600_round_constants);
   mld_keccak_f1600_x2_v84a_asm(state + 2 * 25, mld_keccakf1600_round_constants);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 #endif /* !__ASSEMBLER__ */
 
diff --git a/mldsa/src/fips202/native/aarch64/x4_v8a_scalar.h b/mldsa/src/fips202/native/aarch64/x4_v8a_scalar.h
index 76a71672d..0f7f8fe49 100644
--- a/mldsa/src/fips202/native/aarch64/x4_v8a_scalar.h
+++ b/mldsa/src/fips202/native/aarch64/x4_v8a_scalar.h
@@ -13,11 +13,13 @@
 #define MLD_FIPS202_AARCH64_NEED_X4_V8A_SCALAR_HYBRID
 
 #if !defined(__ASSEMBLER__)
+#include "../api.h"
 #include "src/fips202_native_aarch64.h"
-static MLD_INLINE void mld_keccak_f1600_x4_native(uint64_t *state)
+static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state)
 {
   mld_keccak_f1600_x4_scalar_v8a_hybrid_asm(state,
                                             mld_keccakf1600_round_constants);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 #endif /* !__ASSEMBLER__ */
 
diff --git a/mldsa/src/fips202/native/aarch64/x4_v8a_v84a_scalar.h b/mldsa/src/fips202/native/aarch64/x4_v8a_v84a_scalar.h
index c95b97699..3da883b76 100644
--- a/mldsa/src/fips202/native/aarch64/x4_v8a_v84a_scalar.h
+++ b/mldsa/src/fips202/native/aarch64/x4_v8a_v84a_scalar.h
@@ -17,11 +17,18 @@
 #define MLD_FIPS202_AARCH64_NEED_X4_V8A_V84A_SCALAR_HYBRID
 
 #if !defined(__ASSEMBLER__)
+#include "../api.h"
 #include "src/fips202_native_aarch64.h"
-static MLD_INLINE void mld_keccak_f1600_x4_native(uint64_t *state)
+static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_SHA3))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
+
   mld_keccak_f1600_x4_scalar_v8a_v84a_hybrid_asm(
       state, mld_keccakf1600_round_constants);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 #endif /* !__ASSEMBLER__ */
 
diff --git a/mldsa/src/fips202/native/api.h b/mldsa/src/fips202/native/api.h
index 1dc51cfb1..c527b6a26 100644
--- a/mldsa/src/fips202/native/api.h
+++ b/mldsa/src/fips202/native/api.h
@@ -46,10 +46,10 @@
  */
 
 #if defined(MLD_USE_FIPS202_X1_NATIVE)
-static MLD_INLINE void mld_keccak_f1600_x1_native(uint64_t *state);
+static MLD_INLINE int mld_keccak_f1600_x1_native(uint64_t *state);
 #endif
 #if defined(MLD_USE_FIPS202_X4_NATIVE)
-static MLD_INLINE void mld_keccak_f1600_x4_native(uint64_t *state);
+static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state);
 #endif
 
 #endif /* !MLD_FIPS202_NATIVE_API_H */
diff --git a/mldsa/src/fips202/native/x86_64/xkcp.h b/mldsa/src/fips202/native/x86_64/xkcp.h
index 398694464..c03642d7e 100644
--- a/mldsa/src/fips202/native/x86_64/xkcp.h
+++ b/mldsa/src/fips202/native/x86_64/xkcp.h
@@ -13,12 +13,18 @@
 
 #if !defined(__ASSEMBLER__)
 #include <stdint.h>
+#include "../api.h"
 #include "src/KeccakP_1600_times4_SIMD256.h"
 
 #define MLD_USE_FIPS202_X4_NATIVE
-static MLD_INLINE void mld_keccak_f1600_x4_native(uint64_t *state)
+static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state)
 {
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
   mld_keccakf1600x4_permute24(state);
+  return MLD_NATIVE_FUNC_SUCCESS;
 }
 #endif /* !__ASSEMBLER__ */
 
diff --git a/test/configs.yml b/test/configs.yml
index b875d0e9c..45b5660e9 100644
--- a/test/configs.yml
+++ b/test/configs.yml
@@ -246,6 +246,40 @@ configs:
           }
           #endif /* !__ASSEMBLER__ */
 
+  - path: test/custom_native_capability_config_ID_AA64PFR1_EL1.h
+    description: "Test configuration with ARM system register capability detection"
+    defines:
+      MLD_CONFIG_CUSTOM_CAPABILITY_FUNC:
+        content: |
+          #define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+          #if !defined(__ASSEMBLER__)
+          #include <stdint.h>
+          #include "../mldsa/src/sys.h"
+
+          #if !defined(MLD_SYS_AARCH64) || !defined(MLD_SYS_LINUX)
+          #error This configuration is only suitable for Linux/AArch64 systems
+          #endif
+
+          static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+          {
+            if (cap == MLD_SYS_CAP_SHA3)
+            {
+              uint64_t id_aa64pfr1_el1;
+
+              /* Read ID_AA64PFR1_EL1 system register */
+              __asm__ volatile("mrs %0, id_aa64pfr1_el1" : "=r"(id_aa64pfr1_el1));
+
+              /* Extract SHA3 field (bits 35:32) and check if SHA3 is supported */
+              /* SHA3 field: 0b0000 = not implemented, 0b0001 = SHA3 implemented */
+              uint64_t sha3_field = (id_aa64pfr1_el1 >> 32) & 0xF;
+
+              return (sha3_field == 1) ? 1 : 0;
+            }
+
+            /* Default to 0 (conservative) for unknown capabilities */
+            return 0;
+          }
+          #endif /* !__ASSEMBLER__ */
   # Example configs
 
   - path: examples/monolithic_build/config_44.h
diff --git a/test/custom_native_capability_config_ID_AA64PFR1_EL1.h b/test/custom_native_capability_config_ID_AA64PFR1_EL1.h
new file mode 100644
index 000000000..a1393e090
--- /dev/null
+++ b/test/custom_native_capability_config_ID_AA64PFR1_EL1.h
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS140_3_IG]
+ *   Implementation Guidance for FIPS 140-3 and the Cryptographic Module
+ *   Validation Program
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
+ *
+ * - [FIPS204]
+ *   FIPS 204 Module-Lattice-Based Digital Signature Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/204/final
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogen
+ *          in the mldsa-native repository.
+ *          Do not modify it directly.
+ */
+
+/*
+ * Test configuration: Test configuration with ARM system register capability
+ * detection
+ *
+ * This configuration differs from the default mldsa/src/config.h in the
+ * following places:
+ *   - MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ */
+
+
+#ifndef MLD_CONFIG_H
+#define MLD_CONFIG_H
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_PARAMETER_SET
+ *
+ * Description: Specifies the parameter set for ML-DSA
+ *              - MLD_CONFIG_PARAMETER_SET=44 corresponds to ML-DSA-44
+ *              - MLD_CONFIG_PARAMETER_SET=65 corresponds to ML-DSA-65
+ *              - MLD_CONFIG_PARAMETER_SET=87 corresponds to ML-DSA-87
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#ifndef MLD_CONFIG_PARAMETER_SET
+#define MLD_CONFIG_PARAMETER_SET \
+  44 /* Change this for different security strengths */
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_NAMESPACE_PREFIX
+ *
+ * Description: The prefix to use to namespace global symbols from mldsa/.
+ *
+ *              In a multi-level build (that is, if either
+ *              - MLD_CONFIG_MULTILEVEL_WITH_SHARED, or
+ *              - MLD_CONFIG_MULTILEVEL_NO_SHARED,
+ *              are set, level-dependent symbols will additionally be prefixed
+ *              with the parameter set (44/65/87).
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLD_CONFIG_NAMESPACE_PREFIX)
+#define MLD_CONFIG_NAMESPACE_PREFIX MLD_DEFAULT_NAMESPACE_PREFIX
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_MULTILEVEL_WITH_SHARED
+ *
+ * Description: This is for multi-level builds of mldsa-native only. If you
+ *              need only a single parameter set, keep this unset.
+ *
+ *              If this is set, all MLD_CONFIG_PARAMETER_SET-independent
+ *              code will be included in the build, including code needed only
+ *              for other parameter sets.
+ *
+ *              Example: TODO: add example
+ *
+ *              To build mldsa-native with support for all parameter sets,
+ *              build it three times -- once per parameter set -- and set the
+ *              option MLD_CONFIG_MULTILEVEL_WITH_SHARED for exactly one of
+ *              them, and MLD_CONFIG_MULTILEVEL_NO_SHARED for the others.
+ *
+ *              See examples/multilevel_build_mldsa for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_MULTILEVEL_WITH_SHARED */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_MULTILEVEL_NO_SHARED
+ *
+ * Description: This is for multi-level builds of mldsa-native only. If you
+ *              need only a single parameter set, keep this unset.
+ *
+ *              If this is set, no MLD_CONFIG_PARAMETER_SET-independent code
+ *              will be included in the build.
+ *
+ *              To build mldsa-native with support for all parameter sets,
+ *              build it three times -- once per parameter set -- and set the
+ *              option MLD_CONFIG_MULTILEVEL_WITH_SHARED for exactly one of
+ *              them, and MLD_CONFIG_MULTILEVEL_NO_SHARED for the others.
+ *
+ *              See examples/multilevel_build_mldsa for an example.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_MULTILEVEL_NO_SHARED */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_FILE
+ *
+ * Description: If defined, this is a header that will be included instead
+ *              of the default configuration file mldsa/src/config.h.
+ *
+ *              When you need to build mldsa-native in multiple configurations,
+ *              using varying MLD_CONFIG_FILE can be more convenient
+ *              then configuring everything through CFLAGS.
+ *
+ *              To use, MLD_CONFIG_FILE _must_ be defined prior
+ *              to the inclusion of any mldsa-native headers. For example,
+ *              it can be set by passing `-DMLD_CONFIG_FILE="..."`
+ *              on the command line.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_FILE "config.h" */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_USE_NATIVE_BACKEND_ARITH
+ *
+ * Description: Determines whether an native arithmetic backend should be used.
+ *
+ *              The arithmetic backend covers performance critical functions
+ *              such as the number-theoretic transform (NTT).
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the arithmetic backend to be use is
+ *              determined by MLD_CONFIG_ARITH_BACKEND_FILE: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH)
+/* #define MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_ARITH_BACKEND_FILE
+ *
+ * Description: The arithmetic backend to use.
+ *
+ *              If MLD_CONFIG_USE_NATIVE_BACKEND_ARITH is unset, this option
+ *              is ignored.
+ *
+ *              If MLD_CONFIG_USE_NATIVE_BACKEND_ARITH is set, this option must
+ *              either be undefined or the filename of an arithmetic backend.
+ *              If unset, the default backend will be used.
+ *
+ *              This can be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH) && \
+    !defined(MLD_CONFIG_ARITH_BACKEND_FILE)
+#define MLD_CONFIG_ARITH_BACKEND_FILE "native/meta.h"
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202
+ *
+ * Description: Determines whether an native FIPS202 backend should be used.
+ *
+ *              The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is
+ *              the performance bottleneck of SHA3 and SHAKE.
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the FIPS202 backend to be use is
+ *              determined by MLD_CONFIG_FIPS202_BACKEND_FILE: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)
+/* #define MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 */
+#endif
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_FIPS202_BACKEND_FILE
+ *
+ * Description: The FIPS-202 backend to use.
+ *
+ *              If MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 is set, this option
+ *              must either be undefined or the filename of a FIPS202 backend.
+ *              If unset, the default backend will be used.
+ *
+ *              This can be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202) && \
+    !defined(MLD_CONFIG_FIPS202_BACKEND_FILE)
+#define MLD_CONFIG_FIPS202_BACKEND_FILE "fips202/native/auto.h"
+#endif
+/******************************************************************************
+ * Name:        MLD_CONFIG_FIPS202_CUSTOM_HEADER
+ *
+ * Description: Custom header to use for FIPS-202
+ *
+ *              This should only be set if you intend to use a custom
+ *              FIPS-202 implementation, different from the one shipped
+ *              with mldsa-native.
+ *
+ *              If set, it must be the name of a file serving as the
+ *              replacement for mldsa/src/fips202/fips202.h, and exposing
+ *              the same API (see FIPS202.md).
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_FIPS202_CUSTOM_HEADER "SOME_FILE.h" */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_FIPS202X4_CUSTOM_HEADER
+ *
+ * Description: Custom header to use for FIPS-202-X4
+ *
+ *              This should only be set if you intend to use a custom
+ *              FIPS-202 implementation, different from the one shipped
+ *              with mldsa-native.
+ *
+ *              If set, it must be the name of a file serving as the
+ *              replacement for mldsa/src/fips202/fips202x4.h, and exposing
+ *              the same API (see FIPS202.md).
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_FIPS202X4_CUSTOM_HEADER "SOME_FILE.h" */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_ZEROIZE
+ *
+ * Description: In compliance with @[FIPS204, Section 3.6.3], mldsa-native,
+ *              zeroizes intermediate stack buffers before returning from
+ *              function calls.
+ *
+ *              Set this option and define `mld_zeroize_native` if you want to
+ *              use a custom method to zeroize intermediate stack buffers.
+ *              The default implementation uses SecureZeroMemory on Windows
+ *              and a memset + compiler barrier otherwise. If neither of those
+ *              is available on the target platform, compilation will fail,
+ *              and you will need to use MLD_CONFIG_CUSTOM_ZEROIZE to provide
+ *              a custom implementation of `mld_zeroize_native()`.
+ *
+ *              WARNING:
+ *              The explicit stack zeroization conducted by mldsa-native
+ *              reduces the likelihood of data leaking on the stack, but
+ *              does not eliminate it! The C standard makes no guarantee about
+ *              where a compiler allocates structures and whether/where it makes
+ *              copies of them. Also, in addition to entire structures, there
+ *              may also be potentially exploitable leakage of individual values
+ *              on the stack.
+ *
+ *              If you need bullet-proof zeroization of the stack, you need to
+ *              consider additional measures instead of what this feature
+ *              provides. In this case, you can set mld_zeroize_native to a
+ *              no-op.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_ZEROIZE
+   #if !defined(__ASSEMBLER__)
+   #include <stdint.h>
+   #include "sys.h"
+   static MLD_INLINE void mld_zeroize_native(void *ptr, size_t len)
+   {
+       ... your implementation ...
+   }
+   #endif
+*/
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_MEMCPY
+ *
+ * Description: Set this option and define `mld_memcpy` if you want to
+ *              use a custom method to copy memory instead of the standard
+ *              library memcpy function.
+ *
+ *              The custom implementation must have the same signature and
+ *              behavior as the standard memcpy function:
+ *              void *mld_memcpy(void *dest, const void *src, size_t n)
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_MEMCPY
+   #if !defined(__ASSEMBLER__)
+   #include <stdint.h>
+   #include "sys.h"
+   static MLD_INLINE void *mld_memcpy(void *dest, const void *src, size_t n)
+   {
+       ... your implementation ...
+   }
+   #endif
+*/
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_MEMSET
+ *
+ * Description: Set this option and define `mld_memset` if you want to
+ *              use a custom method to set memory instead of the standard
+ *              library memset function.
+ *
+ *              The custom implementation must have the same signature and
+ *              behavior as the standard memset function:
+ *              void *mld_memset(void *s, int c, size_t n)
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_MEMSET
+   #if !defined(__ASSEMBLER__)
+   #include <stdint.h>
+   #include "sys.h"
+   static MLD_INLINE void *mld_memset(void *s, int c, size_t n)
+   {
+       ... your implementation ...
+   }
+   #endif
+*/
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_RANDOMBYTES
+ *
+ * Description: mldsa-native does not provide a secure randombytes
+ *              implementation. Such an implementation has to provided by the
+ *              consumer.
+ *
+ *              If this option is not set, mldsa-native expects a function
+ *              void randombytes(uint8_t *out, size_t outlen).
+ *
+ *              Set this option and define `mld_randombytes` if you want to
+ *              use a custom method to sample randombytes with a different name
+ *              or signature.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CUSTOM_RANDOMBYTES
+   #if !defined(__ASSEMBLER__)
+   #include <stdint.h>
+   #include "sys.h"
+   static MLD_INLINE void mld_randombytes(uint8_t *ptr, size_t len)
+   {
+       ... your implementation ...
+   }
+   #endif
+*/
+
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+ *
+ * Description: mldsa-native backends may rely on specific hardware features.
+ *              Those backends will only be included in an mldsa-native build
+ *              if support for the respective features is enabled at
+ *              compile-time. However, when building for a heteroneous set
+ *              of CPUs to run the resulting binary/library on, feature
+ *              detection at _runtime_ is needed to decided whether a backend
+ *              can be used or not.
+ *
+ *              Set this option and define `mld_sys_check_capability` if you
+ *              want to use a custom method to dispatch between implementations.
+ *
+ *              If this option is not set, mldsa-native uses compile-time
+ *              feature detection only to decide which backend to use.
+ *
+ *              If you compile mldsa-native on a system with different
+ *              capabilities than the system that the resulting binary/library
+ *              will be run on, you must use this option.
+ *
+ *****************************************************************************/
+#define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+#if !defined(__ASSEMBLER__)
+#include <stdint.h>
+#include "../mldsa/src/sys.h"
+
+#if !defined(MLD_SYS_AARCH64) || !defined(MLD_SYS_LINUX)
+#error This configuration is only suitable for Linux/AArch64 systems
+#endif
+
+static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap)
+{
+  if (cap == MLD_SYS_CAP_SHA3)
+  {
+    uint64_t id_aa64pfr1_el1;
+
+    /* Read ID_AA64PFR1_EL1 system register */
+    __asm__ volatile("mrs %0, id_aa64pfr1_el1" : "=r"(id_aa64pfr1_el1));
+
+    /* Extract SHA3 field (bits 35:32) and check if SHA3 is supported */
+    /* SHA3 field: 0b0000 = not implemented, 0b0001 = SHA3 implemented */
+    uint64_t sha3_field = (id_aa64pfr1_el1 >> 32) & 0xF;
+
+    return (sha3_field == 1) ? 1 : 0;
+  }
+
+  /* Default to 0 (conservative) for unknown capabilities */
+  return 0;
+}
+#endif /* !__ASSEMBLER__ */
+
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_NO_RANDOMIZED_API
+ *
+ * Description: If this option is set, mldsa-native will be built without the
+ *              randomized API functions (crypto_sign_keypair,
+ *              crypto_sign, crypto_sign_signature, and
+ *              crypto_sign_signature_extmu).
+ *              This allows users to build mldsa-native without providing a
+ *              randombytes() implementation if they only need the
+ *              internal deterministic API
+ *              (crypto_sign_keypair_internal, crypto_sign_signature_internal).
+ *
+ *              NOTE: This option is incompatible with MLD_CONFIG_KEYGEN_PCT
+ *              as the current PCT implementation requires
+ *              crypto_sign_signature().
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_NO_RANDOMIZED_API */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_KEYGEN_PCT
+ *
+ * Description: Compliance with @[FIPS140_3_IG, p.87] requires a
+ *              Pairwise Consistency Test (PCT) to be carried out on a freshly
+ *              generated keypair before it can be exported.
+ *
+ *              Set this option if such a check should be implemented.
+ *              In this case, crypto_sign_keypair_internal and
+ *              crypto_sign_keypair will return a non-zero error code if the
+ *              PCT failed.
+ *
+ *              NOTE: This feature will drastically lower the performance of
+ *              key generation.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_KEYGEN_PCT */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_KEYGEN_PCT_BREAKAGE_TEST
+ *
+ * Description: If this option is set, the user must provide a runtime
+ *              function `static inline int mld_break_pct() { ... }` to
+ *              indicate whether the PCT should be made fail.
+ *
+ *              This option only has an effect if MLD_CONFIG_KEYGEN_PCT is set.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_KEYGEN_PCT_BREAKAGE_TEST
+   #if !defined(__ASSEMBLER__)
+   #include "sys.h"
+   static MLD_INLINE int mld_break_pct(void)
+   {
+       ... return 0/1 depending on whether PCT should be broken ...
+   }
+   #endif
+*/
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_INTERNAL_API_QUALIFIER
+ *
+ * Description: If set, this option provides an additional function
+ *              qualifier to be added to declarations of internal API.
+ *
+ *              The primary use case for this option are single-CU builds,
+ *              in which case this option can be set to `static`.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_INTERNAL_API_QUALIFIER */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_EXTERNAL_API_QUALIFIER
+ *
+ * Description: If set, this option provides an additional function
+ *              qualifier to be added to declarations of mldsa-native's
+ *              public API.
+ *
+ *              The primary use case for this option are single-CU builds
+ *              where the public API exposed by mldsa-native is wrapped by
+ *              another API in the consuming application. In this case,
+ *              even mldsa-native's public API can be marked `static`.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_EXTERNAL_API_QUALIFIER */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_CT_TESTING_ENABLED
+ *
+ * Description: If set, mldsa-native annotates data as secret / public using
+ *              valgrind's annotations VALGRIND_MAKE_MEM_UNDEFINED and
+ *              VALGRIND_MAKE_MEM_DEFINED, enabling various checks for secret-
+ *              dependent control flow of variable time execution (depending
+ *              on the exact version of valgrind installed).
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_CT_TESTING_ENABLED */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_NO_ASM
+ *
+ * Description: If this option is set, mldsa-native will be built without
+ *              use of native code or inline assembly.
+ *
+ *              By default, inline assembly is used to implement value barriers.
+ *              Without inline assembly, mldsa-native will use a global volatile
+ *              'opt blocker' instead; see ct.h.
+ *
+ *              Inline assembly is also used to implement a secure zeroization
+ *              function on non-Windows platforms. If this option is set and
+ *              the target platform is not Windows, you MUST set
+ *              MLD_CONFIG_CUSTOM_ZEROIZE and provide a custom zeroization
+ *              function.
+ *
+ *              If this option is set, MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 and
+ *              and MLD_CONFIG_USE_NATIVE_BACKEND_ARITH will be ignored, and no
+ *              native backends will be used.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_NO_ASM */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_NO_ASM_VALUE_BARRIER
+ *
+ * Description: If this option is set, mldsa-native will be built without
+ *              use of native code or inline assembly for value barriers.
+ *
+ *              By default, inline assembly (if available) is used to implement
+ *              value barriers.
+ *              Without inline assembly, mldsa-native will use a global volatile
+ *              'opt blocker' instead; see ct.h.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_NO_ASM_VALUE_BARRIER */
+
+/******************************************************************************
+ * Name:        MLD_CONFIG_SERIAL_FIPS202_ONLY
+ *
+ * Description: Set this to use a FIPS202 implementation with global state
+ *              that supports only one active Keccak computation at a time
+ *              (e.g. some hardware accelerators).
+ *
+ *              If this option is set, ML-DSA will use FIPS202 operations
+ *              serially, ensuring that only one SHAKE context is active
+ *              at any given time.
+ *
+ *              This allows offloading Keccak computations to a hardware
+ *              accelerator that holds only a single Keccak state locally,
+ *              rather than requiring support for multiple concurrent
+ *              Keccak states.
+ *
+ *              NOTE: Depending on the target CPU, this may reduce
+ *              performance when using software FIPS202 implementations.
+ *              Only enable this when you have to.
+ *
+ *****************************************************************************/
+/* #define MLD_CONFIG_SERIAL_FIPS202_ONLY */
+
+/*************************  Config internals  ********************************/
+
+/* Default namespace
+ *
+ * Don't change this. If you need a different namespace, re-define
+ * MLD_CONFIG_NAMESPACE_PREFIX above instead, and remove the following.
+ *
+ * The default MLDSA namespace is
+ *
+ *   PQCP_MLDSA_NATIVE_MLDSA<LEVEL>_
+ *
+ * e.g., PQCP_MLDSA_NATIVE_MLDSA44_
+ */
+
+#if MLD_CONFIG_PARAMETER_SET == 44
+#define MLD_DEFAULT_NAMESPACE_PREFIX PQCP_MLDSA_NATIVE_MLDSA44
+#elif MLD_CONFIG_PARAMETER_SET == 65
+#define MLD_DEFAULT_NAMESPACE_PREFIX PQCP_MLDSA_NATIVE_MLDSA65
+#elif MLD_CONFIG_PARAMETER_SET == 87
+#define MLD_DEFAULT_NAMESPACE_PREFIX PQCP_MLDSA_NATIVE_MLDSA87
+#endif
+
+#endif /* !MLD_CONFIG_H */

From 23a88fc87ab6a1ecaf0a1fb00c4488973fbd0b8d Mon Sep 17 00:00:00 2001
From: willieyz <willie.zhao@chelpis.com>
Date: Thu, 13 Nov 2025 17:02:47 +0800
Subject: [PATCH 11/11] ec2_functests: Remove TODO and enable the
 config-variations test

Signed-off-by: willieyz <willie.zhao@chelpis.com>

add changes

Signed-off-by: willieyz <willie.zhao@chelpis.com>
---
 .github/workflows/ci.yml | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 46e7ad9bc..fc29027b2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -513,32 +513,28 @@ jobs:
             ec2_volume_size: 20
             compile_mode: native
             opt: all
-            # TODO: This config do not exist for now, will be available after PR #607 ("Add Runtime Dispatch") is merged.
-            # config_variations: 'native-cap-CPUID_AVX2'
+            config_variations: 'native-cap-CPUID_AVX2'
           - name: Intel Xeon 4th gen (t3)
             ec2_instance_type: t3.small
             ec2_ami: ubuntu-latest (x86_64)
             ec2_volume_size: 20
             compile_mode: native
             opt: all
-            # TODO: This config do not exist for now, will be available after PR #607 ("Add Runtime Dispatch") is merged.
-            # config_variations: 'native-cap-CPUID_AVX2'
+            config_variations: 'native-cap-CPUID_AVX2'
           - name: Graviton2 (c6g.medium)
             ec2_instance_type: c6g.medium
             ec2_ami: ubuntu-latest (aarch64)
             ec2_volume_size: 20
             compile_mode: native
             opt: all
-            # TODO: This config do not exist for now, will be available after PR #607 ("Add Runtime Dispatch") is merged.
-            # config_variations: 'native-cap-ON native-cap-OFF native-cap-ID_AA64PFR1_EL1'
+            config_variations: 'native-cap-ON native-cap-OFF native-cap-ID_AA64PFR1_EL1'
           - name: Graviton3 (c7g.medium)
             ec2_instance_type: c7g.medium
             ec2_ami: ubuntu-latest (aarch64)
             ec2_volume_size: 20
             compile_mode: native
             opt: all
-            # TODO: This config do not exist for now, will be available after PR #607 ("Add Runtime Dispatch") is merged.
-            # config_variations: 'native-cap-ID_AA64PFR1_EL1'
+            config_variations: 'native-cap-ID_AA64PFR1_EL1'
     name: Platform tests  (${{ matrix.target.name }})
     permissions:
       contents: 'read'