flintlib · vneiger · Nov 1, 2025 · Nov 1, 2025 · Nov 1, 2025 · Nov 2, 2025
diff --git a/doc/source/nmod_poly.rst b/doc/source/nmod_poly.rst
@@ -1225,6 +1225,11 @@ Division
     The algorithm used is to call :func:`div_newton_n` and then multiply out
     and compute the remainder.
 
+
+Division with special divisors
+--------------------------------------------------------------------------------
+
+
 .. function:: ulong _nmod_poly_div_root(nn_ptr Q, nn_srcptr A, slong len, ulong c, nmod_t mod)
 
     Sets ``(Q, len-1)`` to the quotient of ``(A, len)`` on division
@@ -1237,6 +1242,56 @@ Division
     Sets `Q` to the quotient of `A` on division by `(x - c)`, and returns
     the remainder, equal to the value of `A` evaluated at `c`.
 
+.. function:: void _nmod_poly_divrem_xnmc(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod)
+
+    Sets the first `n` coefficients of ``RQ`` to the remainder of
+    ``(A, len)`` on division by `x^n - c`, and the next ``len - n``
+    coefficients (from index `n` to index ``len-1``) to the
+    quotient of this division. `A` and `RQ` are allowed to be the
+    same, but may not overlap partially in any other way.
+    Constraints: `len \ge n > 0`, and `c` is reduced modulo
+    ``mod.n``.
+
+.. function:: void _nmod_poly_divrem_xnm1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t mod)
+
+    Identical to :func:`_nmod_poly_divrem_xnmc` but specialized for `c = 1`,
+    that is, divisor is `x^n - 1`.
+
+.. function:: void _nmod_poly_divrem_xnp1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, nmod_t mod)
+
+    Identical to :func:`_nmod_poly_divrem_xnmc` but specialized for `c = -1`,
+    that is, divisor is `x^n + 1`.
+
+.. function:: void _nmod_poly_divrem_xnmc_precomp(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn)
+
+    Identical to :func:`_nmod_poly_divrem_xnmc` but exploiting the precomputed
+    ``c_precomp`` obtained via :func:`n_mulmod_precomp_shoup` for faster
+    modular multiplications by `c`. Additional constraint: the modulus ``modn``
+    must be less than `2^{\mathtt{FLINT\_BITS} - 1}`. 
+
+.. function:: void _nmod_poly_divrem_xnmc_precomp_lazy(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn)
+
+    Identical to :func:`_nmod_poly_divrem_xnmc_precomp` but handling modular
+    reductions lazily. Precisely, if all coefficients of `A` are less than or
+    equal to `m`, the input requirement is `m + 2n \le
+    2^{\mathtt{FLINT\_BITS}}`, and the output coefficients of ``RQ`` are in
+    `[0, m+2n)` and equal to the sought values modulo `n`. In particular the
+    coefficients of `A` need not be reduced modulo ``n``, and the output may
+    not be either. However, the value ``c`` should be reduced modulo `n`.
+
+    In the case where `m = n-1` (coefficients of `A` are reduced modulo
+    `n`), then the above leads to the requirement `3n-1 \le
+    2^{\mathtt{FLINT\_BITS}}` (this is `n \le 6148914691236517205` for 64 bits,
+    and `n \le 1431655765` for 32 bits), and reducing the output coefficients
+    just amounts to subtracting either `n` or `2n` to each of them.
+
+.. function:: void nmod_poly_divrem_xnmc(nmod_poly_t Q, nmod_poly_t R, const nmod_poly_t A, ulong n, ulong c)
+
+    Sets `Q` and `R` to the quotient and remainder of `A` on division by `x^n -
+    c`. Constraints: `n` is nonzero and ``c`` is reduced modulo the modulus of
+    `A`. Incorporates specialized code for the cases `c \in \{-1,0,1\}`, and
+    calls variants with precomputation on `c` when possible.
+
 
 Divisibility testing
 --------------------------------------------------------------------------------
@@ -1317,14 +1372,14 @@ Evaluation
 .. function:: ulong _nmod_poly_evaluate_nmod_precomp_lazy(nn_srcptr poly, slong len, ulong c, ulong c_precomp, ulong n)
 
     Evaluates ``poly`` at the value ``c`` modulo ``n``, with lazy reductions
-    modulo `n`. Precisely, if all coefficients of ``poly`` are less than `m`,
-    the input requirement is `m \le 2^{\mathtt{FLINT\_BITS}} - 2n + 1`, and the
-    output value is in `[0, m+2n-1)` and equal to the sought evaluation modulo
-    `n`. In particular the coefficients of ``poly`` need not be reduced modulo
-    ``n``, and the output may not be either. However, the value ``c`` should be
-    reduced modulo `n`.
-
-    In the case where `m = n` (coefficients of ``poly`` are reduced modulo
+    modulo `n`. Precisely, if all coefficients of ``poly`` are less than or
+    equal to `m`, the input requirement is `m + 2n \le
+    2^{\mathtt{FLINT\_BITS}}`, and the output value is in `[0, m+2n)` and equal
+    to the sought evaluation modulo `n`. In particular the coefficients of
+    ``poly`` need not be reduced modulo ``n``, and the output may not be
+    either. However, the value ``c`` should be reduced modulo `n`.
+
+    In the case where `m = n-1` (coefficients of ``poly`` are reduced modulo
     `n`), then the above leads to the requirement `3n-1 \le
     2^{\mathtt{FLINT\_BITS}}` (this is `n \le 6148914691236517205` for 64 bits,
     and `n \le 1431655765` for 32 bits), and reducing the output just amounts

diff --git a/src/nmod_poly.h b/src/nmod_poly.h
@@ -502,10 +502,18 @@ void nmod_poly_div_newton_n_preinv (nmod_poly_t Q, const nmod_poly_t A, const nm
 void _nmod_poly_divrem_newton_n_preinv (nn_ptr Q, nn_ptr R, nn_srcptr A, slong lenA, nn_srcptr B, slong lenB, nn_srcptr Binv, slong lenBinv, nmod_t mod);
 void nmod_poly_divrem_newton_n_preinv(nmod_poly_t Q, nmod_poly_t R, const nmod_poly_t A, const nmod_poly_t B, const nmod_poly_t Binv);
 
-ulong _nmod_poly_div_root(nn_ptr Q, nn_srcptr A, slong len, ulong c, nmod_t mod);
+/* Division with special divisors  *******************************************/
 
+ulong _nmod_poly_div_root(nn_ptr Q, nn_srcptr A, slong len, ulong c, nmod_t mod);
 ulong nmod_poly_div_root(nmod_poly_t Q, const nmod_poly_t A, ulong c);
 
+void _nmod_poly_divrem_xnmc(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod);
+void _nmod_poly_divrem_xnm1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong modn);
+void _nmod_poly_divrem_xnp1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong modn);
+void _nmod_poly_divrem_xnmc_precomp(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn);
+void _nmod_poly_divrem_xnmc_precomp_lazy(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn);
+void nmod_poly_divrem_xnmc(nmod_poly_t Q, nmod_poly_t R, nmod_poly_t A, ulong n, ulong c);
+
 /* Divisibility testing  *****************************************************/
 
 int _nmod_poly_divides_classical(nn_ptr Q, nn_srcptr A, slong lenA, nn_srcptr B, slong lenB, nmod_t mod);

diff --git a/src/nmod_poly/divrem_xnmc.c b/src/nmod_poly/divrem_xnmc.c
@@ -0,0 +1,285 @@
+/*
+    Copyright (C) 2025 Vincent Neiger
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+/* #include "flint.h" */
+#include "nmod_poly.h"
+#include "nmod_vec.h"
+#include "ulong_extras.h"
+
+/* division by x**n - 1 */
+void _nmod_poly_divrem_xnm1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong modn)
+{
+    /* assumes len >= n */
+    slong i;
+    ulong j, r;
+
+    if (RQ != A)
+        for (j = 0; j < n; j++)
+            RQ[len-n+j] = A[len-n+j];
+
+    r = len % n;
+    i = len - r - n;  /* multiple of n, >= 0 by assumption */
+
+    for (j = 0; j < r; j++)
+        RQ[i+j] = n_addmod(RQ[i+n+j], A[i+j], modn);
+
+    i -= n;
+    while (i >= 0)
+    {
+        for (j = 0; j < n; j++)
+            RQ[i+j] = n_addmod(RQ[i+n+j], A[i+j], modn);
+        i -= n;
+    }
+}
+
+/* division by x**n + 1 */
+void _nmod_poly_divrem_xnp1(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong modn)
+{
+    /* assumes len >= n */
+    slong i;
+    ulong j, r;
+
+    if (RQ != A)
+        for (j = 0; j < n; j++)
+            RQ[len-n+j] = A[len-n+j];
+
+    r = len % n;
+    i = len - r - n;  /* multiple of n, >= 0 by assumption */
+
+    for (j = 0; j < r; j++)
+        RQ[i+j] = n_submod(A[i+j], RQ[i+n+j], modn);
+
+    i -= n;
+    while (i >= 0)
+    {
+        for (j = 0; j < n; j++)
+            RQ[i+j] = n_submod(A[i+j], RQ[i+n+j], modn);
+        i -= n;
+    }
+}
+
+/* division by x**n - c, general variant */
+void _nmod_poly_divrem_xnmc(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, nmod_t mod)
+{
+    /* assumes len >= n */
+    slong i;
+    ulong j, r, val;
+
+    if (RQ != A)
+        for (j = 0; j < n; j++)
+            RQ[len-n+j] = A[len-n+j];
+
+    r = len % n;
+    i = len - r - n;  /* multiple of n, >= 0 by assumption */
+
+    for (j = 0; j < r; j++)
+    {
+        val = nmod_mul(RQ[i+n+j], c, mod);
+        RQ[i+j] = n_addmod(val, A[i+j], mod.n);
+    }
+
+    i -= n;
+    while (i >= 0)
+    {
+        for (j = 0; j < n; j++)
+        {
+            val = nmod_mul(RQ[i+n+j], c, mod);
+            RQ[i+j] = n_addmod(val, A[i+j], mod.n);
+        }
+        i -= n;
+    }
+}
+
+/* division by x**n - c, with precomputation on c */
+/* constraint: modn < 2**(FLINT_BITS-1) */
+void _nmod_poly_divrem_xnmc_precomp(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn)
+{
+    /* assumes len >= n */
+    slong i;
+    ulong j, r, val;
+
+    if (RQ != A)
+        for (j = 0; j < n; j++)
+            RQ[len-n+j] = A[len-n+j];
+
+    r = len % n;
+    i = len - r - n;  /* multiple of n, >= 0 by assumption */
+
+    for (j = 0; j < r; j++)
+    {
+        val = n_mulmod_shoup(c, RQ[i+n+j], c_precomp, modn);
+        RQ[i+j] = n_addmod(val, A[i+j], modn);
+    }
+
+    i -= n;
+    while (i >= 0)
+    {
+        for (j = 0; j < n; j++)
+        {
+            val = n_mulmod_shoup(c, RQ[i+n+j], c_precomp, modn);
+            RQ[i+j] = n_addmod(val, A[i+j], modn);
+        }
+        i -= n;
+    }
+}
+
+/* division by x**n - c, lazy with precomputation */
+/* constraint: max(A) + 2*modn <= 2**FLINT_BITS */
+/* coeff bounds: in [0, max(A)] | out [0, max(A) + 2*modn) */
+void _nmod_poly_divrem_xnmc_precomp_lazy(nn_ptr RQ, nn_srcptr A, slong len, ulong n, ulong c, ulong c_precomp, ulong modn)
+{
+    /* assumes len >= n */
+    slong i;
+    ulong j, r, val, p_hi, p_lo;
+
+    if (RQ != A)
+        for (j = 0; j < n; j++)
+            RQ[len-n+j] = A[len-n+j];
+
+    r = len % n;
+    i = len - r - n;  /* multiple of n, >= 0 by assumption */
+
+    for (j = 0; j < r; j++)
+    {
+        /* computes either val = (c*val mod n) or val = (c*val mod n) + n */
+        val = RQ[i+n+j];
+        umul_ppmm(p_hi, p_lo, c_precomp, val);
+        val = c * val - p_hi * modn;
+        /* lazy addition, yields RQ[i+j] in [0..k+2n), where max(RQ) <= k */
+        RQ[i+j] = val + A[i+j];
+    }
+
+    i -= n;
+    while (i >= 0)
+    {
+        for (j = 0; j < n; j++)
+        {
+            /* computes either val = (c*val mod n) or val = (c*val mod n) + n */
+            val = RQ[i+n+j];
+            umul_ppmm(p_hi, p_lo, c_precomp, val);
+            val = c * val - p_hi * modn;
+            /* lazy addition, yields RQ[i+j] in [0..k+2n), where max(RQ) <= k */
+            RQ[i+j] = val + A[i+j];
+        }
+        i -= n;
+    }
+}
+
+void nmod_poly_divrem_xnmc(nmod_poly_t Q, nmod_poly_t R, nmod_poly_t A, ulong n, ulong c)
+{
+    const ulong len = A->length;
+
+    if (len <= n)
+    {
+        nmod_poly_zero(Q);
+        nmod_poly_set(R, A);
+        return;
+    }
+
+    if (c == 0)
+    {
+        nmod_poly_set_trunc(R, A, n);
+        nmod_poly_shift_right(Q, A, n);
+        return;
+    }
+
+    int lazy = 0;
+    nn_ptr RQ = _nmod_vec_init(len);
+
+    /* perform division */
+    if (c == 1)
+        _nmod_poly_divrem_xnm1(RQ, A->coeffs, len, n, A->mod.n);
+
+    else if (c == A->mod.n - 1)
+        _nmod_poly_divrem_xnp1(RQ, A->coeffs, len, n, A->mod.n);
+
+    /* if degree below the n_mulmod_shoup threshold, */
+    /* or if modulus forbids n_mulmod_shoup usage, use general */
+#if FLINT_MULMOD_SHOUP_THRESHOLD <= 2
+    else if (A->mod.norm == 0)  /* here A->length >= threshold */
+#else
+    else if ((A->length < FLINT_MULMOD_SHOUP_THRESHOLD)
+           || (A->mod.norm == 0))
+#endif
+    {
+        _nmod_poly_divrem_xnmc(RQ, A->coeffs, len, n, c, A->mod);
+    }
+
+    else
+    {
+        const ulong modn = A->mod.n;
+        const ulong c_precomp = n_mulmod_precomp_shoup(c, modn);
+
+        /* if 3*mod.n - 1 <= 2**FLINT_BITS, use precomp+lazy variant */
+#if FLINT_BITS == 64
+        if (modn <= UWORD(6148914691236517205))
+#else /* FLINT_BITS == 32 */
+        if (modn <= UWORD(1431655765))
+#endif
+        {
+            lazy = 1;
+            _nmod_poly_divrem_xnmc_precomp_lazy(RQ, A->coeffs, len, n, c, c_precomp, modn);
+        }
+
+        /* use n_mulmod_shoup, non-lazy variant */
+        else
+        {
+            _nmod_poly_divrem_xnmc_precomp(RQ, A->coeffs, len, n, c, c_precomp, modn);
+        }
+    }
+
+    /* copy remainder R */
+    nmod_poly_fit_length(R, n);
+    if (lazy)
+    {
+        /* correct excess */
+        const ulong modn = A->mod.n;
+        for (ulong i = 0; i < n; i++)
+        {
+            ulong v = RQ[i];
+            if (v >= 2*modn)
+                v -= 2*modn;
+            else if (v >= modn)
+                v -= modn;
+            R->coeffs[i] = v;
+        }
+    }
+    else
+    {
+        _nmod_vec_set(R->coeffs, RQ, n);
+    }
+    _nmod_poly_set_length(R, n);
+    _nmod_poly_normalise(R);
+
+    /* copy quotient Q */
+    nmod_poly_fit_length(Q, len - n);
+    if (lazy)
+    {
+        /* correct excess */
+        const ulong modn = A->mod.n;
+        for (ulong i = 0; i < len - n; i++)
+        {
+            ulong v = RQ[n+i];
+            if (v >= 2*modn)
+                v -= 2*modn;
+            else if (v >= modn)
+                v -= modn;
+            Q->coeffs[i] = v;
+        }
+    }
+    else
+    {
+        _nmod_vec_set(Q->coeffs, RQ + n, len - n);
+    }
+    _nmod_poly_set_length(Q, len - n);
+
+    _nmod_vec_clear(RQ);
+}