xarray-contrib · dcherian · Jul 27, 2024 · Jul 28, 2024 · Jul 28, 2024 · Jul 28, 2024
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -71,7 +71,7 @@ jobs:
         id: status
         run: |
           uv run --no-dev python -c "import xarray; xarray.show_versions()" || true
-          uv run --no-dev pytest --durations=20 --durations-min=0.5 -n auto --cov=./ --cov-report=xml --hypothesis-profile ci
+          uv run --no-dev pytest --durations=20 --durations-min=0.5 -n auto --cov=./ --cov-report=xml --hypothesis-profile ci --log-disable=flox
       - name: Upload code coverage to Codecov
         uses: codecov/codecov-action@v5.5.1
         with:

diff --git a/.gitignore b/.gitignore
@@ -113,3 +113,22 @@ venv.bak/
 
 # Git worktrees
 worktrees/
+
+# Auto-generated version file
+flox/_version.py
+
+# Temporary files
+Untitled.ipynb
+*.rej
+*.py.rej
+mutmut-cache
+.mutmut-cache
+mydask.png
+profile.json
+profile.html
+test.png
+uv.lock
+devel/
+
+# Claude Code
+.claude/
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -149,3 +149,70 @@ asv preview
 - Integration testing with xarray upstream development branch
 - **Python Support**: Minimum version 3.11 (updated from 3.10)
 - **Git Worktrees**: `worktrees/` directory is ignored for development workflows
+- **Running Tests**: Always use `uv run pytest` to run tests (not just `pytest`)
+
+## Key Implementation Details
+
+### Map-Reduce Combine Strategies (`flox/dask.py`)
+
+There are two strategies for combining intermediate results in dask's tree reduction:
+
+1. **`_simple_combine`**: Used for most reductions. Tree-reduces the reduction itself (not the groupby-reduction) for performance. Requirements:
+
+   - All blocks must contain all groups after blockwise step (reindex.blockwise=True)
+   - Must know expected_groups
+   - Inserts DUMMY_AXIS=-2 via `_expand_dims`, reduces along it, then squeezes it out
+   - Used when: not an arg reduction, not first/last with non-float dtype, and labels are known
+
+1. **`_grouped_combine`**: More general solution that tree-reduces the groupby-reduction itself. Used for:
+
+   - Arg reductions (argmax, argmin, etc.)
+   - When labels are unknown (dask arrays without expected_groups)
+   - First/last reductions with non-float dtypes
+
+### Aggregations with New Dimensions
+
+Some aggregations add new dimensions to the output (e.g., topk, quantile):
+
+- **`new_dims_func`**: Function that returns tuple of Dim objects for new dimensions
+- These MUST use `_simple_combine` because intermediate results have an extra dimension that needs to be reduced along DUMMY_AXIS
+- Check if `new_dims_func(**finalize_kwargs)` returns non-empty tuple to determine if aggregation actually adds dimensions
+- **Note**: argmax/argmin have `new_dims_func` but return empty tuple, so they use `_grouped_combine`
+
+### topk Implementation
+
+The topk aggregation is special:
+
+- Uses `_simple_combine` (has non-empty new_dims_func)
+- First intermediate (topk values) combines along axis 0, not DUMMY_AXIS
+- Does NOT squeeze out DUMMY_AXIS in final aggregate step
+- `_expand_dims` only expands non-topk intermediates (the second one, nanlen)
+
+### Axis Parameter Handling
+
+- **`_simple_combine`**: Always receives axis as tuple (e.g., `(-2,)` for DUMMY_AXIS)
+- **numpy functions**: Most accept both tuple and integer axis (e.g., np.max, np.sum)
+- **Exception**: argmax/argmin don't accept tuple axis, but these use `_grouped_combine`
+- **Custom functions**: Like `_var_combine` should normalize axis to tuple if needed for iteration
+
+### Test Organization
+
+- **`test_groupby_reduce_all`**: Comprehensive test for all aggregations with various parameters (nby, chunks, etc.)
+
+  - Tests both with and without NaN handling
+  - For topk: sorts results along axis 0 before comparison (k dimension is at axis 0)
+  - Uses `np.moveaxis` not `np.swapaxes` for topk to avoid swapping other dimensions
+
+- **`test_groupby_reduce_axis_subset_against_numpy`**: Tests reductions over subsets of axes
+
+  - Compares dask results against numpy results
+  - Tests various axis combinations: None, single int, tuples
+  - Skip arg reductions with axis=None or multiple axes (not supported)
+
+### Common Pitfalls
+
+1. **Axis transformations for topk**: Use `np.moveaxis(expected, src, 0)` not `np.swapaxes(expected, src, 0)` to move k dimension to position 0 without reordering other dimensions
+
+1. **new_dims_func checking**: Check if it returns non-empty dimensions, not just if it exists (argmax has one that returns `()`)
+
+1. **Axis parameter types**: Custom combine functions should handle both tuple and integer axis by normalizing at the start
diff --git a/devel b/devel
@@ -0,0 +1 @@
+../devel/flox
diff --git a/docs/source/aggregations.md b/docs/source/aggregations.md
@@ -9,19 +9,16 @@ the `func` kwarg:
 - `"mean"`, `"nanmean"`
 - `"var"`, `"nanvar"`
 - `"std"`, `"nanstd"`
-- `"argmin"`
-- `"argmax"`
+- `"argmin"`, `"nanargmax"`
+- `"argmax"`, `"nanargmin"`
 - `"first"`, `"nanfirst"`
 - `"last"`, `"nanlast"`
 - `"median"`, `"nanmedian"`
 - `"mode"`, `"nanmode"`
 - `"quantile"`, `"nanquantile"`
+- `"topk"`
 
-```{tip}
-We would like to add support for `cumsum`, `cumprod` ([issue](https://github.com/xarray-contrib/flox/issues/91)). Contributions are welcome!
-```
-
-## Custom Aggregations
+## Custom Reductions
 
 `flox` also allows you to specify a custom Aggregation (again inspired by dask.dataframe),
 though this might not be fully functional at the moment. See `aggregations.py` for examples.
@@ -46,3 +43,7 @@ mean = Aggregation(
     final_fill_value=np.nan,
 )
 ```
+
+## Custom Scans
+
+Coming soon!
diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py
@@ -47,14 +47,32 @@ def _lerp(a, b, *, t, dtype, out=None):
     return out
 
 
-def quantile_(array, inv_idx, *, q, axis, skipna, group_idx, dtype=None, out=None):
+def quantile_or_topk(
+    array,
+    inv_idx,
+    *,
+    q=None,
+    k=None,
+    axis,
+    skipna,
+    group_idx,
+    dtype=None,
+    out=None,
+    fill_value=None,
+):
+    assert q is not None or k is not None
+    assert axis == -1
+
     inv_idx = np.concatenate((inv_idx, [array.shape[-1]]))
 
     array_validmask = notnull(array)
     actual_sizes = np.add.reduceat(array_validmask, inv_idx[:-1], axis=axis)
     newshape = (1,) * (array.ndim - 1) + (inv_idx.size - 1,)
-    full_sizes = np.reshape(np.diff(inv_idx), newshape)
-    nanmask = full_sizes != actual_sizes
+    if k is not None:
+        nanmask = actual_sizes < abs(k)
+    else:
+        full_sizes = np.reshape(np.diff(inv_idx), newshape)
+        nanmask = full_sizes != actual_sizes
 
     # The approach here is to use (complex_array.partition) because
     # 1. The full np.lexsort((array, labels), axis=-1) is slow and unnecessary
@@ -72,36 +90,63 @@ def quantile_(array, inv_idx, *, q, axis, skipna, group_idx, dtype=None, out=Non
     # So we determine which indices we need using the fact that NaNs get sorted to the end.
     # This *was* partly inspired by https://krstn.eu/np.nanpercentile()-there-has-to-be-a-faster-way/
     # but not any more now that I use partition and avoid replacing NaNs
-    qin = q
-    q = np.atleast_1d(qin)
-    q = np.reshape(q, (len(q),) + (1,) * array.ndim)
+    if k is not None:
+        is_scalar_param = False
+        param = np.sort(np.arange(abs(k)) * np.sign(k))
+    else:
+        is_scalar_param = is_scalar(q)
+        param = np.atleast_1d(q)
+    param = np.reshape(param, (param.size,) + (1,) * array.ndim)
 
     # This is numpy's method="linear"
     # TODO: could support all the interpolations here
     offset = actual_sizes.cumsum(axis=-1)
-    actual_sizes -= 1
-    virtual_index = q * actual_sizes
-    # virtual_index is relative to group starts, so now offset that
-    virtual_index[..., 1:] += offset[..., :-1]
-
-    is_scalar_q = is_scalar(qin)
-    if is_scalar_q:
-        virtual_index = virtual_index.squeeze(axis=0)
-        idxshape = array.shape[:-1] + (actual_sizes.shape[-1],)
-    else:
-        idxshape = (q.shape[0],) + array.shape[:-1] + (actual_sizes.shape[-1],)
+    # For topk(.., k=+1 or -1), we always return the singleton dimension.
+    idxshape = (param.shape[0],) + array.shape[:-1] + (actual_sizes.shape[-1],)
 
-    lo_ = np.floor(
-        virtual_index,
-        casting="unsafe",
-        out=np.empty(virtual_index.shape, dtype=np.int64),
-    )
-    hi_ = np.ceil(
-        virtual_index,
-        casting="unsafe",
-        out=np.empty(virtual_index.shape, dtype=np.int64),
-    )
-    kth = np.unique(np.concatenate([lo_.reshape(-1), hi_.reshape(-1)]))
+    if q is not None:
+        # This is numpy's method="linear"
+        # TODO: could support all the interpolations here
+        actual_sizes -= 1
+        virtual_index = param * actual_sizes
+        # virtual_index is relative to group starts, so now offset that
+        virtual_index[..., 1:] += offset[..., :-1]
+
+        if is_scalar_param:
+            virtual_index = virtual_index.squeeze(axis=0)
+            idxshape = array.shape[:-1] + (actual_sizes.shape[-1],)
+
+        lo_ = np.floor(virtual_index, casting="unsafe", out=np.empty(virtual_index.shape, dtype=np.int64))
+        hi_ = np.ceil(virtual_index, casting="unsafe", out=np.empty(virtual_index.shape, dtype=np.int64))
+        kth = np.unique(np.concatenate([lo_.reshape(-1), hi_.reshape(-1)]))
+
+    else:
+        virtual_index = (actual_sizes - k) if k > 0 else (np.zeros_like(actual_sizes) + abs(k) - 1)
+        # virtual_index is relative to group starts, so now offset that
+        virtual_index[..., 1:] += offset[..., :-1]
+        k_offset = param.reshape((abs(k),) + (1,) * virtual_index.ndim)
+        lo_ = k_offset + virtual_index[np.newaxis, ...]
+        # For groups with fewer than k elements, clamp extraction indices to valid range
+        # and mark out-of-bounds positions for filling with fill_value.
+        # Compute group boundaries: starts = [0, offset[:-1]], ends = offset
+        # We prepend 0 to offset[:-1] to get group start positions
+        group_starts = np.insert(offset[..., :-1], 0, 0, axis=-1)
+
+        # Mark positions outside group boundaries (before clamping to detect invalid indices)
+        # Broadcasting happens implicitly in comparison
+        badmask = (lo_ < group_starts) | (lo_ >= offset)
+
+        # Clamp lo_ in-place to [group_starts, array.shape[axis]-1]
+        # Using out= avoids intermediate array allocations
+        np.clip(lo_, group_starts, array.shape[axis] - 1, out=lo_)
+        # Note: we don't include nanmask here because for intermediate chunk results,
+        # we want to keep partial results. nanmask is used separately for final output.
+        # kth must include ALL indices we'll extract, not just the starting index per group.
+        # np.partition only guarantees correct values at kth positions; other positions may
+        # have elements from different groups due to how introselect works with complex numbers.
+        kth = np.unique(np.concatenate([np.unique(offset), np.unique(lo_)]))
+        kth = kth[kth >= 0]
+        kth[kth >= array.shape[axis]] = array.shape[axis] - 1
 
     # partition the complex array in-place
     labels_broadcast = np.broadcast_to(group_idx, array.shape)
@@ -111,20 +156,33 @@ def quantile_(array, inv_idx, *, q, axis, skipna, group_idx, dtype=None, out=Non
         # a simple (labels + 1j * array) will yield `nan+inf * 1j` instead of `0 + inf * j`
         cmplx.real = labels_broadcast
     cmplx.partition(kth=kth, axis=-1)
-    if is_scalar_q:
-        a_ = cmplx.imag
-    else:
-        a_ = np.broadcast_to(cmplx.imag, (q.shape[0],) + array.shape)
 
-    # get bounds, Broadcast to (num quantiles, ..., num labels)
-    loval = np.take_along_axis(a_, np.broadcast_to(lo_, idxshape), axis=axis)
-    hival = np.take_along_axis(a_, np.broadcast_to(hi_, idxshape), axis=axis)
+    a_ = cmplx.imag
+    if not is_scalar_param:
+        a_ = np.broadcast_to(cmplx.imag, (param.shape[0],) + array.shape)
 
-    # TODO: could support all the interpolations here
-    gamma = np.broadcast_to(virtual_index, idxshape) - lo_
-    result = _lerp(loval, hival, t=gamma, out=out, dtype=dtype)
-    if not skipna and np.any(nanmask):
-        result[..., nanmask] = np.nan
+    if array.dtype.kind in "Mm":
+        a_ = a_.view(array.dtype)
+
+    loval = np.take_along_axis(a_, np.broadcast_to(lo_, idxshape), axis=axis)
+    if q is not None:
+        # get bounds, Broadcast to (num quantiles, ..., num labels)
+        hival = np.take_along_axis(a_, np.broadcast_to(hi_, idxshape), axis=axis)
+
+        # TODO: could support all the interpolations here
+        gamma = np.broadcast_to(virtual_index, idxshape) - lo_
+        result = _lerp(loval, hival, t=gamma, out=out, dtype=dtype)
+        if not skipna and np.any(nanmask):
+            result[..., nanmask] = fill_value
+    else:
+        result = loval
+        if badmask.any():
+            result[badmask] = fill_value
+
+    if k is not None:
+        result = result.astype(dtype, copy=False)
+        if out is not None:
+            np.copyto(out, result)
     return result
 
 
@@ -158,12 +216,14 @@ def _np_grouped_op(
 
     if out is None:
         q = kwargs.get("q", None)
-        if q is None:
+        k = kwargs.get("k", None)
+        if q is None and k is None:
             out = np.full(array.shape[:-1] + (size,), fill_value=fill_value, dtype=dtype)
         else:
-            nq = len(np.atleast_1d(q))
+            nq = len(np.atleast_1d(q)) if q is not None else abs(k)
             out = np.full((nq,) + array.shape[:-1] + (size,), fill_value=fill_value, dtype=dtype)
             kwargs["group_idx"] = group_idx
+            kwargs["fill_value"] = fill_value
 
     if (len(uniques) == size) and (uniques == np.arange(size, like=aux)).all():
         # The previous version of this if condition
@@ -200,10 +260,11 @@ def _nan_grouped_op(group_idx, array, func, fillna, *args, **kwargs):
 nanmax = partial(_nan_grouped_op, func=max, fillna=dtypes.NINF)
 min = partial(_np_grouped_op, op=np.minimum.reduceat)
 nanmin = partial(_nan_grouped_op, func=min, fillna=dtypes.INF)
-quantile = partial(_np_grouped_op, op=partial(quantile_, skipna=False))
-nanquantile = partial(_np_grouped_op, op=partial(quantile_, skipna=True))
-median = partial(partial(_np_grouped_op, q=0.5), op=partial(quantile_, skipna=False))
-nanmedian = partial(partial(_np_grouped_op, q=0.5), op=partial(quantile_, skipna=True))
+topk = partial(_np_grouped_op, op=partial(quantile_or_topk, skipna=True))
+quantile = partial(_np_grouped_op, op=partial(quantile_or_topk, skipna=False))
+nanquantile = partial(_np_grouped_op, op=partial(quantile_or_topk, skipna=True))
+median = partial(partial(_np_grouped_op, q=0.5), op=partial(quantile_or_topk, skipna=False))
+nanmedian = partial(partial(_np_grouped_op, q=0.5), op=partial(quantile_or_topk, skipna=True))
 # TODO: all, any