From 6a7b9324b82f098e603157bd4f154555b0b11c28 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@cherian.net>
Date: Wed, 24 Jul 2024 22:11:36 -0600
Subject: [PATCH 1/7] Add first, last property test

Closes #29
---
 flox/aggregations.py     |  15 +++--
 flox/xrdtypes.py         |   1 +
 tests/__init__.py        | 117 +++++++++++++++++++++++++++++++++++----
 tests/test_properties.py |  62 +++++++++++++++------
 4 files changed, 162 insertions(+), 33 deletions(-)

diff --git a/flox/aggregations.py b/flox/aggregations.py
index 0de8d5ff..c69bbc2e 100644
--- a/flox/aggregations.py
+++ b/flox/aggregations.py
@@ -69,6 +69,9 @@ def generic_aggregate(
     if func == "identity":
         return array
 
+    if func in ["nanfirst", "nanlast"] and array.dtype.kind in "US":
+        func = func[3:]
+
     if engine == "flox":
         try:
             method = getattr(aggregate_flox, func)
@@ -144,6 +147,8 @@ def _maybe_promote_int(dtype) -> np.dtype:
 
 def _get_fill_value(dtype, fill_value):
     """Returns dtype appropriate infinity. Returns +Inf equivalent for None."""
+    if fill_value in [None, dtypes.NA] and dtype.kind in "US":
+        return ""
     if fill_value == dtypes.INF or fill_value is None:
         return dtypes.get_pos_infinity(dtype, max_for_int=True)
     if fill_value == dtypes.NINF:
@@ -516,10 +521,10 @@ def _pick_second(*x):
     final_dtype=np.intp,
 )
 
-first = Aggregation("first", chunk=None, combine=None, fill_value=0)
-last = Aggregation("last", chunk=None, combine=None, fill_value=0)
-nanfirst = Aggregation("nanfirst", chunk="nanfirst", combine="nanfirst", fill_value=np.nan)
-nanlast = Aggregation("nanlast", chunk="nanlast", combine="nanlast", fill_value=np.nan)
+first = Aggregation("first", chunk=None, combine=None, fill_value=None)
+last = Aggregation("last", chunk=None, combine=None, fill_value=None)
+nanfirst = Aggregation("nanfirst", chunk="nanfirst", combine="nanfirst", fill_value=dtypes.NA)
+nanlast = Aggregation("nanlast", chunk="nanlast", combine="nanlast", fill_value=dtypes.NA)
 
 all_ = Aggregation(
     "all",
@@ -808,7 +813,7 @@ def _initialize_aggregation(
     )
 
     final_dtype = _normalize_dtype(dtype_ or agg.dtype_init["final"], array_dtype, fill_value)
-    if agg.name not in ["min", "max", "nanmin", "nanmax"]:
+    if agg.name not in ["first", "last", "nanfirst", "nanlast", "min", "max", "nanmin", "nanmax"]:
         final_dtype = _maybe_promote_int(final_dtype)
     agg.dtype = {
         "user": dtype,  # Save to automatically choose an engine
diff --git a/flox/xrdtypes.py b/flox/xrdtypes.py
index 78ba5101..bad498b1 100644
--- a/flox/xrdtypes.py
+++ b/flox/xrdtypes.py
@@ -100,6 +100,7 @@ def get_pos_infinity(dtype, max_for_int=False):
 
     if issubclass(dtype.type, np.integer):
         if max_for_int:
+            dtype = np.int64 if dtype.kind in "Mm" else dtype
             return np.iinfo(dtype).max
         else:
             return np.inf
diff --git a/tests/__init__.py b/tests/__init__.py
index bdbb33be..07cd7ae5 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -101,17 +101,26 @@ def assert_equal(a, b, tolerance=None):
         else:
             tolerance = {}
 
-    if has_dask and isinstance(a, dask_array_type) or isinstance(b, dask_array_type):
-        # sometimes it's nice to see values and shapes
-        # rather than being dropped into some file in dask
-        np.testing.assert_allclose(a, b, **tolerance)
-        # does some validation of the dask graph
-        da.utils.assert_eq(a, b, equal_nan=True)
+    # Always run the numpy comparison first, so that we get nice error messages with dask.
+    # sometimes it's nice to see values and shapes
+    # rather than being dropped into some file in dask
+    if a.dtype != b.dtype:
+        raise AssertionError(f"a and b have different dtypes: (a: {a.dtype}, b: {b.dtype})")
+
+    if has_dask:
+        a_eager = a.compute() if isinstance(a, dask_array_type) else a
+        b_eager = b.compute() if isinstance(b, dask_array_type) else b
     else:
-        if a.dtype != b.dtype:
-            raise AssertionError(f"a and b have different dtypes: (a: {a.dtype}, b: {b.dtype})")
+        a_eager, b_eager = a, b
 
-        np.testing.assert_allclose(a, b, equal_nan=True, **tolerance)
+    if a.dtype.kind in "SUMm":
+        np.testing.assert_equal(a_eager, b_eager)
+    else:
+        np.testing.assert_allclose(a_eager, b_eager, equal_nan=True, **tolerance)
+
+    if has_dask and isinstance(a, dask_array_type) or isinstance(b, dask_array_type):
+        # does some validation of the dask graph
+        dask_assert_eq(a, b, equal_nan=True)
 
 
 def assert_equal_tuple(a, b):
@@ -119,7 +128,7 @@ def assert_equal_tuple(a, b):
     assert len(a) == len(b)
 
     for a_, b_ in zip(a, b):
-        assert type(a_) == type(b_)
+        assert type(a_) is type(b_)
         if isinstance(a_, np.ndarray):
             np.testing.assert_array_equal(a_, b_)
         else:
@@ -156,3 +165,91 @@ def assert_equal_tuple(a, b):
     "quantile",
     "nanquantile",
 ) + tuple(SCIPY_STATS_FUNCS)
+
+
+def dask_assert_eq(
+    a,
+    b,
+    check_shape=True,
+    check_graph=True,
+    check_meta=True,
+    check_chunks=True,
+    check_ndim=True,
+    check_type=True,
+    check_dtype=True,
+    equal_nan=True,
+    scheduler="sync",
+    **kwargs,
+):
+    """dask.array.utils.assert_eq modified to skip value checks. Their code is buggy for some dtypes.
+    We just check values through numpy and care about validating the graph in this function."""
+    from dask.array.utils import _get_dt_meta_computed
+
+    a_original = a
+    b_original = b
+
+    if isinstance(a, (list, int, float)):
+        a = np.array(a)
+    if isinstance(b, (list, int, float)):
+        b = np.array(b)
+
+    a, adt, a_meta, a_computed = _get_dt_meta_computed(
+        a,
+        check_shape=check_shape,
+        check_graph=check_graph,
+        check_chunks=check_chunks,
+        check_ndim=check_ndim,
+        scheduler=scheduler,
+    )
+    b, bdt, b_meta, b_computed = _get_dt_meta_computed(
+        b,
+        check_shape=check_shape,
+        check_graph=check_graph,
+        check_chunks=check_chunks,
+        check_ndim=check_ndim,
+        scheduler=scheduler,
+    )
+
+    if check_type:
+        _a = a if a.shape else a.item()
+        _b = b if b.shape else b.item()
+        assert type(_a) is type(_b), f"a and b have different types (a: {type(_a)}, b: {type(_b)})"
+    if check_meta:
+        if hasattr(a, "_meta") and hasattr(b, "_meta"):
+            dask_assert_eq(a._meta, b._meta)
+        if hasattr(a_original, "_meta"):
+            msg = (
+                f"compute()-ing 'a' changes its number of dimensions "
+                f"(before: {a_original._meta.ndim}, after: {a.ndim})"
+            )
+            assert a_original._meta.ndim == a.ndim, msg
+            if a_meta is not None:
+                msg = (
+                    f"compute()-ing 'a' changes its type "
+                    f"(before: {type(a_original._meta)}, after: {type(a_meta)})"
+                )
+                assert type(a_original._meta) is type(a_meta), msg
+                if not (np.isscalar(a_meta) or np.isscalar(a_computed)):
+                    msg = (
+                        f"compute()-ing 'a' results in a different type than implied by its metadata "
+                        f"(meta: {type(a_meta)}, computed: {type(a_computed)})"
+                    )
+                    assert type(a_meta) is type(a_computed), msg
+        if hasattr(b_original, "_meta"):
+            msg = (
+                f"compute()-ing 'b' changes its number of dimensions "
+                f"(before: {b_original._meta.ndim}, after: {b.ndim})"
+            )
+            assert b_original._meta.ndim == b.ndim, msg
+            if b_meta is not None:
+                msg = (
+                    f"compute()-ing 'b' changes its type "
+                    f"(before: {type(b_original._meta)}, after: {type(b_meta)})"
+                )
+                assert type(b_original._meta) is type(b_meta), msg
+                if not (np.isscalar(b_meta) or np.isscalar(b_computed)):
+                    msg = (
+                        f"compute()-ing 'b' results in a different type than implied by its metadata "
+                        f"(meta: {type(b_meta)}, computed: {type(b_computed)})"
+                    )
+                    assert type(b_meta) is type(b_computed), msg
diff --git a/tests/test_properties.py b/tests/test_properties.py
index 0d75e2a3..b74264b8 100644
--- a/tests/test_properties.py
+++ b/tests/test_properties.py
@@ -59,6 +59,12 @@ def supported_dtypes() -> st.SearchStrategy[np.dtype]:
 func_st = st.sampled_from(
     [f for f in ALL_FUNCS if f not in NON_NUMPY_FUNCS and f not in SKIPPED_FUNCS]
 )
+numeric_arrays = npst.arrays(
+    elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=array_dtype_st
+)
+all_arrays = npst.arrays(
+    elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=supported_dtypes()
+)
 
 
 def by_arrays(shape):
@@ -81,13 +87,7 @@ def not_overflowing_array(array) -> bool:
     return result
 
 
-@given(
-    array=npst.arrays(
-        elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=array_dtype_st
-    ),
-    dtype=by_dtype_st,
-    func=func_st,
-)
+@given(array=numeric_arrays, dtype=by_dtype_st, func=func_st)
 def test_groupby_reduce(array, dtype, func):
     # overflow behaviour differs between bincount and sum (for example)
     assume(not_overflowing_array(array))
@@ -149,17 +149,7 @@ def chunks(draw, *, shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
 
 
 @st.composite
-def chunked_arrays(
-    draw,
-    *,
-    chunks=chunks,
-    arrays=npst.arrays(
-        elements={"allow_subnormal": False},
-        shape=npst.array_shapes(max_side=10),
-        dtype=array_dtype_st,
-    ),
-    from_array=dask.array.from_array,
-):
+def chunked_arrays(draw, *, chunks=chunks, arrays=numeric_arrays, from_array=dask.array.from_array):
     array = draw(arrays)
     chunks = draw(chunks(shape=array.shape))
 
@@ -216,6 +206,7 @@ def test_scans(data, array, func):
 
 @given(data=st.data(), array=chunked_arrays())
 def test_ffill_bfill_reverse(data, array):
+    # TODO: test NaT and timedelta, datetime
     assume(not_overflowing_array(np.asarray(array)))
     by = data.draw(by_arrays(shape=(array.shape[-1],)))
 
@@ -230,3 +221,38 @@ def reverse(arr):
         backward = groupby_scan(a, by, func="bfill")
         forward_reversed = reverse(groupby_scan(reverse(a), reverse(by), func="ffill"))
         assert_equal(forward_reversed, backward)
+
+
+@given(
+    data=st.data(),
+    array=chunked_arrays(arrays=all_arrays),
+    func=st.sampled_from(["first", "last", "nanfirst", "nanlast"]),
+)
+def test_first_last(data, array, func):
+    by = data.draw(by_arrays(shape=(array.shape[-1],)))
+
+    INVERSES = {"first": "last", "last": "first", "nanfirst": "nanlast", "nanlast": "nanfirst"}
+    MATES = {"first": "nanfirst", "last": "nanlast", "nanfirst": "first", "nanlast": "last"}
+    inverse = INVERSES[func]
+    mate = MATES[func]
+
+    if func in ["first", "last"]:
+        array = array.rechunk((*array.chunks[:-1], -1))
+
+    for arr in [array, array.compute()]:
+        forward, fg = groupby_reduce(arr, by, func=func, engine="flox")
+        reverse, rg = groupby_reduce(arr[..., ::-1], by[..., ::-1], func=inverse, engine="flox")
+
+        assert forward.dtype == reverse.dtype
+        assert forward.dtype == arr.dtype
+
+        assert_equal(fg, rg)
+        assert_equal(forward, reverse)
+
+    if arr.dtype.kind == "f" and not np.isnan(array.compute()).any():
+        if mate in ["first", "last"]:
+            array = array.rechunk((*array.chunks[:-1], -1))
+
+        first, _ = groupby_reduce(array, by, func=func, engine="flox")
+        second, _ = groupby_reduce(array, by, func=mate, engine="flox")
+        assert_equal(first, second)

From 22140ebaf1a353f4292ebf2fba4807756533f5bd Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@cherian.net>
Date: Fri, 26 Jul 2024 20:32:35 -0600
Subject: [PATCH 2/7] Add more array types for property tests

---
 tests/__init__.py        |   2 +-
 tests/test_properties.py | 141 +++++++++++++++++++++++++++------------
 2 files changed, 101 insertions(+), 42 deletions(-)

diff --git a/tests/__init__.py b/tests/__init__.py
index 07cd7ae5..3c3ba1da 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -113,7 +113,7 @@ def assert_equal(a, b, tolerance=None):
     else:
         a_eager, b_eager = a, b
 
-    if a.dtype.kind in "SUMm":
+    if a.dtype.kind in "SUMmO":
         np.testing.assert_equal(a_eager, b_eager)
     else:
         np.testing.assert_allclose(a_eager, b_eager, equal_nan=True, **tolerance)
diff --git a/tests/test_properties.py b/tests/test_properties.py
index b74264b8..8bb5abde 100644
--- a/tests/test_properties.py
+++ b/tests/test_properties.py
@@ -3,7 +3,9 @@
 
 pytest.importorskip("hypothesis")
 pytest.importorskip("dask")
+pytest.importorskip("cftime")
 
+import cftime
 import dask
 import hypothesis.extra.numpy as npst
 import hypothesis.strategies as st
@@ -66,11 +68,55 @@ def supported_dtypes() -> st.SearchStrategy[np.dtype]:
     elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=supported_dtypes()
 )
 
+calendars = st.sampled_from(
+    [
+        "standard",
+        "gregorian",
+        "proleptic_gregorian",
+        "noleap",
+        "365_day",
+        "360_day",
+        "julian",
+        "all_leap",
+        "366_day",
+    ]
+)
+
+
+@st.composite
+def units(draw, *, calendar: str):
+    choices = ["days", "hours", "minutes", "seconds", "milliseconds", "microseconds"]
+    if calendar == "360_day":
+        choices += ["months"]
+    elif calendar == "noleap":
+        choices += ["common_years"]
+    time_units = draw(st.sampled_from(choices))
+
+    dt = draw(st.datetimes())
+    year, month, day = dt.year, dt.month, dt.day
+    if calendar == "360_day":
+        month %= 30
+    return f"{time_units} since {year}-{month}-{day}"
 
-def by_arrays(shape):
-    return npst.arrays(
-        dtype=npst.integer_dtypes(endianness="=") | npst.unicode_string_dtypes(endianness="="),
-        shape=shape,
+
+@st.composite
+def cftime_arrays(draw, *, shape, calendars=calendars, elements=None):
+    if elements is None:
+        elements = {"min_value": -10_000, "max_value": 10_000}
+    cal = draw(calendars)
+    values = draw(npst.arrays(dtype=np.int64, shape=shape, elements=elements))
+    unit = draw(units(calendar=cal))
+    return cftime.num2date(values, units=unit, calendar=cal)
+
+
+def by_arrays(shape, *, elements=None):
+    return st.one_of(
+        npst.arrays(
+            dtype=npst.integer_dtypes(endianness="=") | npst.unicode_string_dtypes(endianness="="),
+            shape=shape,
+            elements=elements,
+        ),
+        cftime_arrays(shape=shape, elements=elements),
     )
 
 
@@ -87,8 +133,43 @@ def not_overflowing_array(array) -> bool:
     return result
 
 
-@given(array=numeric_arrays, dtype=by_dtype_st, func=func_st)
-def test_groupby_reduce(array, dtype, func):
+@st.composite
+def chunks(draw, *, shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
+    chunks = []
+    for size in shape:
+        if size > 1:
+            nchunks = draw(st.integers(min_value=1, max_value=size - 1))
+            dividers = sorted(
+                set(draw(st.integers(min_value=1, max_value=size - 1)) for _ in range(nchunks - 1))
+            )
+            chunks.append(tuple(a - b for a, b in zip(dividers + [size], [0] + dividers)))
+        else:
+            chunks.append((1,))
+    return tuple(chunks)
+
+
+@st.composite
+def chunked_arrays(draw, *, chunks=chunks, arrays=numeric_arrays, from_array=dask.array.from_array):
+    array = draw(arrays)
+    chunks = draw(chunks(shape=array.shape))
+
+    if array.dtype.kind in "cf":
+        nan_idx = draw(
+            st.lists(
+                st.integers(min_value=0, max_value=array.shape[-1] - 1),
+                max_size=array.shape[-1] - 1,
+                unique=True,
+            )
+        )
+        if nan_idx:
+            array[..., nan_idx] = np.nan
+
+    return from_array(array, chunks=chunks)
+
+
+# TODO: migrate to by_arrays but with constant value
+@given(data=st.data(), array=numeric_arrays, func=func_st)
+def test_groupby_reduce(data, array, func):
     # overflow behaviour differs between bincount and sum (for example)
     assume(not_overflowing_array(array))
     # TODO: fix var for complex numbers upstream
@@ -97,7 +178,19 @@ def test_groupby_reduce(array, dtype, func):
     assume("arg" not in func and not np.any(np.isnan(array).ravel()))
 
     axis = -1
-    by = np.ones((array.shape[-1],), dtype=dtype)
+    by = data.draw(
+        by_arrays(
+            elements={
+                "alphabet": st.just("a"),
+                "min_value": 1,
+                "max_value": 1,
+                "min_size": 1,
+                "max_size": 1,
+            },
+            shape=array.shape[-1],
+        )
+    )
+    assert len(np.unique(by)) == 1
     kwargs = {"q": 0.8} if "quantile" in func else {}
     flox_kwargs = {}
     with np.errstate(invalid="ignore", divide="ignore"):
@@ -133,40 +226,6 @@ def test_groupby_reduce(array, dtype, func):
     assert_equal(expected, actual, tolerance)
 
 
-@st.composite
-def chunks(draw, *, shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
-    chunks = []
-    for size in shape:
-        if size > 1:
-            nchunks = draw(st.integers(min_value=1, max_value=size - 1))
-            dividers = sorted(
-                set(draw(st.integers(min_value=1, max_value=size - 1)) for _ in range(nchunks - 1))
-            )
-            chunks.append(tuple(a - b for a, b in zip(dividers + [size], [0] + dividers)))
-        else:
-            chunks.append((1,))
-    return tuple(chunks)
-
-
-@st.composite
-def chunked_arrays(draw, *, chunks=chunks, arrays=numeric_arrays, from_array=dask.array.from_array):
-    array = draw(arrays)
-    chunks = draw(chunks(shape=array.shape))
-
-    if array.dtype.kind in "cf":
-        nan_idx = draw(
-            st.lists(
-                st.integers(min_value=0, max_value=array.shape[-1] - 1),
-                max_size=array.shape[-1] - 1,
-                unique=True,
-            )
-        )
-        if nan_idx:
-            array[..., nan_idx] = np.nan
-
-    return from_array(array, chunks=chunks)
-
-
 @given(
     data=st.data(),
     array=chunked_arrays(),

From 6fbd0fdac68ff591bc1f194fbf399590decb7be0 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@cherian.net>
Date: Fri, 26 Jul 2024 21:30:23 -0600
Subject: [PATCH 3/7] Refactor hypothesis strategies

---
 tests/strategies.py      | 125 +++++++++++++++++++++++++++++++++++++++
 tests/test_properties.py | 120 +------------------------------------
 2 files changed, 127 insertions(+), 118 deletions(-)
 create mode 100644 tests/strategies.py

diff --git a/tests/strategies.py b/tests/strategies.py
new file mode 100644
index 00000000..aa24bce9
--- /dev/null
+++ b/tests/strategies.py
@@ -0,0 +1,125 @@
+import cftime
+import dask
+import hypothesis.extra.numpy as npst
+import hypothesis.strategies as st
+import numpy as np
+
+from . import ALL_FUNCS, SCIPY_STATS_FUNCS
+
+
+def supported_dtypes() -> st.SearchStrategy[np.dtype]:
+    return (
+        npst.integer_dtypes(endianness="=")
+        | npst.unsigned_integer_dtypes(endianness="=")
+        | npst.floating_dtypes(endianness="=", sizes=(32, 64))
+        | npst.complex_number_dtypes(endianness="=")
+        | npst.datetime64_dtypes(endianness="=")
+        | npst.timedelta64_dtypes(endianness="=")
+        | npst.unicode_string_dtypes(endianness="=")
+    )
+
+
+# TODO: stop excluding everything but U
+array_dtype_st = supported_dtypes().filter(lambda x: x.kind not in "cmMU")
+by_dtype_st = supported_dtypes()
+
+NON_NUMPY_FUNCS = ["first", "last", "nanfirst", "nanlast", "count", "any", "all"] + list(
+    SCIPY_STATS_FUNCS
+)
+SKIPPED_FUNCS = ["var", "std", "nanvar", "nanstd"]
+
+func_st = st.sampled_from(
+    [f for f in ALL_FUNCS if f not in NON_NUMPY_FUNCS and f not in SKIPPED_FUNCS]
+)
+numeric_arrays = npst.arrays(
+    elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=array_dtype_st
+)
+all_arrays = npst.arrays(
+    elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=supported_dtypes()
+)
+
+
+calendars = st.sampled_from(
+    [
+        "standard",
+        "gregorian",
+        "proleptic_gregorian",
+        "noleap",
+        "365_day",
+        "360_day",
+        "julian",
+        "all_leap",
+        "366_day",
+    ]
+)
+
+
+@st.composite
+def units(draw, *, calendar: str):
+    choices = ["days", "hours", "minutes", "seconds", "milliseconds", "microseconds"]
+    if calendar == "360_day":
+        choices += ["months"]
+    elif calendar == "noleap":
+        choices += ["common_years"]
+    time_units = draw(st.sampled_from(choices))
+
+    dt = draw(st.datetimes())
+    year, month, day = dt.year, dt.month, dt.day
+    if calendar == "360_day":
+        day = min(day, 30)
+    return f"{time_units} since {year}-{month}-{day}"
+
+
+@st.composite
+def cftime_arrays(draw, *, shape, calendars=calendars, elements=None):
+    if elements is None:
+        elements = {"min_value": -10_000, "max_value": 10_000}
+    cal = draw(calendars)
+    values = draw(npst.arrays(dtype=np.int64, shape=shape, elements=elements))
+    unit = draw(units(calendar=cal))
+    return cftime.num2date(values, units=unit, calendar=cal)
+
+
+def by_arrays(shape, *, elements=None):
+    return st.one_of(
+        npst.arrays(
+            dtype=npst.integer_dtypes(endianness="=") | npst.unicode_string_dtypes(endianness="="),
+            shape=shape,
+            elements=elements,
+        ),
+        cftime_arrays(shape=shape, elements=elements),
+    )
+
+
+@st.composite
+def chunks(draw, *, shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
+    chunks = []
+    for size in shape:
+        if size > 1:
+            nchunks = draw(st.integers(min_value=1, max_value=size - 1))
+            dividers = sorted(
+                set(draw(st.integers(min_value=1, max_value=size - 1)) for _ in range(nchunks - 1))
+            )
+            chunks.append(tuple(a - b for a, b in zip(dividers + [size], [0] + dividers)))
+        else:
+            chunks.append((1,))
+    return tuple(chunks)
+
+
+@st.composite
+def chunked_arrays(draw, *, chunks=chunks, arrays=numeric_arrays, from_array=dask.array.from_array):
+    array = draw(arrays)
+    chunks = draw(chunks(shape=array.shape))
+
+    if array.dtype.kind in "cf":
+        nan_idx = draw(
+            st.lists(
+                st.integers(min_value=0, max_value=array.shape[-1] - 1),
+                max_size=array.shape[-1] - 1,
+                unique=True,
+            )
+        )
+        if nan_idx:
+            array[..., nan_idx] = np.nan
+
+    return from_array(array, chunks=chunks)
diff --git a/tests/test_properties.py b/tests/test_properties.py
index 8bb5abde..cf9b410b 100644
--- a/tests/test_properties.py
+++ b/tests/test_properties.py
@@ -5,9 +5,7 @@
 pytest.importorskip("dask")
 pytest.importorskip("cftime")
 
-import cftime
 import dask
-import hypothesis.extra.numpy as npst
 import hypothesis.strategies as st
 import numpy as np
 from hypothesis import assume, given, note
@@ -15,7 +13,8 @@
 import flox
 from flox.core import groupby_reduce, groupby_scan
 
-from . import ALL_FUNCS, SCIPY_STATS_FUNCS, assert_equal
+from . import assert_equal
+from .strategies import all_arrays, by_arrays, chunked_arrays, func_st, numeric_arrays
 
 dask.config.set(scheduler="sync")
 
@@ -32,10 +31,6 @@ def bfill(array, axis, dtype=None):
     )[::-1]
 
 
-NON_NUMPY_FUNCS = ["first", "last", "nanfirst", "nanlast", "count", "any", "all"] + list(
-    SCIPY_STATS_FUNCS
-)
-SKIPPED_FUNCS = ["var", "std", "nanvar", "nanstd"]
 NUMPY_SCAN_FUNCS = {
     "nancumsum": np.nancumsum,
     "ffill": ffill,
@@ -43,83 +38,6 @@ def bfill(array, axis, dtype=None):
 }  # "cumsum": np.cumsum,
 
 
-def supported_dtypes() -> st.SearchStrategy[np.dtype]:
-    return (
-        npst.integer_dtypes(endianness="=")
-        | npst.unsigned_integer_dtypes(endianness="=")
-        | npst.floating_dtypes(endianness="=", sizes=(32, 64))
-        | npst.complex_number_dtypes(endianness="=")
-        | npst.datetime64_dtypes(endianness="=")
-        | npst.timedelta64_dtypes(endianness="=")
-        | npst.unicode_string_dtypes(endianness="=")
-    )
-
-
-# TODO: stop excluding everything but U
-array_dtype_st = supported_dtypes().filter(lambda x: x.kind not in "cmMU")
-by_dtype_st = supported_dtypes()
-func_st = st.sampled_from(
-    [f for f in ALL_FUNCS if f not in NON_NUMPY_FUNCS and f not in SKIPPED_FUNCS]
-)
-numeric_arrays = npst.arrays(
-    elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=array_dtype_st
-)
-all_arrays = npst.arrays(
-    elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=supported_dtypes()
-)
-
-calendars = st.sampled_from(
-    [
-        "standard",
-        "gregorian",
-        "proleptic_gregorian",
-        "noleap",
-        "365_day",
-        "360_day",
-        "julian",
-        "all_leap",
-        "366_day",
-    ]
-)
-
-
-@st.composite
-def units(draw, *, calendar: str):
-    choices = ["days", "hours", "minutes", "seconds", "milliseconds", "microseconds"]
-    if calendar == "360_day":
-        choices += ["months"]
-    elif calendar == "noleap":
-        choices += ["common_years"]
-    time_units = draw(st.sampled_from(choices))
-
-    dt = draw(st.datetimes())
-    year, month, day = dt.year, dt.month, dt.day
-    if calendar == "360_day":
-        month %= 30
-    return f"{time_units} since {year}-{month}-{day}"
-
-
-@st.composite
-def cftime_arrays(draw, *, shape, calendars=calendars, elements=None):
-    if elements is None:
-        elements = {"min_value": -10_000, "max_value": 10_000}
-    cal = draw(calendars)
-    values = draw(npst.arrays(dtype=np.int64, shape=shape, elements=elements))
-    unit = draw(units(calendar=cal))
-    return cftime.num2date(values, units=unit, calendar=cal)
-
-
-def by_arrays(shape, *, elements=None):
-    return st.one_of(
-        npst.arrays(
-            dtype=npst.integer_dtypes(endianness="=") | npst.unicode_string_dtypes(endianness="="),
-            shape=shape,
-            elements=elements,
-        ),
-        cftime_arrays(shape=shape, elements=elements),
-    )
-
-
 def not_overflowing_array(array) -> bool:
     if array.dtype.kind == "f":
         info = np.finfo(array.dtype)
@@ -133,40 +51,6 @@ def not_overflowing_array(array) -> bool:
     return result
 
 
-@st.composite
-def chunks(draw, *, shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
-    chunks = []
-    for size in shape:
-        if size > 1:
-            nchunks = draw(st.integers(min_value=1, max_value=size - 1))
-            dividers = sorted(
-                set(draw(st.integers(min_value=1, max_value=size - 1)) for _ in range(nchunks - 1))
-            )
-            chunks.append(tuple(a - b for a, b in zip(dividers + [size], [0] + dividers)))
-        else:
-            chunks.append((1,))
-    return tuple(chunks)
-
-
-@st.composite
-def chunked_arrays(draw, *, chunks=chunks, arrays=numeric_arrays, from_array=dask.array.from_array):
-    array = draw(arrays)
-    chunks = draw(chunks(shape=array.shape))
-
-    if array.dtype.kind in "cf":
-        nan_idx = draw(
-            st.lists(
-                st.integers(min_value=0, max_value=array.shape[-1] - 1),
-                max_size=array.shape[-1] - 1,
-                unique=True,
-            )
-        )
-        if nan_idx:
-            array[..., nan_idx] = np.nan
-
-    return from_array(array, chunks=chunks)
-
-
 # TODO: migrate to by_arrays but with constant value
 @given(data=st.data(), array=numeric_arrays, func=func_st)
 def test_groupby_reduce(data, array, func):

From 39d8dcf992e4c1fdddb13ff32b9580742bd8d688 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@cherian.net>
Date: Sat, 27 Jul 2024 06:23:17 -0600
Subject: [PATCH 4/7] Type strategies

---
 tests/strategies.py      | 41 +++++++++++++++++++++++++++++++++-------
 tests/test_properties.py | 35 +++++++++++++++++-----------------
 2 files changed, 52 insertions(+), 24 deletions(-)

diff --git a/tests/strategies.py b/tests/strategies.py
index aa24bce9..fcee1c8e 100644
--- a/tests/strategies.py
+++ b/tests/strategies.py
@@ -1,3 +1,7 @@
+from __future__ import annotations
+
+from typing import Any, Callable
+
 import cftime
 import dask
 import hypothesis.extra.numpy as npst
@@ -6,6 +10,8 @@
 
 from . import ALL_FUNCS, SCIPY_STATS_FUNCS
 
+Chunks = tuple[tuple[int, ...], ...]
+
 
 def supported_dtypes() -> st.SearchStrategy[np.dtype]:
     return (
@@ -38,7 +44,6 @@ def supported_dtypes() -> st.SearchStrategy[np.dtype]:
     elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=supported_dtypes()
 )
 
-
 calendars = st.sampled_from(
     [
         "standard",
@@ -55,7 +60,7 @@ def supported_dtypes() -> st.SearchStrategy[np.dtype]:
 
 
 @st.composite
-def units(draw, *, calendar: str):
+def units(draw, *, calendar: str) -> str:
     choices = ["days", "hours", "minutes", "seconds", "milliseconds", "microseconds"]
     if calendar == "360_day":
         choices += ["months"]
@@ -67,20 +72,36 @@ def units(draw, *, calendar: str):
     year, month, day = dt.year, dt.month, dt.day
     if calendar == "360_day":
         day = min(day, 30)
+    if calendar in ["360_day", "365_day", "noleap"] and month == 2 and day == 29:
+        day = 28
+
     return f"{time_units} since {year}-{month}-{day}"
 
 
 @st.composite
-def cftime_arrays(draw, *, shape, calendars=calendars, elements=None):
+def cftime_arrays(
+    draw: st.DrawFn,
+    *,
+    shape: tuple[int, ...],
+    calendars: st.SearchStrategy[str] = calendars,
+    elements: dict[str, Any] | None = None,
+) -> np.ndarray[Any, Any]:
     if elements is None:
-        elements = {"min_value": -10_000, "max_value": 10_000}
+        elements = {}
+    elements.setdefault("min_value", -10_000)
+    elements.setdefault("max_value", 10_000)
     cal = draw(calendars)
     values = draw(npst.arrays(dtype=np.int64, shape=shape, elements=elements))
     unit = draw(units(calendar=cal))
     return cftime.num2date(values, units=unit, calendar=cal)
 
 
-def by_arrays(shape, *, elements=None):
+def by_arrays(
+    shape: tuple[int, ...], *, elements: dict[str, Any] | None = None
+) -> st.SearchStrategy[np.ndarray[Any, Any]]:
+    if elements is None:
+        elements = {}
+    elements.setdefault("alphabet", st.characters(exclude_categories=("C",)))
     return st.one_of(
         npst.arrays(
             dtype=npst.integer_dtypes(endianness="=") | npst.unicode_string_dtypes(endianness="="),
@@ -92,7 +113,7 @@ def by_arrays(shape, *, elements=None):
 
 
 @st.composite
-def chunks(draw, *, shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
+def chunks(draw: st.DrawFn, *, shape: tuple[int, ...]) -> Chunks:
     chunks = []
     for size in shape:
         if size > 1:
@@ -107,7 +128,13 @@ def chunks(draw, *, shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
 
 
 @st.composite
-def chunked_arrays(draw, *, chunks=chunks, arrays=numeric_arrays, from_array=dask.array.from_array):
+def chunked_arrays(
+    draw: st.DrawFn,
+    *,
+    chunks: Callable[..., st.SearchStrategy[Chunks]] = chunks,
+    arrays=all_arrays,
+    from_array: Callable = dask.array.from_array,
+) -> dask.array.Array:
     array = draw(arrays)
     chunks = draw(chunks(shape=array.shape))
 
diff --git a/tests/test_properties.py b/tests/test_properties.py
index cf9b410b..0fadf87d 100644
--- a/tests/test_properties.py
+++ b/tests/test_properties.py
@@ -1,3 +1,5 @@
+from typing import Any, Callable
+
 import pandas as pd
 import pytest
 
@@ -14,7 +16,7 @@
 from flox.core import groupby_reduce, groupby_scan
 
 from . import assert_equal
-from .strategies import all_arrays, by_arrays, chunked_arrays, func_st, numeric_arrays
+from .strategies import by_arrays, chunked_arrays, func_st, numeric_arrays
 
 dask.config.set(scheduler="sync")
 
@@ -31,14 +33,14 @@ def bfill(array, axis, dtype=None):
     )[::-1]
 
 
-NUMPY_SCAN_FUNCS = {
+NUMPY_SCAN_FUNCS: dict[str, Callable] = {
     "nancumsum": np.nancumsum,
     "ffill": ffill,
     "bfill": bfill,
 }  # "cumsum": np.cumsum,
 
 
-def not_overflowing_array(array) -> bool:
+def not_overflowing_array(array: np.ndarray[Any, Any]) -> bool:
     if array.dtype.kind == "f":
         info = np.finfo(array.dtype)
     elif array.dtype.kind in ["i", "u"]:
@@ -51,9 +53,8 @@ def not_overflowing_array(array) -> bool:
     return result
 
 
-# TODO: migrate to by_arrays but with constant value
 @given(data=st.data(), array=numeric_arrays, func=func_st)
-def test_groupby_reduce(data, array, func):
+def test_groupby_reduce(data, array, func: str) -> None:
     # overflow behaviour differs between bincount and sum (for example)
     assume(not_overflowing_array(array))
     # TODO: fix var for complex numbers upstream
@@ -71,14 +72,14 @@ def test_groupby_reduce(data, array, func):
                 "min_size": 1,
                 "max_size": 1,
             },
-            shape=array.shape[-1],
+            shape=(array.shape[-1],),
         )
     )
     assert len(np.unique(by)) == 1
     kwargs = {"q": 0.8} if "quantile" in func else {}
-    flox_kwargs = {}
+    flox_kwargs: dict[str, Any] = {}
     with np.errstate(invalid="ignore", divide="ignore"):
-        actual, _ = groupby_reduce(
+        actual, *_ = groupby_reduce(
             array, by, func=func, axis=axis, engine="numpy", **flox_kwargs, finalize_kwargs=kwargs
         )
 
@@ -112,10 +113,10 @@ def test_groupby_reduce(data, array, func):
 
 @given(
     data=st.data(),
-    array=chunked_arrays(),
+    array=chunked_arrays(arrays=numeric_arrays),
     func=st.sampled_from(tuple(NUMPY_SCAN_FUNCS)),
 )
-def test_scans(data, array, func):
+def test_scans(data, array: dask.array.Array, func: str) -> None:
     assume(not_overflowing_array(np.asarray(array)))
 
     by = data.draw(by_arrays(shape=(array.shape[-1],)))
@@ -148,7 +149,7 @@ def test_scans(data, array, func):
 
 
 @given(data=st.data(), array=chunked_arrays())
-def test_ffill_bfill_reverse(data, array):
+def test_ffill_bfill_reverse(data, array: dask.array.Array) -> None:
     # TODO: test NaT and timedelta, datetime
     assume(not_overflowing_array(np.asarray(array)))
     by = data.draw(by_arrays(shape=(array.shape[-1],)))
@@ -168,10 +169,10 @@ def reverse(arr):
 
 @given(
     data=st.data(),
-    array=chunked_arrays(arrays=all_arrays),
+    array=chunked_arrays(),
     func=st.sampled_from(["first", "last", "nanfirst", "nanlast"]),
 )
-def test_first_last(data, array, func):
+def test_first_last(data, array: dask.array.Array, func: str) -> None:
     by = data.draw(by_arrays(shape=(array.shape[-1],)))
 
     INVERSES = {"first": "last", "last": "first", "nanfirst": "nanlast", "nanlast": "nanfirst"}
@@ -183,8 +184,8 @@ def test_first_last(data, array, func):
         array = array.rechunk((*array.chunks[:-1], -1))
 
     for arr in [array, array.compute()]:
-        forward, fg = groupby_reduce(arr, by, func=func, engine="flox")
-        reverse, rg = groupby_reduce(arr[..., ::-1], by[..., ::-1], func=inverse, engine="flox")
+        forward, *fg = groupby_reduce(arr, by, func=func, engine="flox")
+        reverse, *rg = groupby_reduce(arr[..., ::-1], by[..., ::-1], func=inverse, engine="flox")
 
         assert forward.dtype == reverse.dtype
         assert forward.dtype == arr.dtype
@@ -196,6 +197,6 @@ def test_first_last(data, array, func):
         if mate in ["first", "last"]:
             array = array.rechunk((*array.chunks[:-1], -1))
 
-        first, _ = groupby_reduce(array, by, func=func, engine="flox")
-        second, _ = groupby_reduce(array, by, func=mate, engine="flox")
+        first, *_ = groupby_reduce(array, by, func=func, engine="flox")
+        second, *_ = groupby_reduce(array, by, func=mate, engine="flox")
         assert_equal(first, second)

From f752309789664bc30ab9ae889e779f757abce360 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@cherian.net>
Date: Sat, 27 Jul 2024 21:56:50 -0600
Subject: [PATCH 5/7] Fix overflowing skipping to handle NaNs

Remove from ffill, bfill

Fix fill_value for datetime64

isnan to isnull
---
 flox/aggregate_flox.py   | 2 +-
 flox/aggregations.py     | 6 +++++-
 flox/xrdtypes.py         | 5 +++--
 tests/test_properties.py | 8 +++++---
 4 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py
index 31caa849..7174552c 100644
--- a/flox/aggregate_flox.py
+++ b/flox/aggregate_flox.py
@@ -239,7 +239,7 @@ def ffill(group_idx, array, *, axis, **kwargs):
     (group_starts,) = flag.nonzero()
 
     # https://stackoverflow.com/questions/41190852/most-efficient-way-to-forward-fill-nan-values-in-numpy-array
-    mask = np.isnan(array)
+    mask = isnull(array)
     # modified from the SO answer, just reset the index at the start of every group!
     mask[..., np.asarray(group_starts)] = False
 
diff --git a/flox/aggregations.py b/flox/aggregations.py
index c69bbc2e..63abb0ea 100644
--- a/flox/aggregations.py
+++ b/flox/aggregations.py
@@ -158,7 +158,11 @@ def _get_fill_value(dtype, fill_value):
             return np.nan
         # This is madness, but npg checks that fill_value is compatible
         # with array dtype even if the fill_value is never used.
-        elif np.issubdtype(dtype, np.integer):
+        elif (
+            np.issubdtype(dtype, np.integer)
+            or np.issubdtype(dtype, np.timedelta64)
+            or np.issubdtype(dtype, np.datetime64)
+        ):
             return dtypes.get_neg_infinity(dtype, min_for_int=True)
         else:
             return None
diff --git a/flox/xrdtypes.py b/flox/xrdtypes.py
index bad498b1..2d6ce369 100644
--- a/flox/xrdtypes.py
+++ b/flox/xrdtypes.py
@@ -125,8 +125,9 @@ def get_neg_infinity(dtype, min_for_int=False):
     fill_value : positive infinity value corresponding to this dtype.
     """
 
-    if np.issubdtype(dtype, (np.timedelta64, np.datetime64)):
-        return dtype.type(np.iinfo(np.int64).min + 1)
+    if is_datetime_like(dtype):
+        unit, _ = np.datetime_data(dtype)
+        return dtype.type(np.iinfo(np.int64).min + 1, unit)
 
     if issubclass(dtype.type, np.floating):
         return -np.inf
diff --git a/tests/test_properties.py b/tests/test_properties.py
index 0fadf87d..1f495a36 100644
--- a/tests/test_properties.py
+++ b/tests/test_properties.py
@@ -14,6 +14,7 @@
 
 import flox
 from flox.core import groupby_reduce, groupby_scan
+from flox.xrutils import notnull
 
 from . import assert_equal
 from .strategies import by_arrays, chunked_arrays, func_st, numeric_arrays
@@ -48,6 +49,8 @@ def not_overflowing_array(array: np.ndarray[Any, Any]) -> bool:
     else:
         return True
 
+    array = array.ravel()
+    array = array[notnull(array)]
     result = bool(np.all((array < info.max / array.size) & (array > info.min / array.size)))
     # note(f"returning {result}, {array.min()} vs {info.min}, {array.max()} vs {info.max}")
     return result
@@ -117,7 +120,8 @@ def test_groupby_reduce(data, array, func: str) -> None:
     func=st.sampled_from(tuple(NUMPY_SCAN_FUNCS)),
 )
 def test_scans(data, array: dask.array.Array, func: str) -> None:
-    assume(not_overflowing_array(np.asarray(array)))
+    if "cum" in func:
+        assume(not_overflowing_array(np.asarray(array)))
 
     by = data.draw(by_arrays(shape=(array.shape[-1],)))
     axis = array.ndim - 1
@@ -150,8 +154,6 @@ def test_scans(data, array: dask.array.Array, func: str) -> None:
 
 @given(data=st.data(), array=chunked_arrays())
 def test_ffill_bfill_reverse(data, array: dask.array.Array) -> None:
-    # TODO: test NaT and timedelta, datetime
-    assume(not_overflowing_array(np.asarray(array)))
     by = data.draw(by_arrays(shape=(array.shape[-1],)))
 
     def reverse(arr):

From 84cb5d8d3b14fe94634aaebbf13ff1a0a31d6d2b Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@cherian.net>
Date: Sat, 27 Jul 2024 22:11:37 -0600
Subject: [PATCH 6/7] Fix ffill, bfill bugs

---
 flox/aggregations.py     |  4 ++--
 flox/core.py             | 35 ++++++++++++++++++++---------------
 tests/test_core.py       | 30 +++++++++++++++++++++++++++---
 tests/test_properties.py | 21 +++++++++++++--------
 4 files changed, 62 insertions(+), 28 deletions(-)

diff --git a/flox/aggregations.py b/flox/aggregations.py
index 63abb0ea..98d91622 100644
--- a/flox/aggregations.py
+++ b/flox/aggregations.py
@@ -654,8 +654,8 @@ def __post_init__(self):
 
 
 def reverse(a: AlignedArrays) -> AlignedArrays:
-    a.group_idx = a.group_idx[::-1]
-    a.array = a.array[::-1]
+    a.group_idx = a.group_idx[..., ::-1]
+    a.array = a.array[..., ::-1]
     return a
 
 
diff --git a/flox/core.py b/flox/core.py
index 307345c0..9e3dee3d 100644
--- a/flox/core.py
+++ b/flox/core.py
@@ -2788,20 +2788,29 @@ def groupby_scan(
     if by_.shape[-1] == 1 or by_.shape == grp_shape:
         return array.astype(agg.dtype)
 
+    # Made a design choice here to have `preprocess` handle both array and group_idx
+    # Example: for reversing, we need to reverse the whole array, not just reverse
+    #          each block independently
+    inp = AlignedArrays(array=array, group_idx=by_)
+    if agg.preprocess:
+        inp = agg.preprocess(inp)
+
     if not has_dask:
-        final_state = chunk_scan(
-            AlignedArrays(array=array, group_idx=by_), axis=single_axis, agg=agg, dtype=agg.dtype
-        )
-        return extract_array(final_state)
+        final_state = chunk_scan(inp, axis=single_axis, agg=agg, dtype=agg.dtype)
+        result = _finalize_scan(final_state)
     else:
-        return dask_groupby_scan(array, by_, axes=axis_, agg=agg)
+        result = dask_groupby_scan(inp.array, inp.group_idx, axes=axis_, agg=agg)
+
+    # Made a design choice here to have `postprocess` handle both array and group_idx
+    out = AlignedArrays(array=result, group_idx=by_)
+    if agg.finalize:
+        out = agg.finalize(out)
+    return out.array
 
 
 def chunk_scan(inp: AlignedArrays, *, axis: int, agg: Scan, dtype=None, keepdims=None) -> ScanState:
     assert axis == inp.array.ndim - 1
 
-    if agg.preprocess:
-        inp = agg.preprocess(inp)
     # I don't think we need to re-factorize here unless we are grouping by a dask array
     accumulated = generic_aggregate(
         inp.group_idx,
@@ -2813,8 +2822,6 @@ def chunk_scan(inp: AlignedArrays, *, axis: int, agg: Scan, dtype=None, keepdims
         fill_value=agg.identity,
     )
     result = AlignedArrays(array=accumulated, group_idx=inp.group_idx)
-    if agg.finalize:
-        result = agg.finalize(result)
     return ScanState(result=result, state=None)
 
 
@@ -2840,10 +2847,9 @@ def _zip(group_idx: np.ndarray, array: np.ndarray) -> AlignedArrays:
     return AlignedArrays(group_idx=group_idx, array=array)
 
 
-def extract_array(block: ScanState, finalize: Callable | None = None) -> np.ndarray:
+def _finalize_scan(block: ScanState) -> np.ndarray:
     assert block.result is not None
-    result = finalize(block.result) if finalize is not None else block.result
-    return result.array
+    return block.result.array
 
 
 def dask_groupby_scan(array, by, axes: T_Axes, agg: Scan) -> DaskArray:
@@ -2859,9 +2865,8 @@ def dask_groupby_scan(array, by, axes: T_Axes, agg: Scan) -> DaskArray:
     array, by = _unify_chunks(array, by)
 
     # 1. zip together group indices & array
-    to_map = _zip if agg.preprocess is None else tlz.compose(agg.preprocess, _zip)
     zipped = map_blocks(
-        to_map, by, array, dtype=array.dtype, meta=array._meta, name="groupby-scan-preprocess"
+        _zip, by, array, dtype=array.dtype, meta=array._meta, name="groupby-scan-preprocess"
     )
 
     scan_ = partial(chunk_scan, agg=agg)
@@ -2882,7 +2887,7 @@ def dask_groupby_scan(array, by, axes: T_Axes, agg: Scan) -> DaskArray:
     )
 
     # 3. Unzip and extract the final result array, discard groups
-    result = map_blocks(extract_array, accumulated, dtype=agg.dtype, finalize=agg.finalize)
+    result = map_blocks(_finalize_scan, accumulated, dtype=agg.dtype)
 
     assert result.chunks == array.chunks
 
diff --git a/tests/test_core.py b/tests/test_core.py
index 41e77461..d0f1fe94 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -187,11 +187,11 @@ def test_groupby_reduce(
     assert_equal(expected_result, result)
 
 
-def gen_array_by(size, func):
+def gen_array_by(size, func: str):
     by = np.ones(size[-1])
     rng = np.random.default_rng(12345)
     array = rng.random(tuple(6 if s == 1 else s for s in size))
-    if "nan" in func and "nanarg" not in func:
+    if ("nan" in func or "fill" in func) and "nanarg" not in func:
         array[[1, 4, 5], ...] = np.nan
     elif "nanarg" in func and len(size) > 1:
         array[[1, 4, 5], 1] = np.nan
@@ -1810,7 +1810,7 @@ def test_nanlen_string(dtype, engine):
     assert_equal(expected, actual)
 
 
-def test_scans():
+def test_cumusm():
     array = np.array([1, 1, 1], dtype=np.uint64)
     by = np.array([0] * array.shape[-1])
     kwargs = {"func": "nancumsum", "axis": -1}
@@ -1823,3 +1823,27 @@ def test_scans():
         da = dask.array.from_array(array, chunks=2)
         actual = groupby_scan(da, by, **kwargs)
         assert_equal(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "chunks",
+    [
+        pytest.param(-1, marks=requires_dask),
+        pytest.param(3, marks=requires_dask),
+        pytest.param(4, marks=requires_dask),
+    ],
+)
+@pytest.mark.parametrize("size", ((1, 12), (12,), (12, 9)))
+@pytest.mark.parametrize("add_nan_by", [True, False])
+@pytest.mark.parametrize("func", ["ffill", "bfill"])
+def test_ffill_bfill(chunks, size, add_nan_by, func):
+    array, by = gen_array_by(size, func)
+    if chunks:
+        array = dask.array.from_array(array, chunks=chunks)
+    if add_nan_by:
+        by[0:3] = np.nan
+    by = tuple(by)
+
+    expected = flox.groupby_scan(array.compute(), by, func=func)
+    actual = flox.groupby_scan(array, by, func=func)
+    assert_equal(expected, actual)
diff --git a/tests/test_properties.py b/tests/test_properties.py
index 1f495a36..85fe4692 100644
--- a/tests/test_properties.py
+++ b/tests/test_properties.py
@@ -159,14 +159,19 @@ def test_ffill_bfill_reverse(data, array: dask.array.Array) -> None:
     def reverse(arr):
         return arr[..., ::-1]
 
-    for a in (array, array.compute()):
-        forward = groupby_scan(a, by, func="ffill")
-        backward_reversed = reverse(groupby_scan(reverse(a), reverse(by), func="bfill"))
-        assert_equal(forward, backward_reversed)
-
-        backward = groupby_scan(a, by, func="bfill")
-        forward_reversed = reverse(groupby_scan(reverse(a), reverse(by), func="ffill"))
-        assert_equal(forward_reversed, backward)
+    forward = groupby_scan(array, by, func="ffill")
+    as_numpy = groupby_scan(array.compute(), by, func="ffill")
+    assert_equal(forward, as_numpy)
+
+    backward = groupby_scan(array, by, func="bfill")
+    as_numpy = groupby_scan(array.compute(), by, func="bfill")
+    assert_equal(backward, as_numpy)
+
+    backward_reversed = reverse(groupby_scan(reverse(array), reverse(by), func="bfill"))
+    assert_equal(forward, backward_reversed)
+
+    forward_reversed = reverse(groupby_scan(reverse(array), reverse(by), func="ffill"))
+    assert_equal(forward_reversed, backward)
 
 
 @given(

From b30ea7512b551943243e3f06ab77df7b65a123ef Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@cherian.net>
Date: Sun, 28 Jul 2024 06:37:35 -0600
Subject: [PATCH 7/7] Testing improvements.

1. Record duration
2. Speed up some tests
3. Silence logging
---
 .github/workflows/ci.yaml |  2 +-
 flox/aggregations.py      |  4 ++--
 flox/core.py              |  2 +-
 tests/conftest.py         | 12 ++++++++++
 tests/test_core.py        |  2 +-
 tests/test_properties.py  |  5 ++++-
 tests/test_xarray.py      | 47 +++++++++++++++++++++++----------------
 7 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 6c478b86..f3633fc6 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -62,7 +62,7 @@ jobs:
       - name: Run Tests
         id: status
         run: |
-          pytest -n auto --cov=./ --cov-report=xml --hypothesis-profile ci
+          pytest --durations=20 --durations-min=0.5 -n auto --cov=./ --cov-report=xml --hypothesis-profile ci
       - name: Upload code coverage to Codecov
         uses: codecov/codecov-action@v4.5.0
         with:
diff --git a/flox/aggregations.py b/flox/aggregations.py
index 98d91622..51e650a6 100644
--- a/flox/aggregations.py
+++ b/flox/aggregations.py
@@ -76,7 +76,7 @@ def generic_aggregate(
         try:
             method = getattr(aggregate_flox, func)
         except AttributeError:
-            logger.debug(f"Couldn't find {func} for engine='flox'. Falling back to numpy")
+            # logger.debug(f"Couldn't find {func} for engine='flox'. Falling back to numpy")
             method = get_npg_aggregation(func, engine="numpy")
 
     elif engine == "numbagg":
@@ -94,7 +94,7 @@ def generic_aggregate(
                 method = getattr(aggregate_numbagg, func)
 
         except AttributeError:
-            logger.debug(f"Couldn't find {func} for engine='numbagg'. Falling back to numpy")
+            # logger.debug(f"Couldn't find {func} for engine='numbagg'. Falling back to numpy")
             method = get_npg_aggregation(func, engine="numpy")
 
     elif engine in ["numpy", "numba"]:
diff --git a/flox/core.py b/flox/core.py
index 9e3dee3d..8eaf54b7 100644
--- a/flox/core.py
+++ b/flox/core.py
@@ -2011,7 +2011,7 @@ def _validate_reindex(
                 reindex = True
 
     assert isinstance(reindex, bool)
-    logger.debug("Leaving _validate_reindex: reindex is {}".format(reindex))  # noqa
+    # logger.debug("Leaving _validate_reindex: reindex is {}".format(reindex))  # noqa
 
     return reindex
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 1ccc20a1..b3a0ab93 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -28,3 +28,15 @@
 )
 def engine(request):
     return request.param
+
+
+@pytest.fixture(
+    scope="module",
+    params=[
+        "flox",
+        "numpy",
+        pytest.param("numbagg", marks=requires_numbagg),
+    ],
+)
+def engine_no_numba(request):
+    return request.param
diff --git a/tests/test_core.py b/tests/test_core.py
index d0f1fe94..e12e695d 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -187,7 +187,7 @@ def test_groupby_reduce(
     assert_equal(expected_result, result)
 
 
-def gen_array_by(size, func: str):
+def gen_array_by(size, func):
     by = np.ones(size[-1])
     rng = np.random.default_rng(12345)
     array = rng.random(tuple(6 if s == 1 else s for s in size))
diff --git a/tests/test_properties.py b/tests/test_properties.py
index 85fe4692..26150519 100644
--- a/tests/test_properties.py
+++ b/tests/test_properties.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Any, Callable
 
 import pandas as pd
@@ -51,7 +52,9 @@ def not_overflowing_array(array: np.ndarray[Any, Any]) -> bool:
 
     array = array.ravel()
     array = array[notnull(array)]
-    result = bool(np.all((array < info.max / array.size) & (array > info.min / array.size)))
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", RuntimeWarning)
+        result = bool(np.all((array < info.max / array.size) & (array > info.min / array.size)))
     # note(f"returning {result}, {array.min()} vs {info.min}, {array.max()} vs {info.max}")
     return result
 
diff --git a/tests/test_xarray.py b/tests/test_xarray.py
index ecd63bbf..11b2e23c 100644
--- a/tests/test_xarray.py
+++ b/tests/test_xarray.py
@@ -22,13 +22,9 @@
 
     dask.config.set(scheduler="sync")
 
-try:
-    # test against legacy xarray implementation
-    xr.set_options(use_flox=False)
-except ValueError:
-    pass
-
-
+# test against legacy xarray implementation
+# avoid some compilation overhead
+xr.set_options(use_flox=False, use_numbagg=False)
 tolerance64 = {"rtol": 1e-15, "atol": 1e-18}
 np.random.seed(123)
 
@@ -37,7 +33,8 @@
 @pytest.mark.parametrize("min_count", [None, 1, 3])
 @pytest.mark.parametrize("add_nan", [True, False])
 @pytest.mark.parametrize("skipna", [True, False])
-def test_xarray_reduce(skipna, add_nan, min_count, engine, reindex):
+def test_xarray_reduce(skipna, add_nan, min_count, engine_no_numba, reindex):
+    engine = engine_no_numba
     if skipna is False and min_count is not None:
         pytest.skip()
 
@@ -57,7 +54,13 @@ def test_xarray_reduce(skipna, add_nan, min_count, engine, reindex):
 
     expected = da.groupby("labels").sum(skipna=skipna, min_count=min_count)
     actual = xarray_reduce(
-        da, "labels", func="sum", skipna=skipna, min_count=min_count, engine=engine, reindex=reindex
+        da,
+        "labels",
+        func="sum",
+        skipna=skipna,
+        min_count=min_count,
+        engine=engine,
+        reindex=reindex,
     )
     assert_equal(expected, actual)
 
@@ -85,9 +88,10 @@ def test_xarray_reduce(skipna, add_nan, min_count, engine, reindex):
 # TODO: sort
 @pytest.mark.parametrize("pass_expected_groups", [True, False])
 @pytest.mark.parametrize("chunk", (pytest.param(True, marks=requires_dask), False))
-def test_xarray_reduce_multiple_groupers(pass_expected_groups, chunk, engine):
+def test_xarray_reduce_multiple_groupers(pass_expected_groups, chunk, engine_no_numba):
     if chunk and pass_expected_groups is False:
         pytest.skip()
+    engine = engine_no_numba
 
     arr = np.ones((4, 12))
     labels = np.array(["a", "a", "c", "c", "c", "b", "b", "c", "c", "b", "b", "f"])
@@ -131,9 +135,10 @@ def test_xarray_reduce_multiple_groupers(pass_expected_groups, chunk, engine):
 
 @pytest.mark.parametrize("pass_expected_groups", [True, False])
 @pytest.mark.parametrize("chunk", (pytest.param(True, marks=requires_dask), False))
-def test_xarray_reduce_multiple_groupers_2(pass_expected_groups, chunk, engine):
+def test_xarray_reduce_multiple_groupers_2(pass_expected_groups, chunk, engine_no_numba):
     if chunk and pass_expected_groups is False:
         pytest.skip()
+    engine = engine_no_numba
 
     arr = np.ones((2, 12))
     labels = np.array(["a", "a", "c", "c", "c", "b", "b", "c", "c", "b", "b", "f"])
@@ -187,7 +192,8 @@ def test_validate_expected_groups(expected_groups):
 
 @requires_cftime
 @requires_dask
-def test_xarray_reduce_single_grouper(engine):
+def test_xarray_reduce_single_grouper(engine_no_numba):
+    engine = engine_no_numba
     # DataArray
     ds = xr.Dataset(
         {"Tair": (("time", "x", "y"), dask.array.ones((36, 205, 275), chunks=(9, -1, -1)))},
@@ -293,7 +299,8 @@ def test_rechunk_for_blockwise(inchunks, expected):
 # TODO: dim=None, dim=Ellipsis, groupby unindexed dim
 
 
-def test_groupby_duplicate_coordinate_labels(engine):
+def test_groupby_duplicate_coordinate_labels(engine_no_numba):
+    engine = engine_no_numba
     # fix for http://stackoverflow.com/questions/38065129
     array = xr.DataArray([1, 2, 3], [("x", [1, 1, 2])])
     expected = xr.DataArray([3, 3], [("x", [1, 2])])
@@ -301,7 +308,8 @@ def test_groupby_duplicate_coordinate_labels(engine):
     assert_equal(expected, actual)
 
 
-def test_multi_index_groupby_sum(engine):
+def test_multi_index_groupby_sum(engine_no_numba):
+    engine = engine_no_numba
     # regression test for xarray GH873
     ds = xr.Dataset(
         {"foo": (("x", "y", "z"), np.ones((3, 4, 2)))},
@@ -327,7 +335,8 @@ def test_multi_index_groupby_sum(engine):
 
 
 @pytest.mark.parametrize("chunks", (None, pytest.param(2, marks=requires_dask)))
-def test_xarray_groupby_bins(chunks, engine):
+def test_xarray_groupby_bins(chunks, engine_no_numba):
+    engine = engine_no_numba
     array = xr.DataArray([1, 1, 1, 1, 1], dims="x")
     labels = xr.DataArray([1, 1.5, 1.9, 2, 3], dims="x", name="labels")
 
@@ -495,11 +504,11 @@ def test_alignment_error():
 @pytest.mark.parametrize("dtype_out", [np.float64, "float64", np.dtype("float64")])
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
 @pytest.mark.parametrize("chunk", (pytest.param(True, marks=requires_dask), False))
-def test_dtype(add_nan, chunk, dtype, dtype_out, engine):
-    if engine == "numbagg":
+def test_dtype(add_nan, chunk, dtype, dtype_out, engine_no_numba):
+    if engine_no_numba == "numbagg":
         # https://github.com/numbagg/numbagg/issues/121
         pytest.skip()
-
+    engine = engine_no_numba
     xp = dask.array if chunk else np
     data = xp.linspace(0, 1, 48, dtype=dtype).reshape((4, 12))
 
@@ -707,7 +716,7 @@ def test_multiple_quantiles(q, chunk, by_ndim, skipna):
     da = xr.DataArray(array, dims=("x", *dims))
     by = xr.DataArray(labels, dims=dims, name="by")
 
-    actual = xarray_reduce(da, by, func="quantile", skipna=skipna, q=q)
+    actual = xarray_reduce(da, by, func="quantile", skipna=skipna, q=q, engine="flox")
     with xr.set_options(use_flox=False):
         expected = da.groupby(by).quantile(q, skipna=skipna)
     xr.testing.assert_allclose(expected, actual)