From 6a7b9324b82f098e603157bd4f154555b0b11c28 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 24 Jul 2024 22:11:36 -0600 Subject: [PATCH 1/7] Add first, last property test Closes #29 --- flox/aggregations.py | 15 +++-- flox/xrdtypes.py | 1 + tests/__init__.py | 117 +++++++++++++++++++++++++++++++++++---- tests/test_properties.py | 62 +++++++++++++++------ 4 files changed, 162 insertions(+), 33 deletions(-) diff --git a/flox/aggregations.py b/flox/aggregations.py index 0de8d5ff..c69bbc2e 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -69,6 +69,9 @@ def generic_aggregate( if func == "identity": return array + if func in ["nanfirst", "nanlast"] and array.dtype.kind in "US": + func = func[3:] + if engine == "flox": try: method = getattr(aggregate_flox, func) @@ -144,6 +147,8 @@ def _maybe_promote_int(dtype) -> np.dtype: def _get_fill_value(dtype, fill_value): """Returns dtype appropriate infinity. Returns +Inf equivalent for None.""" + if fill_value in [None, dtypes.NA] and dtype.kind in "US": + return "" if fill_value == dtypes.INF or fill_value is None: return dtypes.get_pos_infinity(dtype, max_for_int=True) if fill_value == dtypes.NINF: @@ -516,10 +521,10 @@ def _pick_second(*x): final_dtype=np.intp, ) -first = Aggregation("first", chunk=None, combine=None, fill_value=0) -last = Aggregation("last", chunk=None, combine=None, fill_value=0) -nanfirst = Aggregation("nanfirst", chunk="nanfirst", combine="nanfirst", fill_value=np.nan) -nanlast = Aggregation("nanlast", chunk="nanlast", combine="nanlast", fill_value=np.nan) +first = Aggregation("first", chunk=None, combine=None, fill_value=None) +last = Aggregation("last", chunk=None, combine=None, fill_value=None) +nanfirst = Aggregation("nanfirst", chunk="nanfirst", combine="nanfirst", fill_value=dtypes.NA) +nanlast = Aggregation("nanlast", chunk="nanlast", combine="nanlast", fill_value=dtypes.NA) all_ = Aggregation( "all", @@ -808,7 +813,7 @@ def _initialize_aggregation( ) final_dtype = _normalize_dtype(dtype_ or agg.dtype_init["final"], array_dtype, fill_value) - if agg.name not in ["min", "max", "nanmin", "nanmax"]: + if agg.name not in ["first", "last", "nanfirst", "nanlast", "min", "max", "nanmin", "nanmax"]: final_dtype = _maybe_promote_int(final_dtype) agg.dtype = { "user": dtype, # Save to automatically choose an engine diff --git a/flox/xrdtypes.py b/flox/xrdtypes.py index 78ba5101..bad498b1 100644 --- a/flox/xrdtypes.py +++ b/flox/xrdtypes.py @@ -100,6 +100,7 @@ def get_pos_infinity(dtype, max_for_int=False): if issubclass(dtype.type, np.integer): if max_for_int: + dtype = np.int64 if dtype.kind in "Mm" else dtype return np.iinfo(dtype).max else: return np.inf diff --git a/tests/__init__.py b/tests/__init__.py index bdbb33be..07cd7ae5 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -101,17 +101,26 @@ def assert_equal(a, b, tolerance=None): else: tolerance = {} - if has_dask and isinstance(a, dask_array_type) or isinstance(b, dask_array_type): - # sometimes it's nice to see values and shapes - # rather than being dropped into some file in dask - np.testing.assert_allclose(a, b, **tolerance) - # does some validation of the dask graph - da.utils.assert_eq(a, b, equal_nan=True) + # Always run the numpy comparison first, so that we get nice error messages with dask. + # sometimes it's nice to see values and shapes + # rather than being dropped into some file in dask + if a.dtype != b.dtype: + raise AssertionError(f"a and b have different dtypes: (a: {a.dtype}, b: {b.dtype})") + + if has_dask: + a_eager = a.compute() if isinstance(a, dask_array_type) else a + b_eager = b.compute() if isinstance(b, dask_array_type) else b else: - if a.dtype != b.dtype: - raise AssertionError(f"a and b have different dtypes: (a: {a.dtype}, b: {b.dtype})") + a_eager, b_eager = a, b - np.testing.assert_allclose(a, b, equal_nan=True, **tolerance) + if a.dtype.kind in "SUMm": + np.testing.assert_equal(a_eager, b_eager) + else: + np.testing.assert_allclose(a_eager, b_eager, equal_nan=True, **tolerance) + + if has_dask and isinstance(a, dask_array_type) or isinstance(b, dask_array_type): + # does some validation of the dask graph + dask_assert_eq(a, b, equal_nan=True) def assert_equal_tuple(a, b): @@ -119,7 +128,7 @@ def assert_equal_tuple(a, b): assert len(a) == len(b) for a_, b_ in zip(a, b): - assert type(a_) == type(b_) + assert type(a_) is type(b_) if isinstance(a_, np.ndarray): np.testing.assert_array_equal(a_, b_) else: @@ -156,3 +165,91 @@ def assert_equal_tuple(a, b): "quantile", "nanquantile", ) + tuple(SCIPY_STATS_FUNCS) + + +def dask_assert_eq( + a, + b, + check_shape=True, + check_graph=True, + check_meta=True, + check_chunks=True, + check_ndim=True, + check_type=True, + check_dtype=True, + equal_nan=True, + scheduler="sync", + **kwargs, +): + """dask.array.utils.assert_eq modified to skip value checks. Their code is buggy for some dtypes. + We just check values through numpy and care about validating the graph in this function.""" + from dask.array.utils import _get_dt_meta_computed + + a_original = a + b_original = b + + if isinstance(a, (list, int, float)): + a = np.array(a) + if isinstance(b, (list, int, float)): + b = np.array(b) + + a, adt, a_meta, a_computed = _get_dt_meta_computed( + a, + check_shape=check_shape, + check_graph=check_graph, + check_chunks=check_chunks, + check_ndim=check_ndim, + scheduler=scheduler, + ) + b, bdt, b_meta, b_computed = _get_dt_meta_computed( + b, + check_shape=check_shape, + check_graph=check_graph, + check_chunks=check_chunks, + check_ndim=check_ndim, + scheduler=scheduler, + ) + + if check_type: + _a = a if a.shape else a.item() + _b = b if b.shape else b.item() + assert type(_a) is type(_b), f"a and b have different types (a: {type(_a)}, b: {type(_b)})" + if check_meta: + if hasattr(a, "_meta") and hasattr(b, "_meta"): + dask_assert_eq(a._meta, b._meta) + if hasattr(a_original, "_meta"): + msg = ( + f"compute()-ing 'a' changes its number of dimensions " + f"(before: {a_original._meta.ndim}, after: {a.ndim})" + ) + assert a_original._meta.ndim == a.ndim, msg + if a_meta is not None: + msg = ( + f"compute()-ing 'a' changes its type " + f"(before: {type(a_original._meta)}, after: {type(a_meta)})" + ) + assert type(a_original._meta) is type(a_meta), msg + if not (np.isscalar(a_meta) or np.isscalar(a_computed)): + msg = ( + f"compute()-ing 'a' results in a different type than implied by its metadata " + f"(meta: {type(a_meta)}, computed: {type(a_computed)})" + ) + assert type(a_meta) is type(a_computed), msg + if hasattr(b_original, "_meta"): + msg = ( + f"compute()-ing 'b' changes its number of dimensions " + f"(before: {b_original._meta.ndim}, after: {b.ndim})" + ) + assert b_original._meta.ndim == b.ndim, msg + if b_meta is not None: + msg = ( + f"compute()-ing 'b' changes its type " + f"(before: {type(b_original._meta)}, after: {type(b_meta)})" + ) + assert type(b_original._meta) is type(b_meta), msg + if not (np.isscalar(b_meta) or np.isscalar(b_computed)): + msg = ( + f"compute()-ing 'b' results in a different type than implied by its metadata " + f"(meta: {type(b_meta)}, computed: {type(b_computed)})" + ) + assert type(b_meta) is type(b_computed), msg diff --git a/tests/test_properties.py b/tests/test_properties.py index 0d75e2a3..b74264b8 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -59,6 +59,12 @@ def supported_dtypes() -> st.SearchStrategy[np.dtype]: func_st = st.sampled_from( [f for f in ALL_FUNCS if f not in NON_NUMPY_FUNCS and f not in SKIPPED_FUNCS] ) +numeric_arrays = npst.arrays( + elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=array_dtype_st +) +all_arrays = npst.arrays( + elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=supported_dtypes() +) def by_arrays(shape): @@ -81,13 +87,7 @@ def not_overflowing_array(array) -> bool: return result -@given( - array=npst.arrays( - elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=array_dtype_st - ), - dtype=by_dtype_st, - func=func_st, -) +@given(array=numeric_arrays, dtype=by_dtype_st, func=func_st) def test_groupby_reduce(array, dtype, func): # overflow behaviour differs between bincount and sum (for example) assume(not_overflowing_array(array)) @@ -149,17 +149,7 @@ def chunks(draw, *, shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: @st.composite -def chunked_arrays( - draw, - *, - chunks=chunks, - arrays=npst.arrays( - elements={"allow_subnormal": False}, - shape=npst.array_shapes(max_side=10), - dtype=array_dtype_st, - ), - from_array=dask.array.from_array, -): +def chunked_arrays(draw, *, chunks=chunks, arrays=numeric_arrays, from_array=dask.array.from_array): array = draw(arrays) chunks = draw(chunks(shape=array.shape)) @@ -216,6 +206,7 @@ def test_scans(data, array, func): @given(data=st.data(), array=chunked_arrays()) def test_ffill_bfill_reverse(data, array): + # TODO: test NaT and timedelta, datetime assume(not_overflowing_array(np.asarray(array))) by = data.draw(by_arrays(shape=(array.shape[-1],))) @@ -230,3 +221,38 @@ def reverse(arr): backward = groupby_scan(a, by, func="bfill") forward_reversed = reverse(groupby_scan(reverse(a), reverse(by), func="ffill")) assert_equal(forward_reversed, backward) + + +@given( + data=st.data(), + array=chunked_arrays(arrays=all_arrays), + func=st.sampled_from(["first", "last", "nanfirst", "nanlast"]), +) +def test_first_last(data, array, func): + by = data.draw(by_arrays(shape=(array.shape[-1],))) + + INVERSES = {"first": "last", "last": "first", "nanfirst": "nanlast", "nanlast": "nanfirst"} + MATES = {"first": "nanfirst", "last": "nanlast", "nanfirst": "first", "nanlast": "last"} + inverse = INVERSES[func] + mate = MATES[func] + + if func in ["first", "last"]: + array = array.rechunk((*array.chunks[:-1], -1)) + + for arr in [array, array.compute()]: + forward, fg = groupby_reduce(arr, by, func=func, engine="flox") + reverse, rg = groupby_reduce(arr[..., ::-1], by[..., ::-1], func=inverse, engine="flox") + + assert forward.dtype == reverse.dtype + assert forward.dtype == arr.dtype + + assert_equal(fg, rg) + assert_equal(forward, reverse) + + if arr.dtype.kind == "f" and not np.isnan(array.compute()).any(): + if mate in ["first", "last"]: + array = array.rechunk((*array.chunks[:-1], -1)) + + first, _ = groupby_reduce(array, by, func=func, engine="flox") + second, _ = groupby_reduce(array, by, func=mate, engine="flox") + assert_equal(first, second) From 22140ebaf1a353f4292ebf2fba4807756533f5bd Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 26 Jul 2024 20:32:35 -0600 Subject: [PATCH 2/7] Add more array types for property tests --- tests/__init__.py | 2 +- tests/test_properties.py | 141 +++++++++++++++++++++++++++------------ 2 files changed, 101 insertions(+), 42 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index 07cd7ae5..3c3ba1da 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -113,7 +113,7 @@ def assert_equal(a, b, tolerance=None): else: a_eager, b_eager = a, b - if a.dtype.kind in "SUMm": + if a.dtype.kind in "SUMmO": np.testing.assert_equal(a_eager, b_eager) else: np.testing.assert_allclose(a_eager, b_eager, equal_nan=True, **tolerance) diff --git a/tests/test_properties.py b/tests/test_properties.py index b74264b8..8bb5abde 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -3,7 +3,9 @@ pytest.importorskip("hypothesis") pytest.importorskip("dask") +pytest.importorskip("cftime") +import cftime import dask import hypothesis.extra.numpy as npst import hypothesis.strategies as st @@ -66,11 +68,55 @@ def supported_dtypes() -> st.SearchStrategy[np.dtype]: elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=supported_dtypes() ) +calendars = st.sampled_from( + [ + "standard", + "gregorian", + "proleptic_gregorian", + "noleap", + "365_day", + "360_day", + "julian", + "all_leap", + "366_day", + ] +) + + +@st.composite +def units(draw, *, calendar: str): + choices = ["days", "hours", "minutes", "seconds", "milliseconds", "microseconds"] + if calendar == "360_day": + choices += ["months"] + elif calendar == "noleap": + choices += ["common_years"] + time_units = draw(st.sampled_from(choices)) + + dt = draw(st.datetimes()) + year, month, day = dt.year, dt.month, dt.day + if calendar == "360_day": + month %= 30 + return f"{time_units} since {year}-{month}-{day}" -def by_arrays(shape): - return npst.arrays( - dtype=npst.integer_dtypes(endianness="=") | npst.unicode_string_dtypes(endianness="="), - shape=shape, + +@st.composite +def cftime_arrays(draw, *, shape, calendars=calendars, elements=None): + if elements is None: + elements = {"min_value": -10_000, "max_value": 10_000} + cal = draw(calendars) + values = draw(npst.arrays(dtype=np.int64, shape=shape, elements=elements)) + unit = draw(units(calendar=cal)) + return cftime.num2date(values, units=unit, calendar=cal) + + +def by_arrays(shape, *, elements=None): + return st.one_of( + npst.arrays( + dtype=npst.integer_dtypes(endianness="=") | npst.unicode_string_dtypes(endianness="="), + shape=shape, + elements=elements, + ), + cftime_arrays(shape=shape, elements=elements), ) @@ -87,8 +133,43 @@ def not_overflowing_array(array) -> bool: return result -@given(array=numeric_arrays, dtype=by_dtype_st, func=func_st) -def test_groupby_reduce(array, dtype, func): +@st.composite +def chunks(draw, *, shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: + chunks = [] + for size in shape: + if size > 1: + nchunks = draw(st.integers(min_value=1, max_value=size - 1)) + dividers = sorted( + set(draw(st.integers(min_value=1, max_value=size - 1)) for _ in range(nchunks - 1)) + ) + chunks.append(tuple(a - b for a, b in zip(dividers + [size], [0] + dividers))) + else: + chunks.append((1,)) + return tuple(chunks) + + +@st.composite +def chunked_arrays(draw, *, chunks=chunks, arrays=numeric_arrays, from_array=dask.array.from_array): + array = draw(arrays) + chunks = draw(chunks(shape=array.shape)) + + if array.dtype.kind in "cf": + nan_idx = draw( + st.lists( + st.integers(min_value=0, max_value=array.shape[-1] - 1), + max_size=array.shape[-1] - 1, + unique=True, + ) + ) + if nan_idx: + array[..., nan_idx] = np.nan + + return from_array(array, chunks=chunks) + + +# TODO: migrate to by_arrays but with constant value +@given(data=st.data(), array=numeric_arrays, func=func_st) +def test_groupby_reduce(data, array, func): # overflow behaviour differs between bincount and sum (for example) assume(not_overflowing_array(array)) # TODO: fix var for complex numbers upstream @@ -97,7 +178,19 @@ def test_groupby_reduce(array, dtype, func): assume("arg" not in func and not np.any(np.isnan(array).ravel())) axis = -1 - by = np.ones((array.shape[-1],), dtype=dtype) + by = data.draw( + by_arrays( + elements={ + "alphabet": st.just("a"), + "min_value": 1, + "max_value": 1, + "min_size": 1, + "max_size": 1, + }, + shape=array.shape[-1], + ) + ) + assert len(np.unique(by)) == 1 kwargs = {"q": 0.8} if "quantile" in func else {} flox_kwargs = {} with np.errstate(invalid="ignore", divide="ignore"): @@ -133,40 +226,6 @@ def test_groupby_reduce(array, dtype, func): assert_equal(expected, actual, tolerance) -@st.composite -def chunks(draw, *, shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: - chunks = [] - for size in shape: - if size > 1: - nchunks = draw(st.integers(min_value=1, max_value=size - 1)) - dividers = sorted( - set(draw(st.integers(min_value=1, max_value=size - 1)) for _ in range(nchunks - 1)) - ) - chunks.append(tuple(a - b for a, b in zip(dividers + [size], [0] + dividers))) - else: - chunks.append((1,)) - return tuple(chunks) - - -@st.composite -def chunked_arrays(draw, *, chunks=chunks, arrays=numeric_arrays, from_array=dask.array.from_array): - array = draw(arrays) - chunks = draw(chunks(shape=array.shape)) - - if array.dtype.kind in "cf": - nan_idx = draw( - st.lists( - st.integers(min_value=0, max_value=array.shape[-1] - 1), - max_size=array.shape[-1] - 1, - unique=True, - ) - ) - if nan_idx: - array[..., nan_idx] = np.nan - - return from_array(array, chunks=chunks) - - @given( data=st.data(), array=chunked_arrays(), From 6fbd0fdac68ff591bc1f194fbf399590decb7be0 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 26 Jul 2024 21:30:23 -0600 Subject: [PATCH 3/7] Refactor hypothesis strategies --- tests/strategies.py | 125 +++++++++++++++++++++++++++++++++++++++ tests/test_properties.py | 120 +------------------------------------ 2 files changed, 127 insertions(+), 118 deletions(-) create mode 100644 tests/strategies.py diff --git a/tests/strategies.py b/tests/strategies.py new file mode 100644 index 00000000..aa24bce9 --- /dev/null +++ b/tests/strategies.py @@ -0,0 +1,125 @@ +import cftime +import dask +import hypothesis.extra.numpy as npst +import hypothesis.strategies as st +import numpy as np + +from . import ALL_FUNCS, SCIPY_STATS_FUNCS + + +def supported_dtypes() -> st.SearchStrategy[np.dtype]: + return ( + npst.integer_dtypes(endianness="=") + | npst.unsigned_integer_dtypes(endianness="=") + | npst.floating_dtypes(endianness="=", sizes=(32, 64)) + | npst.complex_number_dtypes(endianness="=") + | npst.datetime64_dtypes(endianness="=") + | npst.timedelta64_dtypes(endianness="=") + | npst.unicode_string_dtypes(endianness="=") + ) + + +# TODO: stop excluding everything but U +array_dtype_st = supported_dtypes().filter(lambda x: x.kind not in "cmMU") +by_dtype_st = supported_dtypes() + +NON_NUMPY_FUNCS = ["first", "last", "nanfirst", "nanlast", "count", "any", "all"] + list( + SCIPY_STATS_FUNCS +) +SKIPPED_FUNCS = ["var", "std", "nanvar", "nanstd"] + +func_st = st.sampled_from( + [f for f in ALL_FUNCS if f not in NON_NUMPY_FUNCS and f not in SKIPPED_FUNCS] +) +numeric_arrays = npst.arrays( + elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=array_dtype_st +) +all_arrays = npst.arrays( + elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=supported_dtypes() +) + + +calendars = st.sampled_from( + [ + "standard", + "gregorian", + "proleptic_gregorian", + "noleap", + "365_day", + "360_day", + "julian", + "all_leap", + "366_day", + ] +) + + +@st.composite +def units(draw, *, calendar: str): + choices = ["days", "hours", "minutes", "seconds", "milliseconds", "microseconds"] + if calendar == "360_day": + choices += ["months"] + elif calendar == "noleap": + choices += ["common_years"] + time_units = draw(st.sampled_from(choices)) + + dt = draw(st.datetimes()) + year, month, day = dt.year, dt.month, dt.day + if calendar == "360_day": + day = min(day, 30) + return f"{time_units} since {year}-{month}-{day}" + + +@st.composite +def cftime_arrays(draw, *, shape, calendars=calendars, elements=None): + if elements is None: + elements = {"min_value": -10_000, "max_value": 10_000} + cal = draw(calendars) + values = draw(npst.arrays(dtype=np.int64, shape=shape, elements=elements)) + unit = draw(units(calendar=cal)) + return cftime.num2date(values, units=unit, calendar=cal) + + +def by_arrays(shape, *, elements=None): + return st.one_of( + npst.arrays( + dtype=npst.integer_dtypes(endianness="=") | npst.unicode_string_dtypes(endianness="="), + shape=shape, + elements=elements, + ), + cftime_arrays(shape=shape, elements=elements), + ) + + +@st.composite +def chunks(draw, *, shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: + chunks = [] + for size in shape: + if size > 1: + nchunks = draw(st.integers(min_value=1, max_value=size - 1)) + dividers = sorted( + set(draw(st.integers(min_value=1, max_value=size - 1)) for _ in range(nchunks - 1)) + ) + chunks.append(tuple(a - b for a, b in zip(dividers + [size], [0] + dividers))) + else: + chunks.append((1,)) + return tuple(chunks) + + +@st.composite +def chunked_arrays(draw, *, chunks=chunks, arrays=numeric_arrays, from_array=dask.array.from_array): + array = draw(arrays) + chunks = draw(chunks(shape=array.shape)) + + if array.dtype.kind in "cf": + nan_idx = draw( + st.lists( + st.integers(min_value=0, max_value=array.shape[-1] - 1), + max_size=array.shape[-1] - 1, + unique=True, + ) + ) + if nan_idx: + array[..., nan_idx] = np.nan + + return from_array(array, chunks=chunks) diff --git a/tests/test_properties.py b/tests/test_properties.py index 8bb5abde..cf9b410b 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -5,9 +5,7 @@ pytest.importorskip("dask") pytest.importorskip("cftime") -import cftime import dask -import hypothesis.extra.numpy as npst import hypothesis.strategies as st import numpy as np from hypothesis import assume, given, note @@ -15,7 +13,8 @@ import flox from flox.core import groupby_reduce, groupby_scan -from . import ALL_FUNCS, SCIPY_STATS_FUNCS, assert_equal +from . import assert_equal +from .strategies import all_arrays, by_arrays, chunked_arrays, func_st, numeric_arrays dask.config.set(scheduler="sync") @@ -32,10 +31,6 @@ def bfill(array, axis, dtype=None): )[::-1] -NON_NUMPY_FUNCS = ["first", "last", "nanfirst", "nanlast", "count", "any", "all"] + list( - SCIPY_STATS_FUNCS -) -SKIPPED_FUNCS = ["var", "std", "nanvar", "nanstd"] NUMPY_SCAN_FUNCS = { "nancumsum": np.nancumsum, "ffill": ffill, @@ -43,83 +38,6 @@ def bfill(array, axis, dtype=None): } # "cumsum": np.cumsum, -def supported_dtypes() -> st.SearchStrategy[np.dtype]: - return ( - npst.integer_dtypes(endianness="=") - | npst.unsigned_integer_dtypes(endianness="=") - | npst.floating_dtypes(endianness="=", sizes=(32, 64)) - | npst.complex_number_dtypes(endianness="=") - | npst.datetime64_dtypes(endianness="=") - | npst.timedelta64_dtypes(endianness="=") - | npst.unicode_string_dtypes(endianness="=") - ) - - -# TODO: stop excluding everything but U -array_dtype_st = supported_dtypes().filter(lambda x: x.kind not in "cmMU") -by_dtype_st = supported_dtypes() -func_st = st.sampled_from( - [f for f in ALL_FUNCS if f not in NON_NUMPY_FUNCS and f not in SKIPPED_FUNCS] -) -numeric_arrays = npst.arrays( - elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=array_dtype_st -) -all_arrays = npst.arrays( - elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=supported_dtypes() -) - -calendars = st.sampled_from( - [ - "standard", - "gregorian", - "proleptic_gregorian", - "noleap", - "365_day", - "360_day", - "julian", - "all_leap", - "366_day", - ] -) - - -@st.composite -def units(draw, *, calendar: str): - choices = ["days", "hours", "minutes", "seconds", "milliseconds", "microseconds"] - if calendar == "360_day": - choices += ["months"] - elif calendar == "noleap": - choices += ["common_years"] - time_units = draw(st.sampled_from(choices)) - - dt = draw(st.datetimes()) - year, month, day = dt.year, dt.month, dt.day - if calendar == "360_day": - month %= 30 - return f"{time_units} since {year}-{month}-{day}" - - -@st.composite -def cftime_arrays(draw, *, shape, calendars=calendars, elements=None): - if elements is None: - elements = {"min_value": -10_000, "max_value": 10_000} - cal = draw(calendars) - values = draw(npst.arrays(dtype=np.int64, shape=shape, elements=elements)) - unit = draw(units(calendar=cal)) - return cftime.num2date(values, units=unit, calendar=cal) - - -def by_arrays(shape, *, elements=None): - return st.one_of( - npst.arrays( - dtype=npst.integer_dtypes(endianness="=") | npst.unicode_string_dtypes(endianness="="), - shape=shape, - elements=elements, - ), - cftime_arrays(shape=shape, elements=elements), - ) - - def not_overflowing_array(array) -> bool: if array.dtype.kind == "f": info = np.finfo(array.dtype) @@ -133,40 +51,6 @@ def not_overflowing_array(array) -> bool: return result -@st.composite -def chunks(draw, *, shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: - chunks = [] - for size in shape: - if size > 1: - nchunks = draw(st.integers(min_value=1, max_value=size - 1)) - dividers = sorted( - set(draw(st.integers(min_value=1, max_value=size - 1)) for _ in range(nchunks - 1)) - ) - chunks.append(tuple(a - b for a, b in zip(dividers + [size], [0] + dividers))) - else: - chunks.append((1,)) - return tuple(chunks) - - -@st.composite -def chunked_arrays(draw, *, chunks=chunks, arrays=numeric_arrays, from_array=dask.array.from_array): - array = draw(arrays) - chunks = draw(chunks(shape=array.shape)) - - if array.dtype.kind in "cf": - nan_idx = draw( - st.lists( - st.integers(min_value=0, max_value=array.shape[-1] - 1), - max_size=array.shape[-1] - 1, - unique=True, - ) - ) - if nan_idx: - array[..., nan_idx] = np.nan - - return from_array(array, chunks=chunks) - - # TODO: migrate to by_arrays but with constant value @given(data=st.data(), array=numeric_arrays, func=func_st) def test_groupby_reduce(data, array, func): From 39d8dcf992e4c1fdddb13ff32b9580742bd8d688 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sat, 27 Jul 2024 06:23:17 -0600 Subject: [PATCH 4/7] Type strategies --- tests/strategies.py | 41 +++++++++++++++++++++++++++++++++------- tests/test_properties.py | 35 +++++++++++++++++----------------- 2 files changed, 52 insertions(+), 24 deletions(-) diff --git a/tests/strategies.py b/tests/strategies.py index aa24bce9..fcee1c8e 100644 --- a/tests/strategies.py +++ b/tests/strategies.py @@ -1,3 +1,7 @@ +from __future__ import annotations + +from typing import Any, Callable + import cftime import dask import hypothesis.extra.numpy as npst @@ -6,6 +10,8 @@ from . import ALL_FUNCS, SCIPY_STATS_FUNCS +Chunks = tuple[tuple[int, ...], ...] + def supported_dtypes() -> st.SearchStrategy[np.dtype]: return ( @@ -38,7 +44,6 @@ def supported_dtypes() -> st.SearchStrategy[np.dtype]: elements={"allow_subnormal": False}, shape=npst.array_shapes(), dtype=supported_dtypes() ) - calendars = st.sampled_from( [ "standard", @@ -55,7 +60,7 @@ def supported_dtypes() -> st.SearchStrategy[np.dtype]: @st.composite -def units(draw, *, calendar: str): +def units(draw, *, calendar: str) -> str: choices = ["days", "hours", "minutes", "seconds", "milliseconds", "microseconds"] if calendar == "360_day": choices += ["months"] @@ -67,20 +72,36 @@ def units(draw, *, calendar: str): year, month, day = dt.year, dt.month, dt.day if calendar == "360_day": day = min(day, 30) + if calendar in ["360_day", "365_day", "noleap"] and month == 2 and day == 29: + day = 28 + return f"{time_units} since {year}-{month}-{day}" @st.composite -def cftime_arrays(draw, *, shape, calendars=calendars, elements=None): +def cftime_arrays( + draw: st.DrawFn, + *, + shape: tuple[int, ...], + calendars: st.SearchStrategy[str] = calendars, + elements: dict[str, Any] | None = None, +) -> np.ndarray[Any, Any]: if elements is None: - elements = {"min_value": -10_000, "max_value": 10_000} + elements = {} + elements.setdefault("min_value", -10_000) + elements.setdefault("max_value", 10_000) cal = draw(calendars) values = draw(npst.arrays(dtype=np.int64, shape=shape, elements=elements)) unit = draw(units(calendar=cal)) return cftime.num2date(values, units=unit, calendar=cal) -def by_arrays(shape, *, elements=None): +def by_arrays( + shape: tuple[int, ...], *, elements: dict[str, Any] | None = None +) -> st.SearchStrategy[np.ndarray[Any, Any]]: + if elements is None: + elements = {} + elements.setdefault("alphabet", st.characters(exclude_categories=("C",))) return st.one_of( npst.arrays( dtype=npst.integer_dtypes(endianness="=") | npst.unicode_string_dtypes(endianness="="), @@ -92,7 +113,7 @@ def by_arrays(shape, *, elements=None): @st.composite -def chunks(draw, *, shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: +def chunks(draw: st.DrawFn, *, shape: tuple[int, ...]) -> Chunks: chunks = [] for size in shape: if size > 1: @@ -107,7 +128,13 @@ def chunks(draw, *, shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: @st.composite -def chunked_arrays(draw, *, chunks=chunks, arrays=numeric_arrays, from_array=dask.array.from_array): +def chunked_arrays( + draw: st.DrawFn, + *, + chunks: Callable[..., st.SearchStrategy[Chunks]] = chunks, + arrays=all_arrays, + from_array: Callable = dask.array.from_array, +) -> dask.array.Array: array = draw(arrays) chunks = draw(chunks(shape=array.shape)) diff --git a/tests/test_properties.py b/tests/test_properties.py index cf9b410b..0fadf87d 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -1,3 +1,5 @@ +from typing import Any, Callable + import pandas as pd import pytest @@ -14,7 +16,7 @@ from flox.core import groupby_reduce, groupby_scan from . import assert_equal -from .strategies import all_arrays, by_arrays, chunked_arrays, func_st, numeric_arrays +from .strategies import by_arrays, chunked_arrays, func_st, numeric_arrays dask.config.set(scheduler="sync") @@ -31,14 +33,14 @@ def bfill(array, axis, dtype=None): )[::-1] -NUMPY_SCAN_FUNCS = { +NUMPY_SCAN_FUNCS: dict[str, Callable] = { "nancumsum": np.nancumsum, "ffill": ffill, "bfill": bfill, } # "cumsum": np.cumsum, -def not_overflowing_array(array) -> bool: +def not_overflowing_array(array: np.ndarray[Any, Any]) -> bool: if array.dtype.kind == "f": info = np.finfo(array.dtype) elif array.dtype.kind in ["i", "u"]: @@ -51,9 +53,8 @@ def not_overflowing_array(array) -> bool: return result -# TODO: migrate to by_arrays but with constant value @given(data=st.data(), array=numeric_arrays, func=func_st) -def test_groupby_reduce(data, array, func): +def test_groupby_reduce(data, array, func: str) -> None: # overflow behaviour differs between bincount and sum (for example) assume(not_overflowing_array(array)) # TODO: fix var for complex numbers upstream @@ -71,14 +72,14 @@ def test_groupby_reduce(data, array, func): "min_size": 1, "max_size": 1, }, - shape=array.shape[-1], + shape=(array.shape[-1],), ) ) assert len(np.unique(by)) == 1 kwargs = {"q": 0.8} if "quantile" in func else {} - flox_kwargs = {} + flox_kwargs: dict[str, Any] = {} with np.errstate(invalid="ignore", divide="ignore"): - actual, _ = groupby_reduce( + actual, *_ = groupby_reduce( array, by, func=func, axis=axis, engine="numpy", **flox_kwargs, finalize_kwargs=kwargs ) @@ -112,10 +113,10 @@ def test_groupby_reduce(data, array, func): @given( data=st.data(), - array=chunked_arrays(), + array=chunked_arrays(arrays=numeric_arrays), func=st.sampled_from(tuple(NUMPY_SCAN_FUNCS)), ) -def test_scans(data, array, func): +def test_scans(data, array: dask.array.Array, func: str) -> None: assume(not_overflowing_array(np.asarray(array))) by = data.draw(by_arrays(shape=(array.shape[-1],))) @@ -148,7 +149,7 @@ def test_scans(data, array, func): @given(data=st.data(), array=chunked_arrays()) -def test_ffill_bfill_reverse(data, array): +def test_ffill_bfill_reverse(data, array: dask.array.Array) -> None: # TODO: test NaT and timedelta, datetime assume(not_overflowing_array(np.asarray(array))) by = data.draw(by_arrays(shape=(array.shape[-1],))) @@ -168,10 +169,10 @@ def reverse(arr): @given( data=st.data(), - array=chunked_arrays(arrays=all_arrays), + array=chunked_arrays(), func=st.sampled_from(["first", "last", "nanfirst", "nanlast"]), ) -def test_first_last(data, array, func): +def test_first_last(data, array: dask.array.Array, func: str) -> None: by = data.draw(by_arrays(shape=(array.shape[-1],))) INVERSES = {"first": "last", "last": "first", "nanfirst": "nanlast", "nanlast": "nanfirst"} @@ -183,8 +184,8 @@ def test_first_last(data, array, func): array = array.rechunk((*array.chunks[:-1], -1)) for arr in [array, array.compute()]: - forward, fg = groupby_reduce(arr, by, func=func, engine="flox") - reverse, rg = groupby_reduce(arr[..., ::-1], by[..., ::-1], func=inverse, engine="flox") + forward, *fg = groupby_reduce(arr, by, func=func, engine="flox") + reverse, *rg = groupby_reduce(arr[..., ::-1], by[..., ::-1], func=inverse, engine="flox") assert forward.dtype == reverse.dtype assert forward.dtype == arr.dtype @@ -196,6 +197,6 @@ def test_first_last(data, array, func): if mate in ["first", "last"]: array = array.rechunk((*array.chunks[:-1], -1)) - first, _ = groupby_reduce(array, by, func=func, engine="flox") - second, _ = groupby_reduce(array, by, func=mate, engine="flox") + first, *_ = groupby_reduce(array, by, func=func, engine="flox") + second, *_ = groupby_reduce(array, by, func=mate, engine="flox") assert_equal(first, second) From f752309789664bc30ab9ae889e779f757abce360 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sat, 27 Jul 2024 21:56:50 -0600 Subject: [PATCH 5/7] Fix overflowing skipping to handle NaNs Remove from ffill, bfill Fix fill_value for datetime64 isnan to isnull --- flox/aggregate_flox.py | 2 +- flox/aggregations.py | 6 +++++- flox/xrdtypes.py | 5 +++-- tests/test_properties.py | 8 +++++--- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py index 31caa849..7174552c 100644 --- a/flox/aggregate_flox.py +++ b/flox/aggregate_flox.py @@ -239,7 +239,7 @@ def ffill(group_idx, array, *, axis, **kwargs): (group_starts,) = flag.nonzero() # https://stackoverflow.com/questions/41190852/most-efficient-way-to-forward-fill-nan-values-in-numpy-array - mask = np.isnan(array) + mask = isnull(array) # modified from the SO answer, just reset the index at the start of every group! mask[..., np.asarray(group_starts)] = False diff --git a/flox/aggregations.py b/flox/aggregations.py index c69bbc2e..63abb0ea 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -158,7 +158,11 @@ def _get_fill_value(dtype, fill_value): return np.nan # This is madness, but npg checks that fill_value is compatible # with array dtype even if the fill_value is never used. - elif np.issubdtype(dtype, np.integer): + elif ( + np.issubdtype(dtype, np.integer) + or np.issubdtype(dtype, np.timedelta64) + or np.issubdtype(dtype, np.datetime64) + ): return dtypes.get_neg_infinity(dtype, min_for_int=True) else: return None diff --git a/flox/xrdtypes.py b/flox/xrdtypes.py index bad498b1..2d6ce369 100644 --- a/flox/xrdtypes.py +++ b/flox/xrdtypes.py @@ -125,8 +125,9 @@ def get_neg_infinity(dtype, min_for_int=False): fill_value : positive infinity value corresponding to this dtype. """ - if np.issubdtype(dtype, (np.timedelta64, np.datetime64)): - return dtype.type(np.iinfo(np.int64).min + 1) + if is_datetime_like(dtype): + unit, _ = np.datetime_data(dtype) + return dtype.type(np.iinfo(np.int64).min + 1, unit) if issubclass(dtype.type, np.floating): return -np.inf diff --git a/tests/test_properties.py b/tests/test_properties.py index 0fadf87d..1f495a36 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -14,6 +14,7 @@ import flox from flox.core import groupby_reduce, groupby_scan +from flox.xrutils import notnull from . import assert_equal from .strategies import by_arrays, chunked_arrays, func_st, numeric_arrays @@ -48,6 +49,8 @@ def not_overflowing_array(array: np.ndarray[Any, Any]) -> bool: else: return True + array = array.ravel() + array = array[notnull(array)] result = bool(np.all((array < info.max / array.size) & (array > info.min / array.size))) # note(f"returning {result}, {array.min()} vs {info.min}, {array.max()} vs {info.max}") return result @@ -117,7 +120,8 @@ def test_groupby_reduce(data, array, func: str) -> None: func=st.sampled_from(tuple(NUMPY_SCAN_FUNCS)), ) def test_scans(data, array: dask.array.Array, func: str) -> None: - assume(not_overflowing_array(np.asarray(array))) + if "cum" in func: + assume(not_overflowing_array(np.asarray(array))) by = data.draw(by_arrays(shape=(array.shape[-1],))) axis = array.ndim - 1 @@ -150,8 +154,6 @@ def test_scans(data, array: dask.array.Array, func: str) -> None: @given(data=st.data(), array=chunked_arrays()) def test_ffill_bfill_reverse(data, array: dask.array.Array) -> None: - # TODO: test NaT and timedelta, datetime - assume(not_overflowing_array(np.asarray(array))) by = data.draw(by_arrays(shape=(array.shape[-1],))) def reverse(arr): From 84cb5d8d3b14fe94634aaebbf13ff1a0a31d6d2b Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sat, 27 Jul 2024 22:11:37 -0600 Subject: [PATCH 6/7] Fix ffill, bfill bugs --- flox/aggregations.py | 4 ++-- flox/core.py | 35 ++++++++++++++++++++--------------- tests/test_core.py | 30 +++++++++++++++++++++++++++--- tests/test_properties.py | 21 +++++++++++++-------- 4 files changed, 62 insertions(+), 28 deletions(-) diff --git a/flox/aggregations.py b/flox/aggregations.py index 63abb0ea..98d91622 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -654,8 +654,8 @@ def __post_init__(self): def reverse(a: AlignedArrays) -> AlignedArrays: - a.group_idx = a.group_idx[::-1] - a.array = a.array[::-1] + a.group_idx = a.group_idx[..., ::-1] + a.array = a.array[..., ::-1] return a diff --git a/flox/core.py b/flox/core.py index 307345c0..9e3dee3d 100644 --- a/flox/core.py +++ b/flox/core.py @@ -2788,20 +2788,29 @@ def groupby_scan( if by_.shape[-1] == 1 or by_.shape == grp_shape: return array.astype(agg.dtype) + # Made a design choice here to have `preprocess` handle both array and group_idx + # Example: for reversing, we need to reverse the whole array, not just reverse + # each block independently + inp = AlignedArrays(array=array, group_idx=by_) + if agg.preprocess: + inp = agg.preprocess(inp) + if not has_dask: - final_state = chunk_scan( - AlignedArrays(array=array, group_idx=by_), axis=single_axis, agg=agg, dtype=agg.dtype - ) - return extract_array(final_state) + final_state = chunk_scan(inp, axis=single_axis, agg=agg, dtype=agg.dtype) + result = _finalize_scan(final_state) else: - return dask_groupby_scan(array, by_, axes=axis_, agg=agg) + result = dask_groupby_scan(inp.array, inp.group_idx, axes=axis_, agg=agg) + + # Made a design choice here to have `postprocess` handle both array and group_idx + out = AlignedArrays(array=result, group_idx=by_) + if agg.finalize: + out = agg.finalize(out) + return out.array def chunk_scan(inp: AlignedArrays, *, axis: int, agg: Scan, dtype=None, keepdims=None) -> ScanState: assert axis == inp.array.ndim - 1 - if agg.preprocess: - inp = agg.preprocess(inp) # I don't think we need to re-factorize here unless we are grouping by a dask array accumulated = generic_aggregate( inp.group_idx, @@ -2813,8 +2822,6 @@ def chunk_scan(inp: AlignedArrays, *, axis: int, agg: Scan, dtype=None, keepdims fill_value=agg.identity, ) result = AlignedArrays(array=accumulated, group_idx=inp.group_idx) - if agg.finalize: - result = agg.finalize(result) return ScanState(result=result, state=None) @@ -2840,10 +2847,9 @@ def _zip(group_idx: np.ndarray, array: np.ndarray) -> AlignedArrays: return AlignedArrays(group_idx=group_idx, array=array) -def extract_array(block: ScanState, finalize: Callable | None = None) -> np.ndarray: +def _finalize_scan(block: ScanState) -> np.ndarray: assert block.result is not None - result = finalize(block.result) if finalize is not None else block.result - return result.array + return block.result.array def dask_groupby_scan(array, by, axes: T_Axes, agg: Scan) -> DaskArray: @@ -2859,9 +2865,8 @@ def dask_groupby_scan(array, by, axes: T_Axes, agg: Scan) -> DaskArray: array, by = _unify_chunks(array, by) # 1. zip together group indices & array - to_map = _zip if agg.preprocess is None else tlz.compose(agg.preprocess, _zip) zipped = map_blocks( - to_map, by, array, dtype=array.dtype, meta=array._meta, name="groupby-scan-preprocess" + _zip, by, array, dtype=array.dtype, meta=array._meta, name="groupby-scan-preprocess" ) scan_ = partial(chunk_scan, agg=agg) @@ -2882,7 +2887,7 @@ def dask_groupby_scan(array, by, axes: T_Axes, agg: Scan) -> DaskArray: ) # 3. Unzip and extract the final result array, discard groups - result = map_blocks(extract_array, accumulated, dtype=agg.dtype, finalize=agg.finalize) + result = map_blocks(_finalize_scan, accumulated, dtype=agg.dtype) assert result.chunks == array.chunks diff --git a/tests/test_core.py b/tests/test_core.py index 41e77461..d0f1fe94 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -187,11 +187,11 @@ def test_groupby_reduce( assert_equal(expected_result, result) -def gen_array_by(size, func): +def gen_array_by(size, func: str): by = np.ones(size[-1]) rng = np.random.default_rng(12345) array = rng.random(tuple(6 if s == 1 else s for s in size)) - if "nan" in func and "nanarg" not in func: + if ("nan" in func or "fill" in func) and "nanarg" not in func: array[[1, 4, 5], ...] = np.nan elif "nanarg" in func and len(size) > 1: array[[1, 4, 5], 1] = np.nan @@ -1810,7 +1810,7 @@ def test_nanlen_string(dtype, engine): assert_equal(expected, actual) -def test_scans(): +def test_cumusm(): array = np.array([1, 1, 1], dtype=np.uint64) by = np.array([0] * array.shape[-1]) kwargs = {"func": "nancumsum", "axis": -1} @@ -1823,3 +1823,27 @@ def test_scans(): da = dask.array.from_array(array, chunks=2) actual = groupby_scan(da, by, **kwargs) assert_equal(expected, actual) + + +@pytest.mark.parametrize( + "chunks", + [ + pytest.param(-1, marks=requires_dask), + pytest.param(3, marks=requires_dask), + pytest.param(4, marks=requires_dask), + ], +) +@pytest.mark.parametrize("size", ((1, 12), (12,), (12, 9))) +@pytest.mark.parametrize("add_nan_by", [True, False]) +@pytest.mark.parametrize("func", ["ffill", "bfill"]) +def test_ffill_bfill(chunks, size, add_nan_by, func): + array, by = gen_array_by(size, func) + if chunks: + array = dask.array.from_array(array, chunks=chunks) + if add_nan_by: + by[0:3] = np.nan + by = tuple(by) + + expected = flox.groupby_scan(array.compute(), by, func=func) + actual = flox.groupby_scan(array, by, func=func) + assert_equal(expected, actual) diff --git a/tests/test_properties.py b/tests/test_properties.py index 1f495a36..85fe4692 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -159,14 +159,19 @@ def test_ffill_bfill_reverse(data, array: dask.array.Array) -> None: def reverse(arr): return arr[..., ::-1] - for a in (array, array.compute()): - forward = groupby_scan(a, by, func="ffill") - backward_reversed = reverse(groupby_scan(reverse(a), reverse(by), func="bfill")) - assert_equal(forward, backward_reversed) - - backward = groupby_scan(a, by, func="bfill") - forward_reversed = reverse(groupby_scan(reverse(a), reverse(by), func="ffill")) - assert_equal(forward_reversed, backward) + forward = groupby_scan(array, by, func="ffill") + as_numpy = groupby_scan(array.compute(), by, func="ffill") + assert_equal(forward, as_numpy) + + backward = groupby_scan(array, by, func="bfill") + as_numpy = groupby_scan(array.compute(), by, func="bfill") + assert_equal(backward, as_numpy) + + backward_reversed = reverse(groupby_scan(reverse(array), reverse(by), func="bfill")) + assert_equal(forward, backward_reversed) + + forward_reversed = reverse(groupby_scan(reverse(array), reverse(by), func="ffill")) + assert_equal(forward_reversed, backward) @given( From b30ea7512b551943243e3f06ab77df7b65a123ef Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sun, 28 Jul 2024 06:37:35 -0600 Subject: [PATCH 7/7] Testing improvements. 1. Record duration 2. Speed up some tests 3. Silence logging --- .github/workflows/ci.yaml | 2 +- flox/aggregations.py | 4 ++-- flox/core.py | 2 +- tests/conftest.py | 12 ++++++++++ tests/test_core.py | 2 +- tests/test_properties.py | 5 ++++- tests/test_xarray.py | 47 +++++++++++++++++++++++---------------- 7 files changed, 49 insertions(+), 25 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 6c478b86..f3633fc6 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -62,7 +62,7 @@ jobs: - name: Run Tests id: status run: | - pytest -n auto --cov=./ --cov-report=xml --hypothesis-profile ci + pytest --durations=20 --durations-min=0.5 -n auto --cov=./ --cov-report=xml --hypothesis-profile ci - name: Upload code coverage to Codecov uses: codecov/codecov-action@v4.5.0 with: diff --git a/flox/aggregations.py b/flox/aggregations.py index 98d91622..51e650a6 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -76,7 +76,7 @@ def generic_aggregate( try: method = getattr(aggregate_flox, func) except AttributeError: - logger.debug(f"Couldn't find {func} for engine='flox'. Falling back to numpy") + # logger.debug(f"Couldn't find {func} for engine='flox'. Falling back to numpy") method = get_npg_aggregation(func, engine="numpy") elif engine == "numbagg": @@ -94,7 +94,7 @@ def generic_aggregate( method = getattr(aggregate_numbagg, func) except AttributeError: - logger.debug(f"Couldn't find {func} for engine='numbagg'. Falling back to numpy") + # logger.debug(f"Couldn't find {func} for engine='numbagg'. Falling back to numpy") method = get_npg_aggregation(func, engine="numpy") elif engine in ["numpy", "numba"]: diff --git a/flox/core.py b/flox/core.py index 9e3dee3d..8eaf54b7 100644 --- a/flox/core.py +++ b/flox/core.py @@ -2011,7 +2011,7 @@ def _validate_reindex( reindex = True assert isinstance(reindex, bool) - logger.debug("Leaving _validate_reindex: reindex is {}".format(reindex)) # noqa + # logger.debug("Leaving _validate_reindex: reindex is {}".format(reindex)) # noqa return reindex diff --git a/tests/conftest.py b/tests/conftest.py index 1ccc20a1..b3a0ab93 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -28,3 +28,15 @@ ) def engine(request): return request.param + + +@pytest.fixture( + scope="module", + params=[ + "flox", + "numpy", + pytest.param("numbagg", marks=requires_numbagg), + ], +) +def engine_no_numba(request): + return request.param diff --git a/tests/test_core.py b/tests/test_core.py index d0f1fe94..e12e695d 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -187,7 +187,7 @@ def test_groupby_reduce( assert_equal(expected_result, result) -def gen_array_by(size, func: str): +def gen_array_by(size, func): by = np.ones(size[-1]) rng = np.random.default_rng(12345) array = rng.random(tuple(6 if s == 1 else s for s in size)) diff --git a/tests/test_properties.py b/tests/test_properties.py index 85fe4692..26150519 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -1,3 +1,4 @@ +import warnings from typing import Any, Callable import pandas as pd @@ -51,7 +52,9 @@ def not_overflowing_array(array: np.ndarray[Any, Any]) -> bool: array = array.ravel() array = array[notnull(array)] - result = bool(np.all((array < info.max / array.size) & (array > info.min / array.size))) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + result = bool(np.all((array < info.max / array.size) & (array > info.min / array.size))) # note(f"returning {result}, {array.min()} vs {info.min}, {array.max()} vs {info.max}") return result diff --git a/tests/test_xarray.py b/tests/test_xarray.py index ecd63bbf..11b2e23c 100644 --- a/tests/test_xarray.py +++ b/tests/test_xarray.py @@ -22,13 +22,9 @@ dask.config.set(scheduler="sync") -try: - # test against legacy xarray implementation - xr.set_options(use_flox=False) -except ValueError: - pass - - +# test against legacy xarray implementation +# avoid some compilation overhead +xr.set_options(use_flox=False, use_numbagg=False) tolerance64 = {"rtol": 1e-15, "atol": 1e-18} np.random.seed(123) @@ -37,7 +33,8 @@ @pytest.mark.parametrize("min_count", [None, 1, 3]) @pytest.mark.parametrize("add_nan", [True, False]) @pytest.mark.parametrize("skipna", [True, False]) -def test_xarray_reduce(skipna, add_nan, min_count, engine, reindex): +def test_xarray_reduce(skipna, add_nan, min_count, engine_no_numba, reindex): + engine = engine_no_numba if skipna is False and min_count is not None: pytest.skip() @@ -57,7 +54,13 @@ def test_xarray_reduce(skipna, add_nan, min_count, engine, reindex): expected = da.groupby("labels").sum(skipna=skipna, min_count=min_count) actual = xarray_reduce( - da, "labels", func="sum", skipna=skipna, min_count=min_count, engine=engine, reindex=reindex + da, + "labels", + func="sum", + skipna=skipna, + min_count=min_count, + engine=engine, + reindex=reindex, ) assert_equal(expected, actual) @@ -85,9 +88,10 @@ def test_xarray_reduce(skipna, add_nan, min_count, engine, reindex): # TODO: sort @pytest.mark.parametrize("pass_expected_groups", [True, False]) @pytest.mark.parametrize("chunk", (pytest.param(True, marks=requires_dask), False)) -def test_xarray_reduce_multiple_groupers(pass_expected_groups, chunk, engine): +def test_xarray_reduce_multiple_groupers(pass_expected_groups, chunk, engine_no_numba): if chunk and pass_expected_groups is False: pytest.skip() + engine = engine_no_numba arr = np.ones((4, 12)) labels = np.array(["a", "a", "c", "c", "c", "b", "b", "c", "c", "b", "b", "f"]) @@ -131,9 +135,10 @@ def test_xarray_reduce_multiple_groupers(pass_expected_groups, chunk, engine): @pytest.mark.parametrize("pass_expected_groups", [True, False]) @pytest.mark.parametrize("chunk", (pytest.param(True, marks=requires_dask), False)) -def test_xarray_reduce_multiple_groupers_2(pass_expected_groups, chunk, engine): +def test_xarray_reduce_multiple_groupers_2(pass_expected_groups, chunk, engine_no_numba): if chunk and pass_expected_groups is False: pytest.skip() + engine = engine_no_numba arr = np.ones((2, 12)) labels = np.array(["a", "a", "c", "c", "c", "b", "b", "c", "c", "b", "b", "f"]) @@ -187,7 +192,8 @@ def test_validate_expected_groups(expected_groups): @requires_cftime @requires_dask -def test_xarray_reduce_single_grouper(engine): +def test_xarray_reduce_single_grouper(engine_no_numba): + engine = engine_no_numba # DataArray ds = xr.Dataset( {"Tair": (("time", "x", "y"), dask.array.ones((36, 205, 275), chunks=(9, -1, -1)))}, @@ -293,7 +299,8 @@ def test_rechunk_for_blockwise(inchunks, expected): # TODO: dim=None, dim=Ellipsis, groupby unindexed dim -def test_groupby_duplicate_coordinate_labels(engine): +def test_groupby_duplicate_coordinate_labels(engine_no_numba): + engine = engine_no_numba # fix for http://stackoverflow.com/questions/38065129 array = xr.DataArray([1, 2, 3], [("x", [1, 1, 2])]) expected = xr.DataArray([3, 3], [("x", [1, 2])]) @@ -301,7 +308,8 @@ def test_groupby_duplicate_coordinate_labels(engine): assert_equal(expected, actual) -def test_multi_index_groupby_sum(engine): +def test_multi_index_groupby_sum(engine_no_numba): + engine = engine_no_numba # regression test for xarray GH873 ds = xr.Dataset( {"foo": (("x", "y", "z"), np.ones((3, 4, 2)))}, @@ -327,7 +335,8 @@ def test_multi_index_groupby_sum(engine): @pytest.mark.parametrize("chunks", (None, pytest.param(2, marks=requires_dask))) -def test_xarray_groupby_bins(chunks, engine): +def test_xarray_groupby_bins(chunks, engine_no_numba): + engine = engine_no_numba array = xr.DataArray([1, 1, 1, 1, 1], dims="x") labels = xr.DataArray([1, 1.5, 1.9, 2, 3], dims="x", name="labels") @@ -495,11 +504,11 @@ def test_alignment_error(): @pytest.mark.parametrize("dtype_out", [np.float64, "float64", np.dtype("float64")]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("chunk", (pytest.param(True, marks=requires_dask), False)) -def test_dtype(add_nan, chunk, dtype, dtype_out, engine): - if engine == "numbagg": +def test_dtype(add_nan, chunk, dtype, dtype_out, engine_no_numba): + if engine_no_numba == "numbagg": # https://github.com/numbagg/numbagg/issues/121 pytest.skip() - + engine = engine_no_numba xp = dask.array if chunk else np data = xp.linspace(0, 1, 48, dtype=dtype).reshape((4, 12)) @@ -707,7 +716,7 @@ def test_multiple_quantiles(q, chunk, by_ndim, skipna): da = xr.DataArray(array, dims=("x", *dims)) by = xr.DataArray(labels, dims=dims, name="by") - actual = xarray_reduce(da, by, func="quantile", skipna=skipna, q=q) + actual = xarray_reduce(da, by, func="quantile", skipna=skipna, q=q, engine="flox") with xr.set_options(use_flox=False): expected = da.groupby(by).quantile(q, skipna=skipna) xr.testing.assert_allclose(expected, actual)