Skip to content

Commit

Permalink
Some benchmarks with arrow strings.
Browse files Browse the repository at this point in the history
  • Loading branch information
j-bennet committed Mar 17, 2023
1 parent 01e9544 commit 72c6fa0
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 0 deletions.
42 changes: 42 additions & 0 deletions tests/benchmarks/test_arrow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from ..utils_test import cluster_memory, timeseries_of_size, wait


def test_unique(small_client, convert_string):
"""Find unique values"""
memory = cluster_memory(small_client)
df = timeseries_of_size(memory)
result = df.name.unique()
wait(result, small_client, 10 * 60)


def test_contains(small_client, convert_string):
"""String contains"""
memory = cluster_memory(small_client)
df = timeseries_of_size(memory)
result = df.name.contains("a")
wait(result, small_client, 10 * 60)


def test_startswith(small_client, convert_string):
"""String starts with"""
memory = cluster_memory(small_client)
df = timeseries_of_size(memory)
result = df.name.startswith("B")
wait(result, small_client, 10 * 60)


def test_filter(small_client, convert_string):
"""How fast can we filter a DataFrame?"""
memory = cluster_memory(small_client)
df = timeseries_of_size(memory)
name = df.head(1).name.iloc[0] # Get first name that appears
result = df[df.name == name]
wait(result, small_client, 10 * 60)


def test_value_counts(small_client, convert_string):
"""Value counts on string values"""
memory = cluster_memory(small_client)
df = timeseries_of_size(memory)
result = df.name.value_counts()
wait(result, small_client, 10 * 60)
6 changes: 6 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,6 +634,12 @@ def configure_shuffling(shuffle_method):
yield


@pytest.fixture
def convert_string():
with dask.config.set({"dataframe.convert_string": True}):
yield


# Include https://github.com/dask/distributed/pull/7534
P2P_RECHUNK_AVAILABLE = Version(distributed.__version__) >= Version("2023.2.1")

Expand Down

0 comments on commit 72c6fa0

Please sign in to comment.