Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Some benchmarks with arrow strings #722

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
58 changes: 58 additions & 0 deletions tests/benchmarks/test_arrow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import pandas as pd
import pytest

from ..utils_test import cluster_memory, timeseries_of_size, wait


@pytest.fixture(params=[True, False], ids=["string[pyarrow]", "object"])
def series_with_client(request, small_client):
j-bennet marked this conversation as resolved.
Show resolved Hide resolved
memory = cluster_memory(small_client)
df = timeseries_of_size(memory)
series = df.name
if request.param:
series = series.astype(pd.StringDtype("pyarrow"))
series = series.persist()
yield series, small_client


def test_unique(series_with_client):
"""Find unique values"""
series, client = series_with_client
result = series.unique()
wait(result, client, 10 * 60)


def test_contains(series_with_client):
"""String contains"""
series, client = series_with_client
result = series.str.contains("a")
wait(result, client, 10 * 60)


def test_startswith(series_with_client):
"""String starts with"""
series, client = series_with_client
result = series.str.startswith("B")
wait(result, client, 10 * 60)


def test_upper(series_with_client):
"""String upper"""
series, client = series_with_client
result = series.str.upper()
wait(result, client, 10 * 60)


def test_filter(series_with_client):
"""How fast can we filter the Series"""
series, client = series_with_client
name = series.head(1)[0] # Get first name that appears
result = series[series == name]
wait(result, client, 10 * 60)


def test_value_counts(series_with_client):
"""Value counts on string values"""
series, client = series_with_client
result = series.value_counts()
wait(result, client, 10 * 60)