Skip to content
28 changes: 14 additions & 14 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from narwhals._pandas_like.utils import (
NUMPY_VERSION,
align_and_extract_native,
binary_string_sum_fallback,
broadcast_series_to_index,
get_dtype_backend,
import_array_module,
Expand Down Expand Up @@ -399,23 +400,22 @@ def first(self) -> PythonLiteral:
def last(self) -> PythonLiteral:
return self.native.iloc[-1] if len(self.native) else None

def _with_binary(self, op: Callable[..., PandasLikeSeries], other: Any) -> Self:
def _with_binary(self, op: Callable[..., pd.Series], other: Any) -> Self:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice catch here :)

ser, other_native = align_and_extract_native(self, other)
preserve_broadcast = self._broadcast and getattr(other, "_broadcast", True)
if (
str(self.native.dtype) == "large_string[pyarrow]"
and isinstance(other_native, str)
and op.__name__ == "add"
):
# https://github.com/pandas-dev/pandas/issues/64393
import pyarrow as pa # ignore-banned-import

other_native = pa.scalar(other_native, type=pa.large_string())
return self._with_native(
op(ser, other_native), preserve_broadcast=preserve_broadcast
).alias(self.name)
try:
res = op(ser, other_native)
except Exception:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Can this be more specific (e.g. TypeError rather than Exception)? Observing the test results without this snippet, I only observe TypeErrors but this could be due to a backport from pandas.
  2. For cases when addition will fail no matter what (e.g. adding integers and strings) retrying __add__ in this fashion makes for very long tracebacks due to the nested exceptions (see below)
    • This may be avoided by checking for string datatypes on both arrays before retrying. I think we may even be able to do this check against the narwhals type?
very long tracebacks
import pandas as pd
import narwhals as nw

df = pd.DataFrame({"count": [1, 2, 3], "fruit": ["apple", "banana", "orange"]})
nw.from_native(df).with_columns(concat=nw.col("count") + nw.col("fruit"))

produces

Traceback (most recent call last):
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/array.py", line 988, in _evaluate_op_method
    result = pc.binary_join_element_wise(other, self._pa_array, sep)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pyarrow/compute.py", line 271, in wrapper
    return func.call(args, options, memory_pool)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "pyarrow/_compute.pyx", line 399, in pyarrow._compute.Function.call
  File "pyarrow/error.pxi", line 155, in pyarrow.lib.pyarrow_internal_check_status
  File "pyarrow/error.pxi", line 92, in pyarrow.lib.check_status
pyarrow.lib.ArrowNotImplementedError: Function 'binary_join_element_wise' has no kernel matching input types (int64, large_string, large_string)

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/cameron/repos/opensource/narwhals-dev/narwhals/_pandas_like/series.py", line 407, in _with_binary
    res = op(ser, other_native)
          ^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/ops/common.py", line 85, in new_method
    return method(self, other)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/arraylike.py", line 190, in __add__
    return self._arith_method(other, operator.add)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/series.py", line 6751, in _arith_method
    return base.IndexOpsMixin._arith_method(self, other, op)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/base.py", line 1644, in _arith_method
    result = ops.arithmetic_op(lvalues, rvalues, op)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/ops/array_ops.py", line 279, in arithmetic_op
    res_values = op(left, right)
                 ^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/array.py", line 845, in __array_ufunc__
    result = super().__array_ufunc__(ufunc, method, *inputs, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/arrays/base.py", line 2704, in __array_ufunc__
    result = arraylike.maybe_dispatch_ufunc_to_dunder_op(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "pandas/_libs/ops_dispatch.pyx", line 113, in pandas._libs.ops_dispatch.maybe_dispatch_ufunc_to_dunder_op
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/ops/common.py", line 85, in new_method
    return method(self, other)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/arraylike.py", line 194, in __radd__
    return self._arith_method(other, roperator.radd)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/array.py", line 1079, in _arith_method
    result = self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/array.py", line 990, in _evaluate_op_method
    raise TypeError(
TypeError: operation 'radd' not supported for dtype 'str' with dtype 'int64'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/cameron/repos/opensource/narwhals-dev/t.py", line 5, in <module>
    nw.from_native(df).with_columns(concat=nw.col("count") + nw.col("fruit"))
  File "/home/cameron/repos/opensource/narwhals-dev/narwhals/dataframe.py", line 1506, in with_columns
    return super().with_columns(*exprs, **named_exprs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/narwhals/dataframe.py", line 214, in with_columns
    return self._with_compliant(self._compliant_frame.with_columns(*compliant_exprs))
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/narwhals/_pandas_like/dataframe.py", line 503, in with_columns
    columns = self._evaluate_exprs(*exprs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/narwhals/_compliant/dataframe.py", line 367, in _evaluate_exprs
    return tuple(chain.from_iterable(self._evaluate_expr(expr) for expr in exprs))  # pyright: ignore[reportArgumentType]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/narwhals/_compliant/dataframe.py", line 367, in <genexpr>
    return tuple(chain.from_iterable(self._evaluate_expr(expr) for expr in exprs))  # pyright: ignore[reportArgumentType]
                                     ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/narwhals/_compliant/dataframe.py", line 380, in _evaluate_expr
    result = expr(self)
             ^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/narwhals/_compliant/expr.py", line 247, in __call__
    return self._call(df)
           ^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/narwhals/_compliant/expr.py", line 651, in <lambda>
    lambda df: [series.alias(name) for series in self(df)],
                                                 ^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/narwhals/_compliant/expr.py", line 247, in __call__
    return self._call(df)
           ^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/narwhals/_compliant/expr.py", line 372, in _reuse_series_inner
    series._from_scalar(method(series)) if returns_scalar else method(series)
                                                               ^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/narwhals/_pandas_like/series.py", line 452, in __add__
    return self._with_binary(operator.add, other)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/narwhals/_pandas_like/series.py", line 411, in _with_binary
    res = binary_string_sum_fallback(ser, other_native, pdx)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/narwhals/_pandas_like/utils.py", line 742, in binary_string_sum_fallback
    return left + right.astype(left_dtype)
                  ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/generic.py", line 6541, in astype
    new_data = self._mgr.astype(dtype=dtype, errors=errors)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/internals/managers.py", line 611, in astype
    return self.apply("astype", dtype=dtype, errors=errors)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/internals/managers.py", line 442, in apply
    applied = getattr(b, f)(**kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/internals/blocks.py", line 607, in astype
    new_values = astype_array_safe(values, dtype, errors=errors)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/dtypes/astype.py", line 240, in astype_array_safe
    new_values = astype_array(values, dtype, copy=copy)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/dtypes/astype.py", line 182, in astype_array
    values = values.astype(dtype, copy=copy)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/arrays/string_arrow.py", line 336, in astype
    return super().astype(dtype, copy=copy)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/arrays/base.py", line 881, in astype
    return np.asarray(self, dtype=dtype)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/array.py", line 864, in __array__
    return self.to_numpy(dtype=dtype, copy=copy)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cameron/repos/opensource/narwhals-dev/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/array.py", line 1717, in to_numpy
    result = result.astype(dtype, copy=False)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: invalid literal for int() with base 10: 'apple'

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for your review!

Regarding making the error more specific, I'd prefer to avoid that in this PR. Currently, this keeps existing behaviour the same, but just changes some previously-failing paths to now pass. If we change the error, then we're changing what may previously have been ValueError into TypeError or AssertionError or something else

Agree with checking the dtypes before the fallback though ๐Ÿ‘ Have added that

if op.__name__ == "add":
pdx = self.__native_namespace__()
res = binary_string_sum_fallback(ser, other_native, pdx)
else:
raise
Comment thread
FBruzzesi marked this conversation as resolved.
return self._with_native(res, preserve_broadcast=preserve_broadcast).alias(
self.name
)

def _with_binary_right(self, op: Callable[..., PandasLikeSeries], other: Any) -> Self:
def _with_binary_right(self, op: Callable[..., pd.Series], other: Any) -> Self:
return self._with_binary(lambda x, y: op(y, x), other).alias(self.name)

def __eq__(self, other: object) -> Self: # type: ignore[override]
Expand Down
33 changes: 33 additions & 0 deletions narwhals/_pandas_like/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,3 +708,36 @@ def broadcast_series_to_index(
return series_class(pa_array, index=index, name=native.name)

return series_class(value, index=index, dtype=native.dtype, name=native.name)


def binary_string_sum_fallback(left: pd.Series, right: Any, pdx: Any) -> pd.Series:
# Workaround some upstream issues:
# - https://github.com/pandas-dev/pandas/issues/64393
# - https://github.com/pandas-dev/pandas/issues/65220
left_dtype = left.dtype
left_dtype_str = str(left_dtype)
if left_dtype_str == "large_string[pyarrow]" and isinstance(right, str):
import pyarrow as pa # ignore-banned-import

return left + pa.scalar(right, type=pa.large_string())
if isinstance(right, pdx.Series):
right_dtype = right.dtype
if left_dtype_str == "object": # pragma: no cover
# Only for pandas pre 3.0. Anything is better than `object`, so take RHS.
return left.astype(right_dtype) + right
if hasattr(left.values, "__arrow_array__") and hasattr(
right.values, "__arrow_array__"
):
import pyarrow as pa # ignore-banned-import

left_arrow = left.values.__arrow_array__().type # noqa: PD011 # type: ignore[attr-defined]
right_arrow = right.values.__arrow_array__().type # noqa: PD011 # type: ignore[attr-defined]
if pa.types.is_string(left_arrow) and pa.types.is_large_string(right_arrow):
# https://github.com/pandas-dev/pandas/blob/b00d4f6710ff6c1c80319196657c31c2cf6c70ff/pandas/core/arrays/arrow/array.py#L1064-L1068
pd_pa_large_string = pd.ArrowDtype(pa.large_string())
return left.astype(pd_pa_large_string) + right.astype(pd_pa_large_string)
else: # pragma: no cover
pass
# Give precedence to the left-hand-side dtype.
return left + right.astype(left_dtype)
return left + right # pragma: no cover
81 changes: 81 additions & 0 deletions tests/expr_and_series/pandas_str_dtypes_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from __future__ import annotations

from typing import Any

import pytest

import narwhals as nw
from tests.utils import assert_equal_data

pytest.importorskip("pandas", minversion="3.0.0")
pytest.importorskip("pyarrow")

import numpy as np
import pandas as pd
import pyarrow as pa

STRING_DTYPE_NAN = pd.StringDtype("pyarrow", na_value=np.nan) # type: ignore[call-arg]
STRING_DTYPE_NA = pd.StringDtype("pyarrow", na_value=pd.NA) # type: ignore[call-arg]
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would there be any purpose to add non-pyarrow backed StringDtypes into this test?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure, done, thanks!



@pytest.mark.parametrize(
("left_dtype", "right_dtype", "result_dtype"),
[
(STRING_DTYPE_NAN, STRING_DTYPE_NAN, STRING_DTYPE_NAN),
(STRING_DTYPE_NAN, STRING_DTYPE_NA, STRING_DTYPE_NAN),
(STRING_DTYPE_NAN, pd.ArrowDtype(pa.string()), STRING_DTYPE_NAN),
(STRING_DTYPE_NAN, pd.ArrowDtype(pa.large_string()), STRING_DTYPE_NAN),
(STRING_DTYPE_NA, STRING_DTYPE_NAN, STRING_DTYPE_NA),
(STRING_DTYPE_NA, STRING_DTYPE_NA, STRING_DTYPE_NA),
(STRING_DTYPE_NA, pd.ArrowDtype(pa.string()), STRING_DTYPE_NA),
(STRING_DTYPE_NA, pd.ArrowDtype(pa.large_string()), STRING_DTYPE_NA),
(pd.ArrowDtype(pa.string()), STRING_DTYPE_NAN, pd.ArrowDtype(pa.large_string())),
(pd.ArrowDtype(pa.string()), STRING_DTYPE_NA, pd.ArrowDtype(pa.large_string())),
(
pd.ArrowDtype(pa.string()),
pd.ArrowDtype(pa.string()),
pd.ArrowDtype(pa.string()),
),
(
pd.ArrowDtype(pa.string()),
pd.ArrowDtype(pa.large_string()),
pd.ArrowDtype(pa.large_string()),
),
(
pd.ArrowDtype(pa.large_string()),
STRING_DTYPE_NAN,
pd.ArrowDtype(pa.large_string()),
),
(
pd.ArrowDtype(pa.large_string()),
STRING_DTYPE_NA,
pd.ArrowDtype(pa.large_string()),
),
(
pd.ArrowDtype(pa.large_string()),
pd.ArrowDtype(pa.string()),
pd.ArrowDtype(pa.large_string()),
),
(
pd.ArrowDtype(pa.large_string()),
pd.ArrowDtype(pa.large_string()),
pd.ArrowDtype(pa.large_string()),
),
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cases are serve effectively as a handwritten truth table.

Can this be refactored to

  1. parameterize from itertools.product
  2. embed the logic for deriving the appropriate datatype into the test?
    • left_dtype is a pandas.StringDtype -> left_dtype
    • left_dtype is a pandas.ArrowDtype and right_dtype is pandas.StringDtype -> pd.ArrowDtype(pa.large_string()
    • both left/right are ArrowDtype: pa.string if all inputs are pa.string otherwise pa.large_string

This reduces the length of the parameters spec and codifies the expected rules for returning a datatype.

DTYPES = [
    pd.StringDtype("pyarrow", na_value=np.nan),  # type: ignore[call-arg]
    pd.StringDtype("pyarrow", na_value=pd.NA),  # type: ignore[call-arg]
    pd.ArrowDtype(pa.string()),
    pd.ArrowDtype(pa.large_string()),
]

@pytest.mark.parametrize(
    ("left_dtype", "right_dtype"),
    [*product(DTYPES, repeat=2)],
)
def test_pandas_str_types(...):
    ...

    assert_equal_data(res, expected)

    result_dtype = res.to_native()["concat_col"].dtype
    if isinstance(left_dtype, pd.StringDtype):
        expected_dtype = left_dtype

    elif isinstance(left_dtype, pd.ArrowDtype) and isinstance(right_dtype, pd.StringDtype):
        expected_dtype = pd.ArrowDtype(pa.large_string())

    elif isinstance(left_dtype, pd.ArrowDtype) and isinstance(right_dtype, pd.ArrowDtype):
        left_type = left_dtype.pyarrow_dtype
        right_type = right_dtype.pyarrow_dtype

        if pa.types.is_large_string(left_type) or pa.types.is_large_string(right_type):
            expected_dtype = pd.ArrowDtype(pa.large_string())
        else:
            expected_dtype = pd.ArrowDtype(pa.string())

    assert result_dtype == expected_dtype

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for taking a look!

i appreciate the suggestion ๐Ÿ™ however, i generally really dislike having logic in tests and try to minimise it where possible, I much prefer writing out the test cases by hand. there was a blog post on this a few* years ago which i really liked https://testing.googleblog.com/2014/07/testing-on-toilet-dont-put-logic-in.html

*has it really been 12 years already? ๐Ÿ˜ฎ

],
)
def test_pandas_str_types(left_dtype: Any, right_dtype: Any, result_dtype: Any) -> None:
import pandas as pd

df = pd.DataFrame({"fruit": ["apple", "banana"]}, dtype=left_dtype)
df["new_str_col"] = "!"
df["new_str_col"] = df["new_str_col"].astype(right_dtype) # pyrefly: ignore[missing-attribute] https://github.com/facebook/pyrefly/issues/3299
res = nw.from_native(df).with_columns(
concat_col=nw.concat_str([nw.col("fruit"), nw.col("new_str_col")])
)
expected = {
"fruit": ["apple", "banana"],
"new_str_col": ["!", "!"],
"concat_col": ["apple!", "banana!"],
}
assert_equal_data(res, expected)
assert res.to_native()["concat_col"].dtype == result_dtype
Loading