Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,13 @@ jobs:
cd $RUNNER_TEMP
pytest -vv --showlocals $PROJECT_CWD

- name: "Re-run tests with pandas 2"
run: |
PROJECT_CWD=$PWD
pip install "pandas>=2,<3"
cd $RUNNER_TEMP
pytest -vv --showlocals $PROJECT_CWD

- name: "Re-run tests without pandas"
run: |
PROJECT_CWD=$PWD
Expand Down
13 changes: 3 additions & 10 deletions examples/incomplete_iteration.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,21 +40,14 @@


def check_dataframe_deps():
pd_error = """Pandas version >= 1.0 and < 3.0 required for dataframe functionality.
Please `pip install pandas>=1.0,<3.0` to proceed."""
pd_error = """Pandas is required for dataframe functionality.
Please `pip install pandas` to proceed."""

try:
import pandas as pd
import pandas
except ImportError:
raise Exception(pd_error)

from packaging.version import Version

if Version(pd.__version__) < Version("1.0") or Version(pd.__version__) >= Version(
"3.0.0.dev0"
):
raise Exception(pd_error)


# Name of the array to create.
array_name = "incomplete_iteration"
Expand Down
13 changes: 3 additions & 10 deletions examples/parallel_csv_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,21 +49,14 @@


def check_dataframe_deps():
pd_error = """Pandas version >= 1.0 and < 3.0 required for dataframe functionality.
Please `pip install pandas>=1.0,<3.0` to proceed."""
pd_error = """Pandas is required for dataframe functionality.
Please `pip install pandas` to proceed."""

try:
import pandas as pd
import pandas
except ImportError:
raise Exception(pd_error)

from packaging.version import Version

if Version(pd.__version__) < Version("1.0") or Version(pd.__version__) >= Version(
"3.0.0.dev0"
):
raise Exception(pd_error)


def generate_csvs(csv_folder, count=9, min_length=1, max_length=109):
def make_dataframe(col_size):
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ test = [
"hypothesis",
"psutil",
"pyarrow",
"pandas<3",
"pandas",
"dask[distributed]",
]

Expand Down Expand Up @@ -118,6 +118,6 @@ test-requires = [
"hypothesis",
"psutil",
"pyarrow",
"pandas<3",
"pandas",
]
test-command = "pytest {project}"
54 changes: 25 additions & 29 deletions tiledb/dataframe_.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,31 +15,14 @@


def check_dataframe_deps():
pd_error = """Pandas version >= 1.0 and < 3.0 required for dataframe functionality.
Please `pip install pandas>=1.0,<3.0` to proceed."""
pa_error = """PyArrow version >= 1.0 is suggested for dataframe functionality.
Please `pip install pyarrow>=1.0`."""
pd_error = """Pandas is required for dataframe functionality.
Please `pip install pandas` to proceed."""

try:
import pandas as pd
import pandas
except ImportError:
raise Exception(pd_error)

from packaging.version import Version

if Version(pd.__version__) < Version("1.0") or Version(pd.__version__) >= Version(
"3.0.0.dev0"
):
raise Exception(pd_error)

try:
import pyarrow as pa

if Version(pa.__version__) < Version("1.0"):
warnings.warn(pa_error)
except ImportError:
warnings.warn(pa_error)


# Note: 'None' is used to indicate optionality for many of these options
# For example, if the `sparse` argument is unspecified we will default
Expand Down Expand Up @@ -154,7 +137,7 @@ class ColumnInfo:

@classmethod
def from_values(cls, array_like, varlen_types=()):
from pandas import CategoricalDtype
from pandas import CategoricalDtype, StringDtype
from pandas.api import types as pd_types

if pd_types.is_object_dtype(array_like):
Expand All @@ -171,6 +154,16 @@ def from_values(cls, array_like, varlen_types=()):
raise NotImplementedError(
f"{inferred_dtype} inferred dtype not supported (column {array_like.name})"
)
elif hasattr(array_like, "dtype") and isinstance(array_like.dtype, StringDtype):
# Explicit pd.StringDtype() (name="string") is always nullable;
# auto-inferred str (name="str") depends on data
explicit = array_like.dtype.name == "string"
return cls(
np.dtype(np.str_),
repr="string" if explicit else None,
var=True,
nullable=explicit or bool(array_like.isna().any()),
)
elif hasattr(array_like, "dtype") and isinstance(
array_like.dtype, CategoricalDtype
):
Expand Down Expand Up @@ -211,6 +204,14 @@ def from_dtype(cls, dtype, column_name, varlen_types=()):
dtype = pd_types.pandas_dtype(dtype)
# Note: be careful if you rearrange the order of the following checks

# pandas StringDtype (auto-inferred 'str' and explicit 'string')
from pandas import StringDtype

if isinstance(dtype, StringDtype):
repr_val = "string" if dtype.name == "string" else None
nullable = dtype.name == "string"
return cls(np.dtype(np.str_), repr=repr_val, var=True, nullable=nullable)

# extension types
if pd_types.is_extension_array_dtype(dtype):
if libtiledb_version() < (2, 10) and pd_types.is_bool_dtype(dtype):
Expand Down Expand Up @@ -255,12 +256,7 @@ def from_dtype(cls, dtype, column_name, varlen_types=()):

# datetime types
if pd_types.is_datetime64_any_dtype(dtype):
if dtype == "datetime64[ns]":
return cls(dtype)
else:
raise NotImplementedError(
f"Only 'datetime64[ns]' datetime dtype is supported (column {column_name})"
)
return cls(dtype)

# string types
# don't use pd_types.is_string_dtype() because it includes object types too
Expand Down Expand Up @@ -517,8 +513,8 @@ def _df_to_np_arrays(df, column_infos, fillna):
if not column_info.var:
to_numpy_kwargs.update(dtype=column_info.dtype)

if column_info.nullable:
# use default 0/empty for the dtype
if column_info.nullable and column.isna().any():
# Only create nullmap if data actually has nulls
to_numpy_kwargs.update(na_value=column_info.dtype.type())
nullmaps[name] = (~column.isna()).to_numpy(dtype=np.uint8)

Expand Down
8 changes: 8 additions & 0 deletions tiledb/dense_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,14 @@ def _setitem_impl(self, selection, val, nullmaps: dict):

try:
if attr.isvar:
# Capture null mask before np.asarray() loses pandas NA info
if (
attr.isnullable
and name not in nullmaps
and hasattr(attr_val, "isna")
):
nullmaps[name] = (~attr_val.isna()).to_numpy(dtype=np.uint8)

# ensure that the value is array-convertible, for example: pandas.Series
attr_val = np.asarray(attr_val)
if attr.isnullable and name not in nullmaps:
Expand Down
6 changes: 5 additions & 1 deletion tiledb/multirange_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -890,7 +890,11 @@ def _update_df_from_meta(
col_dtypes[name] = dtype

if col_dtypes:
df = df.astype(col_dtypes, copy=False)
# Use str instead of '<U0' so pandas uses its native string type
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is our type-mapping wrong now? I don't quite follow why we need this change

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a comment in code.

col_dtypes = {
name: str if dtype == "<U0" else dtype for name, dtype in col_dtypes.items()
}
df = df.astype(col_dtypes)

if index_col:
if index_col is not True:
Expand Down
9 changes: 9 additions & 0 deletions tiledb/sparse_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,15 @@ def _setitem_impl_sparse(self, selection, val, nullmaps: dict):
attr_val = val[name]

try:
# Capture null mask before np.asarray() loses pandas NA info
if (
attr.isvar
and attr.isnullable
and name not in nullmaps
and hasattr(attr_val, "isna")
):
nullmaps[name] = (~attr_val.isna()).to_numpy(dtype=np.uint8)

# ensure that the value is array-convertible, for example: pandas.Series
attr_val = np.asarray(attr_val)

Expand Down
11 changes: 3 additions & 8 deletions tiledb/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,12 @@

def has_pandas():
try:
import pandas as pd
except ImportError:
return False
import pandas

if Version(pd.__version__) < Version("1.0") or Version(pd.__version__) >= Version(
"3.0.0.dev0"
):
return True
except ImportError:
return False

return True


def has_pyarrow():
try:
Expand Down
8 changes: 7 additions & 1 deletion tiledb/tests/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,17 @@ def __len__(self):
return len(self._flat_arrays)

def __getitem__(self, i):
return self._flat_arrays[i]
if isinstance(i, (int, np.integer)):
return self._flat_arrays[i]
return type(self)(self._flat_arrays[i], self._dtype)

@property
def dtype(self):
return self._dtype

def copy(self):
return type(self)(self._flat_arrays, self._dtype)

@property
def ndim(self):
return 1
4 changes: 2 additions & 2 deletions tiledb/tests/test_enumeration.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def test_array_schema_enumeration(self):

@pytest.mark.skipif(
not has_pyarrow() or not has_pandas(),
reason="pyarrow>=1.0 and/or pandas>=1.0,<3.0 not installed",
reason="pyarrow>=1.0 and/or pandas not installed",
)
@pytest.mark.parametrize("sparse", [True, False])
@pytest.mark.parametrize("pass_df", [True, False])
Expand Down Expand Up @@ -185,7 +185,7 @@ def test_enum_dtypes(self, dtype, values):
assert enmr.dtype == enmr.values().dtype == dtype
assert_array_equal(enmr.values(), values)

@pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed")
@pytest.mark.skipif(not has_pandas(), reason="pandas not installed")
def test_from_pandas_dtype_mismatch(self):
import pandas as pd

Expand Down
7 changes: 3 additions & 4 deletions tiledb/tests/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_examples(self, path):
]
]
if not has_pandas() and path in requires_pd:
pytest.mark.skip("pandas>=1.0,<3.0 not installed")
pytest.mark.skip("pandas not installed")
else:
with tempfile.TemporaryDirectory() as tmpdir:
try:
Expand Down Expand Up @@ -73,10 +73,9 @@ def test_docs(self, capsys):
if failures:
stderr = capsys.readouterr().out
if "No module named 'pandas'" in stderr or (
"Pandas version >= 1.0 and < 3.0 required for dataframe functionality"
in stderr
"Pandas is required for dataframe functionality" in stderr
and not has_pandas()
):
pytest.skip("pandas>=1.0,<3.0 not installed")
pytest.skip("pandas not installed")
else:
pytest.fail(stderr)
6 changes: 3 additions & 3 deletions tiledb/tests/test_fixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def test_ch8292(self):
buffers = list(*q._get_buffers().values())
assert buffers[0].nbytes == max_val

@pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed")
@pytest.mark.skipif(not has_pandas(), reason="pandas not installed")
def test_ch10282_concurrent_multi_index(self):
"""Test concurrent access to a single tiledb.Array using
Array.multi_index and Array.df. We pass an array and slice
Expand Down Expand Up @@ -230,7 +230,7 @@ def test_fix_stats_error_messages(self):

@pytest.mark.skipif(
not has_pandas() and has_pyarrow(),
reason="pandas>=1.0,<3.0 or pyarrow>=1.0 not installed",
reason="pandas or pyarrow>=1.0 not installed",
)
def test_py1078_df_all_empty_strings(self):
uri = self.path()
Expand All @@ -246,7 +246,7 @@ def test_py1078_df_all_empty_strings(self):
with tiledb.open(uri) as arr:
tm.assert_frame_equal(arr.df[:], df)

@pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed")
@pytest.mark.skipif(not has_pandas(), reason="pandas not installed")
@pytest.mark.parametrize("is_sparse", [True, False])
def test_sc1430_nonexisting_timestamp(self, is_sparse):
path = self.path("nonexisting_timestamp")
Expand Down
2 changes: 1 addition & 1 deletion tiledb/tests/test_hypothesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
tm = pd._testing


@pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed")
@pytest.mark.skipif(not has_pandas(), reason="pandas not installed")
@pytest.mark.parametrize("mode", ["np", "df"])
@hp.settings(deadline=None, verbosity=hp.Verbosity.verbose)
@hp.given(st.binary())
Expand Down
8 changes: 4 additions & 4 deletions tiledb/tests/test_libtiledb.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ def test_array_delete(self):

@pytest.mark.skipif(
not has_pyarrow() or not has_pandas(),
reason="pyarrow>=1.0 and/or pandas>=1.0,<3.0 not installed",
reason="pyarrow>=1.0 and/or pandas not installed",
)
@pytest.mark.parametrize("sparse", [True, False])
@pytest.mark.parametrize("pass_df", [True, False])
Expand Down Expand Up @@ -1784,7 +1784,7 @@ def test_query_real_multi_index(self, fx_sparse_cell_order):
"coords" not in T.query(coords=False).multi_index[-10.0:5.0]
)

@pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed")
@pytest.mark.skipif(not has_pandas(), reason="pandas not installed")
@pytest.mark.parametrize("dtype", ["u1", "u2", "u4", "u8", "i1", "i2", "i4", "i8"])
def test_sparse_index_dtypes(self, dtype):
path = self.path()
Expand All @@ -1805,7 +1805,7 @@ def test_sparse_index_dtypes(self, dtype):
assert B[data[1]]["attr"] == data[1]
assert B.multi_index[data[0]]["attr"] == data[0]

@pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed")
@pytest.mark.skipif(not has_pandas(), reason="pandas not installed")
@pytest.mark.skipif(
tiledb.libtiledb.version() < (2, 10),
reason="TILEDB_BOOL introduced in libtiledb 2.10",
Expand Down Expand Up @@ -3743,7 +3743,7 @@ def test_query_return_incomplete_error(self, sparse):
with self.assertRaises(tiledb.TileDBError):
A.query(return_incomplete=True)[:]

@pytest.mark.skipif(not has_pandas(), reason="pandas>=1.0,<3.0 not installed")
@pytest.mark.skipif(not has_pandas(), reason="pandas not installed")
@pytest.mark.parametrize(
"use_arrow, return_arrow, indexer",
[
Expand Down
Loading
Loading