Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 3 additions & 10 deletions examples/incomplete_iteration.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,21 +40,14 @@


def check_dataframe_deps():
pd_error = """Pandas version >= 1.0 and < 3.0 required for dataframe functionality.
Please `pip install pandas>=1.0,<3.0` to proceed."""
pd_error = """Pandas is required for dataframe functionality.
Please `pip install pandas` to proceed."""

try:
import pandas as pd
import pandas
except ImportError:
raise Exception(pd_error)

from packaging.version import Version

if Version(pd.__version__) < Version("1.0") or Version(pd.__version__) >= Version(
"3.0.0.dev0"
):
raise Exception(pd_error)


# Name of the array to create.
array_name = "incomplete_iteration"
Expand Down
13 changes: 3 additions & 10 deletions examples/parallel_csv_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,21 +49,14 @@


def check_dataframe_deps():
pd_error = """Pandas version >= 1.0 and < 3.0 required for dataframe functionality.
Please `pip install pandas>=1.0,<3.0` to proceed."""
pd_error = """Pandas is required for dataframe functionality.
Please `pip install pandas` to proceed."""

try:
import pandas as pd
import pandas
except ImportError:
raise Exception(pd_error)

from packaging.version import Version

if Version(pd.__version__) < Version("1.0") or Version(pd.__version__) >= Version(
"3.0.0.dev0"
):
raise Exception(pd_error)


def generate_csvs(csv_folder, count=9, min_length=1, max_length=109):
def make_dataframe(col_size):
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ test = [
"hypothesis",
"psutil",
"pyarrow",
"pandas<3",
"pandas",
"dask[distributed]",
]

Expand Down Expand Up @@ -118,6 +118,6 @@ test-requires = [
"hypothesis",
"psutil",
"pyarrow",
"pandas<3",
"pandas",
]
test-command = "pytest {project}"
54 changes: 25 additions & 29 deletions tiledb/dataframe_.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,31 +15,14 @@


def check_dataframe_deps():
pd_error = """Pandas version >= 1.0 and < 3.0 required for dataframe functionality.
Please `pip install pandas>=1.0,<3.0` to proceed."""
pa_error = """PyArrow version >= 1.0 is suggested for dataframe functionality.
Please `pip install pyarrow>=1.0`."""
pd_error = """Pandas is required for dataframe functionality.
Please `pip install pandas` to proceed."""

try:
import pandas as pd
import pandas
except ImportError:
raise Exception(pd_error)

from packaging.version import Version

if Version(pd.__version__) < Version("1.0") or Version(pd.__version__) >= Version(
"3.0.0.dev0"
):
raise Exception(pd_error)

try:
import pyarrow as pa

if Version(pa.__version__) < Version("1.0"):
warnings.warn(pa_error)
except ImportError:
warnings.warn(pa_error)


# Note: 'None' is used to indicate optionality for many of these options
# For example, if the `sparse` argument is unspecified we will default
Expand Down Expand Up @@ -154,7 +137,7 @@ class ColumnInfo:

@classmethod
def from_values(cls, array_like, varlen_types=()):
from pandas import CategoricalDtype
from pandas import CategoricalDtype, StringDtype
from pandas.api import types as pd_types

if pd_types.is_object_dtype(array_like):
Expand All @@ -171,6 +154,16 @@ def from_values(cls, array_like, varlen_types=()):
raise NotImplementedError(
f"{inferred_dtype} inferred dtype not supported (column {array_like.name})"
)
elif hasattr(array_like, "dtype") and isinstance(array_like.dtype, StringDtype):
# Explicit pd.StringDtype() (name="string") is always nullable;
# auto-inferred str (name="str") depends on data
explicit = array_like.dtype.name == "string"
return cls(
np.dtype(np.str_),
repr="string" if explicit else None,
var=True,
nullable=explicit or bool(array_like.isna().any()),
)
elif hasattr(array_like, "dtype") and isinstance(
array_like.dtype, CategoricalDtype
):
Expand Down Expand Up @@ -211,6 +204,14 @@ def from_dtype(cls, dtype, column_name, varlen_types=()):
dtype = pd_types.pandas_dtype(dtype)
# Note: be careful if you rearrange the order of the following checks

# pandas StringDtype (auto-inferred 'str' and explicit 'string')
from pandas import StringDtype

if isinstance(dtype, StringDtype):
repr_val = "string" if dtype.name == "string" else None
nullable = dtype.name == "string"
return cls(np.dtype(np.str_), repr=repr_val, var=True, nullable=nullable)

# extension types
if pd_types.is_extension_array_dtype(dtype):
if libtiledb_version() < (2, 10) and pd_types.is_bool_dtype(dtype):
Expand Down Expand Up @@ -255,12 +256,7 @@ def from_dtype(cls, dtype, column_name, varlen_types=()):

# datetime types
if pd_types.is_datetime64_any_dtype(dtype):
if dtype == "datetime64[ns]":
return cls(dtype)
else:
raise NotImplementedError(
f"Only 'datetime64[ns]' datetime dtype is supported (column {column_name})"
)
return cls(dtype)

# string types
# don't use pd_types.is_string_dtype() because it includes object types too
Expand Down Expand Up @@ -517,8 +513,8 @@ def _df_to_np_arrays(df, column_infos, fillna):
if not column_info.var:
to_numpy_kwargs.update(dtype=column_info.dtype)

if column_info.nullable:
# use default 0/empty for the dtype
if column_info.nullable and column.isna().any():
# Only create nullmap if data actually has nulls
to_numpy_kwargs.update(na_value=column_info.dtype.type())
nullmaps[name] = (~column.isna()).to_numpy(dtype=np.uint8)

Expand Down
8 changes: 8 additions & 0 deletions tiledb/dense_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,14 @@ def _setitem_impl(self, selection, val, nullmaps: dict):

try:
if attr.isvar:
# Capture null mask before np.asarray() loses pandas NA info
if (
attr.isnullable
and name not in nullmaps
and hasattr(attr_val, "isna")
):
nullmaps[name] = (~attr_val.isna()).to_numpy(dtype=np.uint8)

# ensure that the value is array-convertible, for example: pandas.Series
attr_val = np.asarray(attr_val)
if attr.isnullable and name not in nullmaps:
Expand Down
6 changes: 5 additions & 1 deletion tiledb/multirange_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -890,7 +890,11 @@ def _update_df_from_meta(
col_dtypes[name] = dtype

if col_dtypes:
df = df.astype(col_dtypes, copy=False)
# Use str instead of '<U0' so pandas uses its native string type
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is our type-mapping wrong now? I don't quite follow why we need this change

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a comment in code.

col_dtypes = {
name: str if dtype == "<U0" else dtype for name, dtype in col_dtypes.items()
}
df = df.astype(col_dtypes)

if index_col:
if index_col is not True:
Expand Down
9 changes: 9 additions & 0 deletions tiledb/sparse_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,15 @@ def _setitem_impl_sparse(self, selection, val, nullmaps: dict):
attr_val = val[name]

try:
# Capture null mask before np.asarray() loses pandas NA info
if (
attr.isvar
and attr.isnullable
and name not in nullmaps
and hasattr(attr_val, "isna")
):
nullmaps[name] = (~attr_val.isna()).to_numpy(dtype=np.uint8)

# ensure that the value is array-convertible, for example: pandas.Series
attr_val = np.asarray(attr_val)

Expand Down
11 changes: 3 additions & 8 deletions tiledb/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,12 @@

def has_pandas():
try:
import pandas as pd
except ImportError:
return False
import pandas

if Version(pd.__version__) < Version("1.0") or Version(pd.__version__) >= Version(
"3.0.0.dev0"
):
return True
except ImportError:
return False

return True


def has_pyarrow():
try:
Expand Down
8 changes: 7 additions & 1 deletion tiledb/tests/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,17 @@ def __len__(self):
return len(self._flat_arrays)

def __getitem__(self, i):
return self._flat_arrays[i]
if isinstance(i, (int, np.integer)):
return self._flat_arrays[i]
return type(self)(self._flat_arrays[i], self._dtype)

@property
def dtype(self):
return self._dtype

def copy(self):
return type(self)(self._flat_arrays, self._dtype)

@property
def ndim(self):
return 1
57 changes: 25 additions & 32 deletions tiledb/tests/test_pandas_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,32 +204,34 @@ def test_implemented(self, type_specs, info_dtype, info_repr, info_nullable):

def test_object_dtype(self):
self.assertColumnInfo(
ColumnInfo.from_values(pd.Series(["hello", "world"])), np.dtype("<U")
ColumnInfo.from_values(pd.Series(["hello", "world"], dtype=object)),
np.dtype("<U"),
)
self.assertColumnInfo(
ColumnInfo.from_values(pd.Series([b"hello", b"world"])), np.dtype("S")
ColumnInfo.from_values(pd.Series([b"hello", b"world"], dtype=object)),
np.dtype("S"),
)
for s in ["hello", b"world"], ["hello", 1], [b"hello", 1]:
pytest.raises(NotImplementedError, ColumnInfo.from_values, pd.Series(s))

def test_string_dtype(self):
# Auto-inferred str type: non-nullable when data has no nulls
info = ColumnInfo.from_values(pd.Series(["hello", "world"]))
assert info.dtype == np.dtype("<U")
assert info.var is True
assert info.nullable is False
# With nulls: pandas 3+ auto-infers StringDtype which preserves null info;
# pandas 2 uses object dtype where null detection happens in the write path
s = pd.Series(["hello", None])
info = ColumnInfo.from_values(s)
assert info.dtype == np.dtype("<U")
assert info.var is True
assert info.nullable is isinstance(s.dtype, pd.StringDtype)

unsupported_type_specs = [
[np.float16, "f2"],
[np.complex64, "c8"],
[np.complex128, "c16"],
[np.datetime64, "<M8", "datetime64"],
[
"<M8[Y]",
"<M8[M]",
"<M8[W]",
"<M8[h]",
"<M8[m]",
"<M8[s]",
"<M8[ms]",
"<M8[us]",
"<M8[ps]",
"<M8[fs]",
"<M8[as]",
],
]
if hasattr(np, "float128"):
unsupported_type_specs.append([np.float128, "f16"])
Expand Down Expand Up @@ -443,7 +445,7 @@ def test_dataframe_basic_rt1_manual(self):
times = df["time"]
cccc = df["cccc"]

df = df.drop(columns=["time", "cccc"], axis=1)
df = df.drop(columns=["time", "cccc"])
A[s_ichars, times, cccc] = df.to_dict(orient="series")

with tiledb.SparseArray(uri) as A:
Expand Down Expand Up @@ -603,12 +605,12 @@ def test_dataframe_index_to_sparse_dims(self):

# ensure that all column which will be used as string dim index
# is sorted, because that is how it will be returned
if df.dtypes[col] == "O":
if pd.api.types.is_string_dtype(df.dtypes[col]):
df.sort_values(col, inplace=True)

# also ensure that string columns are converted to bytes
# b/c only TILEDB_ASCII supported for string dimension
if isinstance(df[col][0], str):
if isinstance(df[col].iloc[0], str):
df[col] = [x.encode("UTF-8") for x in df[col]]

new_df = df.drop_duplicates(subset=col)
Expand Down Expand Up @@ -1446,13 +1448,7 @@ def try_rt(name, df, pq_args={}):
tdb_uri = os.path.join(uri, f"{name}.tdb")
pq_uri = os.path.join(uri, f"{name}.pq")

df.to_parquet(
pq_uri,
# this is required to losslessly serialize timestamps
# until Parquet 2.0 is default.
use_deprecated_int96_timestamps=True,
**pq_args,
)
df.to_parquet(pq_uri, **pq_args)

tiledb.from_parquet(str(tdb_uri), str(pq_uri))
df_bk = tiledb.open_dataframe(tdb_uri)
Expand Down Expand Up @@ -1995,9 +1991,8 @@ def test_datetime64_days_dtype_read_sc25572(checked_path):
assert_dict_arrays_equal(array[:], data)
df_received = array.df[:]
df_received = df_received.set_index("d1")
tm.assert_frame_equal(
original_df, df_received, check_datetimelike_compat=True, check_dtype=False
)
# TileDB returns datetime.date objects for datetime64[D], convert both to strings
tm.assert_frame_equal(original_df.astype(str), df_received.astype(str))


def test_datetime64_days_dtype_write_sc25572(checked_path):
Expand All @@ -2024,9 +2019,7 @@ def test_datetime64_days_dtype_write_sc25572(checked_path):
with tiledb.open(uri, "r") as array:
assert_dict_arrays_equal(array[:], data)
df_received = array.df[:]
tm.assert_frame_equal(
original_df, df_received, check_datetimelike_compat=True, check_dtype=False
)
tm.assert_frame_equal(original_df, df_received, check_dtype=False)


def test_datetime64_days_dtype_read_out_of_range_sc25572(checked_path):
Expand Down