Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 22 additions & 9 deletions tiledb/dataframe_.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ class ColumnInfo:

@classmethod
def from_values(cls, array_like, varlen_types=()):
from pandas import CategoricalDtype
from pandas import CategoricalDtype, StringDtype
from pandas.api import types as pd_types

if pd_types.is_object_dtype(array_like):
Expand All @@ -171,6 +171,16 @@ def from_values(cls, array_like, varlen_types=()):
raise NotImplementedError(
f"{inferred_dtype} inferred dtype not supported (column {array_like.name})"
)
elif hasattr(array_like, "dtype") and isinstance(array_like.dtype, StringDtype):
# Explicit pd.StringDtype() (name="string") is always nullable;
# auto-inferred str (name="str") depends on data
explicit = array_like.dtype.name == "string"
return cls(
np.dtype(np.str_),
repr="string" if explicit else None,
var=True,
nullable=explicit or bool(array_like.isna().any()),
)
elif hasattr(array_like, "dtype") and isinstance(
array_like.dtype, CategoricalDtype
):
Expand Down Expand Up @@ -211,6 +221,14 @@ def from_dtype(cls, dtype, column_name, varlen_types=()):
dtype = pd_types.pandas_dtype(dtype)
# Note: be careful if you rearrange the order of the following checks

# pandas StringDtype (auto-inferred 'str' and explicit 'string')
from pandas import StringDtype

if isinstance(dtype, StringDtype):
repr_val = "string" if dtype.name == "string" else None
nullable = dtype.name == "string"
return cls(np.dtype(np.str_), repr=repr_val, var=True, nullable=nullable)

# extension types
if pd_types.is_extension_array_dtype(dtype):
if libtiledb_version() < (2, 10) and pd_types.is_bool_dtype(dtype):
Expand Down Expand Up @@ -255,12 +273,7 @@ def from_dtype(cls, dtype, column_name, varlen_types=()):

# datetime types
if pd_types.is_datetime64_any_dtype(dtype):
if dtype == "datetime64[ns]":
return cls(dtype)
else:
raise NotImplementedError(
f"Only 'datetime64[ns]' datetime dtype is supported (column {column_name})"
)
return cls(dtype)

# string types
# don't use pd_types.is_string_dtype() because it includes object types too
Expand Down Expand Up @@ -517,8 +530,8 @@ def _df_to_np_arrays(df, column_infos, fillna):
if not column_info.var:
to_numpy_kwargs.update(dtype=column_info.dtype)

if column_info.nullable:
# use default 0/empty for the dtype
if column_info.nullable and column.isna().any():
# Only create nullmap if data actually has nulls
to_numpy_kwargs.update(na_value=column_info.dtype.type())
nullmaps[name] = (~column.isna()).to_numpy(dtype=np.uint8)

Expand Down
8 changes: 8 additions & 0 deletions tiledb/dense_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,14 @@ def _setitem_impl(self, selection, val, nullmaps: dict):

try:
if attr.isvar:
# Capture null mask before np.asarray() loses pandas NA info
if (
attr.isnullable
and name not in nullmaps
and hasattr(attr_val, "isna")
):
nullmaps[name] = (~attr_val.isna()).to_numpy(dtype=np.uint8)

# ensure that the value is array-convertible, for example: pandas.Series
attr_val = np.asarray(attr_val)
if attr.isnullable and name not in nullmaps:
Expand Down
6 changes: 5 additions & 1 deletion tiledb/multirange_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -890,7 +890,11 @@ def _update_df_from_meta(
col_dtypes[name] = dtype

if col_dtypes:
df = df.astype(col_dtypes, copy=False)
# Use str instead of '<U0' so pandas uses its native string type
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is our type-mapping wrong now? I don't quite follow why we need this change

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a comment in code.

col_dtypes = {
name: str if dtype == "<U0" else dtype for name, dtype in col_dtypes.items()
}
df = df.astype(col_dtypes)

if index_col:
if index_col is not True:
Expand Down
9 changes: 9 additions & 0 deletions tiledb/sparse_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,15 @@ def _setitem_impl_sparse(self, selection, val, nullmaps: dict):
attr_val = val[name]

try:
# Capture null mask before np.asarray() loses pandas NA info
if (
attr.isvar
and attr.isnullable
and name not in nullmaps
and hasattr(attr_val, "isna")
):
nullmaps[name] = (~attr_val.isna()).to_numpy(dtype=np.uint8)

# ensure that the value is array-convertible, for example: pandas.Series
attr_val = np.asarray(attr_val)

Expand Down
8 changes: 7 additions & 1 deletion tiledb/tests/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,17 @@ def __len__(self):
return len(self._flat_arrays)

def __getitem__(self, i):
return self._flat_arrays[i]
if isinstance(i, (int, np.integer)):
return self._flat_arrays[i]
return type(self)(self._flat_arrays[i], self._dtype)

@property
def dtype(self):
return self._dtype

def copy(self):
return type(self)(self._flat_arrays, self._dtype)

@property
def ndim(self):
return 1