From 554cbfb9c3511c02f64c4fc76e84220a4c92ea4c Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Fri, 6 Mar 2026 19:47:30 +0100 Subject: [PATCH 1/5] Resolve several errors/warnings --- cpp/apidoc/Doxyfile | 1 + .../compute/kernels/scalar_arithmetic.cc | 4 ++-- .../arrow/compute/kernels/vector_pairwise.cc | 20 +++++++++---------- docs/source/cpp/env_vars.rst | 2 +- docs/source/format/CanonicalExtensions.rst | 5 +++++ docs/source/format/Security.rst | 4 ++-- python/pyarrow/_azurefs.pyx | 1 + python/pyarrow/_compute.pyx | 4 ++-- python/pyarrow/array.pxi | 3 ++- python/pyarrow/parquet/core.py | 4 ++++ python/pyarrow/table.pxi | 4 ++-- 11 files changed, 32 insertions(+), 20 deletions(-) diff --git a/cpp/apidoc/Doxyfile b/cpp/apidoc/Doxyfile index 9cff530791be..9b5a750652ae 100644 --- a/cpp/apidoc/Doxyfile +++ b/cpp/apidoc/Doxyfile @@ -2476,6 +2476,7 @@ PREDEFINED = __attribute__(x)= \ __declspec(x)= \ ARROW_ACERO_EXPORT= \ ARROW_ARG_UNUSED(x)=x \ + ARROW_CUDA_EXPORT= \ ARROW_DEPRECATED(x)= \ ARROW_DS_EXPORT= \ ARROW_ENGINE_EXPORT= \ diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc index 03c9422809b8..f09e209e81df 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc @@ -1347,13 +1347,13 @@ const FunctionDoc atan2_doc{"Compute the inverse tangent of y/x", {"y", "x"}}; const FunctionDoc atanh_doc{"Compute the inverse hyperbolic tangent", - ("NaN is returned for input values x with |x| > 1.\n" + ("NaN is returned for input values x with \\|x\\| > 1.\n" "At x = +/- 1, returns +/- infinity.\n" "To raise an error instead, see \"atanh_checked\"."), {"x"}}; const FunctionDoc atanh_checked_doc{"Compute the inverse hyperbolic tangent", - ("Input values x with |x| >= 1.0 raise an error\n" + ("Input values x with \\|x\\| >= 1.0 raise an error\n" "to return NaN instead, see \"atanh\"."), {"x"}}; diff --git a/cpp/src/arrow/compute/kernels/vector_pairwise.cc b/cpp/src/arrow/compute/kernels/vector_pairwise.cc index 2c61afcc25ab..51d6f959acfc 100644 --- a/cpp/src/arrow/compute/kernels/vector_pairwise.cc +++ b/cpp/src/arrow/compute/kernels/vector_pairwise.cc @@ -111,23 +111,23 @@ Status PairwiseExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) const FunctionDoc pairwise_diff_doc( "Compute first order difference of an array", - ("Computes the first order difference of an array, It internally calls \n" - "the scalar function \"subtract\" to compute \n differences, so its \n" - "behavior and supported types are the same as \n" - "\"subtract\". The period can be specified in :struct:`PairwiseOptions`.\n" + ("Computes the first order difference of an array. It internally calls\n" + "the scalar function \"subtract\" to compute differences, so its\n" + "behavior and supported types are the same as\n" + "\"subtract\". The period can be specified in `PairwiseOptions`.\n" "\n" - "Results will wrap around on integer overflow. Use function \n" + "Results will wrap around on integer overflow. Use function\n" "\"pairwise_diff_checked\" if you want overflow to return an error."), {"input"}, "PairwiseOptions"); const FunctionDoc pairwise_diff_checked_doc( "Compute first order difference of an array", - ("Computes the first order difference of an array, It internally calls \n" - "the scalar function \"subtract_checked\" (or the checked variant) to compute \n" - "differences, so its behavior and supported types are the same as \n" - "\"subtract_checked\". The period can be specified in :struct:`PairwiseOptions`.\n" + ("Computes the first order difference of an array. It internally calls\n" + "the scalar function \"subtract_checked\" (or the checked variant) to compute\n" + "differences, so its behavior and supported types are the same as\n" + "\"subtract_checked\". The period can be specified in `PairwiseOptions`.\n" "\n" - "This function returns an error on overflow. For a variant that doesn't \n" + "This function returns an error on overflow. For a variant that doesn't\n" "fail on overflow, use function \"pairwise_diff\"."), {"input"}, "PairwiseOptions"); diff --git a/docs/source/cpp/env_vars.rst b/docs/source/cpp/env_vars.rst index 6ee6993e2ba7..c6f5ee60324e 100644 --- a/docs/source/cpp/env_vars.rst +++ b/docs/source/cpp/env_vars.rst @@ -94,7 +94,7 @@ that changing their value later will have an effect. value "1"; use "0" to disable. If enabled, at-fork handlers make Arrow C++ compatible with the use of the - ``fork()`` system call, such as by Python's :python:mod:`multiprocessing`, + ``fork()`` system call, such as by Python's :py:mod:`multiprocessing`, but at the expense of executing `potentially unsafe code `__ in a forked child process if the parent process is multi-threaded. diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 5de0da8354b7..467c7d11ec89 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -546,6 +546,11 @@ Primitive Type Mappings | UUID extension type | UUID | +----------------------+------------------------+ +.. toctree:: + :maxdepth: 1 + + CanonicalExtensions/Examples + .. _timestamp_with_offset_extension: Timestamp With Offset diff --git a/docs/source/format/Security.rst b/docs/source/format/Security.rst index 0c117fe1e21d..e14f07143ce6 100644 --- a/docs/source/format/Security.rst +++ b/docs/source/format/Security.rst @@ -52,7 +52,7 @@ Columnar Format Invalid data ------------ -The Arrow :ref:`columnar format <_format_columnar>` is an efficient binary +The Arrow :ref:`columnar format ` is an efficient binary representation with a focus on performance and efficiency. While the format does not store raw pointers, the contents of Arrow buffers are often combined and converted to pointers into the process' address space. @@ -165,7 +165,7 @@ have bugs anyway. IPC Format ========== -The :ref:`IPC format <_ipc-message-format>` is a serialization format for the +The :ref:`IPC format ` is a serialization format for the columnar format with associated metadata. Reading an IPC stream or file from an untrusted source comes with similar caveats as reading the Arrow columnar format. diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx index deb58b0aed84..865aabceceee 100644 --- a/python/pyarrow/_azurefs.pyx +++ b/python/pyarrow/_azurefs.pyx @@ -67,6 +67,7 @@ cdef class AzureFileSystem(FileSystem): user-assigned managed identity and need to explicitly specify which one (e.g., if the resource has multiple user-assigned identities). For system-assigned managed identities, this parameter is typically not required. + client_secret : str, default None Client secret for Azure Active Directory authentication. Must be provided together with `tenant_id` and `client_id` to use ClientSecretCredential. diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index c80e4f9316a7..137b034d6ffc 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -3160,7 +3160,7 @@ def register_vector_function(func, function_name, function_doc, in_types, out_ty all arguments are scalar, else it must return an Array. To define a varargs function, pass a callable that takes - *args. The last in_type will be the type of all varargs + ``*args``. The last in_type will be the type of all varargs arguments. function_name : str Name of the function. There should only be one function @@ -3241,7 +3241,7 @@ def register_aggregate_function(func, function_name, function_doc, in_types, out in_types defined. It must return a Scalar matching the out_type. To define a varargs function, pass a callable that takes - *args. The in_type needs to match in type of inputs when + ``*args``. The in_type needs to match in type of inputs when the function gets called. function_name : str Name of the function. This name must be unique, i.e., diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 78db88e88409..2c1597e21586 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2557,6 +2557,7 @@ cdef class BaseListArray(Array): -------- Basic logical list-array's flatten + >>> import pyarrow as pa >>> values = [1, 2, 3, 4] >>> offsets = [2, 1, 0] @@ -4840,7 +4841,7 @@ cdef class Bool8Array(ExtensionArray): def from_numpy(obj): """ Convert numpy array to a bool8 extension array without making a copy. - The input array must be 1-dimensional, with either bool_ or int8 dtype. + The input array must be 1-dimensional, with either ``bool_`` or ``int8`` dtype. Parameters ---------- diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 354f18124b53..013503563b33 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -898,6 +898,7 @@ def _sanitize_table(table, new_schema, flavor): store_decimal_as_integer : bool, default False Allow decimals with 1 <= precision <= 18 to be stored as integers. In Parquet, DECIMAL can be stored in any of the following physical types: + - int32: for 1 <= precision <= 9. - int64: for 10 <= precision <= 18. - fixed_len_byte_array: precision is limited by the array size. @@ -907,6 +908,7 @@ def _sanitize_table(table, new_schema, flavor): By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. When enabled, the writer will use the following physical types to store decimals: + - int32: for 1 <= precision <= 9. - int64: for 10 <= precision <= 18. - fixed_len_byte_array: for precision > 18. @@ -927,6 +929,7 @@ def _sanitize_table(table, new_schema, flavor): before any Parquet encodings). A `dict` can be passed to adjust the chunker parameters with the following keys: + - `min_chunk_size`: minimum chunk size in bytes, default 256 KiB The rolling hash will not be updated until this size is reached for each chunk. Note that all data sent through the hash function is counted towards the chunk @@ -945,6 +948,7 @@ def _sanitize_table(table, new_schema, flavor): balance between deduplication ratio and fragmentation. Use norm_level=1 or norm_level=2 to reach a higher deduplication ratio at the expense of fragmentation. + write_time_adjusted_to_utc : bool, default False Set the value of isAdjustedTOUTC when writing a TIME column. If True, this tells the Parquet reader that the TIME columns diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 5ca7762dec8b..2e04fa75b8b7 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -3629,7 +3629,7 @@ cdef class RecordBatch(_Tabular): b: [10,20,30,40,null] Convert a RecordBatch to row-major Tensor with null values - written as ``NaN``s + written as NaN values >>> batch.to_tensor(null_to_nan=True) @@ -5779,7 +5779,7 @@ cdef class Table(_Tabular): ------- Table - Example + Examples -------- >>> import pyarrow as pa >>> t1 = pa.table({'id': [1, 3, 2, 3, 3], From afe3b2a7708bfa73a207a37e9c58d2f6f72a46d0 Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Tue, 10 Mar 2026 01:04:12 +0100 Subject: [PATCH 2/5] Fix split parsing --- cpp/src/arrow/result.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/result.h b/cpp/src/arrow/result.h index a5e4f55db0f7..564e2848bf3a 100644 --- a/cpp/src/arrow/result.h +++ b/cpp/src/arrow/result.h @@ -228,8 +228,8 @@ class [[nodiscard]] Result : public util::EqualityComparable> { } /// Templatized constructor which constructs a `Result` by moving the - /// contents of a `Result`. `T` must be implicitly constructible from `U - /// &&`. + /// contents of a `Result`. `T` must be implicitly constructible from + /// `U&&`. /// /// Sets `other` to contain a non-OK status with a `StatusError::Invalid` /// error code. From 6edd3a113810fb4c506b45779239a92fb7f324c0 Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Mon, 16 Mar 2026 12:55:50 +0100 Subject: [PATCH 3/5] Resolve errors more errors --- ci/scripts/python_build.sh | 3 +++ docs/source/implementations.rst | 2 +- docs/source/python/index.rst | 2 +- python/pyarrow/_azurefs.pyx | 1 + python/pyarrow/parquet/core.py | 1 + 5 files changed, 7 insertions(+), 2 deletions(-) diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index f8c1af3982dd..e2377f994478 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -106,6 +106,9 @@ if [ "${BUILD_DOCS_PYTHON}" == "ON" ]; then rm -rf "${python_build_dir}/cpp/examples" mkdir -p "${python_build_dir}/cpp" cp -a "${arrow_dir}/cpp/examples" "${python_build_dir}/cpp/" + rm -rf "${python_build_dir}/python/examples" + mkdir -p "${python_build_dir}/python" + cp -a "${arrow_dir}/python/examples" "${python_build_dir}/python/" rm -rf "${python_build_dir}/ci" cp -a "${arrow_dir}/ci/" "${python_build_dir}/" export ARROW_CPP_DOXYGEN_XML=${build_dir}/cpp/apidoc/xml diff --git a/docs/source/implementations.rst b/docs/source/implementations.rst index 44f851332135..562576655062 100644 --- a/docs/source/implementations.rst +++ b/docs/source/implementations.rst @@ -113,7 +113,7 @@ The source files for the Cookbook are maintained in the C++ C GLib Go - Java + Java JavaScript Julia MATLAB diff --git a/docs/source/python/index.rst b/docs/source/python/index.rst index 337769f246ee..d31b844cc879 100644 --- a/docs/source/python/index.rst +++ b/docs/source/python/index.rst @@ -67,5 +67,5 @@ files into Arrow structures. env_vars api getting_involved - benchmarks + ../developers/benchmarks Python cookbook diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx index 865aabceceee..58896fbd5e66 100644 --- a/python/pyarrow/_azurefs.pyx +++ b/python/pyarrow/_azurefs.pyx @@ -59,6 +59,7 @@ cdef class AzureFileSystem(FileSystem): client_id : str, default None The client ID (Application ID) for Azure Active Directory authentication. Its interpretation depends on the credential type being used: + - For `ClientSecretCredential`: It is the Application (client) ID of your registered Azure AD application (Service Principal). It must be provided together with `tenant_id` and `client_secret` to use ClientSecretCredential. diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 013503563b33..60c9f5ac8849 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -859,6 +859,7 @@ def _sanitize_table(table, new_schema, flavor): item; } } + encryption_properties : FileEncryptionProperties, default None File encryption properties for Parquet Modular Encryption. If None, no encryption will be done. From 79c71f05792e6d7eae1e9b59e9e39db9704ebe81 Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Mon, 16 Mar 2026 15:56:51 +0100 Subject: [PATCH 4/5] Retract fix for warning: download file not readable --- ci/scripts/python_build.sh | 3 --- docs/source/python/index.rst | 1 - 2 files changed, 4 deletions(-) diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index e2377f994478..f8c1af3982dd 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -106,9 +106,6 @@ if [ "${BUILD_DOCS_PYTHON}" == "ON" ]; then rm -rf "${python_build_dir}/cpp/examples" mkdir -p "${python_build_dir}/cpp" cp -a "${arrow_dir}/cpp/examples" "${python_build_dir}/cpp/" - rm -rf "${python_build_dir}/python/examples" - mkdir -p "${python_build_dir}/python" - cp -a "${arrow_dir}/python/examples" "${python_build_dir}/python/" rm -rf "${python_build_dir}/ci" cp -a "${arrow_dir}/ci/" "${python_build_dir}/" export ARROW_CPP_DOXYGEN_XML=${build_dir}/cpp/apidoc/xml diff --git a/docs/source/python/index.rst b/docs/source/python/index.rst index d31b844cc879..cef8998ed4cd 100644 --- a/docs/source/python/index.rst +++ b/docs/source/python/index.rst @@ -67,5 +67,4 @@ files into Arrow structures. env_vars api getting_involved - ../developers/benchmarks Python cookbook From 0ba9b90f23e132b3aef7650a83dbe2e7b563c8fb Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Tue, 17 Mar 2026 16:59:17 +0100 Subject: [PATCH 5/5] CanonicalExtensions/Examples navigation w/o new link or subtitle --- docs/source/format/CanonicalExtensions.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 467c7d11ec89..c6cd8f3ea13a 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -546,11 +546,6 @@ Primitive Type Mappings | UUID extension type | UUID | +----------------------+------------------------+ -.. toctree:: - :maxdepth: 1 - - CanonicalExtensions/Examples - .. _timestamp_with_offset_extension: Timestamp With Offset @@ -595,4 +590,10 @@ Arrow extension types for representing vector geometries. It is well known within the Arrow geospatial subcommunity. The GeoArrow specification is not yet finalized. +.. toctree:: + :maxdepth: 1 + :hidden: + + CanonicalExtensions/Examples + .. _rfc8259: https://datatracker.ietf.org/doc/html/rfc8259