diff --git a/cpp/modmesh/buffer/pymod/buffer_pymod.cpp b/cpp/modmesh/buffer/pymod/buffer_pymod.cpp index 72666256c..9fea9a3e7 100644 --- a/cpp/modmesh/buffer/pymod/buffer_pymod.cpp +++ b/cpp/modmesh/buffer/pymod/buffer_pymod.cpp @@ -28,12 +28,40 @@ #include // Must be the first include. +#include + namespace modmesh { namespace python { +namespace +{ + +char const * simd_feature_name() +{ + using namespace modmesh::simd::detail; + switch (detect_simd()) + { + case SIMD_NONE: return "NONE"; + case SIMD_NEON: return "NEON"; + case SIMD_SSE: return "SSE"; + case SIMD_SSE2: return "SSE2"; + case SIMD_SSE3: return "SSE3"; + case SIMD_SSSE3: return "SSSE3"; + case SIMD_SSE41: return "SSE41"; + case SIMD_SSE42: return "SSE42"; + case SIMD_AVX: return "AVX"; + case SIMD_AVX2: return "AVX2"; + case SIMD_AVX512: return "AVX512"; + case SIMD_UNKNOWN: return "UNKNOWN"; + } + return "UNKNOWN"; +} + +} // namespace + struct buffer_pymod_tag; template <> @@ -52,6 +80,14 @@ void initialize_buffer(pybind11::module & mod) wrap_ConcreteBuffer(mod); wrap_SimpleArray(mod); wrap_SimpleArrayPlex(mod); + + // Reports the runtime-detected SIMD feature so pytest can verify that + // NEON dispatch is active on aarch64. Without this guard, a regression + // that silently routes everything to the scalar path would still pass + // every correctness check. Kept under an underscore-prefixed name + // because detect_simd() only meaningfully reflects the dispatched + // backend on aarch64 today; on other targets it would mislead users. + mod.def("_simd_feature", &simd_feature_name); }; OneTimeInitializer::me()(mod, initialize_impl); diff --git a/cpp/modmesh/simd/neon/neon.hpp b/cpp/modmesh/simd/neon/neon.hpp index 9680dfb88..ead3f6ab2 100644 --- a/cpp/modmesh/simd/neon/neon.hpp +++ b/cpp/modmesh/simd/neon/neon.hpp @@ -28,9 +28,12 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#include -#include +#include +#include + #include +#include +#include #if defined(__aarch64__) #include @@ -90,216 +93,163 @@ inline constexpr size_t get_recommended_alignment() } /* end namespace detail */ #if defined(__aarch64__) -template > * = nullptr> -const T * check_between(T const * start, T const * end, T const & min_val, T const & max_val) +// SFINAE helpers for vectorized operations. +struct vec_add { - return generic::check_between(start, end, min_val, max_val); -} - -template > * = nullptr> -const T * check_between(T const * start, T const * end, T const & min_val, T const & max_val) + template + static auto operator()(V a, V b) -> decltype(vaddq(a, b)) { return vaddq(a, b); } +}; +struct vec_sub { - using vec_t = type::vector_t; - using cmpvec_t = type::vector_t; - constexpr size_t N_lane = type::vector_lane; - -#ifndef NDEBUG - constexpr size_t alignment = detail::get_recommended_alignment(); - detail::check_alignment(start, alignment, "check_between start"); -#endif - - vec_t max_vec = vdupq(max_val); - vec_t min_vec = vdupq(min_val); - vec_t data_vec = {}; - cmpvec_t cmp_vec = {}; - T const * ret = NULL; - - T const * ptr = start; - for (; ptr <= end - N_lane; ptr += N_lane) - { - data_vec = vld1q(ptr); - cmp_vec = (cmpvec_t)vcgeq(data_vec, max_vec); - if (vgetq<0>(cmp_vec) || - vgetq<1>(cmp_vec)) - { - goto OUT_OF_RANGE; - } - - cmp_vec = (cmpvec_t)vcltq(data_vec, min_vec); - if (vgetq<0>(cmp_vec) || - vgetq<1>(cmp_vec)) - { - goto OUT_OF_RANGE; - } - } - - if (ptr != end) - { - ret = generic::check_between(ptr, end, min_val, max_val); - } - - return ret; - -OUT_OF_RANGE: - T cmp_val[N_lane] = {}; - T * cmp = cmp_val; - vst1q(cmp_val, cmp_vec); - - for (size_t i = 0; i < N_lane; ++i, ++cmp) - { - if (*cmp) - { - return ptr + i; - } - } - return ptr; -} + template + static auto operator()(V a, V b) -> decltype(vsubq(a, b)) { return vsubq(a, b); } +}; +struct vec_mul +{ + template + static auto operator()(V a, V b) -> decltype(vmulq(a, b)) { return vmulq(a, b); } +}; +struct vec_div +{ + template + static auto operator()(V a, V b) -> decltype(vdivq(a, b)) { return vdivq(a, b); } +}; -template -void add(T * dest, T const * dest_end, T const * src1, T const * src2) +template ScalarOp, typename VecOp> +void transform_binary(T * dest, T const * dest_end, T const * src1, T const * src2, ScalarOp scalar_op, VecOp vec_op) { - if constexpr (!(type::has_vectype)) + if constexpr (!type::has_vectype) { - return generic::add(dest, dest_end, src1, src2); + generic::transform_binary(dest, dest_end, src1, src2, scalar_op); } else { using vec_t = type::vector_t; - constexpr size_t N_lane = type::vector_lane; + if constexpr (!std::invocable) + { + generic::transform_binary(dest, dest_end, src1, src2, scalar_op); + } + else + { + constexpr size_t N_lane = type::vector_lane; #ifndef NDEBUG - constexpr size_t alignment = detail::get_recommended_alignment(); - detail::check_alignment(dest, alignment, "add dest"); - detail::check_alignment(src1, alignment, "add src1"); - detail::check_alignment(src2, alignment, "add src2"); + constexpr size_t alignment = detail::get_recommended_alignment(); + detail::check_alignment(dest, alignment, "transform_binary dest"); + detail::check_alignment(src1, alignment, "transform_binary src1"); + detail::check_alignment(src2, alignment, "transform_binary src2"); #endif - vec_t src1_vec; - vec_t src2_vec; - vec_t res_vec; - T * ptr = dest; - for (; ptr <= dest_end - N_lane; ptr += N_lane, src1 += N_lane, src2 += N_lane) - { - src1_vec = vld1q(src1); - src2_vec = vld1q(src2); - res_vec = vaddq(src1_vec, src2_vec); - vst1q(ptr, res_vec); - } - if (ptr != dest_end) - { - generic::add(ptr, dest_end, src1, src2); + // Compare on remaining length, not `dest_end - N_lane`: the latter + // forms a pointer before the buffer (UB) on sub-lane inputs. + T * ptr = dest; + while (static_cast(dest_end - ptr) >= N_lane) + { + vec_t v1 = vld1q(src1); + vec_t v2 = vld1q(src2); + vst1q(ptr, vec_op(v1, v2)); + ptr += N_lane; + src1 += N_lane; + src2 += N_lane; + } + while (ptr < dest_end) + { + *ptr = scalar_op(*src1, *src2); + ++ptr; + ++src1; + ++src2; + } } } } template -void sub(T * dest, T const * dest_end, T const * src1, T const * src2) +inline void add(T * dest, T const * dest_end, T const * src1, T const * src2) { - if constexpr (!(type::has_vectype)) - { - return generic::sub(dest, dest_end, src1, src2); - } - else - { - using vec_t = type::vector_t; - constexpr size_t N_lane = type::vector_lane; - -#ifndef NDEBUG - constexpr size_t alignment = detail::get_recommended_alignment(); - detail::check_alignment(dest, alignment, "sub dest"); - detail::check_alignment(src1, alignment, "sub src1"); - detail::check_alignment(src2, alignment, "sub src2"); -#endif - - vec_t src1_vec; - vec_t src2_vec; - vec_t res_vec; - T * ptr = dest; - for (; ptr <= dest_end - N_lane; ptr += N_lane, src1 += N_lane, src2 += N_lane) - { - src1_vec = vld1q(src1); - src2_vec = vld1q(src2); - res_vec = vsubq(src1_vec, src2_vec); - vst1q(ptr, res_vec); - } - if (ptr != dest_end) - { - generic::sub(ptr, dest_end, src1, src2); - } - } + transform_binary(dest, dest_end, src1, src2, std::plus{}, vec_add{}); } template -void mul(T * dest, T const * dest_end, T const * src1, T const * src2) +inline void sub(T * dest, T const * dest_end, T const * src1, T const * src2) { - if constexpr (!((type::vector_lane > 2))) - { - return generic::mul(dest, dest_end, src1, src2); - } - else - { - using vec_t = type::vector_t; - constexpr size_t N_lane = type::vector_lane; + transform_binary(dest, dest_end, src1, src2, std::minus{}, vec_sub{}); +} -#ifndef NDEBUG - constexpr size_t alignment = detail::get_recommended_alignment(); - detail::check_alignment(dest, alignment, "mul dest"); - detail::check_alignment(src1, alignment, "mul src1"); - detail::check_alignment(src2, alignment, "mul src2"); -#endif +template +inline void mul(T * dest, T const * dest_end, T const * src1, T const * src2) +{ + transform_binary(dest, dest_end, src1, src2, std::multiplies{}, vec_mul{}); +} - vec_t src1_vec; - vec_t src2_vec; - vec_t res_vec; - T * ptr = dest; - for (; ptr <= dest_end - N_lane; ptr += N_lane, src1 += N_lane, src2 += N_lane) - { - src1_vec = vld1q(src1); - src2_vec = vld1q(src2); - res_vec = vmulq(src1_vec, src2_vec); - vst1q(ptr, res_vec); - } - if (ptr != dest_end) - { - generic::mul(ptr, dest_end, src1, src2); - } - } +template +inline void div(T * dest, T const * dest_end, T const * src1, T const * src2) +{ + transform_binary(dest, dest_end, src1, src2, std::divides{}, vec_div{}); } template -void div(T * dest, T const * dest_end, T const * src1, T const * src2) +const T * check_between(T const * start, T const * end, T const & min_val, T const & max_val) { - if constexpr (!(std::is_floating_point_v)) + if constexpr (!type::has_vectype) { - return generic::div(dest, dest_end, src1, src2); + return generic::check_between(start, end, min_val, max_val); } else { using vec_t = type::vector_t; + using cmpvec_t = type::vector_t; constexpr size_t N_lane = type::vector_lane; #ifndef NDEBUG constexpr size_t alignment = detail::get_recommended_alignment(); - detail::check_alignment(dest, alignment, "div dest"); - detail::check_alignment(src1, alignment, "div src1"); - detail::check_alignment(src2, alignment, "div src2"); + detail::check_alignment(start, alignment, "check_between start"); #endif - vec_t src1_vec; - vec_t src2_vec; - vec_t res_vec; - T * ptr = dest; - for (; ptr <= dest_end - N_lane; ptr += N_lane, src1 += N_lane, src2 += N_lane) + vec_t max_vec = vdupq(max_val); + vec_t min_vec = vdupq(min_val); + + // Vector loop runs while a full lane still fits. The remaining-count + // form keeps the condition valid for buffers shorter than one lane. + T const * ptr = start; + while (static_cast(end - ptr) >= N_lane) { - src1_vec = vld1q(src1); - src2_vec = vld1q(src2); - res_vec = vdivq(src1_vec, src2_vec); - vst1q(ptr, res_vec); + vec_t data_vec = vld1q(ptr); + + // Inspect both bounds in one pass so the lowest-index failing lane + // wins; callers report this pointer as the first out-of-range + // element. + cmpvec_t ge_vec = (cmpvec_t)vcgeq(data_vec, max_vec); + cmpvec_t lt_vec = (cmpvec_t)vcltq(data_vec, min_vec); + bool out_of_range = vgetq<0>(ge_vec) || vgetq<1>(ge_vec) || vgetq<0>(lt_vec) || vgetq<1>(lt_vec); + + if (out_of_range) + { + T ge_val[N_lane] = {}; + T lt_val[N_lane] = {}; + vst1q(ge_val, ge_vec); + vst1q(lt_val, lt_vec); + for (size_t i = 0; i < N_lane; ++i) + { + if (ge_val[i] || lt_val[i]) + { + return ptr + i; + } + } + return ptr; + } + + ptr += N_lane; } - if (ptr != dest_end) + + // Tail scalar loop for remaining elements + for (; ptr < end; ++ptr) { - generic::div(ptr, dest_end, src1, src2); + if (*ptr < min_val || *ptr > max_val) + { + return ptr; + } } + return nullptr; } } diff --git a/cpp/modmesh/simd/neon/neon_type.hpp b/cpp/modmesh/simd/neon/neon_type.hpp index aca38ed53..285c3c82a 100644 --- a/cpp/modmesh/simd/neon/neon_type.hpp +++ b/cpp/modmesh/simd/neon/neon_type.hpp @@ -30,8 +30,8 @@ #if defined(__aarch64__) -#include #include +#include namespace modmesh { @@ -133,7 +133,7 @@ template inline constexpr size_t vector_lane = detail::vector::N_lane; template -inline constexpr size_t has_vectype = detail::vector::N_lane > 0; +inline constexpr bool has_vectype = detail::vector::N_lane > 0; } /* namespace type */ diff --git a/cpp/modmesh/simd/simd_generic.hpp b/cpp/modmesh/simd/simd_generic.hpp index fafb3a2de..e1f6db139 100644 --- a/cpp/modmesh/simd/simd_generic.hpp +++ b/cpp/modmesh/simd/simd_generic.hpp @@ -28,6 +28,9 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include +#include + namespace modmesh { @@ -40,26 +43,23 @@ namespace generic template const T * check_between(T const * start, T const * end, T const & min_val, T const & max_val) { - T const * ptr = start; - while (ptr < end) + for (T const * ptr = start; ptr < end; ++ptr) { - T idx = *ptr; - if (idx < min_val || idx > max_val) + if (*ptr < min_val || *ptr > max_val) { return ptr; } - ++ptr; } return nullptr; } -template -void add(T * dest, T const * dest_end, T const * src1, T const * src2) +template ScalarOp> +inline void transform_binary(T * dest, T const * dest_end, T const * src1, T const * src2, ScalarOp scalar_op) { T * ptr = dest; while (ptr < dest_end) { - *ptr = *src1 + *src2; + *ptr = scalar_op(*src1, *src2); ++ptr; ++src1; ++src2; @@ -67,42 +67,27 @@ void add(T * dest, T const * dest_end, T const * src1, T const * src2) } template -void sub(T * dest, T const * dest_end, T const * src1, T const * src2) +inline void add(T * dest, T const * dest_end, T const * src1, T const * src2) { - T * ptr = dest; - while (ptr < dest_end) - { - *ptr = *src1 - *src2; - ++ptr; - ++src1; - ++src2; - } + transform_binary(dest, dest_end, src1, src2, std::plus{}); } template -void mul(T * dest, T const * dest_end, T const * src1, T const * src2) +inline void sub(T * dest, T const * dest_end, T const * src1, T const * src2) { - T * ptr = dest; - while (ptr < dest_end) - { - *ptr = *src1 * *src2; - ++ptr; - ++src1; - ++src2; - } + transform_binary(dest, dest_end, src1, src2, std::minus{}); } template -void div(T * dest, T const * dest_end, T const * src1, T const * src2) +inline void mul(T * dest, T const * dest_end, T const * src1, T const * src2) { - T * ptr = dest; - while (ptr < dest_end) - { - *ptr = *src1 / *src2; - ++ptr; - ++src1; - ++src2; - } + transform_binary(dest, dest_end, src1, src2, std::multiplies{}); +} + +template +inline void div(T * dest, T const * dest_end, T const * src1, T const * src2) +{ + transform_binary(dest, dest_end, src1, src2, std::divides{}); } template diff --git a/tests/test_simd.py b/tests/test_simd.py new file mode 100644 index 000000000..cbce7e1ba --- /dev/null +++ b/tests/test_simd.py @@ -0,0 +1,99 @@ +# Copyright (c) 2026, An-Chi Liu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# - Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +import platform +import unittest + +import numpy as np + +import modmesh + + +class SimdDispatchTC(unittest.TestCase): + # Without this guard, missing NEON dispatch on aarch64 would silently route + # every SIMD operation to the scalar path -- correctness tests would still + # pass and the regression would be invisible. + def test_neon_active_on_aarch64(self): + # _simd_feature is intentionally private: the underlying detector only + # reflects the dispatched backend on aarch64 today, so it is reached + # through the C++ module rather than the public modmesh namespace. + feature = modmesh.core._impl._simd_feature() + if platform.machine() in ("arm64", "aarch64"): + self.assertEqual(feature, "NEON") + else: + self.skipTest("_simd_feature() = " + feature) + + +class SimdTransformBinaryTC(unittest.TestCase): + # Each n targets a distinct SIMD code path (int32: 4 lanes per block): + # n=1,3 -- below one lane width: pure scalar path, no vector block + # n=4 -- exactly one block: no scalar tail + # n=5 -- one block + 1-element tail + # n=8 -- two blocks: no scalar tail + # n=17 -- four blocks + 1-element tail: multi-block with remainder + # n=0 is omitted because SimpleArray does not accept zero-length shapes. + def test_add_int32_covers_all_shapes(self): + for n in (1, 3, 4, 5, 8, 17): + a_vals = np.arange(n, dtype=np.int32) + b_vals = np.array([2 * i + 1 for i in range(n)], dtype=np.int32) + a = modmesh.SimpleArrayInt32(array=a_vals) + b = modmesh.SimpleArrayInt32(array=b_vals) + out = a.add_simd(b) + for i in range(n): + self.assertEqual( + out[i], int(a_vals[i]) + int(b_vals[i]), + msg="n=%d i=%d" % (n, i)) + + def test_sub_mul_div_float(self): + # one NEON float lane (4) + 3-element tail + n = 7 + a_vals = np.array([float(i + 10) for i in range(n)], dtype=np.float32) + b_vals = np.array([float(i + 1) for i in range(n)], dtype=np.float32) + a = modmesh.SimpleArrayFloat32(array=a_vals) + b = modmesh.SimpleArrayFloat32(array=b_vals) + + sub_out = a.sub_simd(b) + mul_out = a.mul_simd(b) + div_out = a.div_simd(b) + for i in range(n): + self.assertAlmostEqual(sub_out[i], a_vals[i] - b_vals[i], places=6) + self.assertAlmostEqual(mul_out[i], a_vals[i] * b_vals[i], places=6) + self.assertAlmostEqual(div_out[i], a_vals[i] / b_vals[i], places=6) + + # vmulq has no int64 overload; SFINAE in the NEON path must route int64 + # multiply to the scalar generic implementation + def test_int64_mul_falls_back_to_generic(self): + a = modmesh.SimpleArrayInt64( + array=np.array([1, 2, 3, 4, 5], dtype=np.int64)) + b = modmesh.SimpleArrayInt64( + array=np.array([10, 20, 30, 40, 50], dtype=np.int64)) + out = a.mul_simd(b) + expected = [10, 40, 90, 160, 250] + for i, want in enumerate(expected): + self.assertEqual(out[i], want) + +# vim: set ff=unix fenc=utf8 et sw=4 ts=4 sts=4: