diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx index 1a6098d..67000a8 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx @@ -228,6 +228,63 @@ public: } std::vector GetBlasRoutines() override { return { std::string("Copy"), std::string("Axpy") }; } + + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override { + std::string op; + op = "\n//------ BATCHNORM_KERNEL_ALPAKA\n"; + op += "struct BatchNormKernel {\n"; + op += SP + "template\n"; + op += SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + " TAcc const& acc,\n"; + op += SP + " T const* __restrict__ x, T* __restrict__ y,\n"; + op += SP + " T const* __restrict__ mean, T const* __restrict__ scale,\n"; + op += SP + " T const* __restrict__ bias, std::size_t n) const {\n"; + op += SP + SP + "const auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + "if (idx < n) {\n"; + op += SP + SP + SP + "y[idx] = (x[idx] - mean[idx]) * scale[idx] + bias[idx];\n"; + op += SP + SP + "}\n"; + op += SP + "}\n"; + op += "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return "BatchNormKernel batchNormKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShapeX.empty()) { + throw std::runtime_error( + "TMVA SOFIE BatchNormalization called to Generate_GPU_ALPAKA without being initialized first"); + } + + std::stringstream out; + size_t n = ConvertShapeToLength(fShapeX); + + out << "\n//------ BATCHNORM_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << fNX << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << fNX << " = Vec::all(Idx{" << n << "});\n"; + out << SP << "alpaka::KernelCfg const kernelCfg_" << fNX + << " = {elementsPerGrid_" << fNX << ", elementsPerThread_" << fNX << "};\n"; + out << SP << "auto const workDiv_" << fNX + << " = alpaka::getValidWorkDiv(kernelCfg_" << fNX + << ", devAcc, batchNormKernel," + << " alpaka::getPtrNative(deviceBuf_" << fNX << ")," + << " alpaka::getPtrNative(deviceBuf_" << fNY << ")," + << " alpaka::getPtrNative(deviceBuf_" << fNMean << ")," + << " alpaka::getPtrNative(deviceBuf_" << fNScale << ")," + << " alpaka::getPtrNative(deviceBuf_" << fNB << ")," + << " static_cast(" << n << "));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << fNX << ", batchNormKernel," + << " alpaka::getPtrNative(deviceBuf_" << fNX << ")," + << " alpaka::getPtrNative(deviceBuf_" << fNY << ")," + << " alpaka::getPtrNative(deviceBuf_" << fNMean << ")," + << " alpaka::getPtrNative(deviceBuf_" << fNScale << ")," + << " alpaka::getPtrNative(deviceBuf_" << fNB << ")," + << " static_cast(" << n << "));\n"; + return out.str(); + } }; }//SOFIE diff --git a/src/SOFIE_core/test/BatchNormModelGenerator.py b/src/SOFIE_core/test/BatchNormModelGenerator.py new file mode 100644 index 0000000..91bfa9a --- /dev/null +++ b/src/SOFIE_core/test/BatchNormModelGenerator.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +"""Generate BatchNorm.onnx and print reference output values for BatchNorm.ref.hxx. + +Model: nn.BatchNorm2d(2) in eval mode, input shape (1, 2, 2, 2). +""" + +import torch +import torch.nn as nn + + +def main(): + bn = nn.BatchNorm2d(2) + bn.eval() + + bn.weight.data = torch.tensor([1.0, 2.0]) # scale per channel + bn.bias.data = torch.tensor([0.0, 0.5]) # bias per channel + bn.running_mean = torch.tensor([0.5, 3.0]) + bn.running_var = torch.tensor([1.0, 4.0]) + + # Input: batch=1, C=2, H=2, W=2 + x = torch.tensor([[[[1., 2.], [3., 4.]], + [[5., 6.], [7., 8.]]]]) + + with torch.no_grad(): + y = bn(x) + + flat = y.flatten().tolist() + print("Reference output (8 floats):") + print(", ".join(f"{v:.6f}f" for v in flat)) + + torch.onnx.export( + bn, + x, + "BatchNorm.onnx", + opset_version=13, + input_names=["X"], + output_names=["Y"], + ) + print("Exported BatchNorm.onnx") + + +if __name__ == "__main__": + main() diff --git a/src/SOFIE_core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/src/SOFIE_core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx index 1303251..853a5a6 100644 --- a/src/SOFIE_core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx +++ b/src/SOFIE_core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx @@ -52,6 +52,9 @@ #include "GatherND_NegativeIndices_FromONNX_GPU_ALPAKA.hxx" #include "GatherND_Batch_FromONNX_GPU_ALPAKA.hxx" +#include "BatchNorm_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/BatchNorm.ref.hxx" + #include #include #include @@ -1094,3 +1097,41 @@ TEST_F(SofieAlpakaTest, GatherND_Batch) for (size_t i = 0; i < expected.size(); ++i) EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; } + +TEST_F(SofieAlpakaTest, BatchNorm) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + constexpr size_t INPUT_SIZE = 8; // (1, 2, 2, 2) + constexpr size_t OUTPUT_SIZE = 8; + + // scale=[1,2], bias=[0,0.5], mean=[0.5,3], var=[1,4] + std::vector input({ 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f }); + + auto A = alpaka::allocBuf(host, Ext1D::all(Idx{INPUT_SIZE})); + float *A_ptr = reinterpret_cast(alpaka::getPtrNative(A)); + for (Idx i = 0; i < INPUT_SIZE; ++i) A_ptr[i] = input[i]; + + auto A_d = alpaka::allocBuf(device, Ext1D::all(Idx{INPUT_SIZE})); + alpaka::memcpy(queue, A_d, A); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{OUTPUT_SIZE})); + + { + SOFIE_BatchNorm::Session session("BatchNorm_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(A_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float *res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = BatchNorm_ExpectedOutput::outputs; + + for (size_t i = 0; i < OUTPUT_SIZE; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) + << "Mismatch at index " << i + << ": got " << res_ptr[i] << ", expected " << correct[i]; + } +} diff --git a/src/SOFIE_core/test/input_models/BatchNorm.onnx b/src/SOFIE_core/test/input_models/BatchNorm.onnx new file mode 100644 index 0000000..0f8a659 Binary files /dev/null and b/src/SOFIE_core/test/input_models/BatchNorm.onnx differ diff --git a/src/SOFIE_core/test/input_models/references/BatchNorm.ref.hxx b/src/SOFIE_core/test/input_models/references/BatchNorm.ref.hxx new file mode 100644 index 0000000..212bce9 --- /dev/null +++ b/src/SOFIE_core/test/input_models/references/BatchNorm.ref.hxx @@ -0,0 +1,7 @@ +namespace BatchNorm_ExpectedOutput { + // Input shape (1,2,2,2), scale=[1,2], bias=[0,0.5], mean=[0.5,3], var=[1,4] + float outputs[] = { + 0.499997f, 1.499992f, 2.499988f, 3.499982f, + 2.499998f, 3.499996f, 4.499995f, 5.499994f + }; +} // namespace BatchNorm_ExpectedOutput