Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,63 @@ public:
}

std::vector<std::string> GetBlasRoutines() override { return { std::string("Copy"), std::string("Axpy") }; }

std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override {
std::string op;
op = "\n//------ BATCHNORM_KERNEL_ALPAKA\n";
op += "struct BatchNormKernel {\n";
op += SP + "template<typename TAcc, typename T>\n";
op += SP + "ALPAKA_FN_ACC void operator()(\n";
op += SP + " TAcc const& acc,\n";
op += SP + " T const* __restrict__ x, T* __restrict__ y,\n";
op += SP + " T const* __restrict__ mean, T const* __restrict__ scale,\n";
op += SP + " T const* __restrict__ bias, std::size_t n) const {\n";
op += SP + SP + "const auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
op += SP + SP + "if (idx < n) {\n";
op += SP + SP + SP + "y[idx] = (x[idx] - mean[idx]) * scale[idx] + bias[idx];\n";
op += SP + SP + "}\n";
op += SP + "}\n";
op += "};\n";
return op;
}

std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
return "BatchNormKernel batchNormKernel;\n";
}

std::string Generate_GPU_ALPAKA(std::string OpName) override {
OpName = "op_" + OpName;
if (fShapeX.empty()) {
throw std::runtime_error(
"TMVA SOFIE BatchNormalization called to Generate_GPU_ALPAKA without being initialized first");
}

std::stringstream out;
size_t n = ConvertShapeToLength(fShapeX);

out << "\n//------ BATCHNORM_GPU_ALPAKA\n";
out << SP << "auto const elementsPerThread_" << fNX << " = Vec::all(static_cast<Idx>(1));\n";
out << SP << "auto const elementsPerGrid_" << fNX << " = Vec::all(Idx{" << n << "});\n";
out << SP << "alpaka::KernelCfg<Acc> const kernelCfg_" << fNX
<< " = {elementsPerGrid_" << fNX << ", elementsPerThread_" << fNX << "};\n";
out << SP << "auto const workDiv_" << fNX
<< " = alpaka::getValidWorkDiv(kernelCfg_" << fNX
<< ", devAcc, batchNormKernel,"
<< " alpaka::getPtrNative(deviceBuf_" << fNX << "),"
<< " alpaka::getPtrNative(deviceBuf_" << fNY << "),"
<< " alpaka::getPtrNative(deviceBuf_" << fNMean << "),"
<< " alpaka::getPtrNative(deviceBuf_" << fNScale << "),"
<< " alpaka::getPtrNative(deviceBuf_" << fNB << "),"
<< " static_cast<Idx>(" << n << "));\n";
out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNX << ", batchNormKernel,"
<< " alpaka::getPtrNative(deviceBuf_" << fNX << "),"
<< " alpaka::getPtrNative(deviceBuf_" << fNY << "),"
<< " alpaka::getPtrNative(deviceBuf_" << fNMean << "),"
<< " alpaka::getPtrNative(deviceBuf_" << fNScale << "),"
<< " alpaka::getPtrNative(deviceBuf_" << fNB << "),"
<< " static_cast<Idx>(" << n << "));\n";
return out.str();
}
};

}//SOFIE
Expand Down
43 changes: 43 additions & 0 deletions src/SOFIE_core/test/BatchNormModelGenerator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env python3
"""Generate BatchNorm.onnx and print reference output values for BatchNorm.ref.hxx.

Model: nn.BatchNorm2d(2) in eval mode, input shape (1, 2, 2, 2).
"""

Comment on lines +2 to +6
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How will this file be executed? Also, since you provide the .onnx file anyway, how will this script be useful?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for reviewing,
this file is standalone and I included it for reproducibility so that the model and ref values can be regenerated and verified independently. python3 BatchNormModelGenerator.py executes the file and we get the .onnx along with the ref values.

import torch
import torch.nn as nn


def main():
bn = nn.BatchNorm2d(2)
bn.eval()

bn.weight.data = torch.tensor([1.0, 2.0]) # scale per channel
bn.bias.data = torch.tensor([0.0, 0.5]) # bias per channel
bn.running_mean = torch.tensor([0.5, 3.0])
bn.running_var = torch.tensor([1.0, 4.0])

# Input: batch=1, C=2, H=2, W=2
x = torch.tensor([[[[1., 2.], [3., 4.]],
[[5., 6.], [7., 8.]]]])

with torch.no_grad():
y = bn(x)

flat = y.flatten().tolist()
print("Reference output (8 floats):")
print(", ".join(f"{v:.6f}f" for v in flat))

torch.onnx.export(
bn,
x,
"BatchNorm.onnx",
opset_version=13,
input_names=["X"],
output_names=["Y"],
)
print("Exported BatchNorm.onnx")


if __name__ == "__main__":
main()
41 changes: 41 additions & 0 deletions src/SOFIE_core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@
#include "GatherND_NegativeIndices_FromONNX_GPU_ALPAKA.hxx"
#include "GatherND_Batch_FromONNX_GPU_ALPAKA.hxx"

#include "BatchNorm_FromONNX_GPU_ALPAKA.hxx"
#include "input_models/references/BatchNorm.ref.hxx"

#include <alpaka/alpaka.hpp>
#include <cuda_runtime.h>
#include <nvml.h>
Expand Down Expand Up @@ -1094,3 +1097,41 @@ TEST_F(SofieAlpakaTest, GatherND_Batch)
for (size_t i = 0; i < expected.size(); ++i)
EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i;
}

TEST_F(SofieAlpakaTest, BatchNorm)
{
constexpr float TOLERANCE = DEFAULT_TOLERANCE;
constexpr size_t INPUT_SIZE = 8; // (1, 2, 2, 2)
constexpr size_t OUTPUT_SIZE = 8;

// scale=[1,2], bias=[0,0.5], mean=[0.5,3], var=[1,4]
std::vector<float> input({ 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f });

auto A = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{INPUT_SIZE}));
float *A_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(A));
for (Idx i = 0; i < INPUT_SIZE; ++i) A_ptr[i] = input[i];

auto A_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{INPUT_SIZE}));
alpaka::memcpy(queue, A_d, A);
alpaka::wait(queue);

auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{OUTPUT_SIZE}));

{
SOFIE_BatchNorm::Session<alpaka::TagGpuCudaRt> session("BatchNorm_FromONNX_GPU_ALPAKA.dat");
auto result = session.infer(A_d);
alpaka::wait(queue);
cudaDeviceSynchronize();
alpaka::memcpy(queue, result_h, result);
alpaka::wait(queue);
}

float *res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
float *correct = BatchNorm_ExpectedOutput::outputs;

for (size_t i = 0; i < OUTPUT_SIZE; ++i) {
EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE)
<< "Mismatch at index " << i
<< ": got " << res_ptr[i] << ", expected " << correct[i];
}
}
Binary file added src/SOFIE_core/test/input_models/BatchNorm.onnx
Binary file not shown.
7 changes: 7 additions & 0 deletions src/SOFIE_core/test/input_models/references/BatchNorm.ref.hxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
namespace BatchNorm_ExpectedOutput {
// Input shape (1,2,2,2), scale=[1,2], bias=[0,0.5], mean=[0.5,3], var=[1,4]
float outputs[] = {
0.499997f, 1.499992f, 2.499988f, 3.499982f,
2.499998f, 3.499996f, 4.499995f, 5.499994f
};
} // namespace BatchNorm_ExpectedOutput