diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 708479f79..f3e44fd1e 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -97,6 +97,54 @@ jobs:
           name: katago-macos-opencl
           path: cpp/katago
 
+  build-macos-metal:
+    runs-on: macos-latest
+    permissions:
+      contents: read
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          brew install ninja zlib libzip protobuf abseil
+
+      - name: Cache CMake build
+        uses: actions/cache@v4
+        with:
+          path: |
+            cpp/CMakeCache.txt
+            cpp/CMakeFiles
+            cpp/build.ninja
+            cpp/.ninja_deps
+            cpp/.ninja_log
+          key: ${{ runner.os }}-cmake-metal-${{ hashFiles('**/CMakeLists.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-cmake-metal-
+
+      - name: Configure CMake
+        working-directory: cpp
+        run: |
+          cmake . -G Ninja -DUSE_BACKEND=METAL -DCMAKE_BUILD_TYPE=Release
+
+      - name: Build
+        working-directory: cpp
+        run: |
+          ninja
+
+      - name: Run tests
+        working-directory: cpp
+        run: |
+          ./katago runtests
+
+      - name: Upload artifact
+        if: github.event_name == 'push' && github.ref == 'refs/heads/master'
+        uses: actions/upload-artifact@v4
+        with:
+          name: katago-macos-metal
+          path: cpp/katago
+
   build-windows:
     runs-on: windows-latest
     permissions:
diff --git a/.gitignore b/.gitignore
index 2e933d553..5e8ac9f95 100644
--- a/.gitignore
+++ b/.gitignore
@@ -90,3 +90,4 @@ cpp/.ninja_log
 cpp/build.ninja
 cpp/KataGoSwift.*
 cpp/include/KataGoSwift/KataGoSwift-swift.h
+cpp/external/katagocoreml/proto/
diff --git a/Compiling.md b/Compiling.md
index 648fea548..a03302281 100644
--- a/Compiling.md
+++ b/Compiling.md
@@ -118,7 +118,7 @@ As also mentioned in the instructions below but repeated here for visibility, if
    * If using OpenCL, you will want to verify that KataGo is picking up the correct device (e.g. some systems may have both an Intel CPU OpenCL and GPU OpenCL, if KataGo appears to pick the wrong one, you can correct this by specifying `openclGpuToUse` in `configs/gtp_example.cfg`).
 
 ## MacOS
-   * TLDR:
+   * TLDR (Metal backend - recommended for most users, hybrid CPU+GPU+Neural Engine for maximum throughput):
      ```
      git clone https://github.com/lightvector/KataGo.git
      cd KataGo/cpp
@@ -132,6 +132,7 @@ As also mentioned in the instructions below but repeated here for visibility, if
       * CMake with a minimum version of 3.18.2: `brew install cmake`.
       * AppleClang and Swift compilers: `xcode-select --install`.
       * If using the Metal backend, [Ninja](https://ninja-build.org): `brew install ninja`
+      * If using the Metal backend, protobuf and abseil: `brew install protobuf abseil`
       * libzip: `brew install libzip`.
       * If you want to do self-play training and research, probably Google perftools `brew install gperftools` for TCMalloc or some other better malloc implementation. For unknown reasons, the allocation pattern in self-play with large numbers of threads and parallel games causes a lot of memory fragmentation under glibc malloc that will eventually run your machine out of memory, but better mallocs handle it fine.
       * If compiling to contribute to public distributed training runs, OpenSSL is required (`brew install openssl`).
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 254d23233..c11628c33 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -32,7 +32,7 @@ endif()
 set(BUILD_DISTRIBUTED 0 CACHE BOOL "Build with http support for contributing to distributed training")
 set(USE_BACKEND CACHE STRING "Neural net backend")
 string(TOUPPER "${USE_BACKEND}" USE_BACKEND)
-set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA TENSORRT OPENCL EIGEN)
+set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA TENSORRT OPENCL EIGEN METAL)
 
 set(USE_TCMALLOC 0 CACHE BOOL "Use TCMalloc")
 set(NO_GIT_REVISION 0 CACHE BOOL "Disable embedding the git revision into the compiled exe")
@@ -97,7 +97,7 @@ elseif(USE_BACKEND STREQUAL "TENSORRT")
     message(FATAL_ERROR "Combining USE_CACHE_TENSORRT_PLAN with BUILD_DISTRIBUTED is not supported - it would consume excessive disk space and might worsen performance every time models are updated. Use only one at a time in a given build of KataGo.")
   endif()
 elseif(USE_BACKEND STREQUAL "METAL")
-  message(STATUS "-DUSE_BACKEND=METAL, using Metal backend.")
+  message(STATUS "-DUSE_BACKEND=METAL, using Metal backend with hybrid MPSGraph + CoreML execution.")
   if(NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja")
     message(FATAL_ERROR "Bidirectional C++ Interop requires Ninja generator. Have ${CMAKE_GENERATOR}")
   endif()
@@ -107,6 +107,7 @@ elseif(USE_BACKEND STREQUAL "METAL")
   if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
     message(FATAL_ERROR "Project requires building with AppleClang. Have ${CMAKE_CXX_COMPILER_ID}")
   endif()
+  add_subdirectory(external/katagocoreml)
   list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/external/macos/cmake/modules")
   include(InitializeSwift)
   include(AddSwift)
@@ -115,11 +116,11 @@ elseif(USE_BACKEND STREQUAL "METAL")
     neuralnet/metalbackend.cpp
     )
   add_library(KataGoSwift STATIC
-    neuralnet/metalbackend.swift)
+    neuralnet/metalbackend.swift
+    neuralnet/metallayers.swift)
   _swift_generate_cxx_header(
     KataGoSwift
-    "${CMAKE_CURRENT_BINARY_DIR}/include/KataGoSwift/KataGoSwift-swift.h"
-    SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/neuralnet/metalbackend.swift")
+    "${CMAKE_CURRENT_BINARY_DIR}/include/KataGoSwift/KataGoSwift-swift.h")
   target_include_directories(KataGoSwift PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/include")
   set_target_properties(KataGoSwift PROPERTIES Swift_MODULE_NAME "KataGoSwift")
   target_compile_options(KataGoSwift PUBLIC
@@ -399,9 +400,12 @@ elseif(USE_BACKEND STREQUAL "TENSORRT")
   target_link_libraries(katago CUDA::cudart_static ${TENSORRT_LIBRARY})
 elseif(USE_BACKEND STREQUAL "METAL")
   target_compile_definitions(katago PRIVATE USE_METAL_BACKEND)
-  target_link_libraries(katago KataGoSwift)
+  target_link_libraries(katago KataGoSwift katagocoreml
+    ${KATAGOCOREML_DEP_LDFLAGS}
+    "-framework MetalPerformanceShaders"
+    "-framework MetalPerformanceShadersGraph")
   if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-    message(WARNING "You are currently running cmake on an Intel-based processor. It is known that running KataGo in this configuration may encounter performance issues. It is recommended to switch to a cmake version designed for ARM64 architecture for optimal performance.")
+    message(WARNING "Metal backend may not work optimally on Intel. ARM64 architecture is recommended.")
   endif()
 elseif(USE_BACKEND STREQUAL "OPENCL")
   target_compile_definitions(katago PRIVATE USE_OPENCL_BACKEND)
diff --git a/cpp/configs/analysis_example.cfg b/cpp/configs/analysis_example.cfg
index edc5e8726..a94c1537f 100644
--- a/cpp/configs/analysis_example.cfg
+++ b/cpp/configs/analysis_example.cfg
@@ -224,15 +224,33 @@ nnRandomize = true
 # ------------------------------
 # These only apply when using the METAL version of KataGo.
 
-# For one Metal instance: KataGo will automatically use the default device.
-# metalDeviceToUse = 0
-
-# For two Metal instance: Uncomment these options, AND set numNNServerThreadsPerModel = 2 above.
-# This will create two Metal instances, best overlapping the GPU and CPU execution.
+# Metal backend dispatch is configured via numNNServerThreadsPerModel and metalDeviceToUseThread<N>.
+# Device index values:
+#   0   = GPU only (MPSGraph) - default
+#   100 = ANE only (CoreML, runs on CPU + Apple Neural Engine)
+#
+# Mux mode (recommended): 4 pipelined server threads (2x GPU + 2x ANE).
+# Set nnMaxBatchSize to half of numSearchThreads for optimal pipelining.
+#
+# Example: mux mode (best throughput)
+# numNNServerThreadsPerModel = 4
+# metalDeviceToUseThread0 = 0
+# metalDeviceToUseThread1 = 0
+# metalDeviceToUseThread2 = 100
+# metalDeviceToUseThread3 = 100
+#
+# Example: GPU-only mode (default)
+# numNNServerThreadsPerModel = 1
 # metalDeviceToUseThread0 = 0
-# metalDeviceToUseThread1 = 1
+#
+# Example: ANE-only mode
+# numNNServerThreadsPerModel = 1
+# metalDeviceToUseThread0 = 100
+#
+# Default (no config): 1 server thread, GPU-only mode (gpuIdx = 0).
 
-# The pattern continues for additional Metal instances.
+# FP16 precision (default true). Set to false for exact FP32 inference (slower).
+# metalUseFP16 = true
 
 
 # OpenCL-specific GPU settings--------------------------------------
diff --git a/cpp/configs/gtp_example.cfg b/cpp/configs/gtp_example.cfg
index cfa720bf3..7247ed8b0 100644
--- a/cpp/configs/gtp_example.cfg
+++ b/cpp/configs/gtp_example.cfg
@@ -460,15 +460,33 @@ searchFactorWhenWinningThreshold = 0.95
 # ------------------------------
 # These only apply when using the METAL version of KataGo.
 
-# For one Metal instance: KataGo will automatically use the default device.
-# metalDeviceToUse = 0
-
-# For two Metal instance: Uncomment these options, AND set numNNServerThreadsPerModel = 2 above.
-# This will create two Metal instances, best overlapping the GPU and CPU execution.
+# Metal backend dispatch is configured via numNNServerThreadsPerModel and metalDeviceToUseThread<N>.
+# Device index values:
+#   0   = GPU only (MPSGraph) - default
+#   100 = ANE only (CoreML, runs on CPU + Apple Neural Engine)
+#
+# Mux mode (recommended): 4 pipelined server threads (2x GPU + 2x ANE).
+# Set nnMaxBatchSize to half of numSearchThreads for optimal pipelining.
+#
+# Example: mux mode (best throughput)
+# numNNServerThreadsPerModel = 4
 # metalDeviceToUseThread0 = 0
-# metalDeviceToUseThread1 = 1
+# metalDeviceToUseThread1 = 0
+# metalDeviceToUseThread2 = 100
+# metalDeviceToUseThread3 = 100
+#
+# Example: GPU-only mode (default)
+# numNNServerThreadsPerModel = 1
+# metalDeviceToUseThread0 = 0
+#
+# Example: ANE-only mode
+# numNNServerThreadsPerModel = 1
+# metalDeviceToUseThread0 = 100
+#
+# Default (no config): 1 server thread, GPU-only mode (gpuIdx = 0).
 
-# The pattern continues for additional Metal instances.
+# FP16 precision (default true). Set to false for exact FP32 inference (slower).
+# metalUseFP16 = true
 
 # ------------------------------
 # OpenCL GPU settings
diff --git a/cpp/external/katagocoreml/CMakeLists.txt b/cpp/external/katagocoreml/CMakeLists.txt
new file mode 100644
index 000000000..a3cdc9e16
--- /dev/null
+++ b/cpp/external/katagocoreml/CMakeLists.txt
@@ -0,0 +1,132 @@
+# katagocoreml - KataGo to Core ML Converter (vendored)
+# Simplified build for use as a subdirectory of KataGo.
+#
+# Note: We deliberately avoid linking against CMake's abseil/protobuf targets
+# (find_package(absl)) because their INTERFACE_LINK_LIBRARIES propagate
+# "-Wl,-framework,CoreFoundation" to the final executable, which swiftc
+# (used as the linker for the katago Swift/C++ hybrid) does not understand.
+# Instead, we use pkg-config for include/link flags, which produces
+# swiftc-compatible flags like "-framework CoreFoundation".
+
+# ============================================================================
+# External Dependencies
+# ============================================================================
+
+find_package(ZLIB REQUIRED)
+find_package(Protobuf REQUIRED)  # Needed for protoc executable and include dirs
+find_package(PkgConfig REQUIRED)
+set(KATAGOCOREML_ABSEIL_MODULES
+    absl_base absl_log absl_log_internal_check_op absl_log_internal_message
+    absl_hash absl_strings absl_status absl_statusor
+)
+pkg_check_modules(KATAGOCOREML_ABSEIL REQUIRED ${KATAGOCOREML_ABSEIL_MODULES})
+
+# Export link flags to parent scope for the final executable
+pkg_check_modules(KATAGOCOREML_ALL_DEPS REQUIRED protobuf ${KATAGOCOREML_ABSEIL_MODULES})
+set(KATAGOCOREML_DEP_LDFLAGS ${KATAGOCOREML_ALL_DEPS_LDFLAGS} PARENT_SCOPE)
+
+# ============================================================================
+# Proto Files (compile from source)
+# ============================================================================
+
+set(COREMLTOOLS_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/vendor")
+set(PROTO_DIR "${COREMLTOOLS_ROOT}/mlmodel/format")
+set(PROTO_GENERATED_DIR "${CMAKE_CURRENT_BINARY_DIR}/proto")
+file(MAKE_DIRECTORY ${PROTO_GENERATED_DIR})
+
+# Get all proto files (GLOB does not re-run on file additions; re-run cmake
+# manually if proto files are added or removed).
+file(GLOB PROTO_FILES "${PROTO_DIR}/*.proto")
+
+# Generate C++ from all proto files
+set(PROTO_SRCS)
+set(PROTO_HDRS)
+
+foreach(PROTO_FILE ${PROTO_FILES})
+    get_filename_component(PROTO_NAME ${PROTO_FILE} NAME_WE)
+    set(PROTO_SRC "${PROTO_GENERATED_DIR}/${PROTO_NAME}.pb.cc")
+    set(PROTO_HDR "${PROTO_GENERATED_DIR}/${PROTO_NAME}.pb.h")
+    list(APPEND PROTO_SRCS ${PROTO_SRC})
+    list(APPEND PROTO_HDRS ${PROTO_HDR})
+
+    add_custom_command(
+        OUTPUT ${PROTO_SRC} ${PROTO_HDR}
+        COMMAND ${Protobuf_PROTOC_EXECUTABLE}
+        ARGS --cpp_out=${PROTO_GENERATED_DIR}
+             -I${PROTO_DIR}
+             ${PROTO_FILE}
+        DEPENDS ${PROTO_FILE}
+        COMMENT "Generating C++ from ${PROTO_NAME}.proto"
+        VERBATIM
+    )
+endforeach()
+
+# ============================================================================
+# MILBlob Sources (vendored from coremltools)
+# ============================================================================
+
+set(MILBLOB_DIR "${COREMLTOOLS_ROOT}/mlmodel/src/MILBlob")
+set(MILBLOB_SRCS
+    "${MILBLOB_DIR}/Blob/FileWriter.cpp"
+    "${MILBLOB_DIR}/Blob/StorageWriter.cpp"
+    "${MILBLOB_DIR}/Blob/StorageReader.cpp"
+    "${MILBLOB_DIR}/Blob/MMapFileReader.cpp"
+    "${MILBLOB_DIR}/Blob/MMapFileReaderFactory.cpp"
+    "${MILBLOB_DIR}/SubByteTypes.cpp"
+    "${MILBLOB_DIR}/Fp8.cpp"
+    "${MILBLOB_DIR}/Fp16.cpp"
+)
+
+# ============================================================================
+# ModelPackage Sources (vendored from coremltools)
+# ============================================================================
+
+set(MODELPACKAGE_DIR "${COREMLTOOLS_ROOT}/modelpackage/src")
+set(MODELPACKAGE_SRCS
+    "${MODELPACKAGE_DIR}/ModelPackage.cpp"
+    "${MODELPACKAGE_DIR}/utils/JsonMap.cpp"
+)
+
+# ============================================================================
+# KataGoCoreML Library Sources
+# ============================================================================
+
+set(KATAGOCOREML_SRCS
+    src/parser/KataGoParser.cpp
+    src/builder/MILBuilder.cpp
+    src/builder/Operations.cpp
+    src/serializer/CoreMLSerializer.cpp
+    src/serializer/WeightSerializer.cpp
+    src/Converter.cpp
+)
+
+# ============================================================================
+# Library Target
+# ============================================================================
+
+add_library(katagocoreml STATIC
+    ${KATAGOCOREML_SRCS}
+    ${PROTO_SRCS}
+    ${MILBLOB_SRCS}
+    ${MODELPACKAGE_SRCS}
+)
+
+target_include_directories(katagocoreml
+    PUBLIC
+        ${CMAKE_CURRENT_SOURCE_DIR}/include
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/src
+        ${PROTO_GENERATED_DIR}
+        ${MILBLOB_DIR}/..
+        ${MODELPACKAGE_DIR}
+        ${CMAKE_CURRENT_SOURCE_DIR}/../nlohmann_json
+        ${COREMLTOOLS_ROOT}/deps/FP16/include
+        ${Protobuf_INCLUDE_DIRS}
+        ${KATAGOCOREML_ABSEIL_INCLUDE_DIRS}
+)
+
+# Only link ZLIB as a CMake target (no swiftc-incompatible flags).
+# Protobuf and abseil are linked via pkg-config LDFLAGS in the parent.
+target_link_libraries(katagocoreml PRIVATE ZLIB::ZLIB)
+
+target_compile_definitions(katagocoreml PRIVATE APPLE_BUILD=1)
diff --git a/cpp/external/katagocoreml/LICENSE b/cpp/external/katagocoreml/LICENSE
new file mode 100644
index 000000000..747217958
--- /dev/null
+++ b/cpp/external/katagocoreml/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2025, Chin-Chang Yang
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/cpp/external/katagocoreml/NOTICE b/cpp/external/katagocoreml/NOTICE
new file mode 100644
index 000000000..4fcb99679
--- /dev/null
+++ b/cpp/external/katagocoreml/NOTICE
@@ -0,0 +1,106 @@
+katagocoreml-cpp
+Copyright (c) 2025, Chin-Chang Yang
+
+This project includes third-party software components with their own licenses:
+
+================================================================================
+Core ML Proto Definitions, MILBlob, and ModelPackage
+From: Apple coremltools (https://github.com/apple/coremltools)
+License: BSD-3-Clause
+================================================================================
+
+Copyright (c) 2017-2022, Apple Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder(s) nor the names of any contributors
+   may be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+================================================================================
+KataGo Neural Network Models (tests/models/)
+================================================================================
+
+**g170 series models (g170-b6c96, g170e-b10c128):**
+From: KataGo project (https://github.com/lightvector/KataGo)
+License: CC0 (Public Domain)
+These are from the oldest KataGo training runs and are released into the public
+domain. No restrictions on use.
+
+**b5c192nbt-distilled model:**
+Copyright (c) 2025, Chin-Chang Yang
+License: BSD-3-Clause (same as this project)
+This is a custom-trained model included for testing purposes.
+
+================================================================================
+nlohmann/json
+From: https://github.com/nlohmann/json
+License: MIT
+================================================================================
+
+Copyright (c) 2013-2021 Niels Lohmann
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+================================================================================
+FP16
+From: https://github.com/Maratyszcza/FP16
+License: MIT
+================================================================================
+
+Copyright (c) 2017 Facebook Inc.
+Copyright (c) 2017 Georgia Institute of Technology
+Copyright 2019 Google LLC
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/cpp/external/katagocoreml/include/katagocoreml/KataGoConverter.hpp b/cpp/external/katagocoreml/include/katagocoreml/KataGoConverter.hpp
new file mode 100644
index 000000000..ca89f3323
--- /dev/null
+++ b/cpp/external/katagocoreml/include/katagocoreml/KataGoConverter.hpp
@@ -0,0 +1,51 @@
+// katagocoreml - Standalone C++ KataGo to Core ML Converter
+// Copyright (c) 2025, Chin-Chang Yang
+
+#pragma once
+
+#include "katagocoreml/Options.hpp"
+#include <string>
+
+namespace katagocoreml {
+
+/// Main converter class for KataGo to Core ML conversion
+class KataGoConverter {
+public:
+    /// Supported KataGo model versions
+    static constexpr int MIN_SUPPORTED_VERSION = 8;
+    static constexpr int MAX_SUPPORTED_VERSION = 16;
+
+    /// Convert KataGo model file to Core ML mlpackage
+    ///
+    /// @param input_path Path to .bin or .bin.gz KataGo model file
+    /// @param output_path Path for output .mlpackage directory
+    /// @param options Conversion options
+    /// @throws std::runtime_error on conversion failure
+    static void convert(
+        const std::string& input_path,
+        const std::string& output_path,
+        const ConversionOptions& options = ConversionOptions{}
+    );
+
+    /// Get model information without full conversion
+    ///
+    /// @param input_path Path to .bin or .bin.gz KataGo model file
+    /// @return ModelInfo structure with model metadata
+    /// @throws std::runtime_error if file cannot be parsed
+    static ModelInfo getModelInfo(const std::string& input_path);
+
+    /// Check if a model version is supported
+    ///
+    /// @param version KataGo model version number
+    /// @return true if version is supported
+    static bool isVersionSupported(int version) {
+        return version >= MIN_SUPPORTED_VERSION && version <= MAX_SUPPORTED_VERSION;
+    }
+
+    /// Get library version string
+    static std::string getVersion() {
+        return "1.1.0";
+    }
+};
+
+}  // namespace katagocoreml
diff --git a/cpp/external/katagocoreml/include/katagocoreml/Options.hpp b/cpp/external/katagocoreml/include/katagocoreml/Options.hpp
new file mode 100644
index 000000000..11cfdb5e2
--- /dev/null
+++ b/cpp/external/katagocoreml/include/katagocoreml/Options.hpp
@@ -0,0 +1,117 @@
+// katagocoreml - Standalone C++ KataGo to Core ML Converter
+// Copyright (c) 2025, Chin-Chang Yang
+
+#pragma once
+
+#include <string>
+
+namespace katagocoreml {
+
+/// Conversion options for KataGo to Core ML conversion
+struct ConversionOptions {
+    /// Board width (default: 19)
+    int board_x_size = 19;
+
+    /// Board height (default: 19)
+    int board_y_size = 19;
+
+    /// Optimize for full board (skip mask operations)
+    /// Provides ~6.5% inference speedup but requires all positions valid
+    bool optimize_identity_mask = false;
+
+    /// Compute precision: "FLOAT32" or "FLOAT16"
+    std::string compute_precision = "FLOAT32";
+
+    /// Use FLOAT16 for model inputs/outputs (instead of FLOAT32)
+    /// Only effective when compute_precision="FLOAT16"
+    /// When true with compute_precision="FLOAT16", creates pure FP16 model
+    /// When false (default), uses FP32 I/O with FP16 internal computation
+    /// Has no effect when compute_precision="FLOAT32"
+    bool use_fp16_io = false;
+
+    /// Core ML specification version (default: 6 for iOS 15+)
+    int specification_version = 6;
+
+    /// KataGo model version (set internally during conversion)
+    int model_version = 0;
+
+    /// Metadata encoder version (0 = no encoder, >0 = has encoder)
+    int meta_encoder_version = 0;
+
+    /// Number of metadata input channels (192 for human SL networks)
+    int num_input_meta_channels = 0;
+
+    /// Number of spatial input channels (set internally, typically 22)
+    int num_input_channels = 0;
+
+    /// Number of global input channels (set internally, typically 19)
+    int num_input_global_channels = 0;
+
+    /// Minimum batch size for inference (must be >= 1)
+    /// Default: 1 (single sample inference)
+    int min_batch_size = 1;
+
+    /// Maximum batch size for inference
+    /// If equal to min_batch_size, uses fixed batch size
+    /// If greater than min_batch_size, enables dynamic batch support
+    /// If <= 0, allows unlimited batch size (unbounded)
+    /// Default: 1 (fixed single batch, backward compatible)
+    int max_batch_size = 1;
+
+    /// Author name (who ran the converter) - optional, set via CLI
+    std::string author;
+
+    /// License for the model - optional, set via CLI
+    /// Typical values: "MIT", "CC0", "BSD-3-Clause"
+    std::string license;
+
+    /// Source KataGo model filename (set internally)
+    std::string source_filename;
+
+    /// Number of residual blocks (set internally)
+    int num_blocks = 0;
+
+    /// Trunk channel width (set internally)
+    int trunk_channels = 0;
+
+    /// Model name from KataGo binary (set internally)
+    std::string model_name;
+
+    /// Check if dynamic batch is enabled
+    /// Dynamic batch allows variable batch sizes at runtime
+    bool isDynamicBatch() const {
+        return min_batch_size != max_batch_size || max_batch_size <= 0;
+    }
+};
+
+/// Information about a KataGo model (without full conversion)
+struct ModelInfo {
+    /// Model name from file header
+    std::string name;
+
+    /// KataGo model version (8-16)
+    int version = 0;
+
+    /// Number of spatial input channels (typically 22)
+    int num_input_channels = 0;
+
+    /// Number of global input channels (typically 19)
+    int num_input_global_channels = 0;
+
+    /// Number of residual blocks
+    int num_blocks = 0;
+
+    /// Trunk channel width
+    int trunk_channels = 0;
+
+    /// Whether model has SGF metadata encoder (human SL networks)
+    bool has_metadata_encoder = false;
+
+    /// Number of policy output channels (1, 2, or 4 depending on version)
+    int num_policy_channels = 0;
+
+    /// Number of score value channels (4 or 6 depending on version)
+    int num_score_value_channels = 0;
+};
+
+}  // namespace katagocoreml
diff --git a/cpp/external/katagocoreml/include/katagocoreml/Version.hpp b/cpp/external/katagocoreml/include/katagocoreml/Version.hpp
new file mode 100644
index 000000000..9cb663283
--- /dev/null
+++ b/cpp/external/katagocoreml/include/katagocoreml/Version.hpp
@@ -0,0 +1,8 @@
+#pragma once
+
+namespace katagocoreml {
+constexpr const char* VERSION = "1.1.0";
+constexpr int VERSION_MAJOR = 1;
+constexpr int VERSION_MINOR = 1;
+constexpr int VERSION_PATCH = 0;
+}
diff --git a/cpp/external/katagocoreml/src/Converter.cpp b/cpp/external/katagocoreml/src/Converter.cpp
new file mode 100644
index 000000000..cb6ca80d9
--- /dev/null
+++ b/cpp/external/katagocoreml/src/Converter.cpp
@@ -0,0 +1,106 @@
+// katagocoreml - Standalone C++ KataGo to Core ML Converter
+// Copyright (c) 2025, Chin-Chang Yang
+
+#include "katagocoreml/KataGoConverter.hpp"
+#include "parser/KataGoParser.hpp"
+#include "builder/MILBuilder.hpp"
+#include "serializer/CoreMLSerializer.hpp"
+#include <stdexcept>
+#include <filesystem>
+
+namespace katagocoreml {
+
+void KataGoConverter::convert(const std::string& input_path,
+                               const std::string& output_path,
+                               const ConversionOptions& options) {
+    // Validate board sizes
+    if (options.board_x_size < 2 || options.board_x_size > 37) {
+        throw std::invalid_argument("board_x_size must be in range [2, 37]");
+    }
+    if (options.board_y_size < 2 || options.board_y_size > 37) {
+        throw std::invalid_argument("board_y_size must be in range [2, 37]");
+    }
+
+    // Validate batch sizes
+    if (options.min_batch_size < 1) {
+        throw std::invalid_argument("min_batch_size must be at least 1");
+    }
+    if (options.max_batch_size > 0 && options.max_batch_size < options.min_batch_size) {
+        throw std::invalid_argument("max_batch_size must be >= min_batch_size or <= 0 for unlimited");
+    }
+
+    // Parse KataGo model
+    KataGoParser parser(input_path);
+    KataGoModelDesc model = parser.parse();
+
+    // Determine if using FP16 precision
+    bool use_fp16 = (options.compute_precision == "FLOAT16");
+
+    // Validate configuration: use_fp16_io requires FP16 compute
+    if (options.use_fp16_io && !use_fp16) {
+        throw std::invalid_argument("use_fp16_io requires compute_precision=\"FLOAT16\"");
+    }
+
+    // Build MIL program
+    MILBuilder builder(model,
+                       options.board_x_size,
+                       options.board_y_size,
+                       options.optimize_identity_mask,
+                       use_fp16,
+                       options.min_batch_size,
+                       options.max_batch_size,
+                       options.use_fp16_io);
+    auto program = builder.build();
+
+    // Get weights from builder
+    auto weights = builder.getWeights();
+    std::vector<WeightEntry> weights_copy(weights.begin(), weights.end());
+
+    // Update options with model metadata for serialization
+    ConversionOptions final_options = options;
+    final_options.model_version = model.model_version;
+    final_options.meta_encoder_version = model.meta_encoder_version;
+    final_options.num_input_meta_channels = model.num_input_meta_channels;
+    final_options.num_input_channels = model.num_input_channels;
+    final_options.num_input_global_channels = model.num_input_global_channels;
+
+    // Add model architecture info for metadata
+    final_options.num_blocks = model.trunk.num_blocks;
+    final_options.trunk_channels = model.trunk.trunk_num_channels;
+    final_options.model_name = model.name;
+
+    // Extract filename from input path
+    if (final_options.source_filename.empty()) {
+        std::filesystem::path p(input_path);
+        final_options.source_filename = p.filename().string();
+    }
+
+    // FLOAT16 I/O requires specification version >= 7 (iOS 16+)
+    if (final_options.use_fp16_io && final_options.specification_version < 7) {
+        final_options.specification_version = 7;
+    }
+
+    // Serialize to .mlpackage
+    CoreMLSerializer serializer(final_options.specification_version);
+    serializer.serialize(program.get(), weights_copy, output_path, final_options);
+}
+
+ModelInfo KataGoConverter::getModelInfo(const std::string& input_path) {
+    KataGoParser parser(input_path);
+    KataGoModelDesc model = parser.parse();
+
+    ModelInfo info;
+    info.name = model.name;
+    info.version = model.model_version;
+    info.num_input_channels = model.num_input_channels;
+    info.num_input_global_channels = model.num_input_global_channels;
+    info.num_blocks = model.trunk.num_blocks;
+    info.trunk_channels = model.trunk.trunk_num_channels;
+    info.has_metadata_encoder = model.meta_encoder_version > 0;
+    info.num_policy_channels = model.num_policy_channels;
+    info.num_score_value_channels = model.num_score_value_channels;
+
+    return info;
+}
+
+}  // namespace katagocoreml
diff --git a/cpp/external/katagocoreml/src/builder/MILBuilder.cpp b/cpp/external/katagocoreml/src/builder/MILBuilder.cpp
new file mode 100644
index 000000000..db0c6c4b1
--- /dev/null
+++ b/cpp/external/katagocoreml/src/builder/MILBuilder.cpp
@@ -0,0 +1,2126 @@
+// katagocoreml - Standalone C++ KataGo to Core ML Converter
+// Copyright (c) 2025, Chin-Chang Yang
+
+#include "MILBuilder.hpp"
+#include "MILBlob/Fp16.hpp"
+#include <stdexcept>
+
+// Include generated protobuf headers
+#include "MIL.pb.h"
+
+namespace katagocoreml {
+
+MILBuilder::MILBuilder(const KataGoModelDesc& model,
+                       int board_x_size,
+                       int board_y_size,
+                       bool optimize_identity_mask,
+                       bool use_fp16,
+                       int min_batch_size,
+                       int max_batch_size,
+                       bool use_fp16_io)
+    : m_model(model)
+    , m_board_x_size(board_x_size)
+    , m_board_y_size(board_y_size)
+    , m_optimize_identity_mask(optimize_identity_mask)
+    , m_use_fp16(use_fp16)
+    , m_use_fp16_io(use_fp16_io)
+    , m_min_batch_size(min_batch_size)
+    , m_max_batch_size(max_batch_size)
+    , m_weight_dtype(use_fp16
+          ? CoreML::Specification::MILSpec::DataType::FLOAT16
+          : CoreML::Specification::MILSpec::DataType::FLOAT32)
+    , m_ops(board_x_size, board_y_size, optimize_identity_mask)
+    , m_var_counter(0) {}
+
+void MILBuilder::setBatchDimension(CoreML::Specification::MILSpec::TensorType* tensor_type) {
+    auto* dim = tensor_type->add_dimensions();
+    if (m_min_batch_size == m_max_batch_size && m_max_batch_size > 0) {
+        // Fixed batch size
+        dim->mutable_constant()->set_size(m_min_batch_size);
+    } else {
+        // Dynamic batch size - use UnknownDimension
+        dim->mutable_unknown()->set_variadic(false);
+    }
+}
+
+std::string MILBuilder::genVarName(const std::string& prefix) {
+    return prefix + "_" + std::to_string(m_var_counter++);
+}
+
+std::unique_ptr<CoreML::Specification::MILSpec::Program> MILBuilder::build() {
+    auto program = std::make_unique<CoreML::Specification::MILSpec::Program>();
+    program->set_version(1);
+
+    // Create main function
+    auto& functions = *program->mutable_functions();
+    auto& main_func = functions["main"];
+    main_func.set_opset("CoreML5");
+
+    // Create main block
+    auto& blocks = *main_func.mutable_block_specializations();
+    auto& main_block = blocks["CoreML5"];
+
+    // Define inputs
+    // spatial_input: [batch, num_input_ch, board_y, board_x]
+    auto* spatial_input = main_func.add_inputs();
+    spatial_input->set_name("spatial_input");
+    auto* spatial_type = spatial_input->mutable_type()->mutable_tensortype();
+    spatial_type->set_datatype(m_use_fp16 && m_use_fp16_io
+        ? CoreML::Specification::MILSpec::DataType::FLOAT16
+        : CoreML::Specification::MILSpec::DataType::FLOAT32);
+    spatial_type->set_rank(4);
+    setBatchDimension(spatial_type);
+    spatial_type->add_dimensions()->mutable_constant()->set_size(m_model.num_input_channels);
+    spatial_type->add_dimensions()->mutable_constant()->set_size(m_board_y_size);
+    spatial_type->add_dimensions()->mutable_constant()->set_size(m_board_x_size);
+
+    // global_input: [batch, num_global_ch]
+    auto* global_input = main_func.add_inputs();
+    global_input->set_name("global_input");
+    auto* global_type = global_input->mutable_type()->mutable_tensortype();
+    global_type->set_datatype(m_use_fp16 && m_use_fp16_io
+        ? CoreML::Specification::MILSpec::DataType::FLOAT16
+        : CoreML::Specification::MILSpec::DataType::FLOAT32);
+    global_type->set_rank(2);
+    setBatchDimension(global_type);
+    global_type->add_dimensions()->mutable_constant()->set_size(m_model.num_input_global_channels);
+
+    // input_mask: [batch, 1, board_y, board_x]
+    auto* mask_input = main_func.add_inputs();
+    mask_input->set_name("input_mask");
+    auto* mask_type = mask_input->mutable_type()->mutable_tensortype();
+    mask_type->set_datatype(m_use_fp16 && m_use_fp16_io
+        ? CoreML::Specification::MILSpec::DataType::FLOAT16
+        : CoreML::Specification::MILSpec::DataType::FLOAT32);
+    mask_type->set_rank(4);
+    setBatchDimension(mask_type);
+    mask_type->add_dimensions()->mutable_constant()->set_size(1);
+    mask_type->add_dimensions()->mutable_constant()->set_size(m_board_y_size);
+    mask_type->add_dimensions()->mutable_constant()->set_size(m_board_x_size);
+
+    // Optional meta_input for human SL networks
+    std::string meta_input_name;
+    if (m_model.meta_encoder_version > 0 && m_model.num_input_meta_channels > 0) {
+        auto* meta_input = main_func.add_inputs();
+        meta_input->set_name("meta_input");
+        auto* meta_type = meta_input->mutable_type()->mutable_tensortype();
+        meta_type->set_datatype(m_use_fp16 && m_use_fp16_io
+            ? CoreML::Specification::MILSpec::DataType::FLOAT16
+            : CoreML::Specification::MILSpec::DataType::FLOAT32);
+        meta_type->set_rank(2);
+        setBatchDimension(meta_type);
+        meta_type->add_dimensions()->mutable_constant()->set_size(m_model.num_input_meta_channels);
+        meta_input_name = "meta_input";
+    }
+
+    // For FP16 mode with FP32 I/O, add cast operations after inputs
+    std::string spatial_name = "spatial_input";
+    std::string global_name = "global_input";
+    std::string mask_name = "input_mask";
+    std::string meta_name = meta_input_name;
+
+    if (m_use_fp16 && !m_use_fp16_io) {
+        // Cast spatial_input: [1, num_input_ch, H, W] fp32 -> fp16
+        addCastOp(&main_block, "spatial_input", "spatial_input_cast_fp16", "fp16",
+                  {1, m_model.num_input_channels, m_board_y_size, m_board_x_size});
+        spatial_name = "spatial_input_cast_fp16";
+
+        // Cast global_input: [1, num_global_ch] fp32 -> fp16
+        addCastOp(&main_block, "global_input", "global_input_cast_fp16", "fp16",
+                  {1, m_model.num_input_global_channels});
+        global_name = "global_input_cast_fp16";
+
+        // Cast input_mask: [1, 1, H, W] fp32 -> fp16
+        addCastOp(&main_block, "input_mask", "input_mask_cast_fp16", "fp16",
+                  {1, 1, m_board_y_size, m_board_x_size});
+        mask_name = "input_mask_cast_fp16";
+
+        // Cast meta_input if present
+        if (!meta_input_name.empty()) {
+            addCastOp(&main_block, "meta_input", "meta_input_cast_fp16", "fp16",
+                      {1, m_model.num_input_meta_channels});
+            meta_name = "meta_input_cast_fp16";
+        }
+    }
+
+    // Build the network
+    const std::string* meta_ptr = meta_name.empty() ? nullptr : &meta_name;
+    std::string trunk_out = buildTrunk(&main_block, spatial_name, global_name, mask_name, meta_ptr);
+
+    // Build heads
+    std::string policy_out, pass_out;
+    buildPolicyHead(&main_block, trunk_out, mask_name, policy_out, pass_out);
+
+    std::string value_out, ownership_out, score_value_out;
+    buildValueHead(&main_block, trunk_out, mask_name, value_out, ownership_out, score_value_out);
+
+    // For FP16 mode with FP32 I/O, add cast operations to convert outputs back to FP32
+    std::string final_policy_out = policy_out;
+    std::string final_pass_out = pass_out;
+    std::string final_value_out = value_out;
+    std::string final_ownership_out = ownership_out;
+    std::string final_score_value_out = score_value_out;
+
+    if (m_use_fp16 && !m_use_fp16_io) {
+        const auto& ph = m_model.policy_head;
+        const auto& vh = m_model.value_head;
+
+        // Cast policy_p2_conv: [1, p2_out_channels, H, W] fp16 -> fp32
+        final_policy_out = "policy_p2_conv";
+        addCastOp(&main_block, policy_out, final_policy_out, "fp32",
+                  {1, ph.p2_conv.out_channels, m_board_y_size, m_board_x_size});
+
+        // Cast pass output: [1, 2] fp16 -> fp32
+        final_pass_out = "policy_pass";  // Python uses policy_pass for all versions
+        int pass_out_channels = ph.gpool_to_pass_mul2.has_value()
+            ? ph.gpool_to_pass_mul2->out_channels
+            : ph.gpool_to_pass_mul.out_channels;
+        addCastOp(&main_block, pass_out, final_pass_out, "fp32",
+                  {1, pass_out_channels});
+
+        // Cast value_v3_bias: [1, 3] fp16 -> fp32
+        final_value_out = "value_v3_bias";
+        addCastOp(&main_block, value_out, final_value_out, "fp32",
+                  {1, vh.v3_mul.out_channels});
+
+        // Cast ownership: [1, 1, H, W] fp16 -> fp32
+        final_ownership_out = "value_ownership_conv";
+        addCastOp(&main_block, ownership_out, final_ownership_out, "fp32",
+                  {1, vh.v_ownership_conv.out_channels, m_board_y_size, m_board_x_size});
+
+        // Cast score_value: [1, num_score_value_channels] fp16 -> fp32
+        final_score_value_out = "value_sv3_bias";
+        addCastOp(&main_block, score_value_out, final_score_value_out, "fp32",
+                  {1, vh.sv3_mul.out_channels});
+    }
+
+    // Set block outputs
+    main_block.add_outputs(final_policy_out);
+    main_block.add_outputs(final_pass_out);
+    main_block.add_outputs(final_value_out);
+    main_block.add_outputs(final_ownership_out);
+    main_block.add_outputs(final_score_value_out);
+
+    return program;
+}
+
+// ============================================================================
+// MIL Operation Helpers
+// ============================================================================
+
+void MILBuilder::addConstOp(CoreML::Specification::MILSpec::Block* block,
+                            const std::string& name,
+                            const std::vector<float>& data,
+                            const std::vector<int64_t>& shape) {
+    // Register weight for blob storage
+    m_ops.registerWeight(name, data, shape);
+
+    // Add const operation
+    auto* op = block->add_operations();
+    op->set_type("const");
+
+    // "name" attribute (matching Python structure)
+    auto& name_attr = (*op->mutable_attributes())["name"];
+    name_attr.mutable_type()->mutable_tensortype()->set_datatype(
+        CoreML::Specification::MILSpec::DataType::STRING);
+    name_attr.mutable_immediatevalue()->mutable_tensor()->mutable_strings()->add_values(name);
+
+    // "val" attribute with type and blob reference
+    auto& val_attr = (*op->mutable_attributes())["val"];
+    auto* val_type = val_attr.mutable_type()->mutable_tensortype();
+    val_type->set_datatype(m_weight_dtype);
+    val_type->set_rank(static_cast<int64_t>(shape.size()));
+    for (int64_t dim : shape) {
+        val_type->add_dimensions()->mutable_constant()->set_size(dim);
+    }
+    auto* blob_val = val_attr.mutable_blobfilevalue();
+    blob_val->set_filename("@model_path/weights/weight.bin");
+    // Offset will be set during serialization
+
+    // Set output
+    auto* output = op->add_outputs();
+    output->set_name(name);
+    auto* output_type = output->mutable_type()->mutable_tensortype();
+    output_type->set_datatype(m_weight_dtype);
+    output_type->set_rank(static_cast<int64_t>(shape.size()));
+    for (int64_t dim : shape) {
+        output_type->add_dimensions()->mutable_constant()->set_size(dim);
+    }
+}
+
+// Helper: Add INT32 array const op (for axes, shape)
+void MILBuilder::addIntArrayConstOp(CoreML::Specification::MILSpec::Block* block,
+                                     const std::string& name,
+                                     const std::vector<int32_t>& values) {
+    auto* op = block->add_operations();
+    op->set_type("const");
+
+    // "name" attribute
+    auto& name_attr = (*op->mutable_attributes())["name"];
+    name_attr.mutable_type()->mutable_tensortype()->set_datatype(
+        CoreML::Specification::MILSpec::DataType::STRING);
+    name_attr.mutable_immediatevalue()->mutable_tensor()->mutable_strings()->add_values(name);
+
+    // "val" attribute with INT32 type
+    auto& val_attr = (*op->mutable_attributes())["val"];
+    auto* val_type = val_attr.mutable_type()->mutable_tensortype();
+    val_type->set_datatype(CoreML::Specification::MILSpec::DataType::INT32);
+    val_type->set_rank(1);
+    val_type->add_dimensions()->mutable_constant()->set_size(static_cast<int64_t>(values.size()));
+    auto* ints = val_attr.mutable_immediatevalue()->mutable_tensor()->mutable_ints();
+    for (int32_t v : values) {
+        ints->add_values(v);
+    }
+
+    // Output
+    auto* output = op->add_outputs();
+    output->set_name(name);
+    auto* out_type = output->mutable_type()->mutable_tensortype();
+    out_type->set_datatype(CoreML::Specification::MILSpec::DataType::INT32);
+    out_type->set_rank(1);
+    out_type->add_dimensions()->mutable_constant()->set_size(static_cast<int64_t>(values.size()));
+}
+
+// Helper: Add BOOL scalar const op (for keep_dims)
+void MILBuilder::addBoolScalarConstOp(CoreML::Specification::MILSpec::Block* block,
+                                       const std::string& name,
+                                       bool value) {
+    auto* op = block->add_operations();
+    op->set_type("const");
+
+    // "name" attribute
+    auto& name_attr = (*op->mutable_attributes())["name"];
+    name_attr.mutable_type()->mutable_tensortype()->set_datatype(
+        CoreML::Specification::MILSpec::DataType::STRING);
+    name_attr.mutable_immediatevalue()->mutable_tensor()->mutable_strings()->add_values(name);
+
+    // "val" attribute with BOOL type (rank 0 = scalar)
+    auto& val_attr = (*op->mutable_attributes())["val"];
+    auto* val_type = val_attr.mutable_type()->mutable_tensortype();
+    val_type->set_datatype(CoreML::Specification::MILSpec::DataType::BOOL);
+    val_type->set_rank(0);
+    val_attr.mutable_immediatevalue()->mutable_tensor()->mutable_bools()->add_values(value);
+
+    // Output
+    auto* output = op->add_outputs();
+    output->set_name(name);
+    auto* out_type = output->mutable_type()->mutable_tensortype();
+    out_type->set_datatype(CoreML::Specification::MILSpec::DataType::BOOL);
+    out_type->set_rank(0);
+}
+
+// Helper: Add FLOAT32 scalar const op (for y values in sub/mul)
+void MILBuilder::addFloatScalarConstOp(CoreML::Specification::MILSpec::Block* block,
+                                        const std::string& name,
+                                        float value) {
+    auto* op = block->add_operations();
+    op->set_type("const");
+
+    // "name" attribute
+    auto& name_attr = (*op->mutable_attributes())["name"];
+    name_attr.mutable_type()->mutable_tensortype()->set_datatype(
+        CoreML::Specification::MILSpec::DataType::STRING);
+    name_attr.mutable_immediatevalue()->mutable_tensor()->mutable_strings()->add_values(name);
+
+    // "val" attribute with appropriate dtype (rank 0 = scalar)
+    auto& val_attr = (*op->mutable_attributes())["val"];
+    auto* val_type = val_attr.mutable_type()->mutable_tensortype();
+    val_type->set_datatype(m_weight_dtype);
+    val_type->set_rank(0);
+
+    if (m_use_fp16) {
+        // For FP16, use bytes storage with FP16 representation
+        MILBlob::Fp16 fp16_val = MILBlob::Fp16::FromFloat(value);
+        std::string bytes_data(reinterpret_cast<const char*>(&fp16_val.bytes), sizeof(fp16_val.bytes));
+        val_attr.mutable_immediatevalue()->mutable_tensor()->mutable_bytes()->set_values(bytes_data);
+    } else {
+        // For FP32, use floats storage
+        val_attr.mutable_immediatevalue()->mutable_tensor()->mutable_floats()->add_values(value);
+    }
+
+    // Output
+    auto* output = op->add_outputs();
+    output->set_name(name);
+    auto* out_type = output->mutable_type()->mutable_tensortype();
+    out_type->set_datatype(m_weight_dtype);
+    out_type->set_rank(0);
+}
+
+// Helper: Add INT32 scalar const op (for concat axis)
+void MILBuilder::addIntScalarConstOp(CoreML::Specification::MILSpec::Block* block,
+                                      const std::string& name,
+                                      int32_t value) {
+    auto* op = block->add_operations();
+    op->set_type("const");
+
+    // "name" attribute
+    auto& name_attr = (*op->mutable_attributes())["name"];
+    name_attr.mutable_type()->mutable_tensortype()->set_datatype(
+        CoreML::Specification::MILSpec::DataType::STRING);
+    name_attr.mutable_immediatevalue()->mutable_tensor()->mutable_strings()->add_values(name);
+
+    // "val" attribute with INT32 type (rank 0 = scalar)
+    auto& val_attr = (*op->mutable_attributes())["val"];
+    auto* val_type = val_attr.mutable_type()->mutable_tensortype();
+    val_type->set_datatype(CoreML::Specification::MILSpec::DataType::INT32);
+    val_type->set_rank(0);
+    val_attr.mutable_immediatevalue()->mutable_tensor()->mutable_ints()->add_values(value);
+
+    // Output
+    auto* output = op->add_outputs();
+    output->set_name(name);
+    auto* out_type = output->mutable_type()->mutable_tensortype();
+    out_type->set_datatype(CoreML::Specification::MILSpec::DataType::INT32);
+    out_type->set_rank(0);
+}
+
+// Helper: Add cast operation for dtype conversion
+void MILBuilder::addCastOp(CoreML::Specification::MILSpec::Block* block,
+                           const std::string& input,
+                           const std::string& output,
+                           const std::string& dtype,
+                           const std::vector<int64_t>& shape) {
+    // Create dtype const (STRING type)
+    std::string dtype_name = output + "_dtype_0";
+    {
+        auto* op = block->add_operations();
+        op->set_type("const");
+
+        auto& name_attr = (*op->mutable_attributes())["name"];
+        name_attr.mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::STRING);
+        name_attr.mutable_immediatevalue()->mutable_tensor()->mutable_strings()->add_values(dtype_name);
+
+        auto& val_attr = (*op->mutable_attributes())["val"];
+        val_attr.mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::STRING);
+        val_attr.mutable_immediatevalue()->mutable_tensor()->mutable_strings()->add_values(dtype);
+
+        auto* out = op->add_outputs();
+        out->set_name(dtype_name);
+        out->mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::STRING);
+    }
+
+    // Create cast operation
+    auto* op = block->add_operations();
+    op->set_type("cast");
+
+    auto& inputs = *op->mutable_inputs();
+    inputs["x"].add_arguments()->set_name(input);
+    inputs["dtype"].add_arguments()->set_name(dtype_name);
+
+    // Set output with target dtype
+    auto* out = op->add_outputs();
+    out->set_name(output);
+    auto* tt = out->mutable_type()->mutable_tensortype();
+    tt->set_datatype(dtype == "fp16"
+        ? CoreML::Specification::MILSpec::DataType::FLOAT16
+        : CoreML::Specification::MILSpec::DataType::FLOAT32);
+    tt->set_rank(static_cast<int64_t>(shape.size()));
+    // First dimension is batch - use setBatchDimension
+    setBatchDimension(tt);
+    // Remaining dimensions are constant
+    for (size_t i = 1; i < shape.size(); i++) {
+        tt->add_dimensions()->mutable_constant()->set_size(shape[i]);
+    }
+}
+
+void MILBuilder::addConvOp(CoreML::Specification::MILSpec::Block* block,
+                           const std::string& input,
+                           const ConvLayerDesc& layer,
+                           const std::string& output) {
+    // Create const operations for all parameters (matching Python structure)
+    std::string weight_name = output + "_weight_0";
+    std::string pad_type_name = output + "_pad_type_0";
+    std::string dilations_name = output + "_dilations_0";
+    std::string strides_name = output + "_strides_0";
+    std::string groups_name = output + "_groups_0";
+    std::string pad_name = output + "_pad_0";
+
+    // Add weight constant
+    addConstOp(block, weight_name, layer.weights, layer.getWeightShape());
+
+    // Add pad_type constant ("same") - STRING type
+    {
+        auto* const_op = block->add_operations();
+        const_op->set_type("const");
+        // "name" attribute
+        auto& name_attr = (*const_op->mutable_attributes())["name"];
+        name_attr.mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::STRING);
+        name_attr.mutable_immediatevalue()->mutable_tensor()->mutable_strings()->add_values(pad_type_name);
+        // "val" attribute with type
+        auto& val = (*const_op->mutable_attributes())["val"];
+        val.mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::STRING);
+        val.mutable_immediatevalue()->mutable_tensor()->mutable_strings()->add_values("same");
+        // Output
+        auto* out = const_op->add_outputs();
+        out->set_name(pad_type_name);
+        out->mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::STRING);
+    }
+
+    // Add dilations constant - INT32 type, shape [2]
+    {
+        auto* const_op = block->add_operations();
+        const_op->set_type("const");
+        // "name" attribute
+        auto& name_attr = (*const_op->mutable_attributes())["name"];
+        name_attr.mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::STRING);
+        name_attr.mutable_immediatevalue()->mutable_tensor()->mutable_strings()->add_values(dilations_name);
+        // "val" attribute with type
+        auto& val = (*const_op->mutable_attributes())["val"];
+        auto* val_type = val.mutable_type()->mutable_tensortype();
+        val_type->set_datatype(CoreML::Specification::MILSpec::DataType::INT32);
+        val_type->set_rank(1);
+        val_type->add_dimensions()->mutable_constant()->set_size(2);
+        auto* int_vals = val.mutable_immediatevalue()->mutable_tensor()->mutable_ints();
+        int_vals->add_values(layer.dilation_y);
+        int_vals->add_values(layer.dilation_x);
+        // Output
+        auto* out = const_op->add_outputs();
+        out->set_name(dilations_name);
+        auto* tt = out->mutable_type()->mutable_tensortype();
+        tt->set_datatype(CoreML::Specification::MILSpec::DataType::INT32);
+        tt->set_rank(1);
+        tt->add_dimensions()->mutable_constant()->set_size(2);
+    }
+
+    // Add strides constant - INT32 type, shape [2]
+    {
+        auto* const_op = block->add_operations();
+        const_op->set_type("const");
+        // "name" attribute
+        auto& name_attr = (*const_op->mutable_attributes())["name"];
+        name_attr.mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::STRING);
+        name_attr.mutable_immediatevalue()->mutable_tensor()->mutable_strings()->add_values(strides_name);
+        // "val" attribute with type
+        auto& val = (*const_op->mutable_attributes())["val"];
+        auto* val_type = val.mutable_type()->mutable_tensortype();
+        val_type->set_datatype(CoreML::Specification::MILSpec::DataType::INT32);
+        val_type->set_rank(1);
+        val_type->add_dimensions()->mutable_constant()->set_size(2);
+        auto* int_vals = val.mutable_immediatevalue()->mutable_tensor()->mutable_ints();
+        int_vals->add_values(1);
+        int_vals->add_values(1);
+        // Output
+        auto* out = const_op->add_outputs();
+        out->set_name(strides_name);
+        auto* tt = out->mutable_type()->mutable_tensortype();
+        tt->set_datatype(CoreML::Specification::MILSpec::DataType::INT32);
+        tt->set_rank(1);
+        tt->add_dimensions()->mutable_constant()->set_size(2);
+    }
+
+    // Add groups constant (always 1 for standard convolution) - INT32 scalar type
+    {
+        auto* const_op = block->add_operations();
+        const_op->set_type("const");
+        // "name" attribute
+        auto& name_attr = (*const_op->mutable_attributes())["name"];
+        name_attr.mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::STRING);
+        name_attr.mutable_immediatevalue()->mutable_tensor()->mutable_strings()->add_values(groups_name);
+        // "val" attribute with type (scalar)
+        auto& val = (*const_op->mutable_attributes())["val"];
+        val.mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::INT32);
+        val.mutable_immediatevalue()->mutable_tensor()->mutable_ints()->add_values(1);
+        // Output
+        auto* out = const_op->add_outputs();
+        out->set_name(groups_name);
+        out->mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::INT32);
+    }
+
+    // Add pad constant [0, 0, 0, 0] for "same" padding - INT32 type, shape [4]
+    {
+        auto* const_op = block->add_operations();
+        const_op->set_type("const");
+        // "name" attribute
+        auto& name_attr = (*const_op->mutable_attributes())["name"];
+        name_attr.mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::STRING);
+        name_attr.mutable_immediatevalue()->mutable_tensor()->mutable_strings()->add_values(pad_name);
+        // "val" attribute with type
+        auto& val = (*const_op->mutable_attributes())["val"];
+        auto* val_type = val.mutable_type()->mutable_tensortype();
+        val_type->set_datatype(CoreML::Specification::MILSpec::DataType::INT32);
+        val_type->set_rank(1);
+        val_type->add_dimensions()->mutable_constant()->set_size(4);
+        auto* int_vals = val.mutable_immediatevalue()->mutable_tensor()->mutable_ints();
+        int_vals->add_values(0);
+        int_vals->add_values(0);
+        int_vals->add_values(0);
+        int_vals->add_values(0);
+        // Output
+        auto* out = const_op->add_outputs();
+        out->set_name(pad_name);
+        auto* tt = out->mutable_type()->mutable_tensortype();
+        tt->set_datatype(CoreML::Specification::MILSpec::DataType::INT32);
+        tt->set_rank(1);
+        tt->add_dimensions()->mutable_constant()->set_size(4);
+    }
+
+    // Add conv operation referencing all const parameters
+    auto* op = block->add_operations();
+    op->set_type("conv");
+
+    // Inputs - reference const operations
+    auto& inputs = *op->mutable_inputs();
+    inputs["dilations"].add_arguments()->set_name(dilations_name);
+    inputs["groups"].add_arguments()->set_name(groups_name);
+    inputs["pad"].add_arguments()->set_name(pad_name);
+    inputs["pad_type"].add_arguments()->set_name(pad_type_name);
+    inputs["strides"].add_arguments()->set_name(strides_name);
+    inputs["weight"].add_arguments()->set_name(weight_name);
+    inputs["x"].add_arguments()->set_name(input);
+
+    // Output with dimensions [batch, out_channels, height, width]
+    auto* out = op->add_outputs();
+    out->set_name(output);
+    auto* out_type = out->mutable_type()->mutable_tensortype();
+    out_type->set_datatype(m_weight_dtype);
+    out_type->set_rank(4);
+    setBatchDimension(out_type);
+    out_type->add_dimensions()->mutable_constant()->set_size(layer.out_channels);
+    out_type->add_dimensions()->mutable_constant()->set_size(m_board_y_size);
+    out_type->add_dimensions()->mutable_constant()->set_size(m_board_x_size);
+}
+
+// Helper: Set output tensor type with 4D shape [batch, C, H, W]
+void MILBuilder::setTensorOutput4D(CoreML::Specification::MILSpec::Operation* op,
+                                    const std::string& name,
+                                    int channels, int height, int width) {
+    auto* out = op->add_outputs();
+    out->set_name(name);
+    auto* tt = out->mutable_type()->mutable_tensortype();
+    tt->set_datatype(m_weight_dtype);
+    tt->set_rank(4);
+    setBatchDimension(tt);
+    tt->add_dimensions()->mutable_constant()->set_size(channels);
+    tt->add_dimensions()->mutable_constant()->set_size(height);
+    tt->add_dimensions()->mutable_constant()->set_size(width);
+}
+
+// Helper: Set output tensor type with 2D shape [batch, C]
+void MILBuilder::setTensorOutput2D(CoreML::Specification::MILSpec::Operation* op,
+                                    const std::string& name,
+                                    int channels) {
+    auto* out = op->add_outputs();
+    out->set_name(name);
+    auto* tt = out->mutable_type()->mutable_tensortype();
+    tt->set_datatype(m_weight_dtype);
+    tt->set_rank(2);
+    setBatchDimension(tt);
+    tt->add_dimensions()->mutable_constant()->set_size(channels);
+}
+
+// Helper: Set output tensor type with 4D shape [batch, C, 1, 1] for pooled results
+void MILBuilder::setTensorOutputPooled4D(CoreML::Specification::MILSpec::Operation* op,
+                                          const std::string& name,
+                                          int channels) {
+    auto* out = op->add_outputs();
+    out->set_name(name);
+    auto* tt = out->mutable_type()->mutable_tensortype();
+    tt->set_datatype(m_weight_dtype);
+    tt->set_rank(4);
+    setBatchDimension(tt);
+    tt->add_dimensions()->mutable_constant()->set_size(channels);
+    tt->add_dimensions()->mutable_constant()->set_size(1);
+    tt->add_dimensions()->mutable_constant()->set_size(1);
+}
+
+// Helper: Set output tensor type with 4D shape [batch, 1, 1, 1] (for mask operations)
+void MILBuilder::setTensorOutputMask4D(CoreML::Specification::MILSpec::Operation* op,
+                                        const std::string& name) {
+    auto* out = op->add_outputs();
+    out->set_name(name);
+    auto* tt = out->mutable_type()->mutable_tensortype();
+    tt->set_datatype(m_weight_dtype);
+    tt->set_rank(4);
+    setBatchDimension(tt);
+    tt->add_dimensions()->mutable_constant()->set_size(1);
+    tt->add_dimensions()->mutable_constant()->set_size(1);
+    tt->add_dimensions()->mutable_constant()->set_size(1);
+}
+
+// Helper: Set output tensor type with 4D shape [batch, 1, H, W] (for mask spatial operations)
+void MILBuilder::setTensorOutputMaskSpatial4D(CoreML::Specification::MILSpec::Operation* op,
+                                               const std::string& name,
+                                               int height, int width) {
+    auto* out = op->add_outputs();
+    out->set_name(name);
+    auto* tt = out->mutable_type()->mutable_tensortype();
+    tt->set_datatype(m_weight_dtype);
+    tt->set_rank(4);
+    setBatchDimension(tt);
+    tt->add_dimensions()->mutable_constant()->set_size(1);
+    tt->add_dimensions()->mutable_constant()->set_size(height);
+    tt->add_dimensions()->mutable_constant()->set_size(width);
+}
+
+void MILBuilder::addBatchNormActivationOps(CoreML::Specification::MILSpec::Block* block,
+                                           const std::string& input,
+                                           const BatchNormLayerDesc& bn,
+                                           const ActivationLayerDesc& act,
+                                           const std::string& mask,
+                                           const std::string& output) {
+    // BN: x * scale + bias
+    std::string scale_name = output + "_bn_scale";
+    std::string bias_name = output + "_bn_bias";
+
+    // Reshape scale/bias to [1, C, 1, 1]
+    std::vector<int64_t> bn_shape = {1, static_cast<int64_t>(bn.num_channels), 1, 1};
+    addConstOp(block, scale_name, bn.merged_scale, bn_shape);
+    addConstOp(block, bias_name, bn.merged_bias, bn_shape);
+
+    // Mul: x * scale
+    std::string scaled_name = output + "_scaled";
+    {
+        auto* op = block->add_operations();
+        op->set_type("mul");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(input);
+        inputs["y"].add_arguments()->set_name(scale_name);
+        setTensorOutput4D(op, scaled_name, bn.num_channels, m_board_y_size, m_board_x_size);
+    }
+
+    // Add: scaled + bias
+    std::string biased_name = output + "_biased";
+    {
+        auto* op = block->add_operations();
+        op->set_type("add");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(scaled_name);
+        inputs["y"].add_arguments()->set_name(bias_name);
+        setTensorOutput4D(op, biased_name, bn.num_channels, m_board_y_size, m_board_x_size);
+    }
+
+    std::string bn_output = biased_name;
+
+    // Apply mask if not optimizing
+    if (!m_optimize_identity_mask) {
+        std::string masked_name = output + "_masked";
+        auto* op = block->add_operations();
+        op->set_type("mul");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(bn_output);
+        inputs["y"].add_arguments()->set_name(mask);
+        setTensorOutput4D(op, masked_name, bn.num_channels, m_board_y_size, m_board_x_size);
+        bn_output = masked_name;
+    }
+
+    // Activation
+    if (act.activation_type == ActivationType::Identity) {
+        // Identity: just rename
+        // In MIL we need to copy
+        auto* op = block->add_operations();
+        op->set_type("identity");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(bn_output);
+        setTensorOutput4D(op, output, bn.num_channels, m_board_y_size, m_board_x_size);
+    } else if (act.activation_type == ActivationType::ReLU) {
+        auto* op = block->add_operations();
+        op->set_type("relu");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(bn_output);
+        setTensorOutput4D(op, output, bn.num_channels, m_board_y_size, m_board_x_size);
+    } else if (act.activation_type == ActivationType::Mish) {
+        addMishOps(block, bn_output, output, 4, bn.num_channels);
+    }
+}
+
+void MILBuilder::addMishOps(CoreML::Specification::MILSpec::Block* block,
+                            const std::string& input,
+                            const std::string& output,
+                            int rank,
+                            int channels) {
+    // Mish: x / (1 + 2 / (e * (e + 2)))
+    // e = exp(x)
+    //
+    // rank and channels are used to set output type info:
+    // - rank=4: spatial tensors [1, C, H, W] (uses m_board_y_size, m_board_x_size)
+    // - rank=2: vector tensors [1, C]
+
+    auto setOutputType = [this, rank, channels](CoreML::Specification::MILSpec::Operation* op, const std::string& name) {
+        auto* out = op->add_outputs();
+        out->set_name(name);
+        auto* out_type = out->mutable_type()->mutable_tensortype();
+        out_type->set_datatype(m_weight_dtype);
+        out_type->set_rank(rank);
+        setBatchDimension(out_type);
+        out_type->add_dimensions()->mutable_constant()->set_size(channels);
+        if (rank == 4) {
+            out_type->add_dimensions()->mutable_constant()->set_size(m_board_y_size);
+            out_type->add_dimensions()->mutable_constant()->set_size(m_board_x_size);
+        }
+    };
+
+    std::string e = output + "_exp";
+    std::string ep2 = output + "_ep2";
+    std::string emep2 = output + "_emep2";
+    std::string tdemep2 = output + "_tdemep2";
+    std::string optdemep2 = output + "_optdemep2";
+
+    // Create scalar constants for Mish computation
+    std::string const_one = output + "_const_1";
+    std::string const_two = output + "_const_2";
+    addFloatScalarConstOp(block, const_one, 1.0f);
+    addFloatScalarConstOp(block, const_two, 2.0f);
+
+    // e = exp(x)
+    {
+        auto* op = block->add_operations();
+        op->set_type("exp");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(input);
+        setOutputType(op, e);
+    }
+
+    // ep2 = e + 2
+    {
+        auto* op = block->add_operations();
+        op->set_type("add");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(e);
+        inputs["y"].add_arguments()->set_name(const_two);
+        setOutputType(op, ep2);
+    }
+
+    // emep2 = e * ep2
+    {
+        auto* op = block->add_operations();
+        op->set_type("mul");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(e);
+        inputs["y"].add_arguments()->set_name(ep2);
+        setOutputType(op, emep2);
+    }
+
+    // tdemep2 = 2 / emep2
+    {
+        auto* op = block->add_operations();
+        op->set_type("real_div");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(const_two);
+        inputs["y"].add_arguments()->set_name(emep2);
+        setOutputType(op, tdemep2);
+    }
+
+    // optdemep2 = 1 + tdemep2
+    {
+        auto* op = block->add_operations();
+        op->set_type("add");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(const_one);
+        inputs["y"].add_arguments()->set_name(tdemep2);
+        setOutputType(op, optdemep2);
+    }
+
+    // output = x / optdemep2
+    {
+        auto* op = block->add_operations();
+        op->set_type("real_div");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(input);
+        inputs["y"].add_arguments()->set_name(optdemep2);
+        setOutputType(op, output);
+    }
+}
+
+void MILBuilder::addMatMulOp(CoreML::Specification::MILSpec::Block* block,
+                             const std::string& input,
+                             const MatMulLayerDesc& layer,
+                             const std::string& output) {
+    // Create const operations for all parameters (matching Python structure)
+    std::string weight_name = output + "_y_0";
+    std::string transpose_x_name = output + "_transpose_x_0";
+    std::string transpose_y_name = output + "_transpose_y_0";
+
+    // Add weight constant
+    addConstOp(block, weight_name, layer.weights, layer.getWeightShape());
+
+    // Add transpose_x constant (false) - BOOL type
+    {
+        auto* const_op = block->add_operations();
+        const_op->set_type("const");
+        // "name" attribute
+        auto& name_attr = (*const_op->mutable_attributes())["name"];
+        name_attr.mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::STRING);
+        name_attr.mutable_immediatevalue()->mutable_tensor()->mutable_strings()->add_values(transpose_x_name);
+        // "val" attribute with type
+        auto& val = (*const_op->mutable_attributes())["val"];
+        val.mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::BOOL);
+        val.mutable_immediatevalue()->mutable_tensor()->mutable_bools()->add_values(false);
+        // Output
+        auto* out = const_op->add_outputs();
+        out->set_name(transpose_x_name);
+        out->mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::BOOL);
+    }
+
+    // Add transpose_y constant (false) - BOOL type
+    {
+        auto* const_op = block->add_operations();
+        const_op->set_type("const");
+        // "name" attribute
+        auto& name_attr = (*const_op->mutable_attributes())["name"];
+        name_attr.mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::STRING);
+        name_attr.mutable_immediatevalue()->mutable_tensor()->mutable_strings()->add_values(transpose_y_name);
+        // "val" attribute with type
+        auto& val = (*const_op->mutable_attributes())["val"];
+        val.mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::BOOL);
+        val.mutable_immediatevalue()->mutable_tensor()->mutable_bools()->add_values(false);
+        // Output
+        auto* out = const_op->add_outputs();
+        out->set_name(transpose_y_name);
+        out->mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::BOOL);
+    }
+
+    // Add matmul operation
+    auto* op = block->add_operations();
+    op->set_type("matmul");
+    auto& inputs = *op->mutable_inputs();
+    inputs["transpose_x"].add_arguments()->set_name(transpose_x_name);
+    inputs["transpose_y"].add_arguments()->set_name(transpose_y_name);
+    inputs["x"].add_arguments()->set_name(input);
+    inputs["y"].add_arguments()->set_name(weight_name);
+
+    // Output with 2D shape [batch, out_channels]
+    auto* out = op->add_outputs();
+    out->set_name(output);
+    auto* out_type = out->mutable_type()->mutable_tensortype();
+    out_type->set_datatype(m_weight_dtype);
+    out_type->set_rank(2);
+    setBatchDimension(out_type);
+    out_type->add_dimensions()->mutable_constant()->set_size(layer.out_channels);
+}
+
+void MILBuilder::addMatBiasOp(CoreML::Specification::MILSpec::Block* block,
+                              const std::string& input,
+                              const MatBiasLayerDesc& layer,
+                              const std::string& output) {
+    // Add bias constant
+    std::string bias_name = output + "_bias";
+    std::vector<int64_t> shape = {static_cast<int64_t>(layer.num_channels)};
+    addConstOp(block, bias_name, layer.weights, shape);
+
+    // Add add operation
+    auto* op = block->add_operations();
+    op->set_type("add");
+    auto& inputs = *op->mutable_inputs();
+    inputs["x"].add_arguments()->set_name(input);
+    inputs["y"].add_arguments()->set_name(bias_name);
+
+    // Output with 2D shape [batch, num_channels] (same as matmul output)
+    auto* out = op->add_outputs();
+    out->set_name(output);
+    auto* out_type = out->mutable_type()->mutable_tensortype();
+    out_type->set_datatype(m_weight_dtype);
+    out_type->set_rank(2);
+    setBatchDimension(out_type);
+    out_type->add_dimensions()->mutable_constant()->set_size(layer.num_channels);
+}
+
+void MILBuilder::addLinearOp(CoreML::Specification::MILSpec::Block* block,
+                             const std::string& input,
+                             const MatMulLayerDesc& matmul,
+                             const MatBiasLayerDesc& bias,
+                             const std::string& output) {
+    // Create const operations for weight and bias (matching Python's linear op structure)
+    // Core ML linear expects weights in [out_channels, in_channels] format
+    // KataGo matmul stores weights in [in_channels, out_channels] format
+    // We need to transpose the weights to match Python's fuse_matmul_weight_bias pass
+    std::string weight_name = output + "_weight_0";
+    std::string bias_name = output + "_bias_0";
+
+    // Transpose weights from [in_channels, out_channels] to [out_channels, in_channels]
+    const int in_ch = matmul.in_channels;
+    const int out_ch = matmul.out_channels;
+    std::vector<float> transposed_weights(matmul.weights.size());
+    for (int i = 0; i < in_ch; ++i) {
+        for (int j = 0; j < out_ch; ++j) {
+            // Original: weights[i * out_ch + j] (row-major [in_ch, out_ch])
+            // Transposed: weights[j * in_ch + i] (row-major [out_ch, in_ch])
+            transposed_weights[j * in_ch + i] = matmul.weights[i * out_ch + j];
+        }
+    }
+
+    // Add transposed weight constant with shape [out_channels, in_channels]
+    std::vector<int64_t> transposed_shape = {static_cast<int64_t>(out_ch), static_cast<int64_t>(in_ch)};
+    addConstOp(block, weight_name, transposed_weights, transposed_shape);
+
+    // Add bias constant
+    std::vector<int64_t> bias_shape = {static_cast<int64_t>(bias.num_channels)};
+    addConstOp(block, bias_name, bias.weights, bias_shape);
+
+    // Add linear operation
+    auto* op = block->add_operations();
+    op->set_type("linear");
+    auto& inputs = *op->mutable_inputs();
+    inputs["x"].add_arguments()->set_name(input);
+    inputs["weight"].add_arguments()->set_name(weight_name);
+    inputs["bias"].add_arguments()->set_name(bias_name);
+
+    // Output with 2D shape [batch, out_channels]
+    auto* out = op->add_outputs();
+    out->set_name(output);
+    auto* out_type = out->mutable_type()->mutable_tensortype();
+    out_type->set_datatype(m_weight_dtype);
+    out_type->set_rank(2);
+    setBatchDimension(out_type);
+    out_type->add_dimensions()->mutable_constant()->set_size(matmul.out_channels);
+}
+
+void MILBuilder::addGlobalPoolingOps(CoreML::Specification::MILSpec::Block* block,
+                                     const std::string& input,
+                                     const std::string& mask,
+                                     int channels,
+                                     const std::string& output) {
+    // KataGo global pooling produces: [mean, mean_scaled, max]
+    // mean_scaled = mean * (sqrt(count) - 14) * 0.1
+
+    if (m_optimize_identity_mask) {
+        // Optimized path: use precomputed constants
+        const auto& mc = m_ops.getMaskConstants();
+
+        // Mean pooling: sum / count
+        std::string sum_name = output + "_sum";
+        std::string sum_axes = sum_name + "_axes_0";
+        std::string sum_keep_dims = sum_name + "_keep_dims_0";
+        addIntArrayConstOp(block, sum_axes, {2, 3});
+        addBoolScalarConstOp(block, sum_keep_dims, true);
+        {
+            auto* op = block->add_operations();
+            op->set_type("reduce_sum");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(input);
+            inputs["axes"].add_arguments()->set_name(sum_axes);
+            inputs["keep_dims"].add_arguments()->set_name(sum_keep_dims);
+            setTensorOutputPooled4D(op, sum_name, channels);
+        }
+
+        std::string mean_name = output + "_mean";
+        std::string mean_y = mean_name + "_y_0";
+        addFloatScalarConstOp(block, mean_y, mc.mask_sum_reciprocal);
+        {
+            auto* op = block->add_operations();
+            op->set_type("mul");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(sum_name);
+            inputs["y"].add_arguments()->set_name(mean_y);
+            setTensorOutputPooled4D(op, mean_name, channels);
+        }
+
+        // Max pooling
+        std::string max_name = output + "_max";
+        std::string max_axes = max_name + "_axes_0";
+        std::string max_keep_dims = max_name + "_keep_dims_0";
+        addIntArrayConstOp(block, max_axes, {2, 3});
+        addBoolScalarConstOp(block, max_keep_dims, true);
+        {
+            auto* op = block->add_operations();
+            op->set_type("reduce_max");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(input);
+            inputs["axes"].add_arguments()->set_name(max_axes);
+            inputs["keep_dims"].add_arguments()->set_name(max_keep_dims);
+            setTensorOutputPooled4D(op, max_name, channels);
+        }
+
+        // Mean scaled = mean * constant
+        std::string mean_scaled_name = output + "_mean_scaled";
+        std::string mean_scaled_y = mean_scaled_name + "_y_0";
+        addFloatScalarConstOp(block, mean_scaled_y, mc.mask_sum_sqrt_s14_m01);
+        {
+            auto* op = block->add_operations();
+            op->set_type("mul");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mean_name);
+            inputs["y"].add_arguments()->set_name(mean_scaled_y);
+            setTensorOutputPooled4D(op, mean_scaled_name, channels);
+        }
+
+        // Squeeze spatial dimensions: [N, C, 1, 1] -> [N, C]
+        std::string mean_flat = output + "_mean_flat";
+        std::string mean_flat_axes = mean_flat + "_axes_0";
+        addIntArrayConstOp(block, mean_flat_axes, {2, 3});
+        {
+            auto* op = block->add_operations();
+            op->set_type("squeeze");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mean_name);
+            inputs["axes"].add_arguments()->set_name(mean_flat_axes);
+            setTensorOutput2D(op, mean_flat, channels);
+        }
+
+        std::string mean_scaled_flat = output + "_mean_scaled_flat";
+        std::string mean_scaled_flat_axes = mean_scaled_flat + "_axes_0";
+        addIntArrayConstOp(block, mean_scaled_flat_axes, {2, 3});
+        {
+            auto* op = block->add_operations();
+            op->set_type("squeeze");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mean_scaled_name);
+            inputs["axes"].add_arguments()->set_name(mean_scaled_flat_axes);
+            setTensorOutput2D(op, mean_scaled_flat, channels);
+        }
+
+        std::string max_flat = output + "_max_flat";
+        std::string max_flat_axes = max_flat + "_axes_0";
+        addIntArrayConstOp(block, max_flat_axes, {2, 3});
+        {
+            auto* op = block->add_operations();
+            op->set_type("squeeze");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(max_name);
+            inputs["axes"].add_arguments()->set_name(max_flat_axes);
+            setTensorOutput2D(op, max_flat, channels);
+        }
+
+        // Concatenate: [mean, mean_scaled, max]
+        std::string concat_axis = output + "_concat_axis_0";
+        std::string concat_interleave = output + "_concat_interleave_0";
+        addIntScalarConstOp(block, concat_axis, 1);
+        addBoolScalarConstOp(block, concat_interleave, false);
+        {
+            auto* op = block->add_operations();
+            op->set_type("concat");
+            auto& inputs = *op->mutable_inputs();
+            inputs["values"].add_arguments()->set_name(mean_flat);
+            inputs["values"].add_arguments()->set_name(mean_scaled_flat);
+            inputs["values"].add_arguments()->set_name(max_flat);
+            inputs["axis"].add_arguments()->set_name(concat_axis);
+            inputs["interleave"].add_arguments()->set_name(concat_interleave);
+            setTensorOutput2D(op, output, channels * 3);
+        }
+    } else {
+        // Full path with mask operations
+        // Count valid positions (mask is [1, 1, H, W], output is [1, 1, 1, 1])
+        std::string mask_sum_name = output + "_mask_sum";
+        std::string mask_sum_axes = mask_sum_name + "_axes_0";
+        std::string mask_sum_keep_dims = mask_sum_name + "_keep_dims_0";
+        addIntArrayConstOp(block, mask_sum_axes, {2, 3});
+        addBoolScalarConstOp(block, mask_sum_keep_dims, true);
+        {
+            auto* op = block->add_operations();
+            op->set_type("reduce_sum");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mask);
+            inputs["axes"].add_arguments()->set_name(mask_sum_axes);
+            inputs["keep_dims"].add_arguments()->set_name(mask_sum_keep_dims);
+            setTensorOutputMask4D(op, mask_sum_name);
+        }
+
+        // Masked input: [1, C, H, W] * [1, 1, H, W] -> [1, C, H, W]
+        std::string masked_name = output + "_masked";
+        {
+            auto* op = block->add_operations();
+            op->set_type("mul");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(input);
+            inputs["y"].add_arguments()->set_name(mask);
+            setTensorOutput4D(op, masked_name, channels, m_board_y_size, m_board_x_size);
+        }
+
+        // Sum masked values: [1, C, H, W] -> [1, C, 1, 1]
+        std::string sum_name = output + "_sum";
+        std::string sum_axes = sum_name + "_axes_0";
+        std::string sum_keep_dims = sum_name + "_keep_dims_0";
+        addIntArrayConstOp(block, sum_axes, {2, 3});
+        addBoolScalarConstOp(block, sum_keep_dims, true);
+        {
+            auto* op = block->add_operations();
+            op->set_type("reduce_sum");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(masked_name);
+            inputs["axes"].add_arguments()->set_name(sum_axes);
+            inputs["keep_dims"].add_arguments()->set_name(sum_keep_dims);
+            setTensorOutputPooled4D(op, sum_name, channels);
+        }
+
+        // Mean = sum / count: [1, C, 1, 1] / [1, 1, 1, 1] -> [1, C, 1, 1]
+        std::string mean_name = output + "_mean";
+        {
+            auto* op = block->add_operations();
+            op->set_type("real_div");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(sum_name);
+            inputs["y"].add_arguments()->set_name(mask_sum_name);
+            setTensorOutputPooled4D(op, mean_name, channels);
+        }
+
+        // Max pooling (with mask adjustment)
+        // mask_minus_one: [1, 1, H, W] - scalar -> [1, 1, H, W]
+        std::string mask_minus_one = output + "_mask_minus_one";
+        std::string mask_minus_one_y = mask_minus_one + "_y_0";
+        addFloatScalarConstOp(block, mask_minus_one_y, 1.0f);
+        {
+            auto* op = block->add_operations();
+            op->set_type("sub");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mask);
+            inputs["y"].add_arguments()->set_name(mask_minus_one_y);
+            setTensorOutputMaskSpatial4D(op, mask_minus_one, m_board_y_size, m_board_x_size);
+        }
+
+        // x_for_max: [1, C, H, W] + [1, 1, H, W] -> [1, C, H, W]
+        std::string x_for_max = output + "_x_for_max";
+        {
+            auto* op = block->add_operations();
+            op->set_type("add");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(masked_name);
+            inputs["y"].add_arguments()->set_name(mask_minus_one);
+            setTensorOutput4D(op, x_for_max, channels, m_board_y_size, m_board_x_size);
+        }
+
+        // max: [1, C, H, W] -> [1, C, 1, 1]
+        std::string max_name = output + "_max";
+        std::string max_axes = max_name + "_axes_0";
+        std::string max_keep_dims = max_name + "_keep_dims_0";
+        addIntArrayConstOp(block, max_axes, {2, 3});
+        addBoolScalarConstOp(block, max_keep_dims, true);
+        {
+            auto* op = block->add_operations();
+            op->set_type("reduce_max");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(x_for_max);
+            inputs["axes"].add_arguments()->set_name(max_axes);
+            inputs["keep_dims"].add_arguments()->set_name(max_keep_dims);
+            setTensorOutputPooled4D(op, max_name, channels);
+        }
+
+        // Mean scaled = mean * (sqrt(count) - 14) * 0.1
+        // sqrt_mask_sum: [1, 1, 1, 1] -> [1, 1, 1, 1]
+        std::string sqrt_mask_sum = output + "_sqrt_mask_sum";
+        {
+            auto* op = block->add_operations();
+            op->set_type("sqrt");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mask_sum_name);
+            setTensorOutputMask4D(op, sqrt_mask_sum);
+        }
+
+        // sqrt_m14: [1, 1, 1, 1] - scalar -> [1, 1, 1, 1]
+        std::string sqrt_m14 = output + "_sqrt_m14";
+        std::string sqrt_m14_y = sqrt_m14 + "_y_0";
+        addFloatScalarConstOp(block, sqrt_m14_y, 14.0f);
+        {
+            auto* op = block->add_operations();
+            op->set_type("sub");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(sqrt_mask_sum);
+            inputs["y"].add_arguments()->set_name(sqrt_m14_y);
+            setTensorOutputMask4D(op, sqrt_m14);
+        }
+
+        // scaled_factor: [1, 1, 1, 1] * scalar -> [1, 1, 1, 1]
+        std::string scaled_factor = output + "_scaled_factor";
+        std::string scaled_factor_y = scaled_factor + "_y_0";
+        addFloatScalarConstOp(block, scaled_factor_y, 0.1f);
+        {
+            auto* op = block->add_operations();
+            op->set_type("mul");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(sqrt_m14);
+            inputs["y"].add_arguments()->set_name(scaled_factor_y);
+            setTensorOutputMask4D(op, scaled_factor);
+        }
+
+        // mean_scaled: [1, C, 1, 1] * [1, 1, 1, 1] -> [1, C, 1, 1]
+        std::string mean_scaled = output + "_mean_scaled";
+        {
+            auto* op = block->add_operations();
+            op->set_type("mul");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mean_name);
+            inputs["y"].add_arguments()->set_name(scaled_factor);
+            setTensorOutputPooled4D(op, mean_scaled, channels);
+        }
+
+        // Squeeze spatial dimensions: [1, C, 1, 1] -> [1, C]
+        std::string mean_flat = output + "_mean_flat";
+        std::string mean_flat_axes = mean_flat + "_axes_0";
+        addIntArrayConstOp(block, mean_flat_axes, {2, 3});
+        {
+            auto* op = block->add_operations();
+            op->set_type("squeeze");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mean_name);
+            inputs["axes"].add_arguments()->set_name(mean_flat_axes);
+            setTensorOutput2D(op, mean_flat, channels);
+        }
+
+        std::string mean_scaled_flat = output + "_mean_scaled_flat";
+        std::string mean_scaled_flat_axes = mean_scaled_flat + "_axes_0";
+        addIntArrayConstOp(block, mean_scaled_flat_axes, {2, 3});
+        {
+            auto* op = block->add_operations();
+            op->set_type("squeeze");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mean_scaled);
+            inputs["axes"].add_arguments()->set_name(mean_scaled_flat_axes);
+            setTensorOutput2D(op, mean_scaled_flat, channels);
+        }
+
+        std::string max_flat = output + "_max_flat";
+        std::string max_flat_axes = max_flat + "_axes_0";
+        addIntArrayConstOp(block, max_flat_axes, {2, 3});
+        {
+            auto* op = block->add_operations();
+            op->set_type("squeeze");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(max_name);
+            inputs["axes"].add_arguments()->set_name(max_flat_axes);
+            setTensorOutput2D(op, max_flat, channels);
+        }
+
+        // Concatenate: [mean, mean_scaled, max] -> [1, 3*C]
+        std::string concat_axis = output + "_concat_axis_0";
+        std::string concat_interleave = output + "_concat_interleave_0";
+        addIntScalarConstOp(block, concat_axis, 1);
+        addBoolScalarConstOp(block, concat_interleave, false);
+        {
+            auto* op = block->add_operations();
+            op->set_type("concat");
+            auto& inputs = *op->mutable_inputs();
+            inputs["values"].add_arguments()->set_name(mean_flat);
+            inputs["values"].add_arguments()->set_name(mean_scaled_flat);
+            inputs["values"].add_arguments()->set_name(max_flat);
+            inputs["axis"].add_arguments()->set_name(concat_axis);
+            inputs["interleave"].add_arguments()->set_name(concat_interleave);
+            setTensorOutput2D(op, output, channels * 3);
+        }
+    }
+}
+
+void MILBuilder::addGlobalPoolingValueOps(CoreML::Specification::MILSpec::Block* block,
+                                          const std::string& input,
+                                          const std::string& mask,
+                                          int channels,
+                                          const std::string& output) {
+    // KataGo value head global pooling produces: [mean, mean_scaled, mean_f3]
+    // mean_scaled = mean * (sqrt(count) - 14) * 0.1
+    // mean_f3 = mean * ((sqrt(count) - 14)^2 * 0.01 - 0.1)
+
+    if (m_optimize_identity_mask) {
+        // Optimized path: use precomputed constants
+        const auto& mc = m_ops.getMaskConstants();
+
+        // Mean pooling: sum / count -> [1, C, 1, 1]
+        std::string sum_name = output + "_sum";
+        std::string sum_axes = sum_name + "_axes_0";
+        std::string sum_keep_dims = sum_name + "_keep_dims_0";
+        addIntArrayConstOp(block, sum_axes, {2, 3});
+        addBoolScalarConstOp(block, sum_keep_dims, true);
+        {
+            auto* op = block->add_operations();
+            op->set_type("reduce_sum");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(input);
+            inputs["axes"].add_arguments()->set_name(sum_axes);
+            inputs["keep_dims"].add_arguments()->set_name(sum_keep_dims);
+            setTensorOutputPooled4D(op, sum_name, channels);
+        }
+
+        std::string mean_name = output + "_mean";
+        std::string mean_y = mean_name + "_y_0";
+        addFloatScalarConstOp(block, mean_y, mc.mask_sum_reciprocal);
+        {
+            auto* op = block->add_operations();
+            op->set_type("mul");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(sum_name);
+            inputs["y"].add_arguments()->set_name(mean_y);
+            setTensorOutputPooled4D(op, mean_name, channels);
+        }
+
+        // Mean scaled = mean * constant -> [1, C, 1, 1]
+        std::string mean_scaled_name = output + "_mean_scaled";
+        std::string mean_scaled_y = mean_scaled_name + "_y_0";
+        addFloatScalarConstOp(block, mean_scaled_y, mc.mask_sum_sqrt_s14_m01);
+        {
+            auto* op = block->add_operations();
+            op->set_type("mul");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mean_name);
+            inputs["y"].add_arguments()->set_name(mean_scaled_y);
+            setTensorOutputPooled4D(op, mean_scaled_name, channels);
+        }
+
+        // Mean feature 3 = mean * constant -> [1, C, 1, 1]
+        std::string mean_f3_name = output + "_mean_f3";
+        std::string mean_f3_y = mean_f3_name + "_y_0";
+        addFloatScalarConstOp(block, mean_f3_y, mc.mask_sum_sqrt_s14_m01_sq_s01);
+        {
+            auto* op = block->add_operations();
+            op->set_type("mul");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mean_name);
+            inputs["y"].add_arguments()->set_name(mean_f3_y);
+            setTensorOutputPooled4D(op, mean_f3_name, channels);
+        }
+
+        // Squeeze spatial dimensions: [N, C, 1, 1] -> [N, C]
+        std::string mean_flat = output + "_mean_flat";
+        std::string mean_flat_axes = mean_flat + "_axes_0";
+        addIntArrayConstOp(block, mean_flat_axes, {2, 3});
+        {
+            auto* op = block->add_operations();
+            op->set_type("squeeze");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mean_name);
+            inputs["axes"].add_arguments()->set_name(mean_flat_axes);
+            setTensorOutput2D(op, mean_flat, channels);
+        }
+
+        std::string mean_scaled_flat = output + "_mean_scaled_flat";
+        std::string mean_scaled_flat_axes = mean_scaled_flat + "_axes_0";
+        addIntArrayConstOp(block, mean_scaled_flat_axes, {2, 3});
+        {
+            auto* op = block->add_operations();
+            op->set_type("squeeze");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mean_scaled_name);
+            inputs["axes"].add_arguments()->set_name(mean_scaled_flat_axes);
+            setTensorOutput2D(op, mean_scaled_flat, channels);
+        }
+
+        std::string mean_f3_flat = output + "_mean_f3_flat";
+        std::string mean_f3_flat_axes = mean_f3_flat + "_axes_0";
+        addIntArrayConstOp(block, mean_f3_flat_axes, {2, 3});
+        {
+            auto* op = block->add_operations();
+            op->set_type("squeeze");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mean_f3_name);
+            inputs["axes"].add_arguments()->set_name(mean_f3_flat_axes);
+            setTensorOutput2D(op, mean_f3_flat, channels);
+        }
+
+        // Concatenate: [mean, mean_scaled, mean_f3] -> [1, 3*C]
+        std::string concat_axis = output + "_concat_axis_0";
+        std::string concat_interleave = output + "_concat_interleave_0";
+        addIntScalarConstOp(block, concat_axis, 1);
+        addBoolScalarConstOp(block, concat_interleave, false);
+        {
+            auto* op = block->add_operations();
+            op->set_type("concat");
+            auto& inputs = *op->mutable_inputs();
+            inputs["values"].add_arguments()->set_name(mean_flat);
+            inputs["values"].add_arguments()->set_name(mean_scaled_flat);
+            inputs["values"].add_arguments()->set_name(mean_f3_flat);
+            inputs["axis"].add_arguments()->set_name(concat_axis);
+            inputs["interleave"].add_arguments()->set_name(concat_interleave);
+            setTensorOutput2D(op, output, channels * 3);
+        }
+    } else {
+        // Full path with mask operations
+        // Count valid positions: [1, 1, H, W] -> [1, 1, 1, 1]
+        std::string mask_sum_name = output + "_mask_sum";
+        std::string mask_sum_axes = mask_sum_name + "_axes_0";
+        std::string mask_sum_keep_dims = mask_sum_name + "_keep_dims_0";
+        addIntArrayConstOp(block, mask_sum_axes, {2, 3});
+        addBoolScalarConstOp(block, mask_sum_keep_dims, true);
+        {
+            auto* op = block->add_operations();
+            op->set_type("reduce_sum");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mask);
+            inputs["axes"].add_arguments()->set_name(mask_sum_axes);
+            inputs["keep_dims"].add_arguments()->set_name(mask_sum_keep_dims);
+            setTensorOutputMask4D(op, mask_sum_name);
+        }
+
+        // Masked input: [1, C, H, W] * [1, 1, H, W] -> [1, C, H, W]
+        std::string masked_name = output + "_masked";
+        {
+            auto* op = block->add_operations();
+            op->set_type("mul");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(input);
+            inputs["y"].add_arguments()->set_name(mask);
+            setTensorOutput4D(op, masked_name, channels, m_board_y_size, m_board_x_size);
+        }
+
+        // Sum masked values: [1, C, H, W] -> [1, C, 1, 1]
+        std::string sum_name = output + "_sum";
+        std::string sum_axes = sum_name + "_axes_0";
+        std::string sum_keep_dims = sum_name + "_keep_dims_0";
+        addIntArrayConstOp(block, sum_axes, {2, 3});
+        addBoolScalarConstOp(block, sum_keep_dims, true);
+        {
+            auto* op = block->add_operations();
+            op->set_type("reduce_sum");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(masked_name);
+            inputs["axes"].add_arguments()->set_name(sum_axes);
+            inputs["keep_dims"].add_arguments()->set_name(sum_keep_dims);
+            setTensorOutputPooled4D(op, sum_name, channels);
+        }
+
+        // Mean = sum / count: [1, C, 1, 1] / [1, 1, 1, 1] -> [1, C, 1, 1]
+        std::string mean_name = output + "_mean";
+        {
+            auto* op = block->add_operations();
+            op->set_type("real_div");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(sum_name);
+            inputs["y"].add_arguments()->set_name(mask_sum_name);
+            setTensorOutputPooled4D(op, mean_name, channels);
+        }
+
+        // Compute (sqrt(count) - 14): [1, 1, 1, 1] -> [1, 1, 1, 1]
+        std::string sqrt_mask_sum = output + "_sqrt_mask_sum";
+        {
+            auto* op = block->add_operations();
+            op->set_type("sqrt");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mask_sum_name);
+            setTensorOutputMask4D(op, sqrt_mask_sum);
+        }
+
+        std::string sqrt_m14 = output + "_sqrt_m14";
+        std::string sqrt_m14_y = sqrt_m14 + "_y_0";
+        addFloatScalarConstOp(block, sqrt_m14_y, 14.0f);
+        {
+            auto* op = block->add_operations();
+            op->set_type("sub");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(sqrt_mask_sum);
+            inputs["y"].add_arguments()->set_name(sqrt_m14_y);
+            setTensorOutputMask4D(op, sqrt_m14);
+        }
+
+        // Feature 2: Mean * (sqrt(count) - 14) * 0.1
+        // scaled_factor: [1, 1, 1, 1] * scalar -> [1, 1, 1, 1]
+        std::string scaled_factor = output + "_scaled_factor";
+        std::string scaled_factor_y = scaled_factor + "_y_0";
+        addFloatScalarConstOp(block, scaled_factor_y, 0.1f);
+        {
+            auto* op = block->add_operations();
+            op->set_type("mul");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(sqrt_m14);
+            inputs["y"].add_arguments()->set_name(scaled_factor_y);
+            setTensorOutputMask4D(op, scaled_factor);
+        }
+
+        // mean_scaled: [1, C, 1, 1] * [1, 1, 1, 1] -> [1, C, 1, 1]
+        std::string mean_scaled = output + "_mean_scaled";
+        {
+            auto* op = block->add_operations();
+            op->set_type("mul");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mean_name);
+            inputs["y"].add_arguments()->set_name(scaled_factor);
+            setTensorOutputPooled4D(op, mean_scaled, channels);
+        }
+
+        // Feature 3: Mean * ((sqrt(count) - 14)^2 * 0.01 - 0.1)
+        // sqrt_m14_sq: [1, 1, 1, 1] * [1, 1, 1, 1] -> [1, 1, 1, 1]
+        std::string sqrt_m14_sq = output + "_sqrt_m14_sq";
+        {
+            auto* op = block->add_operations();
+            op->set_type("mul");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(sqrt_m14);
+            inputs["y"].add_arguments()->set_name(sqrt_m14);
+            setTensorOutputMask4D(op, sqrt_m14_sq);
+        }
+
+        // sq_01: [1, 1, 1, 1] * scalar -> [1, 1, 1, 1]
+        std::string sq_01 = output + "_sq_01";
+        std::string sq_01_y = sq_01 + "_y_0";
+        addFloatScalarConstOp(block, sq_01_y, 0.01f);
+        {
+            auto* op = block->add_operations();
+            op->set_type("mul");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(sqrt_m14_sq);
+            inputs["y"].add_arguments()->set_name(sq_01_y);
+            setTensorOutputMask4D(op, sq_01);
+        }
+
+        // f3_factor: [1, 1, 1, 1] - scalar -> [1, 1, 1, 1]
+        std::string f3_factor = output + "_f3_factor";
+        std::string f3_factor_y = f3_factor + "_y_0";
+        addFloatScalarConstOp(block, f3_factor_y, 0.1f);
+        {
+            auto* op = block->add_operations();
+            op->set_type("sub");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(sq_01);
+            inputs["y"].add_arguments()->set_name(f3_factor_y);
+            setTensorOutputMask4D(op, f3_factor);
+        }
+
+        // mean_f3: [1, C, 1, 1] * [1, 1, 1, 1] -> [1, C, 1, 1]
+        std::string mean_f3 = output + "_mean_f3";
+        {
+            auto* op = block->add_operations();
+            op->set_type("mul");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mean_name);
+            inputs["y"].add_arguments()->set_name(f3_factor);
+            setTensorOutputPooled4D(op, mean_f3, channels);
+        }
+
+        // Squeeze spatial dimensions: [1, C, 1, 1] -> [1, C]
+        std::string mean_flat = output + "_mean_flat";
+        std::string mean_flat_axes = mean_flat + "_axes_0";
+        addIntArrayConstOp(block, mean_flat_axes, {2, 3});
+        {
+            auto* op = block->add_operations();
+            op->set_type("squeeze");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mean_name);
+            inputs["axes"].add_arguments()->set_name(mean_flat_axes);
+            setTensorOutput2D(op, mean_flat, channels);
+        }
+
+        std::string mean_scaled_flat = output + "_mean_scaled_flat";
+        std::string mean_scaled_flat_axes = mean_scaled_flat + "_axes_0";
+        addIntArrayConstOp(block, mean_scaled_flat_axes, {2, 3});
+        {
+            auto* op = block->add_operations();
+            op->set_type("squeeze");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mean_scaled);
+            inputs["axes"].add_arguments()->set_name(mean_scaled_flat_axes);
+            setTensorOutput2D(op, mean_scaled_flat, channels);
+        }
+
+        std::string mean_f3_flat = output + "_mean_f3_flat";
+        std::string mean_f3_flat_axes = mean_f3_flat + "_axes_0";
+        addIntArrayConstOp(block, mean_f3_flat_axes, {2, 3});
+        {
+            auto* op = block->add_operations();
+            op->set_type("squeeze");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(mean_f3);
+            inputs["axes"].add_arguments()->set_name(mean_f3_flat_axes);
+            setTensorOutput2D(op, mean_f3_flat, channels);
+        }
+
+        // Concatenate: [mean, mean_scaled, mean_f3] -> [1, 3*C]
+        std::string concat_axis = output + "_concat_axis_0";
+        std::string concat_interleave = output + "_concat_interleave_0";
+        addIntScalarConstOp(block, concat_axis, 1);
+        addBoolScalarConstOp(block, concat_interleave, false);
+        {
+            auto* op = block->add_operations();
+            op->set_type("concat");
+            auto& inputs = *op->mutable_inputs();
+            inputs["values"].add_arguments()->set_name(mean_flat);
+            inputs["values"].add_arguments()->set_name(mean_scaled_flat);
+            inputs["values"].add_arguments()->set_name(mean_f3_flat);
+            inputs["axis"].add_arguments()->set_name(concat_axis);
+            inputs["interleave"].add_arguments()->set_name(concat_interleave);
+            setTensorOutput2D(op, output, channels * 3);
+        }
+    }
+}
+
+// ============================================================================
+// Network Component Builders
+// ============================================================================
+
+std::string MILBuilder::buildTrunk(CoreML::Specification::MILSpec::Block* block,
+                                   const std::string& spatial_input,
+                                   const std::string& global_input,
+                                   const std::string& mask,
+                                   const std::string* meta_input) {
+    const auto& trunk = m_model.trunk;
+
+    // Initial conv
+    std::string x = genVarName("trunk_init_conv");
+    addConvOp(block, spatial_input, trunk.initial_conv, x);
+
+    // Global projection
+    std::string global_bias = genVarName("trunk_global_proj");
+    addMatMulOp(block, global_input, trunk.initial_matmul, global_bias);
+
+    // Reshape global bias to [batch, C, 1, 1]
+    // Create shape const first (matching Python structure)
+    std::string global_bias_reshaped = genVarName("trunk_global_reshape");
+    std::string reshape_shape_name = global_bias_reshaped + "_shape_0";
+    // Use -1 for batch to infer from input, explicit channel count
+    addIntArrayConstOp(block, reshape_shape_name, {-1, static_cast<int32_t>(trunk.initial_conv.out_channels), 1, 1});
+    {
+        auto* op = block->add_operations();
+        op->set_type("reshape");
+        // "name" attribute
+        auto& name_attr = (*op->mutable_attributes())["name"];
+        name_attr.mutable_type()->mutable_tensortype()->set_datatype(
+            CoreML::Specification::MILSpec::DataType::STRING);
+        name_attr.mutable_immediatevalue()->mutable_tensor()->mutable_strings()->add_values(global_bias_reshaped);
+        // Inputs
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(global_bias);
+        inputs["shape"].add_arguments()->set_name(reshape_shape_name);
+        // Output with dimensions [batch, C, 1, 1]
+        setTensorOutputPooled4D(op, global_bias_reshaped, trunk.initial_conv.out_channels);
+    }
+
+    // Add global bias
+    std::string x_with_global = genVarName("trunk_add_global");
+    {
+        auto* op = block->add_operations();
+        op->set_type("add");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(x);
+        inputs["y"].add_arguments()->set_name(global_bias_reshaped);
+        // Output with 4D shape [batch, C, H, W]
+        setTensorOutput4D(op, x_with_global, trunk.trunk_num_channels, m_board_y_size, m_board_x_size);
+    }
+    x = x_with_global;
+
+    // Add metadata bias if present
+    if (trunk.sgf_metadata_encoder.has_value() && meta_input != nullptr) {
+        std::string meta_bias = buildSGFMetadataEncoder(block, *meta_input, *trunk.sgf_metadata_encoder);
+
+        // Reshape meta bias
+        std::string meta_bias_reshaped = genVarName("trunk_meta_reshape");
+        std::string meta_bias_shape_name = meta_bias_reshaped + "_shape_0";
+        // Use -1 for batch to infer from input, explicit channel count
+        addIntArrayConstOp(block, meta_bias_shape_name, {-1, static_cast<int32_t>(trunk.trunk_num_channels), 1, 1});
+        {
+            auto* op = block->add_operations();
+            op->set_type("reshape");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(meta_bias);
+            inputs["shape"].add_arguments()->set_name(meta_bias_shape_name);
+            // Output with 4D shape [batch, C, 1, 1]
+            setTensorOutputPooled4D(op, meta_bias_reshaped, trunk.trunk_num_channels);
+        }
+
+        // Add meta bias
+        std::string x_with_meta = genVarName("trunk_add_meta");
+        {
+            auto* op = block->add_operations();
+            op->set_type("add");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(x);
+            inputs["y"].add_arguments()->set_name(meta_bias_reshaped);
+            // Output with 4D shape [batch, C, H, W]
+            setTensorOutput4D(op, x_with_meta, trunk.trunk_num_channels, m_board_y_size, m_board_x_size);
+        }
+        x = x_with_meta;
+    }
+
+    // Apply initial mask
+    std::string x_masked = genVarName("trunk_init_mask");
+    {
+        auto* op = block->add_operations();
+        op->set_type("mul");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(x);
+        inputs["y"].add_arguments()->set_name(mask);
+        // Output with 4D shape [batch, C, H, W]
+        setTensorOutput4D(op, x_masked, trunk.trunk_num_channels, m_board_y_size, m_board_x_size);
+    }
+    x = x_masked;
+
+    // Process residual blocks
+    for (size_t i = 0; i < trunk.blocks.size(); i++) {
+        const auto& entry = trunk.blocks[i];
+        std::string prefix = "trunk_block_" + std::to_string(i);
+
+        if (entry.block_kind == ORDINARY_BLOCK_KIND) {
+            const auto& block_desc = std::get<ResidualBlockDesc>(*entry.block);
+            x = buildResidualBlock(block, x, block_desc, mask, prefix);
+        } else if (entry.block_kind == GLOBAL_POOLING_BLOCK_KIND) {
+            const auto& block_desc = std::get<GlobalPoolingResidualBlockDesc>(*entry.block);
+            x = buildGlobalPoolingResidualBlock(block, x, block_desc, mask, prefix);
+        } else if (entry.block_kind == NESTED_BOTTLENECK_BLOCK_KIND) {
+            const auto& block_desc = std::get<NestedBottleneckResidualBlockDesc>(*entry.block);
+            x = buildNestedBottleneckBlock(block, x, block_desc, mask, prefix);
+        }
+    }
+
+    // Trunk tip
+    std::string trunk_out = genVarName("trunk_tip");
+    addBatchNormActivationOps(block, x, trunk.trunk_tip_bn, trunk.trunk_tip_activation, mask, trunk_out);
+
+    return trunk_out;
+}
+
+std::string MILBuilder::buildResidualBlock(CoreML::Specification::MILSpec::Block* block,
+                                           const std::string& input,
+                                           const ResidualBlockDesc& block_desc,
+                                           const std::string& mask,
+                                           const std::string& prefix) {
+    // Pre BN + activation
+    std::string pre_out = genVarName(prefix + "_pre");
+    addBatchNormActivationOps(block, input, block_desc.pre_bn, block_desc.pre_activation, mask, pre_out);
+
+    // First conv
+    std::string conv1_out = genVarName(prefix + "_conv1");
+    addConvOp(block, pre_out, block_desc.regular_conv, conv1_out);
+
+    // Mid BN + activation
+    std::string mid_out = genVarName(prefix + "_mid");
+    addBatchNormActivationOps(block, conv1_out, block_desc.mid_bn, block_desc.mid_activation, mask, mid_out);
+
+    // Second conv
+    std::string conv2_out = genVarName(prefix + "_conv2");
+    addConvOp(block, mid_out, block_desc.final_conv, conv2_out);
+
+    // Residual add
+    std::string output = genVarName(prefix + "_residual");
+    {
+        auto* op = block->add_operations();
+        op->set_type("add");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(conv2_out);
+        inputs["y"].add_arguments()->set_name(input);
+        // Set proper 4D output type [1, C, H, W]
+        setTensorOutput4D(op, output, block_desc.final_conv.out_channels, m_board_y_size, m_board_x_size);
+    }
+
+    return output;
+}
+
+std::string MILBuilder::buildGlobalPoolingResidualBlock(CoreML::Specification::MILSpec::Block* block,
+                                                         const std::string& input,
+                                                         const GlobalPoolingResidualBlockDesc& block_desc,
+                                                         const std::string& mask,
+                                                         const std::string& prefix) {
+    // Pre BN + activation
+    std::string pre_out = genVarName(prefix + "_pre");
+    addBatchNormActivationOps(block, input, block_desc.pre_bn, block_desc.pre_activation, mask, pre_out);
+
+    // Regular conv
+    std::string regular_out = genVarName(prefix + "_regular");
+    addConvOp(block, pre_out, block_desc.regular_conv, regular_out);
+
+    // Gpool conv
+    std::string gpool_conv_out = genVarName(prefix + "_gpool_conv");
+    addConvOp(block, pre_out, block_desc.gpool_conv, gpool_conv_out);
+
+    // Gpool BN + activation
+    std::string gpool_bn_out = genVarName(prefix + "_gpool_bn");
+    addBatchNormActivationOps(block, gpool_conv_out, block_desc.gpool_bn, block_desc.gpool_activation, mask, gpool_bn_out);
+
+    // Global pooling
+    std::string gpool_features = genVarName(prefix + "_gpool_features");
+    addGlobalPoolingOps(block, gpool_bn_out, mask, block_desc.gpool_conv.out_channels, gpool_features);
+
+    // Project to bias
+    std::string gpool_bias = genVarName(prefix + "_gpool_bias");
+    addMatMulOp(block, gpool_features, block_desc.gpool_to_bias_mul, gpool_bias);
+
+    // Reshape bias
+    std::string gpool_bias_reshaped = genVarName(prefix + "_gpool_bias_reshape");
+    std::string gpool_bias_reshape_shape = gpool_bias_reshaped + "_shape_0";
+    // Use -1 for batch to infer from input, explicit channel count
+    addIntArrayConstOp(block, gpool_bias_reshape_shape, {-1, static_cast<int32_t>(block_desc.regular_conv.out_channels), 1, 1});
+    {
+        auto* op = block->add_operations();
+        op->set_type("reshape");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(gpool_bias);
+        inputs["shape"].add_arguments()->set_name(gpool_bias_reshape_shape);
+        // Output is [batch, regular_conv.out_channels, 1, 1]
+        setTensorOutputPooled4D(op, gpool_bias_reshaped, block_desc.regular_conv.out_channels);
+    }
+
+    // Add bias to regular path
+    std::string combined = genVarName(prefix + "_combined");
+    {
+        auto* op = block->add_operations();
+        op->set_type("add");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(regular_out);
+        inputs["y"].add_arguments()->set_name(gpool_bias_reshaped);
+        // Output is [1, regular_conv.out_channels, H, W]
+        setTensorOutput4D(op, combined, block_desc.regular_conv.out_channels, m_board_y_size, m_board_x_size);
+    }
+
+    // Mid BN + activation
+    std::string mid_out = genVarName(prefix + "_mid");
+    addBatchNormActivationOps(block, combined, block_desc.mid_bn, block_desc.mid_activation, mask, mid_out);
+
+    // Final conv
+    std::string final_conv_out = genVarName(prefix + "_final");
+    addConvOp(block, mid_out, block_desc.final_conv, final_conv_out);
+
+    // Residual add
+    std::string output = genVarName(prefix + "_residual");
+    {
+        auto* op = block->add_operations();
+        op->set_type("add");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(final_conv_out);
+        inputs["y"].add_arguments()->set_name(input);
+        // Set proper 4D output type [1, C, H, W]
+        setTensorOutput4D(op, output, block_desc.final_conv.out_channels, m_board_y_size, m_board_x_size);
+    }
+
+    return output;
+}
+
+std::string MILBuilder::buildNestedBottleneckBlock(CoreML::Specification::MILSpec::Block* block,
+                                                    const std::string& input,
+                                                    const NestedBottleneckResidualBlockDesc& block_desc,
+                                                    const std::string& mask,
+                                                    const std::string& prefix) {
+    // Pre BN + activation
+    std::string pre_out = genVarName(prefix + "_pre");
+    addBatchNormActivationOps(block, input, block_desc.pre_bn, block_desc.pre_activation, mask, pre_out);
+
+    // Pre conv (bottleneck reduction)
+    std::string pre_conv_out = genVarName(prefix + "_pre_conv");
+    addConvOp(block, pre_out, block_desc.pre_conv, pre_conv_out);
+
+    std::string x = pre_conv_out;
+
+    // Process nested blocks
+    for (size_t i = 0; i < block_desc.blocks.size(); i++) {
+        const auto& entry = block_desc.blocks[i];
+        std::string nested_prefix = prefix + "_nested_" + std::to_string(i);
+
+        if (entry.block_kind == ORDINARY_BLOCK_KIND) {
+            const auto& nested = std::get<ResidualBlockDesc>(*entry.block);
+            x = buildResidualBlock(block, x, nested, mask, nested_prefix);
+        } else if (entry.block_kind == GLOBAL_POOLING_BLOCK_KIND) {
+            const auto& nested = std::get<GlobalPoolingResidualBlockDesc>(*entry.block);
+            x = buildGlobalPoolingResidualBlock(block, x, nested, mask, nested_prefix);
+        }
+    }
+
+    // Post BN + activation
+    std::string post_out = genVarName(prefix + "_post");
+    addBatchNormActivationOps(block, x, block_desc.post_bn, block_desc.post_activation, mask, post_out);
+
+    // Post conv (bottleneck expansion)
+    std::string post_conv_out = genVarName(prefix + "_post_conv");
+    addConvOp(block, post_out, block_desc.post_conv, post_conv_out);
+
+    // Residual add
+    std::string output = genVarName(prefix + "_residual");
+    {
+        auto* op = block->add_operations();
+        op->set_type("add");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(post_conv_out);
+        inputs["y"].add_arguments()->set_name(input);
+        // Set proper 4D output type [1, C, H, W]
+        setTensorOutput4D(op, output, block_desc.post_conv.out_channels, m_board_y_size, m_board_x_size);
+    }
+
+    return output;
+}
+
+void MILBuilder::buildPolicyHead(CoreML::Specification::MILSpec::Block* block,
+                                 const std::string& trunk_out,
+                                 const std::string& mask,
+                                 std::string& policy_out,
+                                 std::string& pass_out) {
+    const auto& ph = m_model.policy_head;
+
+    // P1 conv
+    std::string p1 = genVarName("policy_p1");
+    addConvOp(block, trunk_out, ph.p1_conv, p1);
+
+    // G1 conv + BN + activation
+    std::string g1_conv = genVarName("policy_g1_conv");
+    addConvOp(block, trunk_out, ph.g1_conv, g1_conv);
+
+    std::string g1 = genVarName("policy_g1");
+    addBatchNormActivationOps(block, g1_conv, ph.g1_bn, ph.g1_activation, mask, g1);
+
+    // Global pooling on G1
+    std::string g1_pooled = genVarName("policy_g1_pool");
+    addGlobalPoolingOps(block, g1, mask, ph.g1_conv.out_channels, g1_pooled);
+
+    // Project to spatial bias
+    std::string gpool_bias = genVarName("policy_gpool_bias");
+    addMatMulOp(block, g1_pooled, ph.gpool_to_bias_mul, gpool_bias);
+
+    // Reshape bias
+    std::string gpool_bias_reshaped = genVarName("policy_gpool_bias_reshape");
+    std::string policy_gpool_reshape_shape = gpool_bias_reshaped + "_shape_0";
+    // Use -1 for batch to infer from input, explicit channel count
+    addIntArrayConstOp(block, policy_gpool_reshape_shape, {-1, static_cast<int32_t>(ph.p1_conv.out_channels), 1, 1});
+    {
+        auto* op = block->add_operations();
+        op->set_type("reshape");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(gpool_bias);
+        inputs["shape"].add_arguments()->set_name(policy_gpool_reshape_shape);
+        // Output is [batch, p1_conv.out_channels, 1, 1]
+        setTensorOutputPooled4D(op, gpool_bias_reshaped, ph.p1_conv.out_channels);
+    }
+
+    // Add bias to P1
+    std::string p1_biased = genVarName("policy_p1_biased");
+    {
+        auto* op = block->add_operations();
+        op->set_type("add");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(p1);
+        inputs["y"].add_arguments()->set_name(gpool_bias_reshaped);
+        // Output is [1, p1_conv.out_channels, H, W]
+        setTensorOutput4D(op, p1_biased, ph.p1_conv.out_channels, m_board_y_size, m_board_x_size);
+    }
+
+    // P1 BN + activation
+    std::string p1_activated = genVarName("policy_p1_act");
+    addBatchNormActivationOps(block, p1_biased, ph.p1_bn, ph.p1_activation, mask, p1_activated);
+
+    // P2 conv -> policy output
+    // Mixed precision uses _fp16 suffix for this intermediate op; cast ops later rename to base name
+    policy_out = (m_use_fp16 && !m_use_fp16_io) ? "policy_p2_conv_fp16" : "policy_p2_conv";
+    addConvOp(block, p1_activated, ph.p2_conv, policy_out);
+
+    // Pass move
+    if (ph.gpool_to_pass_mul2.has_value()) {
+        // v15+: two-layer pass (first layer fused matmul+bias -> linear)
+        std::string pass_biased = genVarName("policy_pass_biased");
+        addLinearOp(block, g1_pooled, ph.gpool_to_pass_mul, *ph.gpool_to_pass_bias, pass_biased);
+
+        // Activation
+        std::string pass_activated = genVarName("policy_pass_act");
+        if (ph.pass_activation->activation_type == ActivationType::ReLU) {
+            auto* op = block->add_operations();
+            op->set_type("relu");
+            auto& inputs = *op->mutable_inputs();
+            inputs["x"].add_arguments()->set_name(pass_biased);
+            setTensorOutput2D(op, pass_activated, ph.gpool_to_pass_mul.out_channels);
+        } else if (ph.pass_activation->activation_type == ActivationType::Mish) {
+            addMishOps(block, pass_biased, pass_activated, 2, ph.gpool_to_pass_mul.out_channels);
+        } else {
+            pass_activated = pass_biased;
+        }
+
+        // Mixed precision: _fp16 intermediate, cast ops rename to base name
+        pass_out = (m_use_fp16 && !m_use_fp16_io) ? "policy_pass_fp16" : "policy_pass";
+        addMatMulOp(block, pass_activated, *ph.gpool_to_pass_mul2, pass_out);
+    } else {
+        // Pre-v15: single layer pass
+        // Mixed precision: _fp16 intermediate, cast ops rename to base name (pre-v15)
+        pass_out = (m_use_fp16 && !m_use_fp16_io) ? "policy_pass_fp16" : "policy_pass";
+        addMatMulOp(block, g1_pooled, ph.gpool_to_pass_mul, pass_out);
+    }
+}
+
+void MILBuilder::buildValueHead(CoreML::Specification::MILSpec::Block* block,
+                                const std::string& trunk_out,
+                                const std::string& mask,
+                                std::string& value_out,
+                                std::string& ownership_out,
+                                std::string& score_value_out) {
+    const auto& vh = m_model.value_head;
+
+    // V1 conv + BN + activation
+    std::string v1_conv = genVarName("value_v1_conv");
+    addConvOp(block, trunk_out, vh.v1_conv, v1_conv);
+
+    std::string v1 = genVarName("value_v1");
+    addBatchNormActivationOps(block, v1_conv, vh.v1_bn, vh.v1_activation, mask, v1);
+
+    // Global pooling (value head version)
+    std::string v1_pooled = genVarName("value_v1_pool");
+    addGlobalPoolingValueOps(block, v1, mask, vh.v1_conv.out_channels, v1_pooled);
+
+    // V2: linear + activation (fused matmul+bias -> linear)
+    std::string v2_bias = genVarName("value_v2_bias");
+    addLinearOp(block, v1_pooled, vh.v2_mul, vh.v2_bias, v2_bias);
+
+    std::string v2 = genVarName("value_v2");
+    if (vh.v2_activation.activation_type == ActivationType::ReLU) {
+        auto* op = block->add_operations();
+        op->set_type("relu");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(v2_bias);
+        setTensorOutput2D(op, v2, vh.v2_mul.out_channels);
+    } else if (vh.v2_activation.activation_type == ActivationType::Mish) {
+        addMishOps(block, v2_bias, v2, 2, vh.v2_mul.out_channels);
+    } else {
+        v2 = v2_bias;
+    }
+
+    // V3: linear -> value output (fused matmul+bias -> linear)
+    // Mixed precision: _fp16 intermediate, cast ops rename to base name
+    value_out = (m_use_fp16 && !m_use_fp16_io) ? "value_v3_bias_fp16" : "value_v3_bias";
+    addLinearOp(block, v2, vh.v3_mul, vh.v3_bias, value_out);
+
+    // SV3: linear -> score value output (fused matmul+bias -> linear)
+    // Mixed precision: _fp16 intermediate, cast ops rename to base name
+    score_value_out = (m_use_fp16 && !m_use_fp16_io) ? "value_sv3_bias_fp16" : "value_sv3_bias";
+    addLinearOp(block, v2, vh.sv3_mul, vh.sv3_bias, score_value_out);
+
+    // Ownership conv
+    // Mixed precision: _fp16 intermediate, cast ops rename to base name
+    ownership_out = (m_use_fp16 && !m_use_fp16_io) ? "value_ownership_conv_fp16" : "value_ownership_conv";
+    addConvOp(block, v1, vh.v_ownership_conv, ownership_out);
+}
+
+std::string MILBuilder::buildSGFMetadataEncoder(CoreML::Specification::MILSpec::Block* block,
+                                                const std::string& meta_input,
+                                                const SGFMetadataEncoderDesc& encoder) {
+    // Layer 1 (fused matmul+bias -> linear)
+    std::string bias1 = genVarName("meta_bias1");
+    addLinearOp(block, meta_input, encoder.mul1, encoder.bias1, bias1);
+
+    std::string act1 = genVarName("meta_act1");
+    if (encoder.act1.activation_type == ActivationType::ReLU) {
+        auto* op = block->add_operations();
+        op->set_type("relu");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(bias1);
+        setTensorOutput2D(op, act1, encoder.mul1.out_channels);
+    } else if (encoder.act1.activation_type == ActivationType::Mish) {
+        addMishOps(block, bias1, act1, 2, encoder.mul1.out_channels);
+    } else {
+        // Identity activation - create identity op to preserve type information
+        auto* op = block->add_operations();
+        op->set_type("identity");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(bias1);
+        setTensorOutput2D(op, act1, encoder.mul1.out_channels);
+    }
+
+    // Layer 2 (fused matmul+bias -> linear)
+    std::string bias2 = genVarName("meta_bias2");
+    addLinearOp(block, act1, encoder.mul2, encoder.bias2, bias2);
+
+    std::string act2 = genVarName("meta_act2");
+    if (encoder.act2.activation_type == ActivationType::ReLU) {
+        auto* op = block->add_operations();
+        op->set_type("relu");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(bias2);
+        setTensorOutput2D(op, act2, encoder.mul2.out_channels);
+    } else if (encoder.act2.activation_type == ActivationType::Mish) {
+        addMishOps(block, bias2, act2, 2, encoder.mul2.out_channels);
+    } else {
+        // Identity activation - create identity op to preserve type information
+        auto* op = block->add_operations();
+        op->set_type("identity");
+        auto& inputs = *op->mutable_inputs();
+        inputs["x"].add_arguments()->set_name(bias2);
+        setTensorOutput2D(op, act2, encoder.mul2.out_channels);
+    }
+
+    // Layer 3 (output)
+    std::string output = genVarName("meta_output");
+    addMatMulOp(block, act2, encoder.mul3, output);
+
+    return output;
+}
+
+}  // namespace katagocoreml
diff --git a/cpp/external/katagocoreml/src/builder/MILBuilder.hpp b/cpp/external/katagocoreml/src/builder/MILBuilder.hpp
new file mode 100644
index 000000000..042f9fc16
--- /dev/null
+++ b/cpp/external/katagocoreml/src/builder/MILBuilder.hpp
@@ -0,0 +1,194 @@
+// katagocoreml - Standalone C++ KataGo to Core ML Converter
+// Copyright (c) 2025, Chin-Chang Yang
+
+#pragma once
+
+#include "../types/KataGoTypes.hpp"
+#include "Operations.hpp"
+#include "MIL.pb.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace katagocoreml {
+
+/// Builder for constructing MIL programs from KataGo models.
+/// Converts a parsed KataGo model description into a MIL protobuf program.
+class MILBuilder {
+public:
+    MILBuilder(const KataGoModelDesc& model,
+               int board_x_size,
+               int board_y_size,
+               bool optimize_identity_mask,
+               bool use_fp16 = false,
+               int min_batch_size = 1,
+               int max_batch_size = 1,
+               bool use_fp16_io = false);
+
+    /// Build and return the MIL program protobuf
+    /// @return Unique pointer to MIL Program protobuf
+    std::unique_ptr<CoreML::Specification::MILSpec::Program> build();
+
+    /// Get weight entries for blob serialization
+    const std::vector<WeightEntry>& getWeights() const { return m_ops.getWeights(); }
+
+    /// Get board dimensions
+    int getBoardXSize() const { return m_board_x_size; }
+    int getBoardYSize() const { return m_board_y_size; }
+
+private:
+    const KataGoModelDesc& m_model;
+    int m_board_x_size;
+    int m_board_y_size;
+    bool m_optimize_identity_mask;
+    bool m_use_fp16;
+    bool m_use_fp16_io;
+    int m_min_batch_size;
+    int m_max_batch_size;
+    CoreML::Specification::MILSpec::DataType m_weight_dtype;
+    KataGoOps m_ops;
+
+    // Batch size helpers
+    bool isDynamicBatch() const {
+        return m_min_batch_size != m_max_batch_size || m_max_batch_size <= 0;
+    }
+    void setBatchDimension(CoreML::Specification::MILSpec::TensorType* tensor_type);
+
+    // Tensor output helpers with batch dimension support
+    void setTensorOutput4D(CoreML::Specification::MILSpec::Operation* op,
+                           const std::string& name,
+                           int channels, int height, int width);
+    void setTensorOutput2D(CoreML::Specification::MILSpec::Operation* op,
+                           const std::string& name,
+                           int channels);
+    void setTensorOutputPooled4D(CoreML::Specification::MILSpec::Operation* op,
+                                  const std::string& name,
+                                  int channels);
+    void setTensorOutputMask4D(CoreML::Specification::MILSpec::Operation* op,
+                                const std::string& name);
+    void setTensorOutputMaskSpatial4D(CoreML::Specification::MILSpec::Operation* op,
+                                       const std::string& name,
+                                       int height, int width);
+
+    // Operation name counter for unique names
+    int m_var_counter = 0;
+    std::string genVarName(const std::string& prefix);
+
+    // MIL program construction helpers
+    void addConstOp(CoreML::Specification::MILSpec::Block* block,
+                    const std::string& name,
+                    const std::vector<float>& data,
+                    const std::vector<int64_t>& shape);
+
+    void addIntArrayConstOp(CoreML::Specification::MILSpec::Block* block,
+                            const std::string& name,
+                            const std::vector<int32_t>& values);
+
+    void addBoolScalarConstOp(CoreML::Specification::MILSpec::Block* block,
+                              const std::string& name,
+                              bool value);
+
+    void addFloatScalarConstOp(CoreML::Specification::MILSpec::Block* block,
+                               const std::string& name,
+                               float value);
+
+    void addIntScalarConstOp(CoreML::Specification::MILSpec::Block* block,
+                             const std::string& name,
+                             int32_t value);
+
+    void addCastOp(CoreML::Specification::MILSpec::Block* block,
+                   const std::string& input,
+                   const std::string& output,
+                   const std::string& dtype,
+                   const std::vector<int64_t>& shape);
+
+    void addConvOp(CoreML::Specification::MILSpec::Block* block,
+                   const std::string& input,
+                   const ConvLayerDesc& layer,
+                   const std::string& output);
+
+    void addBatchNormActivationOps(CoreML::Specification::MILSpec::Block* block,
+                                   const std::string& input,
+                                   const BatchNormLayerDesc& bn,
+                                   const ActivationLayerDesc& act,
+                                   const std::string& mask,
+                                   const std::string& output);
+
+    void addMishOps(CoreML::Specification::MILSpec::Block* block,
+                    const std::string& input,
+                    const std::string& output,
+                    int rank,
+                    int channels);
+
+    void addGlobalPoolingOps(CoreML::Specification::MILSpec::Block* block,
+                             const std::string& input,
+                             const std::string& mask,
+                             int channels,
+                             const std::string& output);
+
+    void addGlobalPoolingValueOps(CoreML::Specification::MILSpec::Block* block,
+                                  const std::string& input,
+                                  const std::string& mask,
+                                  int channels,
+                                  const std::string& output);
+
+    void addMatMulOp(CoreML::Specification::MILSpec::Block* block,
+                     const std::string& input,
+                     const MatMulLayerDesc& layer,
+                     const std::string& output);
+
+    void addMatBiasOp(CoreML::Specification::MILSpec::Block* block,
+                      const std::string& input,
+                      const MatBiasLayerDesc& layer,
+                      const std::string& output);
+
+    void addLinearOp(CoreML::Specification::MILSpec::Block* block,
+                     const std::string& input,
+                     const MatMulLayerDesc& matmul,
+                     const MatBiasLayerDesc& bias,
+                     const std::string& output);
+
+    // Network component builders
+    std::string buildTrunk(CoreML::Specification::MILSpec::Block* block,
+                           const std::string& spatial_input,
+                           const std::string& global_input,
+                           const std::string& mask,
+                           const std::string* meta_input);
+
+    std::string buildResidualBlock(CoreML::Specification::MILSpec::Block* block,
+                                   const std::string& input,
+                                   const ResidualBlockDesc& block_desc,
+                                   const std::string& mask,
+                                   const std::string& prefix);
+
+    std::string buildGlobalPoolingResidualBlock(CoreML::Specification::MILSpec::Block* block,
+                                                 const std::string& input,
+                                                 const GlobalPoolingResidualBlockDesc& block_desc,
+                                                 const std::string& mask,
+                                                 const std::string& prefix);
+
+    std::string buildNestedBottleneckBlock(CoreML::Specification::MILSpec::Block* block,
+                                            const std::string& input,
+                                            const NestedBottleneckResidualBlockDesc& block_desc,
+                                            const std::string& mask,
+                                            const std::string& prefix);
+
+    void buildPolicyHead(CoreML::Specification::MILSpec::Block* block,
+                         const std::string& trunk_out,
+                         const std::string& mask,
+                         std::string& policy_out,
+                         std::string& pass_out);
+
+    void buildValueHead(CoreML::Specification::MILSpec::Block* block,
+                        const std::string& trunk_out,
+                        const std::string& mask,
+                        std::string& value_out,
+                        std::string& ownership_out,
+                        std::string& score_value_out);
+
+    std::string buildSGFMetadataEncoder(CoreML::Specification::MILSpec::Block* block,
+                                        const std::string& meta_input,
+                                        const SGFMetadataEncoderDesc& encoder);
+};
+
+}  // namespace katagocoreml
diff --git a/cpp/external/katagocoreml/src/builder/Operations.cpp b/cpp/external/katagocoreml/src/builder/Operations.cpp
new file mode 100644
index 000000000..c0c036292
--- /dev/null
+++ b/cpp/external/katagocoreml/src/builder/Operations.cpp
@@ -0,0 +1,31 @@
+// katagocoreml - Standalone C++ KataGo to Core ML Converter
+// Copyright (c) 2025, Chin-Chang Yang
+
+#include "Operations.hpp"
+
+namespace katagocoreml {
+
+KataGoOps::KataGoOps(int board_x_size, int board_y_size, bool optimize_identity_mask)
+    : m_board_x_size(board_x_size)
+    , m_board_y_size(board_y_size)
+    , m_optimize_identity_mask(optimize_identity_mask)
+    , m_mask_constants(board_x_size, board_y_size)
+    , m_op_counter(0) {}
+
+std::string KataGoOps::registerWeight(const std::string& name,
+                                       const std::vector<float>& data,
+                                       const std::vector<int64_t>& shape) {
+    WeightEntry entry;
+    entry.name = name;
+    entry.data = data;
+    entry.shape = shape;
+    entry.blob_offset = 0;  // Will be set during serialization
+    m_weights.push_back(std::move(entry));
+    return name;
+}
+
+std::string KataGoOps::genOpName(const std::string& prefix) {
+    return prefix + "_" + std::to_string(m_op_counter++);
+}
+
+}  // namespace katagocoreml
diff --git a/cpp/external/katagocoreml/src/builder/Operations.hpp b/cpp/external/katagocoreml/src/builder/Operations.hpp
new file mode 100644
index 000000000..3fc72ad88
--- /dev/null
+++ b/cpp/external/katagocoreml/src/builder/Operations.hpp
@@ -0,0 +1,77 @@
+// katagocoreml - Standalone C++ KataGo to Core ML Converter
+// Copyright (c) 2025, Chin-Chang Yang
+
+#pragma once
+
+#include "../types/KataGoTypes.hpp"
+#include <cmath>
+#include <string>
+#include <vector>
+
+namespace katagocoreml {
+
+/// Weight entry for blob file storage
+struct WeightEntry {
+    std::string name;
+    std::vector<float> data;
+    std::vector<int64_t> shape;
+    uint64_t blob_offset = 0;  // Set during serialization
+};
+
+/// Precomputed constants for identity mask optimization
+struct MaskConstants {
+    float mask_sum = 361.0f;  // 19 * 19
+    float mask_sum_reciprocal = 1.0f / 361.0f;
+    float mask_sum_sqrt_s14_m01 = 0.5f;  // (sqrt(361) - 14) * 0.1
+    float mask_sum_sqrt_s14_m01_sq_s01 = 0.15f;  // (0.5^2) - 0.1
+
+    MaskConstants() = default;
+
+    MaskConstants(int board_x_size, int board_y_size) {
+        mask_sum = static_cast<float>(board_x_size * board_y_size);
+        mask_sum_reciprocal = 1.0f / mask_sum;
+        float sqrt_mask_sum = std::sqrt(mask_sum);
+        mask_sum_sqrt_s14_m01 = (sqrt_mask_sum - 14.0f) * 0.1f;
+        float sq = mask_sum_sqrt_s14_m01 * mask_sum_sqrt_s14_m01;
+        mask_sum_sqrt_s14_m01_sq_s01 = sq - 0.1f;
+    }
+};
+
+/// KataGo operation builder for MIL program construction
+/// This class builds the structure needed for MIL program generation
+class KataGoOps {
+public:
+    KataGoOps(int board_x_size, int board_y_size, bool optimize_identity_mask);
+
+    /// Get the board dimensions
+    int getBoardXSize() const { return m_board_x_size; }
+    int getBoardYSize() const { return m_board_y_size; }
+    bool isOptimizeIdentityMask() const { return m_optimize_identity_mask; }
+
+    /// Get precomputed mask constants
+    const MaskConstants& getMaskConstants() const { return m_mask_constants; }
+
+    /// Register a weight tensor and return its reference name
+    std::string registerWeight(const std::string& name,
+                               const std::vector<float>& data,
+                               const std::vector<int64_t>& shape);
+
+    /// Get all registered weights
+    const std::vector<WeightEntry>& getWeights() const { return m_weights; }
+
+    /// Clear all registered weights
+    void clearWeights() { m_weights.clear(); }
+
+    /// Generate unique operation name
+    std::string genOpName(const std::string& prefix);
+
+private:
+    int m_board_x_size;
+    int m_board_y_size;
+    bool m_optimize_identity_mask;
+    MaskConstants m_mask_constants;
+    std::vector<WeightEntry> m_weights;
+    int m_op_counter = 0;
+};
+
+}  // namespace katagocoreml
diff --git a/cpp/external/katagocoreml/src/parser/KataGoParser.cpp b/cpp/external/katagocoreml/src/parser/KataGoParser.cpp
new file mode 100644
index 000000000..884add38f
--- /dev/null
+++ b/cpp/external/katagocoreml/src/parser/KataGoParser.cpp
@@ -0,0 +1,573 @@
+// katagocoreml - Standalone C++ KataGo to Core ML Converter
+// Copyright (c) 2025, Chin-Chang Yang
+
+#include "KataGoParser.hpp"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <stdexcept>
+#include <zlib.h>
+
+namespace katagocoreml {
+
+// ============================================================================
+// Constructor
+// ============================================================================
+
+KataGoParser::KataGoParser(const std::string& model_path)
+    : m_model_path(model_path) {}
+
+// ============================================================================
+// Version Support
+// ============================================================================
+
+bool KataGoParser::isVersionSupported(int version) {
+    for (int v : SUPPORTED_VERSIONS) {
+        if (v == version) return true;
+    }
+    return false;
+}
+
+// ============================================================================
+// File Loading
+// ============================================================================
+
+void KataGoParser::loadFile() {
+    // Check if gzip compressed
+    bool is_gzip = false;
+    if (m_model_path.size() >= 3) {
+        std::string ext = m_model_path.substr(m_model_path.size() - 3);
+        is_gzip = (ext == ".gz");
+    }
+
+    if (is_gzip) {
+        // Read gzipped file
+        gzFile gz = gzopen(m_model_path.c_str(), "rb");
+        if (!gz) {
+            throw std::runtime_error("Cannot open gzip file: " + m_model_path);
+        }
+
+        // Read in chunks
+        m_buffer.clear();
+        std::vector<uint8_t> chunk(1024 * 1024);  // 1MB chunks
+        int bytes_read;
+        while ((bytes_read = gzread(gz, chunk.data(), static_cast<unsigned>(chunk.size()))) > 0) {
+            m_buffer.insert(m_buffer.end(), chunk.begin(), chunk.begin() + bytes_read);
+        }
+
+        if (bytes_read < 0) {
+            int errnum;
+            const char* errmsg = gzerror(gz, &errnum);
+            gzclose(gz);
+            throw std::runtime_error("Error reading gzip file: " + std::string(errmsg));
+        }
+
+        gzclose(gz);
+    } else {
+        // Read regular file
+        std::ifstream file(m_model_path, std::ios::binary | std::ios::ate);
+        if (!file) {
+            throw std::runtime_error("Cannot open file: " + m_model_path);
+        }
+
+        std::streamsize size = file.tellg();
+        file.seekg(0, std::ios::beg);
+
+        m_buffer.resize(static_cast<size_t>(size));
+        if (!file.read(reinterpret_cast<char*>(m_buffer.data()), size)) {
+            throw std::runtime_error("Error reading file: " + m_model_path);
+        }
+    }
+}
+
+// ============================================================================
+// Main Parse Function
+// ============================================================================
+
+KataGoModelDesc KataGoParser::parse() {
+    loadFile();
+    m_pos = 0;
+
+    // Detect if binary format (check for @BIN@ marker)
+    const std::string bin_marker = "@BIN@";
+    auto it = std::search(m_buffer.begin(), m_buffer.end(),
+                          bin_marker.begin(), bin_marker.end());
+    m_binary_floats = (it != m_buffer.end());
+
+    return parseModel();
+}
+
+// ============================================================================
+// Low-Level Reading Functions
+// ============================================================================
+
+void KataGoParser::skipWhitespace() {
+    while (m_pos < m_buffer.size()) {
+        char c = static_cast<char>(m_buffer[m_pos]);
+        if (c != ' ' && c != '\t' && c != '\n' && c != '\r') {
+            break;
+        }
+        m_pos++;
+    }
+}
+
+void KataGoParser::readUntilWhitespace(std::string& out) {
+    out.clear();
+    while (m_pos < m_buffer.size()) {
+        char c = static_cast<char>(m_buffer[m_pos]);
+        if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
+            break;
+        }
+        out += c;
+        m_pos++;
+    }
+}
+
+std::string KataGoParser::readString() {
+    skipWhitespace();
+    std::string token;
+    readUntilWhitespace(token);
+    return token;
+}
+
+int KataGoParser::readInt() {
+    std::string token = readString();
+    return std::stoi(token);
+}
+
+float KataGoParser::readFloat() {
+    std::string token = readString();
+    return std::stof(token);
+}
+
+bool KataGoParser::readBool() {
+    return readInt() != 0;
+}
+
+std::vector<float> KataGoParser::readFloats(size_t count, const std::string& name) {
+    std::vector<float> floats(count);
+
+    if (!m_binary_floats) {
+        // Text format
+        for (size_t i = 0; i < count; i++) {
+            floats[i] = readFloat();
+        }
+    } else {
+        // Binary format - find @BIN@ marker
+        while (m_pos < m_buffer.size()) {
+            if (m_buffer[m_pos] == '@') {
+                break;
+            }
+            m_pos++;
+        }
+
+        // Check for @BIN@ header
+        if (m_pos + 5 > m_buffer.size() ||
+            std::memcmp(&m_buffer[m_pos], "@BIN@", 5) != 0) {
+            throw std::runtime_error(name + ": expected @BIN@ marker for binary float block");
+        }
+        m_pos += 5;
+
+        // Read binary floats (little-endian)
+        size_t num_bytes = count * 4;
+        if (m_pos + num_bytes > m_buffer.size()) {
+            throw std::runtime_error(name + ": not enough bytes for " + std::to_string(count) + " floats");
+        }
+
+        // Copy as little-endian float32
+        std::memcpy(floats.data(), &m_buffer[m_pos], num_bytes);
+        m_pos += num_bytes;
+    }
+
+    return floats;
+}
+
+// ============================================================================
+// Layer Parsing Functions
+// ============================================================================
+
+ConvLayerDesc KataGoParser::parseConvLayer() {
+    ConvLayerDesc layer;
+    layer.name = readString();
+    layer.conv_y_size = readInt();
+    layer.conv_x_size = readInt();
+    layer.in_channels = readInt();
+    layer.out_channels = readInt();
+    layer.dilation_y = readInt();
+    layer.dilation_x = readInt();
+
+    // Read weights in file order: [y, x, ic, oc]
+    size_t num_weights = static_cast<size_t>(layer.conv_y_size) * layer.conv_x_size *
+                         layer.in_channels * layer.out_channels;
+    std::vector<float> weights_flat = readFloats(num_weights, layer.name);
+
+    // Transpose from [y, x, ic, oc] to [oc, ic, y, x]
+    layer.weights.resize(num_weights);
+    int y_size = layer.conv_y_size;
+    int x_size = layer.conv_x_size;
+    int ic = layer.in_channels;
+    int oc = layer.out_channels;
+
+    for (int out_c = 0; out_c < oc; out_c++) {
+        for (int in_c = 0; in_c < ic; in_c++) {
+            for (int y = 0; y < y_size; y++) {
+                for (int x = 0; x < x_size; x++) {
+                    // Source index: [y, x, ic, oc]
+                    size_t src_idx = static_cast<size_t>(y) * x_size * ic * oc +
+                                     x * ic * oc +
+                                     in_c * oc +
+                                     out_c;
+                    // Dest index: [oc, ic, y, x]
+                    size_t dst_idx = static_cast<size_t>(out_c) * ic * y_size * x_size +
+                                     in_c * y_size * x_size +
+                                     y * x_size +
+                                     x;
+                    layer.weights[dst_idx] = weights_flat[src_idx];
+                }
+            }
+        }
+    }
+
+    return layer;
+}
+
+BatchNormLayerDesc KataGoParser::parseBatchNormLayer() {
+    BatchNormLayerDesc layer;
+    layer.name = readString();
+    layer.num_channels = readInt();
+    layer.epsilon = readFloat();
+    layer.has_scale = readBool();
+    layer.has_bias = readBool();
+
+    layer.mean = readFloats(layer.num_channels, layer.name + "/mean");
+    layer.variance = readFloats(layer.num_channels, layer.name + "/variance");
+
+    if (layer.has_scale) {
+        layer.scale = readFloats(layer.num_channels, layer.name + "/scale");
+    } else {
+        layer.scale.resize(layer.num_channels, 1.0f);
+    }
+
+    if (layer.has_bias) {
+        layer.bias = readFloats(layer.num_channels, layer.name + "/bias");
+    } else {
+        layer.bias.resize(layer.num_channels, 0.0f);
+    }
+
+    // Compute merged scale and bias
+    layer.merged_scale.resize(layer.num_channels);
+    layer.merged_bias.resize(layer.num_channels);
+    for (int i = 0; i < layer.num_channels; i++) {
+        layer.merged_scale[i] = layer.scale[i] / std::sqrt(layer.variance[i] + layer.epsilon);
+        layer.merged_bias[i] = layer.bias[i] - layer.merged_scale[i] * layer.mean[i];
+    }
+
+    return layer;
+}
+
+ActivationLayerDesc KataGoParser::parseActivationLayer(int model_version) {
+    ActivationLayerDesc layer;
+    layer.name = readString();
+
+    if (model_version >= 11) {
+        std::string activation_str = readString();
+        if (activation_str == "ACTIVATION_IDENTITY") {
+            layer.activation_type = ActivationType::Identity;
+        } else if (activation_str == "ACTIVATION_RELU") {
+            layer.activation_type = ActivationType::ReLU;
+        } else if (activation_str == "ACTIVATION_MISH") {
+            layer.activation_type = ActivationType::Mish;
+        } else {
+            throw std::runtime_error("Unknown activation type: " + activation_str);
+        }
+    } else {
+        // Pre-v11 models only have ReLU
+        layer.activation_type = ActivationType::ReLU;
+    }
+
+    return layer;
+}
+
+MatMulLayerDesc KataGoParser::parseMatMulLayer() {
+    MatMulLayerDesc layer;
+    layer.name = readString();
+    layer.in_channels = readInt();
+    layer.out_channels = readInt();
+
+    // Weights in [ic, oc] order
+    size_t num_weights = static_cast<size_t>(layer.in_channels) * layer.out_channels;
+    layer.weights = readFloats(num_weights, layer.name);
+
+    return layer;
+}
+
+MatBiasLayerDesc KataGoParser::parseMatBiasLayer() {
+    MatBiasLayerDesc layer;
+    layer.name = readString();
+    layer.num_channels = readInt();
+    layer.weights = readFloats(layer.num_channels, layer.name);
+
+    return layer;
+}
+
+// ============================================================================
+// Block Parsing Functions
+// ============================================================================
+
+ResidualBlockDesc KataGoParser::parseResidualBlock(int model_version) {
+    ResidualBlockDesc block;
+    block.name = readString();
+    block.pre_bn = parseBatchNormLayer();
+    block.pre_activation = parseActivationLayer(model_version);
+    block.regular_conv = parseConvLayer();
+    block.mid_bn = parseBatchNormLayer();
+    block.mid_activation = parseActivationLayer(model_version);
+    block.final_conv = parseConvLayer();
+
+    return block;
+}
+
+GlobalPoolingResidualBlockDesc KataGoParser::parseGlobalPoolingResidualBlock(int model_version) {
+    GlobalPoolingResidualBlockDesc block;
+    block.name = readString();
+    block.model_version = model_version;
+    block.pre_bn = parseBatchNormLayer();
+    block.pre_activation = parseActivationLayer(model_version);
+    block.regular_conv = parseConvLayer();
+    block.gpool_conv = parseConvLayer();
+    block.gpool_bn = parseBatchNormLayer();
+    block.gpool_activation = parseActivationLayer(model_version);
+    block.gpool_to_bias_mul = parseMatMulLayer();
+    block.mid_bn = parseBatchNormLayer();
+    block.mid_activation = parseActivationLayer(model_version);
+    block.final_conv = parseConvLayer();
+
+    return block;
+}
+
+NestedBottleneckResidualBlockDesc KataGoParser::parseNestedBottleneckBlock(int model_version, int trunk_num_channels) {
+    NestedBottleneckResidualBlockDesc block;
+    block.name = readString();
+    block.num_blocks = readInt();
+
+    block.pre_bn = parseBatchNormLayer();
+    block.pre_activation = parseActivationLayer(model_version);
+    block.pre_conv = parseConvLayer();
+
+    block.blocks = parseBlockStack(model_version, block.num_blocks, block.pre_conv.out_channels);
+
+    block.post_bn = parseBatchNormLayer();
+    block.post_activation = parseActivationLayer(model_version);
+    block.post_conv = parseConvLayer();
+
+    return block;
+}
+
+std::vector<BlockEntry> KataGoParser::parseBlockStack(int model_version, int num_blocks, int trunk_num_channels) {
+    std::vector<BlockEntry> blocks;
+    blocks.reserve(num_blocks);
+
+    for (int i = 0; i < num_blocks; i++) {
+        std::string block_kind_name = readString();
+        BlockEntry entry;
+
+        if (block_kind_name == "ordinary_block") {
+            entry.block_kind = ORDINARY_BLOCK_KIND;
+            entry.block = std::make_shared<BlockDesc>(parseResidualBlock(model_version));
+        } else if (block_kind_name == "gpool_block") {
+            entry.block_kind = GLOBAL_POOLING_BLOCK_KIND;
+            entry.block = std::make_shared<BlockDesc>(parseGlobalPoolingResidualBlock(model_version));
+        } else if (block_kind_name == "nested_bottleneck_block") {
+            entry.block_kind = NESTED_BOTTLENECK_BLOCK_KIND;
+            entry.block = std::make_shared<BlockDesc>(parseNestedBottleneckBlock(model_version, trunk_num_channels));
+        } else {
+            throw std::runtime_error("Unknown block kind: " + block_kind_name);
+        }
+
+        blocks.push_back(std::move(entry));
+    }
+
+    return blocks;
+}
+
+// ============================================================================
+// Component Parsing Functions
+// ============================================================================
+
+SGFMetadataEncoderDesc KataGoParser::parseSGFMetadataEncoder(int model_version, int meta_encoder_version) {
+    SGFMetadataEncoderDesc encoder;
+    encoder.name = readString();
+    encoder.meta_encoder_version = meta_encoder_version;
+    encoder.num_input_meta_channels = readInt();
+
+    encoder.mul1 = parseMatMulLayer();
+    encoder.bias1 = parseMatBiasLayer();
+    encoder.act1 = parseActivationLayer(model_version);
+    encoder.mul2 = parseMatMulLayer();
+    encoder.bias2 = parseMatBiasLayer();
+    encoder.act2 = parseActivationLayer(model_version);
+    encoder.mul3 = parseMatMulLayer();
+
+    return encoder;
+}
+
+TrunkDesc KataGoParser::parseTrunk(int model_version, int meta_encoder_version) {
+    TrunkDesc trunk;
+    trunk.name = readString();
+    trunk.model_version = model_version;
+    trunk.meta_encoder_version = meta_encoder_version;
+    trunk.num_blocks = readInt();
+    trunk.trunk_num_channels = readInt();
+    trunk.mid_num_channels = readInt();
+    trunk.regular_num_channels = readInt();
+    readInt();  // dilatedNumChannels (unused)
+    trunk.gpool_num_channels = readInt();
+
+    // Version >= 15 has 6 unused int parameters
+    if (model_version >= 15) {
+        for (int i = 0; i < 6; i++) {
+            readInt();
+        }
+    }
+
+    trunk.initial_conv = parseConvLayer();
+    trunk.initial_matmul = parseMatMulLayer();
+
+    // Parse SGF metadata encoder if present
+    if (meta_encoder_version > 0) {
+        trunk.sgf_metadata_encoder = parseSGFMetadataEncoder(model_version, meta_encoder_version);
+    }
+
+    // Parse residual blocks
+    trunk.blocks = parseBlockStack(model_version, trunk.num_blocks, trunk.trunk_num_channels);
+
+    trunk.trunk_tip_bn = parseBatchNormLayer();
+    trunk.trunk_tip_activation = parseActivationLayer(model_version);
+
+    return trunk;
+}
+
+PolicyHeadDesc KataGoParser::parsePolicyHead(int model_version) {
+    PolicyHeadDesc head;
+    head.name = readString();
+    head.model_version = model_version;
+
+    head.p1_conv = parseConvLayer();
+    head.g1_conv = parseConvLayer();
+    head.g1_bn = parseBatchNormLayer();
+    head.g1_activation = parseActivationLayer(model_version);
+    head.gpool_to_bias_mul = parseMatMulLayer();
+    head.p1_bn = parseBatchNormLayer();
+    head.p1_activation = parseActivationLayer(model_version);
+    head.p2_conv = parseConvLayer();
+    head.gpool_to_pass_mul = parseMatMulLayer();
+
+    // Version >= 15 has additional pass move layers
+    if (model_version >= 15) {
+        head.gpool_to_pass_bias = parseMatBiasLayer();
+        head.pass_activation = parseActivationLayer(model_version);
+        head.gpool_to_pass_mul2 = parseMatMulLayer();
+    }
+
+    // Determine policy output channels based on version
+    if (model_version >= 16) {
+        head.policy_out_channels = 4;
+    } else if (model_version >= 12) {
+        head.policy_out_channels = 2;
+    } else {
+        head.policy_out_channels = 1;
+    }
+
+    return head;
+}
+
+ValueHeadDesc KataGoParser::parseValueHead(int model_version) {
+    ValueHeadDesc head;
+    head.name = readString();
+    head.model_version = model_version;
+
+    head.v1_conv = parseConvLayer();
+    head.v1_bn = parseBatchNormLayer();
+    head.v1_activation = parseActivationLayer(model_version);
+    head.v2_mul = parseMatMulLayer();
+    head.v2_bias = parseMatBiasLayer();
+    head.v2_activation = parseActivationLayer(model_version);
+    head.v3_mul = parseMatMulLayer();
+    head.v3_bias = parseMatBiasLayer();
+    head.sv3_mul = parseMatMulLayer();
+    head.sv3_bias = parseMatBiasLayer();
+    head.v_ownership_conv = parseConvLayer();
+
+    return head;
+}
+
+// ============================================================================
+// Main Model Parsing
+// ============================================================================
+
+KataGoModelDesc KataGoParser::parseModel() {
+    KataGoModelDesc model;
+
+    // Read header
+    model.name = readString();
+    model.model_version = readInt();
+
+    if (!isVersionSupported(model.model_version)) {
+        throw std::runtime_error(
+            "Only KataGo model versions 8-16 are supported, got version " +
+            std::to_string(model.model_version));
+    }
+
+    model.num_input_channels = readInt();
+    model.num_input_global_channels = readInt();
+
+    // Parse post-process params (version >= 13)
+    if (model.model_version >= 13) {
+        model.post_process_params.td_score_multiplier = readFloat();
+        model.post_process_params.score_mean_multiplier = readFloat();
+        model.post_process_params.score_stdev_multiplier = readFloat();
+        model.post_process_params.lead_multiplier = readFloat();
+        model.post_process_params.variance_time_multiplier = readFloat();
+        model.post_process_params.shortterm_value_error_multiplier = readFloat();
+        model.post_process_params.shortterm_score_error_multiplier = readFloat();
+    }
+
+    // Parse meta encoder version (version >= 15)
+    model.meta_encoder_version = 0;
+    model.num_input_meta_channels = 0;
+    if (model.model_version >= 15) {
+        model.meta_encoder_version = readInt();
+        // Read unused params
+        for (int i = 0; i < 7; i++) {
+            readInt();
+        }
+
+        if (model.meta_encoder_version > 0) {
+            model.num_input_meta_channels = 192;  // SGFMetadata::METADATA_INPUT_NUM_CHANNELS
+        }
+    }
+
+    // Parse trunk, policy head, value head
+    model.trunk = parseTrunk(model.model_version, model.meta_encoder_version);
+    model.policy_head = parsePolicyHead(model.model_version);
+    model.value_head = parseValueHead(model.model_version);
+
+    // Determine output channel counts
+    model.num_policy_channels = model.policy_head.policy_out_channels;
+    model.num_value_channels = 3;  // win, loss, noresult
+
+    if (model.model_version >= 9) {
+        model.num_score_value_channels = 6;
+    } else if (model.model_version >= 8) {
+        model.num_score_value_channels = 4;
+    } else {
+        model.num_score_value_channels = 1;
+    }
+
+    model.num_ownership_channels = 1;
+
+    return model;
+}
+
+}  // namespace katagocoreml
diff --git a/cpp/external/katagocoreml/src/parser/KataGoParser.hpp b/cpp/external/katagocoreml/src/parser/KataGoParser.hpp
new file mode 100644
index 000000000..cbcfdefa8
--- /dev/null
+++ b/cpp/external/katagocoreml/src/parser/KataGoParser.hpp
@@ -0,0 +1,73 @@
+// katagocoreml - Standalone C++ KataGo to Core ML Converter
+// Copyright (c) 2025, Chin-Chang Yang
+
+#pragma once
+
+#include "../types/KataGoTypes.hpp"
+#include <array>
+#include <string>
+#include <vector>
+
+namespace katagocoreml {
+
+/// Parser for KataGo neural network model files.
+/// Supports versions 8-16 models in binary format (.bin, .bin.gz).
+class KataGoParser {
+public:
+    /// Supported KataGo model versions
+    static constexpr std::array<int, 9> SUPPORTED_VERSIONS = {8, 9, 10, 11, 12, 13, 14, 15, 16};
+
+    /// Constructor
+    /// @param model_path Path to the KataGo model file (.bin or .bin.gz)
+    explicit KataGoParser(const std::string& model_path);
+
+    /// Parse the model file and return a structured model description
+    /// @return KataGoModelDesc containing all model parameters
+    /// @throws std::runtime_error if the file cannot be read or parsed
+    KataGoModelDesc parse();
+
+    /// Check if a version is supported
+    static bool isVersionSupported(int version);
+
+private:
+    std::string m_model_path;
+    std::vector<uint8_t> m_buffer;
+    size_t m_pos = 0;
+    bool m_binary_floats = true;
+
+    // Low-level reading functions
+    void readUntilWhitespace(std::string& out);
+    void skipWhitespace();
+    std::string readString();
+    int readInt();
+    float readFloat();
+    bool readBool();
+    std::vector<float> readFloats(size_t count, const std::string& name);
+
+    // Layer parsing functions
+    ConvLayerDesc parseConvLayer();
+    BatchNormLayerDesc parseBatchNormLayer();
+    ActivationLayerDesc parseActivationLayer(int model_version);
+    MatMulLayerDesc parseMatMulLayer();
+    MatBiasLayerDesc parseMatBiasLayer();
+
+    // Block parsing functions
+    ResidualBlockDesc parseResidualBlock(int model_version);
+    GlobalPoolingResidualBlockDesc parseGlobalPoolingResidualBlock(int model_version);
+    NestedBottleneckResidualBlockDesc parseNestedBottleneckBlock(int model_version, int trunk_num_channels);
+    std::vector<BlockEntry> parseBlockStack(int model_version, int num_blocks, int trunk_num_channels);
+
+    // Component parsing functions
+    SGFMetadataEncoderDesc parseSGFMetadataEncoder(int model_version, int meta_encoder_version);
+    TrunkDesc parseTrunk(int model_version, int meta_encoder_version);
+    PolicyHeadDesc parsePolicyHead(int model_version);
+    ValueHeadDesc parseValueHead(int model_version);
+
+    // Main model parsing
+    KataGoModelDesc parseModel();
+
+    // Helper to load file (handles gzip)
+    void loadFile();
+};
+
+}  // namespace katagocoreml
diff --git a/cpp/external/katagocoreml/src/serializer/CoreMLSerializer.cpp b/cpp/external/katagocoreml/src/serializer/CoreMLSerializer.cpp
new file mode 100644
index 000000000..f271f5526
--- /dev/null
+++ b/cpp/external/katagocoreml/src/serializer/CoreMLSerializer.cpp
@@ -0,0 +1,289 @@
+// katagocoreml - Standalone C++ KataGo to Core ML Converter
+// Copyright (c) 2025, Chin-Chang Yang
+
+#include "CoreMLSerializer.hpp"
+#include "WeightSerializer.hpp"
+#include "katagocoreml/Version.hpp"
+#include "MIL.pb.h"
+#include "Model.pb.h"
+#include "FeatureTypes.pb.h"
+#include "ModelPackage.hpp"
+#include <fstream>
+#include <stdexcept>
+#include <filesystem>
+#include <unordered_map>
+
+namespace katagocoreml {
+
+CoreMLSerializer::CoreMLSerializer(int spec_version)
+    : m_spec_version(spec_version) {}
+
+void CoreMLSerializer::serialize(CoreML::Specification::MILSpec::Program* program,
+                                 std::vector<WeightEntry>& weights,
+                                 const std::string& output_path,
+                                 const ConversionOptions& options) {
+    // Create temporary directory for weights
+    std::filesystem::path temp_dir = std::filesystem::temp_directory_path() / "katagocoreml_weights";
+    std::filesystem::create_directories(temp_dir);
+    std::string weights_dir = temp_dir.string();
+
+    // Determine if using FP16 precision
+    bool use_fp16 = (options.compute_precision == "FLOAT16");
+    bool use_fp16_io = use_fp16 && options.use_fp16_io;
+
+    // Write weight blob (this sets blob_offset on each WeightEntry)
+    writeWeightBlob(weights_dir, weights, use_fp16);
+
+    // Update MIL program with calculated blob offsets
+    updateBlobOffsets(program, weights);
+
+    // Create Model spec wrapping the MIL program
+    auto model = createModelSpec(program, options);
+
+    // Create .mlpackage
+    createPackage(output_path, model.get(), weights_dir);
+
+    // Cleanup temp directory
+    std::filesystem::remove_all(temp_dir);
+}
+
+std::unique_ptr<CoreML::Specification::Model> CoreMLSerializer::createModelSpec(
+    CoreML::Specification::MILSpec::Program* program,
+    const ConversionOptions& options) {
+
+    auto model = std::make_unique<CoreML::Specification::Model>();
+    model->set_specificationversion(m_spec_version);
+
+    // Set description
+    auto* desc = model->mutable_description();
+
+    // Helper lambda to set up batch dimension (either fixed shape or shape range)
+    auto setBatchShape = [&options](CoreML::Specification::ArrayFeatureType* array_type,
+                                     std::vector<int64_t> other_dims) {
+        if (options.isDynamicBatch()) {
+            // Use ShapeRange for dynamic batch
+            auto* shape_range = array_type->mutable_shaperange();
+
+            // Batch dimension range
+            auto* batch_range = shape_range->add_sizeranges();
+            batch_range->set_lowerbound(options.min_batch_size);
+            batch_range->set_upperbound(options.max_batch_size);
+
+            // Other dimensions are fixed
+            for (int64_t dim : other_dims) {
+                auto* range = shape_range->add_sizeranges();
+                range->set_lowerbound(dim);
+                range->set_upperbound(dim);
+            }
+
+            // Also set default shape for batch=min_batch_size
+            array_type->add_shape(options.min_batch_size);
+            for (int64_t dim : other_dims) {
+                array_type->add_shape(dim);
+            }
+        } else {
+            // Fixed batch size
+            array_type->add_shape(options.min_batch_size);
+            for (int64_t dim : other_dims) {
+                array_type->add_shape(dim);
+            }
+        }
+    };
+
+    // Determine data type for inputs/outputs
+    auto io_datatype = (options.compute_precision == "FLOAT16" && options.use_fp16_io)
+        ? CoreML::Specification::ArrayFeatureType::FLOAT16
+        : CoreML::Specification::ArrayFeatureType::FLOAT32;
+
+    // Add input descriptions
+    // spatial_input: [batch, num_input_channels, board_y, board_x]
+    auto* spatial_input = desc->add_input();
+    spatial_input->set_name("spatial_input");
+    auto* spatial_type = spatial_input->mutable_type()->mutable_multiarraytype();
+    spatial_type->set_datatype(io_datatype);
+    setBatchShape(spatial_type, {options.num_input_channels, options.board_y_size, options.board_x_size});
+
+    // global_input: [batch, num_input_global_channels]
+    auto* global_input = desc->add_input();
+    global_input->set_name("global_input");
+    auto* global_type = global_input->mutable_type()->mutable_multiarraytype();
+    global_type->set_datatype(io_datatype);
+    setBatchShape(global_type, {options.num_input_global_channels});
+
+    // input_mask: [batch, 1, board_y, board_x]
+    auto* mask_input = desc->add_input();
+    mask_input->set_name("input_mask");
+    auto* mask_type = mask_input->mutable_type()->mutable_multiarraytype();
+    mask_type->set_datatype(io_datatype);
+    setBatchShape(mask_type, {1, options.board_y_size, options.board_x_size});
+
+    // meta_input (optional, for human SL networks with metadata encoder): [batch, num_meta_channels]
+    if (options.meta_encoder_version > 0 && options.num_input_meta_channels > 0) {
+        auto* meta_input = desc->add_input();
+        meta_input->set_name("meta_input");
+        auto* meta_type = meta_input->mutable_type()->mutable_multiarraytype();
+        meta_type->set_datatype(io_datatype);
+        setBatchShape(meta_type, {options.num_input_meta_channels});
+    }
+
+    // Add output descriptions (names match Python coremltools converter)
+    auto* policy_output = desc->add_output();
+    policy_output->set_name("policy_p2_conv");
+    auto* policy_type = policy_output->mutable_type()->mutable_multiarraytype();
+    policy_type->set_datatype(io_datatype);
+
+    auto* pass_output = desc->add_output();
+    // Pass output name: Python uses "policy_pass" for all model versions
+    pass_output->set_name("policy_pass");
+    auto* pass_type = pass_output->mutable_type()->mutable_multiarraytype();
+    pass_type->set_datatype(io_datatype);
+
+    auto* value_output = desc->add_output();
+    value_output->set_name("value_v3_bias");
+    auto* value_type = value_output->mutable_type()->mutable_multiarraytype();
+    value_type->set_datatype(io_datatype);
+
+    auto* ownership_output = desc->add_output();
+    ownership_output->set_name("value_ownership_conv");
+    auto* ownership_type = ownership_output->mutable_type()->mutable_multiarraytype();
+    ownership_type->set_datatype(io_datatype);
+
+    auto* score_output = desc->add_output();
+    score_output->set_name("value_sv3_bias");
+    auto* score_type = score_output->mutable_type()->mutable_multiarraytype();
+    score_type->set_datatype(io_datatype);
+
+    // Set metadata
+    auto* metadata = desc->mutable_metadata();
+
+    // Build enhanced description: "KataGo - 10 blocks, 128 channels (from model.bin.gz)"
+    std::string description = "KataGo";
+    if (options.num_blocks > 0 && options.trunk_channels > 0) {
+        description += " - " + std::to_string(options.num_blocks) + " blocks, "
+                    + std::to_string(options.trunk_channels) + " channels";
+    } else {
+        description += " neural network model";
+    }
+    if (!options.source_filename.empty()) {
+        description += " (from " + options.source_filename + ")";
+    }
+    metadata->set_shortdescription(description);
+
+    // Set author if provided
+    if (!options.author.empty()) {
+        metadata->set_author(options.author);
+    }
+
+    // Set license if provided
+    if (!options.license.empty()) {
+        metadata->set_license(options.license);
+    }
+
+    // Set version string to model name
+    if (!options.model_name.empty()) {
+        metadata->set_versionstring(options.model_name);
+    }
+
+    // User-defined metadata
+    auto& user_meta = *metadata->mutable_userdefined();
+    user_meta["board_x_size"] = std::to_string(options.board_x_size);
+    user_meta["board_y_size"] = std::to_string(options.board_y_size);
+    user_meta["converter"] = "katagocoreml";
+    user_meta["converter_version"] = VERSION;
+
+    // Model info
+    user_meta["model_version"] = std::to_string(options.model_version);
+    if (options.meta_encoder_version > 0) {
+        user_meta["meta_encoder_version"] = std::to_string(options.meta_encoder_version);
+    }
+    user_meta["optimize_identity_mask"] = options.optimize_identity_mask ? "true" : "false";
+
+    // Precision info
+    user_meta["compute_precision"] = options.compute_precision;
+    user_meta["io_precision"] = options.use_fp16_io ? "FLOAT16" : "FLOAT32";
+
+    // Set the MIL program (use Swap to transfer ownership)
+    auto* ml_program = model->mutable_mlprogram();
+    ml_program->Swap(program);
+
+    return model;
+}
+
+void CoreMLSerializer::writeWeightBlob(const std::string& weights_dir,
+                                       std::vector<WeightEntry>& weights,
+                                       bool use_fp16) {
+    std::filesystem::create_directories(weights_dir);
+    std::string blob_path = weights_dir + "/weight.bin";
+    WeightSerializer::serialize(weights, blob_path, use_fp16);
+}
+
+void CoreMLSerializer::createPackage(const std::string& output_path,
+                                     CoreML::Specification::Model* model,
+                                     const std::string& weights_dir) {
+    // Create package using MPL::ModelPackage
+    MPL::ModelPackage package(output_path, true, false);
+
+    // Serialize model spec to temp file
+    std::filesystem::path temp_spec = std::filesystem::temp_directory_path() / "model.mlmodel";
+    {
+        std::ofstream out(temp_spec, std::ios::binary);
+        if (!out) {
+            throw std::runtime_error("Failed to create temp model file");
+        }
+        if (!model->SerializeToOstream(&out)) {
+            throw std::runtime_error("Failed to serialize model spec");
+        }
+    }
+
+    // Set root model
+    package.setRootModel(temp_spec.string(), "model.mlmodel", "com.apple.CoreML", "KataGo Core ML Model");
+
+    // Add weights
+    package.addItem(weights_dir, "weights", "com.apple.CoreML", "Model Weights");
+
+    // Cleanup temp file
+    std::filesystem::remove(temp_spec);
+}
+
+void CoreMLSerializer::updateBlobOffsets(CoreML::Specification::MILSpec::Program* program,
+                                          const std::vector<WeightEntry>& weights) {
+    // Build a map from weight name to blob offset
+    std::unordered_map<std::string, uint64_t> offset_map;
+    for (const auto& entry : weights) {
+        offset_map[entry.name] = entry.blob_offset;
+    }
+
+    // Navigate through MIL program structure to find all blobfilevalue entries
+    // Structure: Program -> functions -> blocks -> operations -> attributes["val"]
+    for (auto& func_pair : *program->mutable_functions()) {
+        auto& func = func_pair.second;
+        for (auto& block_pair : *func.mutable_block_specializations()) {
+            auto& block = block_pair.second;
+            for (int op_idx = 0; op_idx < block.operations_size(); ++op_idx) {
+                auto* op = block.mutable_operations(op_idx);
+                // Check if this is a const operation
+                if (op->type() == "const") {
+                    // Get the "val" attribute
+                    auto* attrs = op->mutable_attributes();
+                    auto val_it = attrs->find("val");
+                    if (val_it != attrs->end()) {
+                        auto& val = val_it->second;
+                        // Check if it's a blobfilevalue
+                        if (val.has_blobfilevalue()) {
+                            // Get the output name to look up the offset
+                            if (op->outputs_size() > 0) {
+                                const std::string& output_name = op->outputs(0).name();
+                                auto offset_it = offset_map.find(output_name);
+                                if (offset_it != offset_map.end()) {
+                                    val.mutable_blobfilevalue()->set_offset(offset_it->second);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+}  // namespace katagocoreml
diff --git a/cpp/external/katagocoreml/src/serializer/CoreMLSerializer.hpp b/cpp/external/katagocoreml/src/serializer/CoreMLSerializer.hpp
new file mode 100644
index 000000000..2828839d3
--- /dev/null
+++ b/cpp/external/katagocoreml/src/serializer/CoreMLSerializer.hpp
@@ -0,0 +1,54 @@
+// katagocoreml - Standalone C++ KataGo to Core ML Converter
+// Copyright (c) 2025, Chin-Chang Yang
+
+#pragma once
+
+#include "../builder/MILBuilder.hpp"
+#include "katagocoreml/Options.hpp"
+#include "Model.pb.h"
+#include <memory>
+#include <string>
+
+namespace katagocoreml {
+
+/// Serializes MIL program to Core ML .mlpackage format
+class CoreMLSerializer {
+public:
+    /// Constructor
+    /// @param spec_version Core ML specification version (default: 6 for iOS 15+)
+    explicit CoreMLSerializer(int spec_version = 6);
+
+    /// Serialize MIL program to .mlpackage
+    /// @param program The MIL program protobuf
+    /// @param weights Weight entries for blob serialization
+    /// @param output_path Path for .mlpackage directory
+    /// @param options Conversion options for metadata
+    void serialize(CoreML::Specification::MILSpec::Program* program,
+                   std::vector<WeightEntry>& weights,
+                   const std::string& output_path,
+                   const ConversionOptions& options);
+
+private:
+    int m_spec_version;
+
+    /// Create the top-level Model protobuf wrapping the MIL program
+    std::unique_ptr<CoreML::Specification::Model> createModelSpec(
+        CoreML::Specification::MILSpec::Program* program,
+        const ConversionOptions& options);
+
+    /// Write weight blob file
+    void writeWeightBlob(const std::string& weights_dir,
+                         std::vector<WeightEntry>& weights,
+                         bool use_fp16);
+
+    /// Create .mlpackage directory structure
+    void createPackage(const std::string& output_path,
+                       CoreML::Specification::Model* model,
+                       const std::string& weights_dir);
+
+    /// Update blob offsets in MIL program after weights are serialized
+    void updateBlobOffsets(CoreML::Specification::MILSpec::Program* program,
+                          const std::vector<WeightEntry>& weights);
+};
+
+}  // namespace katagocoreml
diff --git a/cpp/external/katagocoreml/src/serializer/WeightSerializer.cpp b/cpp/external/katagocoreml/src/serializer/WeightSerializer.cpp
new file mode 100644
index 000000000..2ac23a3da
--- /dev/null
+++ b/cpp/external/katagocoreml/src/serializer/WeightSerializer.cpp
@@ -0,0 +1,38 @@
+// katagocoreml - Standalone C++ KataGo to Core ML Converter
+// Copyright (c) 2025, Chin-Chang Yang
+
+#include "WeightSerializer.hpp"
+#include "MILBlob/Blob/StorageWriter.hpp"
+#include "MILBlob/Fp16.hpp"
+#include "MILBlob/Util/Span.hpp"
+
+namespace katagocoreml {
+
+size_t WeightSerializer::serialize(std::vector<WeightEntry>& weights,
+                                   const std::string& blob_path,
+                                   bool use_fp16) {
+    MILBlob::Blob::StorageWriter writer(blob_path, true);
+    size_t total_bytes = 0;
+
+    for (auto& entry : weights) {
+        if (use_fp16) {
+            // Convert FP32 weights to FP16
+            std::vector<MILBlob::Fp16> fp16_data(entry.data.size());
+            for (size_t i = 0; i < entry.data.size(); ++i) {
+                fp16_data[i] = MILBlob::Fp16::FromFloat(entry.data[i]);
+            }
+            MILBlob::Util::Span<const MILBlob::Fp16> span(fp16_data.data(), fp16_data.size());
+            entry.blob_offset = writer.WriteData(span);
+            total_bytes += entry.data.size() * sizeof(MILBlob::Fp16);
+        } else {
+            // Write FP32 weights
+            MILBlob::Util::Span<const float> span(entry.data.data(), entry.data.size());
+            entry.blob_offset = writer.WriteData(span);
+            total_bytes += entry.data.size() * sizeof(float);
+        }
+    }
+
+    return total_bytes;
+}
+
+}  // namespace katagocoreml
diff --git a/cpp/external/katagocoreml/src/serializer/WeightSerializer.hpp b/cpp/external/katagocoreml/src/serializer/WeightSerializer.hpp
new file mode 100644
index 000000000..e561ff442
--- /dev/null
+++ b/cpp/external/katagocoreml/src/serializer/WeightSerializer.hpp
@@ -0,0 +1,25 @@
+// katagocoreml - Standalone C++ KataGo to Core ML Converter
+// Copyright (c) 2025, Chin-Chang Yang
+
+#pragma once
+
+#include "../builder/Operations.hpp"
+#include <string>
+#include <vector>
+
+namespace katagocoreml {
+
+/// Serializes model weights to MIL blob storage format
+class WeightSerializer {
+public:
+    /// Write weights to blob file
+    /// @param weights Vector of weight entries to serialize
+    /// @param blob_path Path to output blob file
+    /// @param use_fp16 If true, convert weights to FLOAT16
+    /// @return Total bytes written
+    static size_t serialize(std::vector<WeightEntry>& weights,
+                            const std::string& blob_path,
+                            bool use_fp16 = false);
+};
+
+}  // namespace katagocoreml
diff --git a/cpp/external/katagocoreml/src/types/KataGoTypes.hpp b/cpp/external/katagocoreml/src/types/KataGoTypes.hpp
new file mode 100644
index 000000000..284b26cd3
--- /dev/null
+++ b/cpp/external/katagocoreml/src/types/KataGoTypes.hpp
@@ -0,0 +1,297 @@
+// katagocoreml - Standalone C++ KataGo to Core ML Converter
+// Copyright (c) 2025, Chin-Chang Yang
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <variant>
+#include <vector>
+
+namespace katagocoreml {
+
+// ============================================================================
+// Activation Types
+// ============================================================================
+
+/// Activation function types used in KataGo models
+enum class ActivationType : int {
+    Identity = 0,
+    ReLU = 1,
+    Mish = 2
+    // MISH_SCALE8 = 12 is internal optimization, treated as Mish
+};
+
+// ============================================================================
+// Block Kind Constants
+// ============================================================================
+
+/// Block kind constants (matching KataGo's desc.h)
+constexpr int ORDINARY_BLOCK_KIND = 0;
+constexpr int GLOBAL_POOLING_BLOCK_KIND = 2;
+constexpr int NESTED_BOTTLENECK_BLOCK_KIND = 3;
+
+// ============================================================================
+// Layer Descriptors
+// ============================================================================
+
+/// Convolutional layer descriptor
+struct ConvLayerDesc {
+    std::string name;
+    int conv_y_size = 0;
+    int conv_x_size = 0;
+    int in_channels = 0;
+    int out_channels = 0;
+    int dilation_y = 1;
+    int dilation_x = 1;
+    std::vector<float> weights;  // Shape: [out_channels, in_channels, y, x] (OIHW)
+
+    /// Get weight shape as vector
+    std::vector<int64_t> getWeightShape() const {
+        return {out_channels, in_channels, conv_y_size, conv_x_size};
+    }
+};
+
+/// Batch normalization layer descriptor
+/// KataGo pre-computes merged scale and bias for efficiency:
+///   merged_scale = scale / sqrt(variance + epsilon)
+///   merged_bias = bias - mean * merged_scale
+struct BatchNormLayerDesc {
+    std::string name;
+    int num_channels = 0;
+    float epsilon = 1e-5f;
+    bool has_scale = true;
+    bool has_bias = true;
+    std::vector<float> mean;
+    std::vector<float> variance;
+    std::vector<float> scale;
+    std::vector<float> bias;
+    std::vector<float> merged_scale;  // Pre-computed
+    std::vector<float> merged_bias;   // Pre-computed
+};
+
+/// Activation layer descriptor
+struct ActivationLayerDesc {
+    std::string name;
+    ActivationType activation_type = ActivationType::ReLU;
+};
+
+/// Matrix multiplication (fully connected) layer descriptor
+/// Computes: output = input @ weights
+struct MatMulLayerDesc {
+    std::string name;
+    int in_channels = 0;
+    int out_channels = 0;
+    std::vector<float> weights;  // Shape: [in_channels, out_channels]
+
+    std::vector<int64_t> getWeightShape() const {
+        return {in_channels, out_channels};
+    }
+};
+
+/// Bias addition layer descriptor
+/// Computes: output = input + bias
+struct MatBiasLayerDesc {
+    std::string name;
+    int num_channels = 0;
+    std::vector<float> weights;  // Shape: [num_channels]
+};
+
+// ============================================================================
+// Block Descriptors
+// ============================================================================
+
+/// Forward declarations for recursive block types
+struct ResidualBlockDesc;
+struct GlobalPoolingResidualBlockDesc;
+struct NestedBottleneckResidualBlockDesc;
+
+/// Block descriptor variant
+using BlockDesc = std::variant<
+    ResidualBlockDesc,
+    GlobalPoolingResidualBlockDesc,
+    NestedBottleneckResidualBlockDesc
+>;
+
+/// Block with its kind
+struct BlockEntry {
+    int block_kind = ORDINARY_BLOCK_KIND;
+    std::shared_ptr<BlockDesc> block;
+};
+
+/// Standard residual block descriptor
+/// Architecture:
+///   input -> preBN -> preActivation -> regularConv ->
+///            midBN -> midActivation -> finalConv -> + input
+struct ResidualBlockDesc {
+    std::string name;
+    BatchNormLayerDesc pre_bn;
+    ActivationLayerDesc pre_activation;
+    ConvLayerDesc regular_conv;
+    BatchNormLayerDesc mid_bn;
+    ActivationLayerDesc mid_activation;
+    ConvLayerDesc final_conv;
+};
+
+/// Global pooling residual block descriptor
+/// Similar to ResidualBlock but includes a global pooling path
+struct GlobalPoolingResidualBlockDesc {
+    std::string name;
+    int model_version = 0;
+    BatchNormLayerDesc pre_bn;
+    ActivationLayerDesc pre_activation;
+    ConvLayerDesc regular_conv;
+    ConvLayerDesc gpool_conv;
+    BatchNormLayerDesc gpool_bn;
+    ActivationLayerDesc gpool_activation;
+    MatMulLayerDesc gpool_to_bias_mul;
+    BatchNormLayerDesc mid_bn;
+    ActivationLayerDesc mid_activation;
+    ConvLayerDesc final_conv;
+};
+
+/// Nested bottleneck residual block descriptor
+/// A bottleneck block that can contain other blocks inside it
+struct NestedBottleneckResidualBlockDesc {
+    std::string name;
+    int num_blocks = 0;
+    BatchNormLayerDesc pre_bn;
+    ActivationLayerDesc pre_activation;
+    ConvLayerDesc pre_conv;
+    std::vector<BlockEntry> blocks;
+    BatchNormLayerDesc post_bn;
+    ActivationLayerDesc post_activation;
+    ConvLayerDesc post_conv;
+};
+
+// ============================================================================
+// SGF Metadata Encoder (v15+)
+// ============================================================================
+
+/// SGF metadata encoder descriptor (model version >= 15)
+/// Encodes game metadata through a 3-layer MLP
+struct SGFMetadataEncoderDesc {
+    std::string name;
+    int meta_encoder_version = 0;
+    int num_input_meta_channels = 0;
+    MatMulLayerDesc mul1;
+    MatBiasLayerDesc bias1;
+    ActivationLayerDesc act1;
+    MatMulLayerDesc mul2;
+    MatBiasLayerDesc bias2;
+    ActivationLayerDesc act2;
+    MatMulLayerDesc mul3;
+};
+
+// ============================================================================
+// Network Component Descriptors
+// ============================================================================
+
+/// Trunk (backbone) network descriptor
+struct TrunkDesc {
+    std::string name;
+    int model_version = 0;
+    int num_blocks = 0;
+    int trunk_num_channels = 0;
+    int mid_num_channels = 0;
+    int regular_num_channels = 0;
+    int gpool_num_channels = 0;
+    int meta_encoder_version = 0;
+    ConvLayerDesc initial_conv;
+    MatMulLayerDesc initial_matmul;
+    std::optional<SGFMetadataEncoderDesc> sgf_metadata_encoder;
+    std::vector<BlockEntry> blocks;
+    BatchNormLayerDesc trunk_tip_bn;
+    ActivationLayerDesc trunk_tip_activation;
+};
+
+/// Policy head descriptor
+struct PolicyHeadDesc {
+    std::string name;
+    int model_version = 0;
+    int policy_out_channels = 0;
+    ConvLayerDesc p1_conv;
+    ConvLayerDesc g1_conv;
+    BatchNormLayerDesc g1_bn;
+    ActivationLayerDesc g1_activation;
+    MatMulLayerDesc gpool_to_bias_mul;
+    BatchNormLayerDesc p1_bn;
+    ActivationLayerDesc p1_activation;
+    ConvLayerDesc p2_conv;
+    MatMulLayerDesc gpool_to_pass_mul;
+    std::optional<MatBiasLayerDesc> gpool_to_pass_bias;      // v15+
+    std::optional<ActivationLayerDesc> pass_activation;      // v15+
+    std::optional<MatMulLayerDesc> gpool_to_pass_mul2;       // v15+
+};
+
+/// Value head descriptor
+struct ValueHeadDesc {
+    std::string name;
+    int model_version = 0;
+    ConvLayerDesc v1_conv;
+    BatchNormLayerDesc v1_bn;
+    ActivationLayerDesc v1_activation;
+    MatMulLayerDesc v2_mul;
+    MatBiasLayerDesc v2_bias;
+    ActivationLayerDesc v2_activation;
+    MatMulLayerDesc v3_mul;
+    MatBiasLayerDesc v3_bias;
+    MatMulLayerDesc sv3_mul;
+    MatBiasLayerDesc sv3_bias;
+    ConvLayerDesc v_ownership_conv;
+};
+
+// ============================================================================
+// Post-Processing Parameters
+// ============================================================================
+
+/// Post-processing parameters for model outputs (v13+)
+struct ModelPostProcessParams {
+    float td_score_multiplier = 20.0f;
+    float score_mean_multiplier = 20.0f;
+    float score_stdev_multiplier = 20.0f;
+    float lead_multiplier = 20.0f;
+    float variance_time_multiplier = 40.0f;
+    float shortterm_value_error_multiplier = 0.25f;
+    float shortterm_score_error_multiplier = 30.0f;
+    float output_scale_multiplier = 1.0f;
+};
+
+// ============================================================================
+// Complete Model Descriptor
+// ============================================================================
+
+/// Complete KataGo model descriptor
+struct KataGoModelDesc {
+    std::string name;
+    std::string sha256;
+    int model_version = 0;
+    int num_input_channels = 0;
+    int num_input_global_channels = 0;
+    int num_input_meta_channels = 0;
+    int num_policy_channels = 0;
+    int num_value_channels = 3;  // Always 3: win/loss/noresult
+    int num_score_value_channels = 0;
+    int num_ownership_channels = 1;  // Always 1
+    int meta_encoder_version = 0;
+    ModelPostProcessParams post_process_params;
+    TrunkDesc trunk;
+    PolicyHeadDesc policy_head;
+    ValueHeadDesc value_head;
+
+    /// Get number of policy channels based on model version
+    static int getPolicyChannels(int version) {
+        if (version >= 16) return 4;
+        if (version >= 12) return 2;
+        return 1;
+    }
+
+    /// Get number of score value channels based on model version
+    static int getScoreValueChannels(int version) {
+        if (version >= 9) return 6;
+        return 4;
+    }
+};
+
+}  // namespace katagocoreml
diff --git a/cpp/external/katagocoreml/vendor/deps/FP16/LICENSE b/cpp/external/katagocoreml/vendor/deps/FP16/LICENSE
new file mode 100644
index 000000000..eabec6c86
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/deps/FP16/LICENSE
@@ -0,0 +1,11 @@
+The MIT License (MIT)
+
+Copyright (c) 2017 Facebook Inc.
+Copyright (c) 2017 Georgia Institute of Technology
+Copyright 2019 Google LLC
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/cpp/external/katagocoreml/vendor/deps/FP16/README.md b/cpp/external/katagocoreml/vendor/deps/FP16/README.md
new file mode 100644
index 000000000..6cba15862
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/deps/FP16/README.md
@@ -0,0 +1,20 @@
+# FP16
+Header-only library for conversion to/from half-precision floating point formats
+
+## Features
+
+- Supports IEEE and ARM alternative half-precision floating-point format
+    - Property converts infinities and NaNs
+    - Properly converts denormal numbers, even on systems without denormal support
+- Header-only library, no installation or build required
+- Compatible with C99 and C++11
+- Fully covered with unit tests and microbenchmarks
+
+## Acknowledgements
+
+[![HPC Garage logo](https://github.com/Maratyszcza/PeachPy/blob/master/logo/hpcgarage.png)](http://hpcgarage.org)
+[![Georgia Tech College of Computing logo](https://github.com/Maratyszcza/PeachPy/blob/master/logo/college-of-computing.gif)](http://www.cse.gatech.edu/)
+
+The library is developed by [Marat Dukhan](http://www.maratdukhan.com) of Georgia Tech. FP16 is a research project at [Richard Vuduc](http://vuduc.org)'s HPC Garage lab in the Georgia Institute of Technology, College of Computing, School of Computational Science and Engineering.
+
+This material is based upon work supported by the U.S. National Science Foundation (NSF) Award Number 1339745. Any opinions, findings and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect those of NSF.
diff --git a/cpp/external/katagocoreml/vendor/deps/FP16/include/fp16.h b/cpp/external/katagocoreml/vendor/deps/FP16/include/fp16.h
new file mode 100644
index 000000000..9d7366e99
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/deps/FP16/include/fp16.h
@@ -0,0 +1,11 @@
+#pragma once
+#ifndef FP16_H
+#define FP16_H
+
+#include <fp16/fp16.h>
+
+#if defined(PSIMD_H)
+#include <fp16/psimd.h>
+#endif
+
+#endif /* FP16_H */
diff --git a/cpp/external/katagocoreml/vendor/deps/FP16/include/fp16/bitcasts.h b/cpp/external/katagocoreml/vendor/deps/FP16/include/fp16/bitcasts.h
new file mode 100644
index 000000000..86a4e22c4
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/deps/FP16/include/fp16/bitcasts.h
@@ -0,0 +1,92 @@
+#pragma once
+#ifndef FP16_BITCASTS_H
+#define FP16_BITCASTS_H
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+	#include <cstdint>
+#elif !defined(__OPENCL_VERSION__)
+	#include <stdint.h>
+#endif
+
+#if defined(__INTEL_COMPILER)
+	#include <immintrin.h>
+#endif
+
+#if defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+	#include <intrin.h>
+#endif
+
+
+static inline float fp32_from_bits(uint32_t w) {
+#if defined(__OPENCL_VERSION__)
+	return as_float(w);
+#elif defined(__CUDA_ARCH__)
+	return __uint_as_float((unsigned int) w);
+#elif defined(__INTEL_COMPILER)
+	return _castu32_f32(w);
+#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+	return _CopyFloatFromInt32((__int32) w);
+#else
+	union {
+		uint32_t as_bits;
+		float as_value;
+	} fp32 = { w };
+	return fp32.as_value;
+#endif
+}
+
+static inline uint32_t fp32_to_bits(float f) {
+#if defined(__OPENCL_VERSION__)
+	return as_uint(f);
+#elif defined(__CUDA_ARCH__)
+	return (uint32_t) __float_as_uint(f);
+#elif defined(__INTEL_COMPILER)
+	return _castf32_u32(f);
+#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+	return (uint32_t) _CopyInt32FromFloat(f);
+#else
+	union {
+		float as_value;
+		uint32_t as_bits;
+	} fp32 = { f };
+	return fp32.as_bits;
+#endif
+}
+
+static inline double fp64_from_bits(uint64_t w) {
+#if defined(__OPENCL_VERSION__)
+	return as_double(w);
+#elif defined(__CUDA_ARCH__)
+	return __longlong_as_double((long long) w);
+#elif defined(__INTEL_COMPILER)
+	return _castu64_f64(w);
+#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+	return _CopyDoubleFromInt64((__int64) w);
+#else
+	union {
+		uint64_t as_bits;
+		double as_value;
+	} fp64 = { w };
+	return fp64.as_value;
+#endif
+}
+
+static inline uint64_t fp64_to_bits(double f) {
+#if defined(__OPENCL_VERSION__)
+	return as_ulong(f);
+#elif defined(__CUDA_ARCH__)
+	return (uint64_t) __double_as_longlong(f);
+#elif defined(__INTEL_COMPILER)
+	return _castf64_u64(f);
+#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+	return (uint64_t) _CopyInt64FromDouble(f);
+#else
+	union {
+		double as_value;
+		uint64_t as_bits;
+	} fp64 = { f };
+	return fp64.as_bits;
+#endif
+}
+
+#endif /* FP16_BITCASTS_H */
diff --git a/cpp/external/katagocoreml/vendor/deps/FP16/include/fp16/fp16.h b/cpp/external/katagocoreml/vendor/deps/FP16/include/fp16/fp16.h
new file mode 100644
index 000000000..b95aa15f5
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/deps/FP16/include/fp16/fp16.h
@@ -0,0 +1,451 @@
+#pragma once
+#ifndef FP16_FP16_H
+#define FP16_FP16_H
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+	#include <cstdint>
+	#include <cmath>
+#elif !defined(__OPENCL_VERSION__)
+	#include <stdint.h>
+	#include <math.h>
+#endif
+
+#ifdef _MSC_VER
+	#include <intrin.h>
+#endif
+
+#include <fp16/bitcasts.h>
+
+
+/*
+ * Convert a 16-bit floating-point number in IEEE half-precision format, in bit representation, to
+ * a 32-bit floating-point number in IEEE single-precision format, in bit representation.
+ *
+ * @note The implementation doesn't use any floating-point operations.
+ */
+static inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) {
+	/*
+	 * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
+	 *      +---+-----+------------+-------------------+
+	 *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+	 *      +---+-----+------------+-------------------+
+	 * Bits  31  26-30    16-25            0-15
+	 *
+	 * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
+	 */
+	const uint32_t w = (uint32_t) h << 16;
+	/*
+	 * Extract the sign of the input number into the high bit of the 32-bit word:
+	 *
+	 *      +---+----------------------------------+
+	 *      | S |0000000 00000000 00000000 00000000|
+	 *      +---+----------------------------------+
+	 * Bits  31                 0-31
+	 */
+	const uint32_t sign = w & UINT32_C(0x80000000);
+	/*
+	 * Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word:
+	 *
+	 *      +---+-----+------------+-------------------+
+	 *      | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+	 *      +---+-----+------------+-------------------+
+	 * Bits  30  27-31     17-26            0-16
+	 */
+	const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
+	/*
+	 * Renorm shift is the number of bits to shift mantissa left to make the half-precision number normalized.
+	 * If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit exponent) equals one.
+	 * In this case renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note that if we shift
+	 * denormalized nonsign by renorm_shift, the unit bit of mantissa will shift into exponent, turning the
+	 * biased exponent into 1, and making mantissa normalized (i.e. without leading 1).
+	 */
+#ifdef _MSC_VER
+	unsigned long nonsign_bsr;
+	_BitScanReverse(&nonsign_bsr, (unsigned long) nonsign);
+	uint32_t renorm_shift = (uint32_t) nonsign_bsr ^ 31;
+#else
+	uint32_t renorm_shift = __builtin_clz(nonsign);
+#endif
+	renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
+	/*
+	 * Iff half-precision number has exponent of 15, the addition overflows it into bit 31,
+	 * and the subsequent shift turns the high 9 bits into 1. Thus
+	 *   inf_nan_mask ==
+	 *                   0x7F800000 if the half-precision number had exponent of 15 (i.e. was NaN or infinity)
+	 *                   0x00000000 otherwise
+	 */
+	const int32_t inf_nan_mask = ((int32_t) (nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000);
+	/*
+	 * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31 into 1. Otherwise, bit 31 remains 0.
+	 * The signed shift right by 31 broadcasts bit 31 into all bits of the zero_mask. Thus
+	 *   zero_mask ==
+	 *                0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h)
+	 *                0x00000000 otherwise
+	 */
+	const int32_t zero_mask = (int32_t) (nonsign - 1) >> 31;
+	/*
+	 * 1. Shift nonsign left by renorm_shift to normalize it (if the input was denormal)
+	 * 2. Shift nonsign right by 3 so the exponent (5 bits originally) becomes an 8-bit field and 10-bit mantissa
+	 *    shifts into the 10 high bits of the 23-bit mantissa of IEEE single-precision number.
+	 * 3. Add 0x70 to the exponent (starting at bit 23) to compensate the different in exponent bias
+	 *    (0x7F for single-precision number less 0xF for half-precision number).
+	 * 4. Subtract renorm_shift from the exponent (starting at bit 23) to account for renormalization. As renorm_shift
+	 *    is less than 0x70, this can be combined with step 3.
+	 * 5. Binary OR with inf_nan_mask to turn the exponent into 0xFF if the input was NaN or infinity.
+	 * 6. Binary ANDNOT with zero_mask to turn the mantissa and exponent into zero if the input was zero.
+	 * 7. Combine with the sign of the input number.
+	 */
+	return sign | ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
+}
+
+/*
+ * Convert a 16-bit floating-point number in IEEE half-precision format, in bit representation, to
+ * a 32-bit floating-point number in IEEE single-precision format.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
+ * floating-point operations and bitcasts between integer and floating-point variables.
+ */
+static inline float fp16_ieee_to_fp32_value(uint16_t h) {
+	/*
+	 * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
+	 *      +---+-----+------------+-------------------+
+	 *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+	 *      +---+-----+------------+-------------------+
+	 * Bits  31  26-30    16-25            0-15
+	 *
+	 * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
+	 */
+	const uint32_t w = (uint32_t) h << 16;
+	/*
+	 * Extract the sign of the input number into the high bit of the 32-bit word:
+	 *
+	 *      +---+----------------------------------+
+	 *      | S |0000000 00000000 00000000 00000000|
+	 *      +---+----------------------------------+
+	 * Bits  31                 0-31
+	 */
+	const uint32_t sign = w & UINT32_C(0x80000000);
+	/*
+	 * Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word:
+	 *
+	 *      +-----+------------+---------------------+
+	 *      |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
+	 *      +-----+------------+---------------------+
+	 * Bits  27-31    17-26            0-16
+	 */
+	const uint32_t two_w = w + w;
+
+	/*
+	 * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent
+	 * of a single-precision floating-point number:
+	 *
+	 *       S|Exponent |          Mantissa
+	 *      +-+---+-----+------------+----------------+
+	 *      |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
+	 *      +-+---+-----+------------+----------------+
+	 * Bits   | 23-31   |           0-22
+	 *
+	 * Next, there are some adjustments to the exponent:
+	 * - The exponent needs to be corrected by the difference in exponent bias between single-precision and half-precision
+	 *   formats (0x7F - 0xF = 0x70)
+	 * - Inf and NaN values in the inputs should become Inf and NaN values after conversion to the single-precision number.
+	 *   Therefore, if the biased exponent of the half-precision input was 0x1F (max possible value), the biased exponent
+	 *   of the single-precision output must be 0xFF (max possible value). We do this correction in two steps:
+	 *   - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset below) rather than by 0x70 suggested
+	 *     by the difference in the exponent bias (see above).
+	 *   - Then we multiply the single-precision result of exponent adjustment by 2**(-112) to reverse the effect of
+	 *     exponent adjustment by 0xE0 less the necessary exponent adjustment by 0x70 due to difference in exponent bias.
+	 *     The floating-point multiplication hardware would ensure than Inf and NaN would retain their value on at least
+	 *     partially IEEE754-compliant implementations.
+	 *
+	 * Note that the above operations do not handle denormal inputs (where biased exponent == 0). However, they also do not
+	 * operate on denormal inputs, and do not produce denormal results.
+	 */
+	const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+	const float exp_scale = 0x1.0p-112f;
+#else
+	const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+	const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+	/*
+	 * Convert denormalized half-precision inputs into single-precision results (always normalized).
+	 * Zero inputs are also handled here.
+	 *
+	 * In a denormalized number the biased exponent is zero, and mantissa has on-zero bits.
+	 * First, we shift mantissa into bits 0-9 of the 32-bit word.
+	 *
+	 *                  zeros           |  mantissa
+	 *      +---------------------------+------------+
+	 *      |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
+	 *      +---------------------------+------------+
+	 * Bits             10-31                0-9
+	 *
+	 * Now, remember that denormalized half-precision numbers are represented as:
+	 *    FP16 = mantissa * 2**(-24).
+	 * The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input
+	 * and with an exponent which would scale the corresponding mantissa bits to 2**(-24).
+	 * A normalized single-precision floating-point number is represented as:
+	 *    FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127)
+	 * Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision
+	 * number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount.
+	 *
+	 * The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number
+	 * is zero, the constructed single-precision number has the value of
+	 *    FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5
+	 * Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of
+	 * the input half-precision number.
+	 */
+	const uint32_t magic_mask = UINT32_C(126) << 23;
+	const float magic_bias = 0.5f;
+	const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+	/*
+	 * - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the
+	 *   input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the
+	 *   input is either a denormal number, or zero.
+	 * - Combine the result of conversion of exponent and mantissa with the sign of the input number.
+	 */
+	const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+	const uint32_t result = sign |
+		(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+	return fp32_from_bits(result);
+}
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a 16-bit floating-point number in
+ * IEEE half-precision format, in bit representation.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
+ * floating-point operations and bitcasts between integer and floating-point variables.
+ */
+static inline uint16_t fp16_ieee_from_fp32_value(float f) {
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+	const float scale_to_inf = 0x1.0p+112f;
+	const float scale_to_zero = 0x1.0p-110f;
+#else
+	const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+	const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+#endif
+	float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+	const uint32_t w = fp32_to_bits(f);
+	const uint32_t shl1_w = w + w;
+	const uint32_t sign = w & UINT32_C(0x80000000);
+	uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+	if (bias < UINT32_C(0x71000000)) {
+		bias = UINT32_C(0x71000000);
+	}
+
+	base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+	const uint32_t bits = fp32_to_bits(base);
+	const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+	const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+	const uint32_t nonsign = exp_bits + mantissa_bits;
+	return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+}
+
+/*
+ * Convert a 16-bit floating-point number in ARM alternative half-precision format, in bit representation, to
+ * a 32-bit floating-point number in IEEE single-precision format, in bit representation.
+ *
+ * @note The implementation doesn't use any floating-point operations.
+ */
+static inline uint32_t fp16_alt_to_fp32_bits(uint16_t h) {
+	/*
+	 * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
+	 *      +---+-----+------------+-------------------+
+	 *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+	 *      +---+-----+------------+-------------------+
+	 * Bits  31  26-30    16-25            0-15
+	 *
+	 * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
+	 */
+	const uint32_t w = (uint32_t) h << 16;
+	/*
+	 * Extract the sign of the input number into the high bit of the 32-bit word:
+	 *
+	 *      +---+----------------------------------+
+	 *      | S |0000000 00000000 00000000 00000000|
+	 *      +---+----------------------------------+
+	 * Bits  31                 0-31
+	 */
+	const uint32_t sign = w & UINT32_C(0x80000000);
+	/*
+	 * Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word:
+	 *
+	 *      +---+-----+------------+-------------------+
+	 *      | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+	 *      +---+-----+------------+-------------------+
+	 * Bits  30  27-31     17-26            0-16
+	 */
+	const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
+	/*
+	 * Renorm shift is the number of bits to shift mantissa left to make the half-precision number normalized.
+	 * If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit exponent) equals one.
+	 * In this case renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note that if we shift
+	 * denormalized nonsign by renorm_shift, the unit bit of mantissa will shift into exponent, turning the
+	 * biased exponent into 1, and making mantissa normalized (i.e. without leading 1).
+	 */
+#ifdef _MSC_VER
+	unsigned long nonsign_bsr;
+	_BitScanReverse(&nonsign_bsr, (unsigned long) nonsign);
+	uint32_t renorm_shift = (uint32_t) nonsign_bsr ^ 31;
+#else
+	uint32_t renorm_shift = __builtin_clz(nonsign);
+#endif
+	renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
+	/*
+	 * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31 into 1. Otherwise, bit 31 remains 0.
+	 * The signed shift right by 31 broadcasts bit 31 into all bits of the zero_mask. Thus
+	 *   zero_mask ==
+	 *                0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h)
+	 *                0x00000000 otherwise
+	 */
+	const int32_t zero_mask = (int32_t) (nonsign - 1) >> 31;
+	/*
+	 * 1. Shift nonsign left by renorm_shift to normalize it (if the input was denormal)
+	 * 2. Shift nonsign right by 3 so the exponent (5 bits originally) becomes an 8-bit field and 10-bit mantissa
+	 *    shifts into the 10 high bits of the 23-bit mantissa of IEEE single-precision number.
+	 * 3. Add 0x70 to the exponent (starting at bit 23) to compensate the different in exponent bias
+	 *    (0x7F for single-precision number less 0xF for half-precision number).
+	 * 4. Subtract renorm_shift from the exponent (starting at bit 23) to account for renormalization. As renorm_shift
+	 *    is less than 0x70, this can be combined with step 3.
+	 * 5. Binary ANDNOT with zero_mask to turn the mantissa and exponent into zero if the input was zero.
+	 * 6. Combine with the sign of the input number.
+	 */
+	return sign | (((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) & ~zero_mask);
+}
+
+/*
+ * Convert a 16-bit floating-point number in ARM alternative half-precision format, in bit representation, to
+ * a 32-bit floating-point number in IEEE single-precision format.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
+ * floating-point operations and bitcasts between integer and floating-point variables.
+ */
+static inline float fp16_alt_to_fp32_value(uint16_t h) {
+	/*
+	 * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
+	 *      +---+-----+------------+-------------------+
+	 *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+	 *      +---+-----+------------+-------------------+
+	 * Bits  31  26-30    16-25            0-15
+	 *
+	 * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
+	 */
+	const uint32_t w = (uint32_t) h << 16;
+	/*
+	 * Extract the sign of the input number into the high bit of the 32-bit word:
+	 *
+	 *      +---+----------------------------------+
+	 *      | S |0000000 00000000 00000000 00000000|
+	 *      +---+----------------------------------+
+	 * Bits  31                 0-31
+	 */
+	const uint32_t sign = w & UINT32_C(0x80000000);
+	/*
+	 * Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word:
+	 *
+	 *      +-----+------------+---------------------+
+	 *      |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
+	 *      +-----+------------+---------------------+
+	 * Bits  27-31    17-26            0-16
+	 */
+	const uint32_t two_w = w + w;
+
+	/*
+	 * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent
+	 * of a single-precision floating-point number:
+	 *
+	 *       S|Exponent |          Mantissa
+	 *      +-+---+-----+------------+----------------+
+	 *      |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
+	 *      +-+---+-----+------------+----------------+
+	 * Bits   | 23-31   |           0-22
+	 *
+	 * Next, the exponent is adjusted for the difference in exponent bias between single-precision and half-precision
+	 * formats (0x7F - 0xF = 0x70). This operation never overflows or generates non-finite values, as the largest
+	 * half-precision exponent is 0x1F and after the adjustment is can not exceed 0x8F < 0xFE (largest single-precision
+	 * exponent for non-finite values).
+	 *
+	 * Note that this operation does not handle denormal inputs (where biased exponent == 0). However, they also do not
+	 * operate on denormal inputs, and do not produce denormal results.
+	 */
+	const uint32_t exp_offset = UINT32_C(0x70) << 23;
+	const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset);
+
+	/*
+	 * Convert denormalized half-precision inputs into single-precision results (always normalized).
+	 * Zero inputs are also handled here.
+	 *
+	 * In a denormalized number the biased exponent is zero, and mantissa has on-zero bits.
+	 * First, we shift mantissa into bits 0-9 of the 32-bit word.
+	 *
+	 *                  zeros           |  mantissa
+	 *      +---------------------------+------------+
+	 *      |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
+	 *      +---------------------------+------------+
+	 * Bits             10-31                0-9
+	 *
+	 * Now, remember that denormalized half-precision numbers are represented as:
+	 *    FP16 = mantissa * 2**(-24).
+	 * The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input
+	 * and with an exponent which would scale the corresponding mantissa bits to 2**(-24).
+	 * A normalized single-precision floating-point number is represented as:
+	 *    FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127)
+	 * Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision
+	 * number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount.
+	 *
+	 * The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number
+	 * is zero, the constructed single-precision number has the value of
+	 *    FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5
+	 * Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of
+	 * the input half-precision number.
+	 */
+	const uint32_t magic_mask = UINT32_C(126) << 23;
+	const float magic_bias = 0.5f;
+	const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+	/*
+	 * - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the
+	 *   input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the
+	 *   input is either a denormal number, or zero.
+	 * - Combine the result of conversion of exponent and mantissa with the sign of the input number.
+	 */
+	const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+	const uint32_t result = sign |
+		(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+	return fp32_from_bits(result);
+}
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a 16-bit floating-point number in
+ * ARM alternative half-precision format, in bit representation.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
+ * floating-point operations and bitcasts between integer and floating-point variables.
+ */
+static inline uint16_t fp16_alt_from_fp32_value(float f) {
+	const uint32_t w = fp32_to_bits(f);
+	const uint32_t sign = w & UINT32_C(0x80000000);
+	const uint32_t shl1_w = w + w;
+
+	const uint32_t shl1_max_fp16_fp32 = UINT32_C(0x8FFFC000);
+	const uint32_t shl1_base = shl1_w > shl1_max_fp16_fp32 ? shl1_max_fp16_fp32 : shl1_w;
+	uint32_t shl1_bias = shl1_base & UINT32_C(0xFF000000);
+	const uint32_t exp_difference = 23 - 10;
+	const uint32_t shl1_bias_min = (127 - 1 - exp_difference) << 24;
+	if (shl1_bias < shl1_bias_min) {
+		shl1_bias = shl1_bias_min;
+	}
+
+	const float bias = fp32_from_bits((shl1_bias >> 1) + ((exp_difference + 2) << 23));
+	const float base = fp32_from_bits((shl1_base >> 1) + (2 << 23)) + bias;
+
+	const uint32_t exp_f = fp32_to_bits(base) >> 13;
+	return (sign >> 16) | ((exp_f & UINT32_C(0x00007C00)) + (fp32_to_bits(base) & UINT32_C(0x00000FFF)));
+}
+
+#endif /* FP16_FP16_H */
diff --git a/cpp/external/katagocoreml/vendor/deps/FP16/include/fp16/psimd.h b/cpp/external/katagocoreml/vendor/deps/FP16/include/fp16/psimd.h
new file mode 100644
index 000000000..428ab0651
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/deps/FP16/include/fp16/psimd.h
@@ -0,0 +1,131 @@
+#pragma once
+#ifndef FP16_PSIMD_H
+#define FP16_PSIMD_H
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+	#include <cstdint>
+#elif !defined(__OPENCL_VERSION__)
+	#include <stdint.h>
+#endif
+
+#include <psimd.h>
+
+
+PSIMD_INTRINSIC psimd_f32 fp16_ieee_to_fp32_psimd(psimd_u16 half) {
+	const psimd_u32 word = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
+
+	const psimd_u32 sign = word & psimd_splat_u32(UINT32_C(0x80000000));
+	const psimd_u32 shr3_nonsign = (word + word) >> psimd_splat_u32(4);
+
+	const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x70000000));
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+	const psimd_f32 exp_scale = psimd_splat_f32(0x1.0p-112f);
+#else
+	const psimd_f32 exp_scale = psimd_splat_f32(fp32_from_bits(UINT32_C(0x7800000)));
+#endif
+	const psimd_f32 norm_nonsign = psimd_mul_f32((psimd_f32) (shr3_nonsign + exp_offset), exp_scale);
+
+	const psimd_u16 magic_mask = psimd_splat_u16(UINT16_C(0x3E80));
+	const psimd_f32 magic_bias = psimd_splat_f32(0.25f);
+	const psimd_f32 denorm_nonsign = psimd_sub_f32((psimd_f32) psimd_interleave_lo_u16(half + half, magic_mask), magic_bias);
+
+	const psimd_s32 denorm_cutoff = psimd_splat_s32(INT32_C(0x00800000));
+	const psimd_s32 denorm_mask = (psimd_s32) shr3_nonsign < denorm_cutoff;
+	return (psimd_f32) (sign | (psimd_s32) psimd_blend_f32(denorm_mask, denorm_nonsign, norm_nonsign));
+}
+
+PSIMD_INTRINSIC psimd_f32x2 fp16_ieee_to_fp32x2_psimd(psimd_u16 half) {
+	const psimd_u32 word_lo = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
+	const psimd_u32 word_hi = (psimd_u32) psimd_interleave_hi_u16(psimd_zero_u16(), half);
+
+	const psimd_u32 sign_mask = psimd_splat_u32(UINT32_C(0x80000000));
+	const psimd_u32 sign_lo = word_lo & sign_mask;
+	const psimd_u32 sign_hi = word_hi & sign_mask;
+	const psimd_u32 shr3_nonsign_lo = (word_lo + word_lo) >> psimd_splat_u32(4);
+	const psimd_u32 shr3_nonsign_hi = (word_hi + word_hi) >> psimd_splat_u32(4);
+
+	const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x70000000));
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+	const psimd_f32 exp_scale = psimd_splat_f32(0x1.0p-112f);
+#else
+	const psimd_f32 exp_scale = psimd_splat_f32(fp32_from_bits(UINT32_C(0x7800000)));
+#endif
+	const psimd_f32 norm_nonsign_lo = psimd_mul_f32((psimd_f32) (shr3_nonsign_lo + exp_offset), exp_scale);
+	const psimd_f32 norm_nonsign_hi = psimd_mul_f32((psimd_f32) (shr3_nonsign_hi + exp_offset), exp_scale);
+
+	const psimd_u16 magic_mask = psimd_splat_u16(UINT16_C(0x3E80));
+	const psimd_u16 shl1_half = half + half;
+	const psimd_f32 magic_bias = psimd_splat_f32(0.25f);
+	const psimd_f32 denorm_nonsign_lo = psimd_sub_f32((psimd_f32) psimd_interleave_lo_u16(shl1_half, magic_mask), magic_bias);
+	const psimd_f32 denorm_nonsign_hi = psimd_sub_f32((psimd_f32) psimd_interleave_hi_u16(shl1_half, magic_mask), magic_bias);
+
+	const psimd_s32 denorm_cutoff = psimd_splat_s32(INT32_C(0x00800000));
+	const psimd_s32 denorm_mask_lo = (psimd_s32) shr3_nonsign_lo < denorm_cutoff;
+	const psimd_s32 denorm_mask_hi = (psimd_s32) shr3_nonsign_hi < denorm_cutoff;
+
+	psimd_f32x2 result;
+	result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_blend_f32(denorm_mask_lo, denorm_nonsign_lo, norm_nonsign_lo));
+	result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_blend_f32(denorm_mask_hi, denorm_nonsign_hi, norm_nonsign_hi));
+	return result;
+}
+
+PSIMD_INTRINSIC psimd_f32 fp16_alt_to_fp32_psimd(psimd_u16 half) {
+	const psimd_u32 word = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
+
+	const psimd_u32 sign = word & psimd_splat_u32(INT32_C(0x80000000));
+	const psimd_u32 shr3_nonsign = (word + word) >> psimd_splat_u32(4);
+
+#if 0
+	const psimd_s32 exp112_offset = psimd_splat_s32(INT32_C(0x38000000));
+	const psimd_s32 nonsign_bits = (psimd_s32) shr3_nonsign + exp112_offset;
+	const psimd_s32 exp1_offset = psimd_splat_s32(INT32_C(0x00800000));
+	const psimd_f32 two_nonsign = (psimd_f32) (nonsign_bits + exp1_offset);
+	const psimd_s32 exp113_offset = exp112_offset | exp1_offset;
+	return (psimd_f32) (sign | (psimd_s32) psimd_sub_f32(two_nonsign, (psimd_f32) psimd_max_s32(nonsign_bits, exp113_offset)));
+#else
+	const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x38000000));
+	const psimd_f32 nonsign = (psimd_f32) (shr3_nonsign + exp_offset);
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+	const psimd_f32 denorm_bias = psimd_splat_f32(0x1.0p-14f);
+#else
+	const psimd_f32 denorm_bias = psimd_splat_f32(fp32_from_bits(UINT32_C(0x38800000)));
+#endif
+	return (psimd_f32) (sign | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign, nonsign), psimd_max_f32(nonsign, denorm_bias)));
+#endif
+}
+
+PSIMD_INTRINSIC psimd_f32x2 fp16_alt_to_fp32x2_psimd(psimd_u16 half) {
+	const psimd_u32 word_lo = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
+	const psimd_u32 word_hi = (psimd_u32) psimd_interleave_hi_u16(psimd_zero_u16(), half);
+
+	const psimd_u32 sign_mask = psimd_splat_u32(UINT32_C(0x80000000));
+	const psimd_u32 sign_lo = word_lo & sign_mask;
+	const psimd_u32 sign_hi = word_hi & sign_mask;
+	const psimd_u32 shr3_nonsign_lo = (word_lo + word_lo) >> psimd_splat_u32(4);
+	const psimd_u32 shr3_nonsign_hi = (word_hi + word_hi) >> psimd_splat_u32(4);
+
+#if 1
+	const psimd_s32 exp112_offset = psimd_splat_s32(INT32_C(0x38000000));
+	const psimd_s32 nonsign_bits_lo = (psimd_s32) shr3_nonsign_lo + exp112_offset;
+	const psimd_s32 nonsign_bits_hi = (psimd_s32) shr3_nonsign_hi + exp112_offset;
+	const psimd_s32 exp1_offset = psimd_splat_s32(INT32_C(0x00800000));
+	const psimd_f32 two_nonsign_lo = (psimd_f32) (nonsign_bits_lo + exp1_offset);
+	const psimd_f32 two_nonsign_hi = (psimd_f32) (nonsign_bits_hi + exp1_offset);
+	const psimd_s32 exp113_offset = exp1_offset | exp112_offset;
+	psimd_f32x2 result;
+	result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_sub_f32(two_nonsign_lo, (psimd_f32) psimd_max_s32(nonsign_bits_lo, exp113_offset)));
+	result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_sub_f32(two_nonsign_hi, (psimd_f32) psimd_max_s32(nonsign_bits_hi, exp113_offset)));
+	return result;
+#else
+	const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x38000000));
+	const psimd_f32 nonsign_lo = (psimd_f32) (shr3_nonsign_lo + exp_offset);
+	const psimd_f32 nonsign_hi = (psimd_f32) (shr3_nonsign_hi + exp_offset);
+	const psimd_f32 denorm_bias = psimd_splat_f32(0x1.0p-14f);
+	psimd_f32x2 result;
+	result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign_lo, nonsign_lo), psimd_max_f32(nonsign_lo, denorm_bias)));
+	result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign_hi, nonsign_hi), psimd_max_f32(nonsign_hi, denorm_bias)));
+	return result;
+#endif
+}
+
+#endif /* FP16_PSIMD_H */
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/LICENSE.txt b/cpp/external/katagocoreml/vendor/mlmodel/LICENSE.txt
new file mode 100644
index 000000000..b4570ec56
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/LICENSE.txt
@@ -0,0 +1,11 @@
+Copyright (c) 2017, Apple Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1.  Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2.  Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3.  Neither the name of the copyright holder(s) nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/ArrayFeatureExtractor.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/ArrayFeatureExtractor.proto
new file mode 100644
index 000000000..d689a3f0e
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/ArrayFeatureExtractor.proto
@@ -0,0 +1,19 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+package CoreML.Specification;
+
+/*
+ * An array feature extractor.
+ *
+ * Given an index, extracts the value at that index from its array input.
+ * Indexes are zero-based.
+ */
+message ArrayFeatureExtractor {
+    repeated uint64 extractIndex = 1;
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/AudioFeaturePrint.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/AudioFeaturePrint.proto
new file mode 100644
index 000000000..8daa3fffa
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/AudioFeaturePrint.proto
@@ -0,0 +1,36 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+package CoreML.Specification.CoreMLModels;
+
+/*
+ * A model which takes an input audio and outputs array(s) of features
+ * according to the specified feature types
+ */
+message AudioFeaturePrint {
+
+    // Specific audio feature print types
+   
+    // Sound extracts features useful for identifying the predominant
+    // sound in audio signal
+    message Sound {
+        enum SoundVersion {
+            SOUND_VERSION_INVALID = 0;
+            // VERSION_1 is available on iOS,tvOS 15.0+, macOS 12.0+
+            // It uses a variable-length input audio sample vector and yields a 512 float feature vector
+            SOUND_VERSION_1 = 1;
+        }
+        
+        SoundVersion version = 1;
+    }
+
+    // Audio feature print type
+    oneof AudioFeaturePrintType {
+        Sound sound = 20;
+    }
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/BayesianProbitRegressor.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/BayesianProbitRegressor.proto
new file mode 100644
index 000000000..742c99ae8
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/BayesianProbitRegressor.proto
@@ -0,0 +1,139 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+package CoreML.Specification;
+
+/*
+ * A Bayesian probit regressor.
+ *
+ * The probit regression model is superficially similar to the more commonly known
+ * logistic regression, with sampling distribution of the model given by
+ *
+ *    P(y=+1|x,w) = Φ(<w,x>/β)
+ *
+ * where w are the set of weights,
+ *       x are the set of features for the given event,
+ *       β is a model hyper-parameter, and
+ *       Φ is the link function, defined to be the CDF of the normal distribution.
+ * The weights w[i,j] are Gaussian distributed, with mean μ[i,j] and precision 1/(σ[i,j])^2
+ * (where i indexes over features and j indexes over the values for the feature).
+ * The parameter β scales the steepness of the inverse link function.
+ *
+ * (see https://en.wikipedia.org/wiki/Probit_model and https://en.wikipedia.org/wiki/Logistic_regression
+ * for more details on probit model and logistic regression, respectively)
+ *
+ * Input: X
+ *   x represents a set of features, each taking on a discrete value (note that continuous values
+ *   would first need to be discretized). x can be represented as a vector where the index i is
+ *   the feature id and x[i] is the feature value. Alternatively, x can be represented as a matrix
+ *   with 2 columns where the first column indicates the feature id and the second column contains
+ *   the feature values, i.e. x[i,0] is the feature id and x[i,1] is the feature value.
+ *
+ *   additional input features:
+ *   - "optimism": apply a mean shift to the probability, i.e. shift regression mean by o*stdev,
+ *                 where o is the "optimism" parameter (see additional output features)
+ *   - "samplingScale": for sampling from posterior, multiply standard deviation by this factor
+ *   - "samplingTruncation": for sampling from posterior, truncate sampling distribution at given multiple of std from mean
+ *
+ * Output: Y
+ *   probability P(y|x,w)
+ *
+ *   additional output features:
+ *   - mean (regression output before applying link function)
+ *   - variance (regression output variance before applying link function)
+ *   - pessimistic probability: P(y|x,w) with a mean shift parameterized by "optimism" feature
+ *   - sampled probability: p ~ P(y|x,w) with standard deviation scaling parametrized by "samplingScale" feature
+ *                                       and distribution truncated at multiple of standard deviation,
+ *                                       where multiple parameterized by "samplingTruncation" feature.
+ *
+ */
+
+message BayesianProbitRegressor {
+
+    /*
+     * Parameterization of a Gaussian distribution
+     */
+    message Gaussian {
+        double mean = 1;
+        double precision = 2; // inverse of the variance
+    }
+
+    /*
+     * Weight for a specific feature value
+     * The weight is represented as a Gaussian distribution
+     * with a mean and precision (1/variance) to capture
+     * uncertainty in the weight
+     */
+    message FeatureValueWeight {
+        uint32 featureValue = 1;
+        Gaussian featureWeight = 2;
+    }
+
+    /*
+     * Feature with associated weights (for different values)
+     * Each feature has a set of weights for the (discrete) values
+     * it can take
+     */
+    message FeatureWeight {
+        uint32 featureId = 1;
+        repeated FeatureValueWeight weights = 2;
+    }
+
+    uint32 numberOfFeatures = 1;
+
+    Gaussian bias = 2;  // bias term
+
+    /*
+     * Set of features with associated weights
+     */
+    repeated FeatureWeight features = 3;  // feature weights
+
+    /*
+     * Set this name to be the same as input feature of type multi-array (1D)
+     * in the model description you want to use as the regression input
+     */
+    string regressionInputFeatureName = 10;
+
+    /*
+     * Set this name to be the same as optional input feature of type double
+     * in the model description you want to use as the optimism input
+     */
+    string optimismInputFeatureName = 11;
+
+    /*
+     * Set this name to be the same as optional input feature of type double
+     * in the model description you want to use as the samplingScale input
+     */
+    string samplingScaleInputFeatureName = 12;
+
+    /*
+     * Set this name to be the same as optional input feature of type double
+     * in the model description you want to use as the samplingBounds input
+     */
+    string samplingTruncationInputFeatureName = 13;
+
+    /*
+     * name of 'mean' output feature
+     */
+    string meanOutputFeatureName = 20;
+
+    /*
+     * name of 'variance' output feature
+     */
+    string varianceOutputFeatureName = 21;
+
+    /*
+     * name of 'pessimistic' output feature
+     */
+    string pessimisticProbabilityOutputFeatureName = 22;
+
+    /*
+     * name of 'sampled' output feature: samples from the scaled posterior probability distribuiton
+     */
+    string sampledProbabilityOutputFeatureName = 23;
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/CategoricalMapping.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/CategoricalMapping.proto
new file mode 100644
index 000000000..dcb6eaf9d
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/CategoricalMapping.proto
@@ -0,0 +1,38 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "DataStructures.proto";
+
+package CoreML.Specification;
+
+/*
+ * A categorical mapping.
+ *
+ * This allows conversion from integers to strings, or from strings to integers.
+ */
+message CategoricalMapping {
+    oneof MappingType {
+        // Conversion from strings to integers
+        StringToInt64Map stringToInt64Map = 1;
+
+        // Conversion from integer to string
+        Int64ToStringMap int64ToStringMap = 2;
+    }
+
+    /*
+     * The value returned if an input is not contained in the map above.
+     * If one of these is not set, then an error is raised on an unknown input.
+     */
+    oneof ValueOnUnknown {
+        // Default output when converting from an integer to a string.
+        string strValue = 101;
+
+        // Default output when converting from a string to an integer.
+        int64 int64Value = 102;
+    }
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/ClassConfidenceThresholding.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/ClassConfidenceThresholding.proto
new file mode 100644
index 000000000..173296345
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/ClassConfidenceThresholding.proto
@@ -0,0 +1,41 @@
+// Copyright (c) 2022, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "DataStructures.proto";
+
+package CoreML.Specification;
+
+/* A model to filter classification labels by confidence thresholds.
+ *
+ * The model has one input:
+ * - A multi-array of type FP16, FP32, or FP64 and shape [C], where C
+ * is the number of classes.
+ *
+ * The model has one output:
+ * - A multi-array of type FP16, FP32, or FP64 and shape [2, C], where
+ *   C is the number of classes. The values in [0, :] is the same as
+ *   the confidence inputs. The values in [1, :] is either 0 or 1,
+ *   where 1 means the class is present and 0 means it is not.
+ *
+ * Currently, the model simply takes all the classes.
+ *
+ *   filteredClassConfidences[0, :] = classConfidences[:]
+ *   filteredClassConfidences[1, :] = 1
+ */
+ 
+message ClassConfidenceThresholding {
+
+    /**
+     * The precision-recall curve for each class label.
+     *
+     * The field is optional. When it exists, the number of curves
+     * must match the number of class labels.
+     */
+    repeated PrecisionRecallCurve precisionRecallCurves = 100;
+}
+
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/CustomModel.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/CustomModel.proto
new file mode 100644
index 000000000..b5a361b10
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/CustomModel.proto
@@ -0,0 +1,30 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+package CoreML.Specification;
+
+/*
+ * A parameterized model whose function is defined in code
+ */
+message CustomModel {
+
+    message CustomModelParamValue {
+        oneof value {
+            double doubleValue = 10;
+            string stringValue = 20;
+            int32 intValue = 30;
+            int64 longValue = 40;
+            bool boolValue = 50;
+            bytes bytesValue = 60;
+        }
+    }
+
+    string className = 10; // The name of the class (conforming to MLCustomModel) corresponding to this model
+    map<string, CustomModelParamValue> parameters = 30;
+    string description = 40; // An (optional) description provided by the model creator. This information is displayed when viewing the model, but does not affect the model's execution on device.
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/DataStructures.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/DataStructures.proto
new file mode 100644
index 000000000..6cd2d1ee6
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/DataStructures.proto
@@ -0,0 +1,126 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "FeatureTypes.proto";
+
+package CoreML.Specification;
+
+/*
+ * A mapping from a string
+ * to a 64-bit integer.
+ */
+message StringToInt64Map {
+    map<string, int64> map = 1;
+}
+
+/*
+ * A mapping from a 64-bit integer
+ * to a string.
+ */
+message Int64ToStringMap {
+    map<int64, string> map = 1;
+}
+
+/*
+ * A mapping from a string
+ * to a double-precision floating point number.
+ */
+message StringToDoubleMap {
+    map<string, double> map = 1;
+}
+
+/*
+ * A mapping from a 64-bit integer
+ * to a double-precision floating point number.
+ */
+message Int64ToDoubleMap {
+    map<int64, double> map = 1;
+}
+
+/*
+ * A vector of strings.
+ */
+message StringVector {
+    repeated string vector = 1;
+}
+
+/*
+ * A vector of 64-bit integers.
+ */
+message Int64Vector {
+    repeated int64 vector = 1;
+}
+
+/*
+ * A vector of floating point numbers.
+ */
+message FloatVector {
+    repeated float vector = 1;
+}
+
+/*
+ * A vector of double-precision floating point numbers.
+ */
+message DoubleVector {
+    repeated double vector = 1;
+}
+
+/*
+ * A range of int64 values
+ */
+message Int64Range {
+    int64 minValue = 1;
+    int64 maxValue = 2;
+}
+
+/*
+ * A set of int64 values
+ */
+message Int64Set {
+    repeated int64 values = 1;
+}
+
+/*
+ * A range of double values
+ */
+message DoubleRange {
+    double minValue = 1;
+    double maxValue = 2;
+}
+
+/**
+ * Precision/Recall curve.
+ *
+ * The syntax comprises two tables, one to look up the confidence value threshold
+ * for a given precision, and the other for a given recall.
+ *
+ * Example:
+ * ----------------------+----+----+----+----+----+----+----+----+----
+ * precisionValues       | .1 | .2 | .3 | .4 | .5 | .6 | .7 |
+ * precisionConfidence   | .0 | .0 | .0 | .0 | .1 | .3 | .4 |
+ * ----------------------+----+----+----+----+----+----+----+----+----
+ *
+ * ----------------------+----+----+----+----+----+----+----+----+----
+ * recallValues          | .1 | .2 | .3 | .4 | .5 | .6 | .7 | .8 | .9
+ * recallConfidence      | .7 | .6 | .5 | .4 | .3 | .3 | .2 | .1 | .0
+ * ----------------------+----+----+----+----+----+----+----+----+----
+ *
+ * The application expects that, when it filters out samples with
+ * confidence threshold = 0.1, it gets precision = 0.5. Likewise,
+ * with threshold = 0.2 it gets recall = 0.7.
+ *
+ * The table must have only valid values; do not use `NaN`, `+/- INF`,
+ * or negative values. The application is responsible for inter/extrapolating
+ * approprate confidence threshold based on the application's specific need.
+ */
+message PrecisionRecallCurve {
+    FloatVector precisionValues = 1;
+    FloatVector precisionConfidenceThresholds = 2;
+    FloatVector recallValues = 3;
+    FloatVector recallConfidenceThresholds = 4;
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/DictVectorizer.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/DictVectorizer.proto
new file mode 100644
index 000000000..73f6a0c42
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/DictVectorizer.proto
@@ -0,0 +1,36 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "DataStructures.proto";
+
+package CoreML.Specification;
+
+/*
+ * Uses an index mapping to convert a dictionary to an array.
+ *
+ * The output array will be equal in length to the index mapping vector parameter.
+ * All keys in the input dictionary must be present in the index mapping vector.
+ *
+ * For each item in the input dictionary, insert its value in the output array.
+ * The position of the insertion is determined by the position of the item's key
+ * in the index mapping. Any keys not present in the input dictionary, will be
+ * zero in the output array.
+ *
+ * For example: if the ``stringToIndex`` parameter is set to ``["a", "c", "b", "z"]``,
+ * then an input of ``{"a": 4, "c": 8}`` will produce an output of ``[4, 8, 0, 0]``.
+ *
+ */
+message DictVectorizer {
+    oneof Map {
+        // String keys to indexes
+        StringVector stringToIndex = 1;
+
+        // Int keys to indexes
+        Int64Vector int64ToIndex = 2;
+    }
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/FeatureTypes.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/FeatureTypes.proto
new file mode 100644
index 000000000..46c51fb67
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/FeatureTypes.proto
@@ -0,0 +1,233 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+package CoreML.Specification;
+
+/*
+ * The 64-bit integer feature type.
+ */
+message Int64FeatureType {}
+
+/*
+ * The double-precision floating point number feature type.
+ */
+message DoubleFeatureType {}
+
+/*
+ * The string feature type.
+ */
+message StringFeatureType {}
+
+
+message SizeRange {
+    uint64 lowerBound = 1;
+    int64 upperBound = 2; // negative value means unbound otherwise upperbound is included in range
+}
+
+/*
+ * The image feature type.
+ */
+message ImageFeatureType {
+    // Assumes raw (decompressed) format
+    enum ColorSpace {
+        INVALID_COLOR_SPACE = 0;
+        GRAYSCALE = 10;         //  8 bits per pixel
+        RGB = 20;               // 32 bits per pixel: RGBA with A channel ignored
+        BGR = 30;               // 32 bits per pixel: BGRA with A channel ignored
+        GRAYSCALE_FLOAT16 = 40; // 16 bits float per pixel
+    }
+
+    message ImageSize {
+        uint64 width = 1;
+        uint64 height = 2;
+    }
+
+    message EnumeratedImageSizes {
+        repeated ImageSize sizes = 1;
+    }
+
+    message ImageSizeRange {
+        SizeRange widthRange = 1;
+        SizeRange heightRange = 2;
+    }
+
+    // The required or default image size is width x height
+    //
+    // If specificationVersion <= 2 or SizeFlexibility is empty,
+    // width x height is the required fixed image size
+    //
+    // If SizeFlexibility is present, width x height indicate a "default"
+    // image size which must be consistent with the flexibility specified
+
+    int64 width = 1;
+    int64 height = 2;
+
+    // For specification version >= 3 you can specify image size flexibility.
+
+    oneof SizeFlexibility {
+
+        // Use enumeratedSizes for a set of distinct fixed sizes
+        // e.g. portrait or landscape: [80 x 100, 100 x 8]
+        //
+        // If the width x height fields above are specified then they must be
+        // one of the sizes listed.
+        //
+        // If width and height are not specified above then the default width
+        // and height will be enumeratedSizes[0]
+        //
+        // Must be non-empty
+
+        EnumeratedImageSizes enumeratedSizes = 21;
+
+        // Use imageSizeRange to allow for ranges of values
+        // e.g. any image greater than 10 x 20: [10..<max] x [20..<max]
+        //
+        // If width and height are specified above they must fall in the range
+        // specified in imageSizeRange. They will be treated as the default size.
+        //
+        // If width and height are not specified above then the default width
+        // and height will be imageSizeRange.widthRange.lowerBound x imageSizeRange.heightRange.lowerBound
+
+        ImageSizeRange imageSizeRange = 31;
+    }
+
+    ColorSpace colorSpace = 3;
+}
+
+/*
+ * The array feature type.
+ */
+message ArrayFeatureType {
+
+    enum ArrayDataType {
+        INVALID_ARRAY_DATA_TYPE = 0;
+        FLOAT32 = 65568; // 0x10000 | 32
+        DOUBLE = 65600;  // 0x10000 | 64
+        INT32 = 131104;  // 0x20000 | 32
+        INT8  = 131080;  // 0x20000 |  8
+        FLOAT16 = 65552; // 0x10000 | 16
+    }
+
+    // The required or default shape
+    //
+    // If specificationVersion <= 2 or ShapeFlexibility is empty,
+    // shape is the required fixed shape
+    //
+    // If ShapeFlexibility is present, shape indicate a "default"
+    // shape which must be consistent with the flexibility specified
+
+    repeated int64 shape = 1;
+
+    ArrayDataType dataType = 2;
+
+    message Shape {
+        repeated int64 shape = 1;
+    }
+
+    message EnumeratedShapes {
+        repeated Shape shapes = 1;
+    }
+
+    message ShapeRange {
+        // sizeRanges.size() must be length 1 or 3
+        // sizeRanges[d] specifies the allowed range for dimension d
+        repeated SizeRange sizeRanges = 1;
+    }
+
+    // For specification version >= 3 you can specify image size flexibility.
+
+    oneof ShapeFlexibility {
+
+        // Use enumeratedShapes for a set of distinct fixed shapes
+        //
+        // If the shape field is specified then it must be
+        // one of the enumerated shapes.
+        //
+        // If shape is not specified, the "default" shape will be considered
+        // enumeratedShapes[0]
+        //
+        // Must be non-empty
+
+        EnumeratedShapes enumeratedShapes = 21;
+
+        // Use shapeRange to allow the size of each dimension vary within
+        // independently specified ranges
+        //
+        // If you specify shape above it must fall in the range
+        // specified in shapeRanges. It will be treated as the default shape.
+        //
+        // If you don't specify shape above then the default shape will
+        // have shape[d] = shapeRange.sizeRanges[d].lowerBound
+
+        ShapeRange shapeRange = 31;
+
+    }
+
+    oneof defaultOptionalValue {
+        int32 intDefaultValue = 41;
+        float floatDefaultValue = 51;
+        double doubleDefaultValue = 61;
+    }
+
+}
+
+/*
+ * The dictionary feature type.
+ */
+message DictionaryFeatureType {
+    /*
+     *  Key/value type tags, with the following restrictions:
+     *  - ``keyType`` must be a hashable type
+     *  - ``valueType`` is assumed to be a ``double``
+     */
+    oneof KeyType {
+        Int64FeatureType int64KeyType = 1;
+        StringFeatureType stringKeyType = 2;
+    }
+}
+
+/*
+ * The Sequence feature type.
+ */
+message SequenceFeatureType {
+
+    /*
+     * Currently only categorical int64 and String sequences are supported
+     */
+    oneof Type {
+        Int64FeatureType int64Type = 1;
+        StringFeatureType stringType = 3;
+    }
+
+    // Range of allowed size/length/count of sequence
+    SizeRange sizeRange = 101;
+}
+
+message StateFeatureType {
+    oneof Type {
+        ArrayFeatureType arrayType = 1;
+    }
+}
+
+/*
+ * A feature, which may be optional.
+ */
+message FeatureType {
+    oneof Type {
+        Int64FeatureType int64Type = 1;
+        DoubleFeatureType doubleType = 2;
+        StringFeatureType stringType = 3;
+        ImageFeatureType imageType = 4;
+        ArrayFeatureType multiArrayType = 5;
+        DictionaryFeatureType dictionaryType = 6;
+        SequenceFeatureType sequenceType = 7;
+        StateFeatureType stateType = 8;
+    }
+
+    bool isOptional = 1000;
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/FeatureVectorizer.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/FeatureVectorizer.proto
new file mode 100644
index 000000000..94d97474a
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/FeatureVectorizer.proto
@@ -0,0 +1,26 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+package CoreML.Specification;
+
+/*
+ * A FeatureVectorizer puts one or more features into a single array.
+ *
+ * The ordering of features in the output array is determined by
+ * ``inputList``.
+ *
+ * ``inputDimensions`` is a zero based index.
+ */
+message FeatureVectorizer {
+    message InputColumn {
+        string inputColumn = 1;
+        uint64 inputDimensions = 2;
+    }
+
+    repeated InputColumn inputList = 1;
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/GLMClassifier.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/GLMClassifier.proto
new file mode 100644
index 000000000..66f5befc3
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/GLMClassifier.proto
@@ -0,0 +1,43 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "DataStructures.proto";
+
+package CoreML.Specification;
+
+/*
+ * A generalized linear model classifier.
+ */
+message GLMClassifier {
+    message DoubleArray {
+        repeated double value = 1;
+    }
+
+    enum PostEvaluationTransform {
+        Logit = 0;
+        Probit = 1; // Only binary classification is supported for probit
+    }
+
+    enum ClassEncoding {
+        ReferenceClass = 0; // First class is the reference class
+        OneVsRest = 1; // Also called One vs All
+    }
+
+    repeated DoubleArray weights = 1;
+    repeated double offset = 2;
+    PostEvaluationTransform postEvaluationTransform = 3;
+    ClassEncoding classEncoding = 4;
+
+    /*
+     * Required class label mapping.
+     */
+    oneof ClassLabels {
+        StringVector stringClassLabels = 100;
+        Int64Vector int64ClassLabels = 101;
+    }
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/GLMRegressor.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/GLMRegressor.proto
new file mode 100644
index 000000000..fb46492d0
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/GLMRegressor.proto
@@ -0,0 +1,28 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+package CoreML.Specification;
+
+/*
+ * A generalized linear model regressor.
+ */
+message GLMRegressor {
+    message DoubleArray {
+        repeated double value = 1;
+    }
+
+    enum PostEvaluationTransform {
+        NoTransform = 0;
+        Logit = 1;
+        Probit = 2;
+    }
+
+    repeated DoubleArray weights = 1;
+    repeated double offset = 2;
+    PostEvaluationTransform postEvaluationTransform = 3;
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/Gazetteer.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/Gazetteer.proto
new file mode 100644
index 000000000..8dac370e7
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/Gazetteer.proto
@@ -0,0 +1,43 @@
+// Copyright (c) 2019, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "DataStructures.proto";
+
+package CoreML.Specification.CoreMLModels;
+
+/*
+* A model which uses an efficient probabilistic representation
+* for assigning labels to a set of strings.
+*/
+message Gazetteer {
+
+    /*
+     * Stores the revision number for the model, revision 2 is available on
+     * iOS, tvOS 13.0+, macOS 10.15+
+     */
+    uint32 revision = 1;
+
+    /*
+     * Stores the language of the model, as specified in BCP-47 format,
+     * e.g. "en-US". See https://tools.ietf.org/html/bcp47
+     */
+    string language = 10;
+
+    /*
+     * Natural Language framework's efficient representation of a gazetter.
+     */
+    bytes modelParameterData = 100;
+
+    /*
+     * Stores the set of output class labels
+     */
+    oneof ClassLabels {
+        StringVector stringClassLabels = 200;
+    }
+
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/Identity.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/Identity.proto
new file mode 100644
index 000000000..b932fe3d7
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/Identity.proto
@@ -0,0 +1,18 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+package CoreML.Specification;
+
+/*
+ * An identity model.
+ *
+ * This model returns given inputs as outputs, unchanged.
+ * Intended to be used for testing purposes.
+ */
+message Identity {
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/Imputer.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/Imputer.proto
new file mode 100644
index 000000000..ecedb0119
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/Imputer.proto
@@ -0,0 +1,43 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "DataStructures.proto";
+
+package CoreML.Specification;
+
+/*
+ * A transformer that replaces missing values with a default value,
+ * such as a statistically-derived value.
+ *
+ * If ``ReplaceValue`` is set, then missing values of that type are
+ * replaced with the corresponding value.
+ *
+ * For example: if ``replaceDoubleValue`` is set to ``NaN``
+ * and a single ``NaN`` double value is provided as input,
+ * then it is replaced by ``imputedDoubleValue``. However
+ * if the input is an array of doubles, then any instances
+ * of ``NaN`` in the array is replaced with the corresponding
+ * value in ``imputedDoubleArray``.
+ */
+message Imputer {
+    oneof ImputedValue {
+        double imputedDoubleValue = 1;
+        int64 imputedInt64Value = 2;
+        string imputedStringValue = 3;
+        DoubleVector imputedDoubleArray = 4;
+        Int64Vector imputedInt64Array = 5;
+        StringToDoubleMap imputedStringDictionary = 6;
+        Int64ToDoubleMap imputedInt64Dictionary = 7;
+    }
+
+    oneof ReplaceValue {
+        double replaceDoubleValue = 11;
+        int64 replaceInt64Value = 12;
+        string replaceStringValue = 13;
+    }
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/ItemSimilarityRecommender.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/ItemSimilarityRecommender.proto
new file mode 100644
index 000000000..eb0292ac6
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/ItemSimilarityRecommender.proto
@@ -0,0 +1,74 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "DataStructures.proto";
+
+package CoreML.Specification;
+
+
+/*
+ * Item Similarity Recommender
+ *
+ *  The Item Similarity recommender takes as input a list of items and scores,
+ *  then uses that information and a table of item similarities to predict similarity
+ *  scores for all items.  By default, the items predicted are most similar to the given
+ *  items but not part of that item set.
+ *
+ *  The predicted score for a given item k is
+ *    sum_(i in observed items)   sim_(k,i) * (score_i - shift_k)
+ *
+ *  Because only the most similar scores for each item i are stored,
+ *  sim_(k,i) is often zero.
+ *
+ *  For many models, the score adjustment parameter shift_j is zero -- it's occasionally used
+ *  to counteract global biases for popular items.
+ *
+ *
+ *  References:
+ */
+message ItemSimilarityRecommender {
+
+    /* The items similar to a given base item.
+     */
+    message ConnectedItem {
+        uint64 itemId = 1;
+        double similarityScore = 2;
+    }
+
+    /*  The formula for the score of a given model as given above, with shift_k
+     *   parameter given by itemScoreAdjustment, and the similar item list filling in
+     *   all the known sim(k,i) scores for i given by itemID and k given by the itemID parameter in
+     *   the similarItemList.
+     */
+    message SimilarItems {
+        uint64 itemId = 1;
+        repeated ConnectedItem similarItemList = 2;
+        double itemScoreAdjustment = 3;
+    }
+
+    repeated SimilarItems itemItemSimilarities = 1;
+
+    /* One or none of these are given.  If none are given, then the items must number 0, 1, ..., num_items - 1.
+     *  If either is given, the length must be exactly num_items.
+     */
+    StringVector itemStringIds = 2;
+    Int64Vector itemInt64Ids = 3;
+
+    /* Input parameter names specifying different possible inputs to the recommender.
+     */
+    string itemInputFeatureName = 10;  /* Required */
+    string numRecommendationsInputFeatureName = 11;  /* Optional; defaults to all items if not given.*/
+    string itemRestrictionInputFeatureName = 12; /* Optional. */
+    string itemExclusionInputFeatureName = 13; /* Optional; defaults to input item list if not given. */
+
+    /* The predicted outputs.  At least one of these must be specified.
+     */
+    string recommendedItemListOutputFeatureName = 20;
+    string recommendedItemScoreOutputFeatureName = 21;
+
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/LICENSE.txt b/cpp/external/katagocoreml/vendor/mlmodel/format/LICENSE.txt
new file mode 100644
index 000000000..bbcdc9ef8
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/LICENSE.txt
@@ -0,0 +1,11 @@
+Copyright (c) 2017, Apple Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:  
+
+1.  Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2.  Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3.  Neither the name of the copyright holder(s) nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/LinkedModel.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/LinkedModel.proto
new file mode 100644
index 000000000..7b5263c3a
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/LinkedModel.proto
@@ -0,0 +1,40 @@
+// Copyright (c) 2019, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+import public "Parameters.proto";
+
+package CoreML.Specification;
+
+/*
+ * A model which wraps another (compiled) model external to this one
+ */
+message LinkedModel {
+
+    oneof LinkType {
+        // A model located via a file system path
+        LinkedModelFile linkedModelFile = 1;
+    }
+}
+
+// Model is referenced by a model file name and search path
+message LinkedModelFile {
+
+    // Model file name: e.g. "MyFetureExtractor.mlmodelc"
+    StringParameter linkedModelFileName = 1;
+
+    // Search path to find the linked model file
+    // Multiple paths can be searched using the unix-style path separator ":"
+    // Each path can be relative (to this model) or absolute
+    //
+    // An empty string is the same as the relative search path "."
+    // which searches in the same location as this model file
+    //
+    // There are some special paths which start with $
+    // - $BUNDLE_MAIN - Indicates to look in the main bundle
+    // - $BUNDLE_IDENTIFIER(identifier) - Looks in Bunde with given identifier
+    StringParameter linkedModelSearchPath = 2;
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/MIL.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/MIL.proto
new file mode 100644
index 000000000..af7c3e004
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/MIL.proto
@@ -0,0 +1,371 @@
+// Copyright (c) 2019, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+/*
+ * - A Program is the container with following information
+ *     - set of functions: Function defines a program block to be executed
+ *     - A model can have multiple functions defined and will have a single point of entry.
+ * - A Function consists of
+ *     - List of named inputs and output types
+ *     - A block defining scope for a function - similar to a function in C/C++
+ * - A Block consists of
+ *     - List of named inputs and output names
+ *     - Topologically sorted Ops
+ * - A Op consists of
+ *     - List of named inputs and outputs (name, type) pair
+ *     - Optionally, blocks for Control-Flow
+ *
+ * Programs, functions, blocks, ops, and tensor types all can contain an optional set of attributes.
+ *
+ * == Identifiers ==
+ * Identifiers, generally used for names and keys, must match the
+ * regular expression [A-Za-z\_][A-Za-z0-9\_@]*
+ */
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+package CoreML.Specification.MILSpec;
+
+// The top level container.
+message Program {
+    int64 version = 1;
+
+    // Must be unique within the containing program
+    // Names must be valid identifiers as described above.
+    map<string, Function> functions = 2;
+
+    string docString = 3;
+
+    // Any other attributes not described by other fields.
+    // Keys must be valid identifiers as described above.
+    map<string, Value> attributes = 4;
+}
+
+// A program-level function.
+message Function {
+
+    // Function inputs are unordered (name, ValueType) pairs.
+    // Inputs intended to process images must be rank-4 Float32 tensors. Dimensions
+    // are interpreted as NCHW, with N == 1 and C being 1 for grayscale and 3 for RGB.
+    // Names must be valid identifiers as described above.
+    repeated NamedValueType inputs = 1;
+
+    // The active block is drawn from this named specialization.
+    // This key must exist in `block_specializations`.
+    string opset = 2;
+
+    // Named specializations of this function.
+    //
+    // Specialization keys are the name of the opset that the
+    // function specialization is written in. They must be valid
+    // identifiers as described above.
+    //
+    // Outputs from all blocks must match. They define the outputs
+    // of the function.
+    // Each block inherits the lexical scope from the function.
+    map<string, Block> block_specializations = 3;
+
+    // Any other attributes not described by other fields.
+    // Keys must be valid identifiers as described above.
+    map<string, Value> attributes = 4;
+}
+
+// A basic block with a single entry and exit in SSA form.
+message Block {
+    // Infrequently used, these are for operators that may need to give
+    // block-local names to input values (e.g. while_loop).
+    repeated NamedValueType inputs = 1;
+
+    // The names to give to values returned by this block. They must be
+    // identifiers as described above.
+    //
+    // ValueType of outputs[i] is Operation[j].outputs[k].type where
+    // i, j and k are indices of block output, block Operation and
+    // jth operation's output respectively.
+    // this is due to
+    // 1. An operation can have more than one output
+    // 2. Any one of operation's output could be potentially block's output
+    repeated string outputs = 2;
+
+    repeated Operation operations = 3;
+
+    // Any other attributes not described by other fields.
+    // Keys must be valid identifiers as described above.
+    map<string, Value> attributes = 4;
+}
+
+// Argument is list of Binding to either name or value
+message Argument {
+    message Binding {
+        oneof binding {
+            // The name of a previously defined value.
+            string name = 1;
+
+            // A compile time constant.
+            Value value = 2;
+        }
+    }
+
+    repeated Binding arguments = 1;
+};
+
+// A single operation/node/layer.
+message Operation {
+    // Examples: "convolution", "cropResize". Operation type defines the
+    // expected inputs and output.
+    string type = 1;
+
+    // Operator arguments
+    //
+    // Key: parameter name
+    // Value: Argument (list of bindings)
+    //
+    // Value is list of argument binding to given parameter
+    // Binding can be a string name (previous operation output or input given to model/block/function)
+    //             or a Value (known compile time value for given operation)
+    // Argument can be of length 1 (general) or variable length (e.g. concat layer)
+    // e.g. {'stride' : ['input_01']}
+    // e.g. {'x' : ['input_01', 'input_02', 'input_03', false]}
+    map<string, Argument> inputs = 2;
+
+    // Names to which to bind values returned by this operation.
+    // Names must be:
+    //  (*) valid identifiers as described above; and
+    //  (*) unique within the current scope.
+    repeated NamedValueType outputs = 3;
+
+    // Nested blocks for loops and conditionals. For example,
+    // a conditional block will have two entries here.
+    repeated Block blocks = 4;
+
+    // Any other information not captured by other fields.
+    // Keys must be valid identifiers as described above.
+    map<string, Value> attributes = 5;
+}
+
+// Named Value parameters
+// (name, type) pair
+message NamedValueType {
+    // The name of this parameter; must be a valid identifier as described above.
+    string name = 1;
+
+    // This parameter's required type.
+    ValueType type = 2;
+}
+
+/* ========  Types ======= */
+
+// Primer: Two fundamental representations of state:
+//
+// Variable: Variables are NEVER materialized at compile time and are only
+// available at run time. Therefore, for Variables we only have ValueType,
+// which may have unknown shapes in the IR. Variable encompasses familiar
+// concepts such as placeholder, output of an Op.
+//
+// Value: Values are ALWAYS materialized at compile time, and MAY be modified
+// at runtime (e.g., during on-device training). Value describes notions
+// such as parameter, attributes of an op. Value is either stored inside
+// proto (e.g., attributes) or outside of proto (e.g. parameters) and
+// NEVER contains unknown shape in the IR.
+//
+// Comment(daviddai): A Variable with the potential to be materialized at
+// compile time (e.g., through constant propagation) does NOT preclude it to
+// be a Variable. Certain Ops such as LoadParameter and Const, their output
+// has potential to be materialized at compile time but is still represented
+// as Variable.
+
+// A type of any kind
+message ValueType {
+    oneof type {
+        TensorType tensorType = 1;
+        ListType listType = 2;
+        TupleType tupleType = 3;
+        DictionaryType dictionaryType = 4;
+        StateType stateType = 5;
+    }
+}
+
+// Supported data types
+enum DataType {
+    // Comment: Two schemes of specifying field id: just start with 0
+    // without reserving numbers, but keep track of the next field ID. The
+    // other is assign blocks of ID to int / float / uint etc.
+
+    // 0-10 reserved for special types
+    UNUSED_TYPE = 0;  // not currently in use
+    BOOL = 1;
+    STRING = 2;  // arbitrary sequence of bytes
+
+    // Floats
+    FLOAT8E4M3FN = 40;
+    FLOAT8E5M2 = 41;
+    FLOAT16 = 10;
+    FLOAT32 = 11;
+    FLOAT64 = 12;
+    BFLOAT16 = 13;
+
+    // Ints
+    INT8 = 21;
+    INT16 = 22;
+    INT32 = 23;
+    INT64 = 24;
+    INT4 = 25;
+
+    // UInts
+    UINT8 = 31;
+    UINT16 = 32;
+    UINT32 = 33;
+    UINT64 = 34;
+
+    UINT4 = 35;
+    UINT2 = 36;
+    UINT1 = 37;
+    UINT6 = 38;
+    UINT3 = 39;
+}
+
+message TensorType {
+    // The data type stored in a tensor of this type
+    DataType dataType = 1;
+
+    // The number of dimensions in the tensor shape. rank == -1 implies
+    // variable (not fixed) rank
+    int64 rank = 2;
+
+    // Tensor shape values; must be of length "rank"
+    repeated Dimension dimensions = 3;
+
+    // Any other tensor type attributes not described by other fields.
+    // Keys must be valid identifiers in MIL text syntax.
+    map<string, Value> attributes = 4;
+}
+
+message TupleType {
+    // Recursively define TupleType from ValueType.
+    repeated ValueType types = 1;
+}
+
+message ListType {
+    // The type of element stored in a list of this type
+    ValueType type = 1;
+
+    // The number of elements in a list of this type. May be unknown (variable length)
+    Dimension length = 2;
+}
+
+// An unordered key-value mapping
+message DictionaryType {
+    ValueType keyType = 1;
+    ValueType valueType = 2;
+}
+
+message StateType {
+    ValueType wrappedType = 1;
+}
+
+message Dimension {
+    oneof dimension {
+      ConstantDimension constant = 1;
+      UnknownDimension unknown = 2;
+    }
+
+    message ConstantDimension {
+        uint64 size = 1;
+    }
+
+    message UnknownDimension {
+        bool variadic = 1;
+    }
+}
+
+/* ======== Values ======= */
+
+// See Variable vs Value primer above.
+message Value {
+    string docString = 1; // optional human-readable texts.
+    ValueType type = 2;
+
+    // An immediate value stored within the proto
+    message ImmediateValue {
+        oneof value {
+            TensorValue tensor = 1;
+            TupleValue tuple = 2;
+            ListValue list = 3;
+            DictionaryValue dictionary = 4;
+        }
+    }
+
+    // Reference to a "blob v2" storage file
+    message BlobFileValue {
+        // name of file
+        string fileName = 1;
+
+        // byte offset to metadata
+        uint64 offset = 2;
+    }
+
+    oneof value {
+        ImmediateValue immediateValue = 3;
+        BlobFileValue blobFileValue = 5;
+    }
+}
+
+message TensorValue {
+    oneof value {
+        RepeatedFloats floats = 1;
+        RepeatedInts ints = 2;
+        RepeatedBools bools = 3;
+        RepeatedStrings strings = 4;
+        RepeatedLongInts longInts = 5;
+        RepeatedDoubles doubles = 6;
+        RepeatedBytes bytes = 7;
+    }
+
+    message RepeatedFloats {
+        repeated float values = 1 [packed = true];
+    }
+
+    message RepeatedDoubles {
+        repeated double values = 1 [packed = true];
+    }
+
+    message RepeatedInts {
+        repeated int32 values = 1 [packed = true];
+    }
+
+    message RepeatedLongInts {
+        repeated int64 values = 1 [packed = true];
+    }
+
+    message RepeatedBools {
+        repeated bool values = 1 [packed = true];
+    }
+
+    message RepeatedStrings {
+        repeated string values = 1;
+    }
+
+    message RepeatedBytes {
+        bytes values = 1;
+    }
+}
+
+message TupleValue {
+    // Comment: TupleValue is recursively defined from Value.
+    repeated Value values = 1;
+}
+
+message ListValue {
+    repeated Value values = 1;
+}
+
+message DictionaryValue {
+    message KeyValuePair {
+        Value key = 1;
+        Value value = 2;
+    }
+    repeated KeyValuePair values = 1;
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/Model.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/Model.proto
new file mode 100644
index 000000000..d44f0b3e9
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/Model.proto
@@ -0,0 +1,415 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+/*
+ * A Core ML model consists of a specification version
+ * and a model description,
+ * and can be any one of the following types:
+ *
+ * Neural Networks
+ *   - `NeuralNetwork`
+ *
+ * Regressors
+ *   - ``GLMRegressor``
+ *   - ``SupportVectorRegressor``
+ *   - ``TreeEnsembleRegressor``
+ *   - ``NeuralNetworkRegressor``
+ *   - ``BayesianProbitRegressor``
+ *
+ * Classifiers
+ *   - `NeuralNetworkClassifier`
+ *   - `TreeEnsembleClassifier`
+ *   - `GLMClassifier`
+ *   - `SupportVectorClassifier`
+ *   - `KNearestNeighborsClassifier`
+ *
+ * Other models
+ *   - `CustomModel`
+ *   - `TextClassifier`
+ *   - `WordTagger`
+ *   - `Gazetteer`
+ *   - `WordEmbedding`
+ *   - `VisionFeaturePrint`
+ *   - `LinkedModel`
+ *   - `SoundAnalysisPreprocessing`
+ *   - `ItemSimilarityRecommender`
+ *   - `ClassConfidenceThresholding`
+ *
+ * Feature Engineering
+ *   - `Imputer`
+ *   - `Scaler`
+ *   - `Normalizer`
+ *   - `OneHotEncoder`
+ *   - `CategoricalMapping`
+ *   - `FeatureVectorizer`
+ *   - `DictVectorizer`
+ *   - `ArrayFeatureExtractor`
+ *   - `NonMaximumSuppression`
+ *
+ * Pipelines
+ *   - `PipelineClassifier`
+ *   - `PipelineRegressor`
+ *   - `Pipeline`
+ *
+ * Simple Mathematical Functions
+ *   - `Identity`
+ */
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "VisionFeaturePrint.proto";
+import public "AudioFeaturePrint.proto";
+import public "TextClassifier.proto";
+import public "WordTagger.proto";
+import public "Gazetteer.proto";
+import public "WordEmbedding.proto";
+import public "ArrayFeatureExtractor.proto";
+import public "BayesianProbitRegressor.proto";
+import public "CategoricalMapping.proto";
+import public "CustomModel.proto";
+import public "DictVectorizer.proto";
+import public "FeatureTypes.proto";
+import public "FeatureVectorizer.proto";
+import public "GLMRegressor.proto";
+import public "GLMClassifier.proto";
+import public "NearestNeighbors.proto";
+import public "Identity.proto";
+import public "Imputer.proto";
+import public "MIL.proto";
+import public "NeuralNetwork.proto";
+import public "Normalizer.proto";
+import public "OneHotEncoder.proto";
+import public "Scaler.proto";
+import public "NonMaximumSuppression.proto";
+import public "SVM.proto";
+import public "TreeEnsemble.proto";
+import public "Parameters.proto";
+import public "ItemSimilarityRecommender.proto";
+import public "SoundAnalysisPreprocessing.proto";
+import public "LinkedModel.proto";
+import public "ClassConfidenceThresholding.proto";
+
+package CoreML.Specification;
+
+/*
+ * A pipeline consists of one or more models.
+ */
+message Pipeline {
+    repeated Model models = 1;
+
+    // Optional names given for each model
+    // If not supplied it defaults to ["model0",..., "model"(models.size()-1)]
+    // These names can be used to disambiguate the scope / domain of a parameter
+    repeated string names = 2;
+}
+
+/*
+ * A classifier pipeline.
+ */
+message PipelineClassifier {
+    Pipeline pipeline = 1;
+}
+
+/*
+ * A regressor pipeline.
+ */
+message PipelineRegressor {
+    Pipeline pipeline = 1;
+}
+
+/*
+ * A feature description
+ * consisting of a name, short description, and type.
+ */
+message FeatureDescription {
+    string name = 1;
+    string shortDescription = 2;
+    FeatureType type = 3;
+}
+
+/*
+ * Model metadata,
+ * consisting of a short description, a version string,
+ * an author, a license, and any other user defined
+ * key/value meta data.
+ */
+message Metadata {
+    string shortDescription = 1;
+    string versionString = 2;
+    string author = 3;
+    string license = 4;
+    map<string, string> userDefined = 100;
+}
+
+/*
+ * A description of a function.
+ */
+message FunctionDescription {
+    // The function name.
+    string name = 1;
+
+    // Input feature descriptions for the function.
+    repeated FeatureDescription input = 2;
+
+    // Output feature descriptions for the function.
+    repeated FeatureDescription output = 3;
+
+    // State feature descriptions for the function.
+    //
+    // The `type` of each feature description must be `StateFeatureType`.
+    repeated FeatureDescription state = 6;
+
+    // [Required for regressor and classifier functions]: the name
+    // to give to an output feature containing the prediction.
+    string predictedFeatureName = 4;
+
+    // [Optional for classifier functions]: the name to give to an
+    // output feature containing a dictionary mapping class
+    // labels to their predicted probabilities. If not specified,
+    // the dictionary will not be returned by the model.
+    string predictedProbabilitiesName = 5;
+}
+
+/*
+ * A description of a model,
+ * consisting of descriptions of its input and output features.
+ * Both regressor and classifier models require the name of the
+ * primary predicted output feature (``predictedFeatureName``).
+ * Classifier models can specify the output feature containing
+ * probabilities for the predicted classes
+ * (``predictedProbabilitiesName``).
+ */
+message ModelDescription {
+    // Functions in the model.
+    //
+    // Some model types (e.g. ML Program) support multiple functions. For
+    // example, a large language model might have "prompt" and "extend"
+    // functions. Each has a different input and output behavior, but
+    // they are in a same model and share resources.
+    //
+    // If the model has more than one function, use the multiple
+    // function configuration and declare the feature descriptions and
+    // associated properties at function level.
+    //
+    // If the model type doesn't support multiple functions or the
+    // model has just "main" function, declare the feature
+    // descriptions and associated properties at the model level.
+    //
+    // Note: feature descriptions and associated properties mentioned
+    // above include input, output, state, predictedFeatureName,
+    // predictedProbabilitiesName, and trainingInput fields.
+    repeated FunctionDescription functions = 20;
+
+    // The default function.
+    //
+    // The default function is the one that is automatically used when
+    // one doesn't explicitly specify.
+    //
+    // The value must be one of the names in `functions` message
+    // above. If `functions` is empty, this field must not be present.
+    string defaultFunctionName = 21;
+
+    // The metadata (e.g. author, licence, etc) of the model.
+    Metadata metadata = 100;
+
+    // Use these fields below only when `functions` above is empty.
+
+    repeated FeatureDescription input = 1;
+    repeated FeatureDescription output = 10;
+
+    // State feature descriptions for the function.
+    //
+    // The `type` of each feature description must be `StateFeatureType`.
+    repeated FeatureDescription state = 13;
+
+    // [Required for regressor and classifier models]: the name
+    // to give to an output feature containing the prediction.
+    string predictedFeatureName = 11;
+
+    // [Optional for classifier models]: the name to give to an
+    // output feature containing a dictionary mapping class
+    // labels to their predicted probabilities. If not specified,
+    // the dictionary will not be returned by the model.
+    string predictedProbabilitiesName = 12;
+
+    repeated FeatureDescription trainingInput = 50;
+}
+
+message SerializedModel {
+    // Identifier whose content describes the model type of the serialized protocol buffer message.
+    string identifier = 1;
+
+    // Must be a valid serialized protocol buffer of the above specified type.
+    bytes model = 2;
+}
+
+/*
+ * A Core ML model,
+ * consisting of a specification version,
+ * a model description, and a model type.
+ *
+ * Core ML model compatibility is indicated by
+ * a monotonically increasing specification version number,
+ * which is incremented anytime a backward-incompatible change is made
+ * (this is functionally equivalent to the MAJOR version number
+ * described by `Semantic Versioning 2.0.0 <http://semver.org/>`_).
+ *
+ * Specification Versions : OS Availability (Core ML Version)
+ *
+ * 1 : iOS 11, macOS 10.13, tvOS 11, watchOS 4 (Core ML 1)
+ * - Feedforward & Recurrent Neural Networks
+ * - General Linear Models
+ * - Tree Ensembles
+ * - Support Vector Machines
+ * - Pipelines
+ * - Feature Engineering
+ *
+ * 2 : iOS 11.2, macOS 10.13.2, tvOS 11.2, watchOS 4.2 (Core ML 1.2)
+ * - Custom Layers for Neural Networks
+ * - Float 16 support for Neural Network layers
+ *
+ * 3 : iOS 12, macOS 10.14, tvOS 12, watchOS 5 (Core ML 2)
+ * - Flexible shapes and image sizes
+ * - Categorical sequences
+ * - Core ML Vision Feature Print, Text Classifier, Word Tagger
+ * - Non Max Suppression
+ * - Crop and Resize Bilinear NN layers
+ * - Custom Models
+ *
+ * 4 : iOS 13, macOS 10.15, tvOS 13, watchOS 6 (Core ML 3)
+ * - Updatable models
+ * - Exact shape / general rank mapping for neural networks
+ * - Large expansion of supported neural network layers
+ *   - Generalized operations
+ *   - Control flow
+ *   - Dynamic layers
+ *   - See NeuralNetwork.proto
+ * - Nearest Neighbor Classifier
+ * - Sound Analysis Prepreocessing
+ * - Recommender
+ * - Linked Model
+ * - NLP Gazeteer
+ * - NLP WordEmbedding
+ *
+ * 5 : iOS 14, macOS 11, tvOS 14, watchOS 7 (Core ML 4)
+ * - Model Deployment
+ * - Model Encryption
+ * - Unified converter API with PyTorch and Tensorflow 2 Support in coremltools 4
+ * - MIL builder for neural networks and composite ops in coremltools 4
+ * - New layers in neural network:
+ *      - CumSum
+ *      - OneHot
+ *      - ClampedReLu
+ *      - ArgSort
+ *      - SliceBySize
+ *      - Convolution3D
+ *      - Pool3D
+ *      - Bilinear Upsample with align corners and fractional factors
+ *      - PixelShuffle
+ *      - MatMul with int8 weights and int8 activations
+ *      - Concat interleave
+ *      - See NeuralNetwork.proto
+ * - Enhanced Xcode model view with interactive previews
+ * - Enhanced Xcode Playground support for Core ML models
+ *
+ * 6 : iOS 15, macOS 12, tvOS 15, watchOS 8 (Core ML 5)
+ * - Core ML Audio Feature Print
+ * - new type of model: mlprogram (MILSpec.Program)
+ *
+ * 7 : iOS 16, macOS 13, tvOS 16, watchOS 9 (Core ML 6)
+ * - FLOAT16 array data type
+ * - GRAYSCALE_FLOAT16 image color space.
+ *
+ * 8 : iOS 17, macOS 14, tvOS 17, watchOS 10 (Core ML 7)
+ * - iOS 17 ops
+ * - Scene print v2
+ * - ClassConfidenceThresholding model
+ *
+ * 9 : iOS 18, macOS 15, tvOS 18, watchOS 11 (Core ML 8)
+ * - multiple functions
+ *
+ * 10 : iOS 26, macOS 26, tvOS 26, watchOS 26, visionOS 26 (Core ML 9)
+ * - Int8 MultiArray types for ML Programs
+ */
+message Model {
+    int32 specificationVersion = 1;
+    ModelDescription description = 2;
+
+    /*
+     * Following model types support on-device update:
+     *
+     * - NeuralNetworkClassifier
+     * - NeuralNetworkRegressor
+     * - NeuralNetwork
+     * - KNearestNeighborsClassifier
+     */
+    bool isUpdatable = 10;
+
+    // start at 200 here
+    // model specific parameters:
+    oneof Type {
+        // pipeline starts at 200
+        PipelineClassifier pipelineClassifier = 200;
+        PipelineRegressor pipelineRegressor = 201;
+        Pipeline pipeline = 202;
+
+        // regressors start at 300
+        GLMRegressor glmRegressor = 300;
+        SupportVectorRegressor supportVectorRegressor = 301;
+        TreeEnsembleRegressor treeEnsembleRegressor = 302;
+        NeuralNetworkRegressor neuralNetworkRegressor = 303;
+        BayesianProbitRegressor bayesianProbitRegressor = 304;
+
+        // classifiers start at 400
+        GLMClassifier glmClassifier = 400;
+        SupportVectorClassifier supportVectorClassifier = 401;
+        TreeEnsembleClassifier treeEnsembleClassifier = 402;
+        NeuralNetworkClassifier neuralNetworkClassifier = 403;
+        KNearestNeighborsClassifier kNearestNeighborsClassifier = 404;
+
+        // generic models start at 500
+        NeuralNetwork neuralNetwork = 500;
+        ItemSimilarityRecommender itemSimilarityRecommender = 501;
+        MILSpec.Program mlProgram = 502;
+
+        // Custom and linked models
+        CustomModel customModel = 555;
+        LinkedModel linkedModel = 556;
+
+        // Precision Recall Curve 'container''
+        ClassConfidenceThresholding classConfidenceThresholding = 560;
+
+        // feature engineering starts at 600
+        OneHotEncoder oneHotEncoder = 600;
+        Imputer imputer = 601;
+        FeatureVectorizer featureVectorizer = 602;
+        DictVectorizer dictVectorizer = 603;
+        Scaler scaler = 604;
+        CategoricalMapping categoricalMapping = 606;
+        Normalizer normalizer = 607;
+        ArrayFeatureExtractor arrayFeatureExtractor = 609;
+        NonMaximumSuppression nonMaximumSuppression = 610;
+
+
+        // simple mathematical functions used for testing start at 900
+        Identity identity = 900;
+
+        // reserved until 1000
+
+        // CoreML provided models
+        CoreMLModels.TextClassifier textClassifier = 2000;
+        CoreMLModels.WordTagger wordTagger = 2001;
+        CoreMLModels.VisionFeaturePrint visionFeaturePrint = 2002;
+        CoreMLModels.SoundAnalysisPreprocessing soundAnalysisPreprocessing = 2003;
+        CoreMLModels.Gazetteer gazetteer = 2004;
+        CoreMLModels.WordEmbedding wordEmbedding = 2005;
+        CoreMLModels.AudioFeaturePrint audioFeaturePrint = 2006;
+
+        // Reserved private messages start at 3000
+        // These messages are subject to change with no notice or support.
+        SerializedModel serializedModel = 3000;
+    }
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/NearestNeighbors.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/NearestNeighbors.proto
new file mode 100644
index 000000000..d7f2a60f9
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/NearestNeighbors.proto
@@ -0,0 +1,132 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+package CoreML.Specification;
+
+import public "DataStructures.proto";
+import public "Parameters.proto";
+
+/*
+ * A k-Nearest-Neighbor classifier
+ */
+message KNearestNeighborsClassifier {
+
+    /*
+     * The "core" nearest neighbor model attributes.
+     */
+    NearestNeighborsIndex nearestNeighborsIndex = 1;
+
+    /*
+     * Number of neighbors to use for classification.
+     */
+    Int64Parameter numberOfNeighbors = 3;
+
+    /*
+     * Type of labels supported by the model. Currently supports String or Int64
+     * labels.
+     */
+    oneof ClassLabels {
+        StringVector stringClassLabels = 100;
+        Int64Vector int64ClassLabels = 101;
+    }
+
+	/*
+	 * Default value of class label (useful when prediction is called on an empty kNN classifier)
+	 */
+    oneof DefaultClassLabel {
+        string defaultStringLabel = 110;
+        int64 defaultInt64Label = 111;
+    }
+
+    /*
+     * Weighting scheme to be used when computing the majority label of a 
+     * new data point.
+     */
+    oneof WeightingScheme {
+        UniformWeighting uniformWeighting = 200;
+        InverseDistanceWeighting inverseDistanceWeighting = 210;
+    }
+}
+
+/*
+ * The "core" attributes of a Nearest Neighbors model.
+ */
+message NearestNeighborsIndex {
+
+    /* 
+     * Number of dimensions of the input data.
+     */
+    int32 numberOfDimensions = 1;
+
+    /*
+     * Vector of floating point data that makes up the model. Each data point must have 'numberOfDimensions'
+     * dimensions.
+     */
+    repeated FloatVector floatSamples = 2;
+
+    /* 
+     * Backing data structure for the Nearest Neighbors Index. Currently supports 
+     * a linear index or a kd-tree index.
+     */
+    oneof IndexType {
+        LinearIndex linearIndex = 100;
+        SingleKdTreeIndex singleKdTreeIndex = 110;
+    }
+
+    /* 
+     * Distance function to be used to find neighbors. Currently only Squared Euclidean
+     * Distance is supported.
+     */
+    oneof DistanceFunction {
+        SquaredEuclideanDistance squaredEuclideanDistance = 200;
+    }
+
+}
+
+/*
+ * Specifies a uniform weighting scheme (i.e. each neighbor receives equal
+ * voting power).
+ */
+message UniformWeighting {
+}
+
+
+/*
+ * Specifies a inverse-distance weighting scheme (i.e. closest neighbors receives higher
+ * voting power). A nearest neighbor with highest sum of (1 / distance) is picked.
+ */
+message InverseDistanceWeighting {
+}
+
+
+/*
+ * Specifies a flat index of data points to be searched by brute force.
+ */
+message LinearIndex {
+}
+
+
+/*
+ * Specifies a kd-tree backend for the nearest neighbors model.
+ */
+message SingleKdTreeIndex {
+
+    /*
+     * Number of data points contained within a leaf node of the kd-tree.
+     */
+    int32 leafSize = 1;
+
+}
+
+
+/*
+ * Specifies the Squared Euclidean Distance function.
+ */
+message SquaredEuclideanDistance {
+}
+
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/NeuralNetwork.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/NeuralNetwork.proto
new file mode 100644
index 000000000..f2bdb68c0
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/NeuralNetwork.proto
@@ -0,0 +1,6531 @@
+// Copyright (c) 2017-2019, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+/*
+ * A neural network is defined through a collection of layers
+ * and represents a directed acyclic graph (DAG).
+ * Each layer has a name, a layer type,
+ * a list of input names, a list of output names,
+ * and a collection of parameters specific to the layer type.
+ *
+ * The graph structure and connectivity of the neural network
+ * is inferred from the input and output names.
+ * A neural network starts with the layer
+ * whose input name is equal to the value specified in
+ * ``Model.description.input.name``,
+ * and ends with the layer
+ * whose output name is equal to the value specified in
+ * ``Model.description.output.name``.
+ * Layers must have unique input and output names,
+ * and a layer may not have input or output names that
+ * refer to layers that are not yet defined.
+ *
+ * For Core ML specification version <=3,
+ * all inputs are mapped to static rank 5 tensors, with axis notations
+ * [Sequence, Batch, Channel, Height, Width].
+ *
+ * From specification version 4 onwards (iOS >= 13, macOS >= 10.15), more options are available
+ * (see enums ``NeuralNetworkMultiArrayShapeMapping``, ``NeuralNetworkImageShapeMapping``)
+ * to map inputs to generic N-Dimensional (or N rank) tensors, where N >= 1.
+ *
+ * Each layer type may have specific constraints on the ranks of its inputs and outputs.
+ *
+ * Some of the layers (such as softmax, reduce, etc) have parameters that have been described in
+ * terms of notational axis "Channel", "Height", "Width" or "Sequence". They can be re-interpreted easily in
+ * the general ND setting by using the following rule:
+ * "width" is same as axis = -1 (i.e. the last axis from the end)
+ * "height" is same as axis = -2 (i.e. the second last axis from the end)
+ * "channel" is same as axis = -3 (i.e. the third last axis from the end)
+ * "sequence" is same as axis = -5 (i.e. the fifth last axis from the end)
+ *
+ * Several layers are available in 3 different variations, with the names ending
+ * in identifiers: ``like``, ``static`` and ``dynamic``. For instance, ``FillLike``,
+ * ``FillStatic`` and ``FillDynamic``. The ``static`` variation generally will have
+ * a property corresponding to the shape of the output. For instance, if the
+ * output of the ``FillStatic`` layer is desired to be of shape (10, 4), the
+ * property ``targetShape`` will have to be set to [10, 4]. In the ``dynamic`` case,
+ * the shape is an input, hence it can be changed at runtime. For instance, for
+ * a ``FillDynamic`` layer, the input would have to be an array containing the
+ * values 10 and 4, if the desired output is of shape (10, 4). Whereas in the
+ * ``like`` case, the additional input's shape is used as the output shape, ignoring
+ * its values. For instance, for a ``FillLike`` layer, for an input with shape
+ * (10, 4), the output generated will also be of shape (10, 4), values of the
+ * input will be ignored.
+ */
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "DataStructures.proto";
+import public "Parameters.proto";
+
+package CoreML.Specification;
+
+
+enum NeuralNetworkMultiArrayShapeMapping {
+
+    /*
+     * Describes how the MultiArray shape for the inputs,
+     * provided in Features Types proto via model description,
+     * is mapped to construct tensors that are fed into the Neural Network layers.
+     */
+
+    /*
+     * Default legacy value. Only supported for Core ML Specification version <= 3.
+     *
+     * The default legacy shape mapping resolves all input shapes to a rank 5 equivalent
+     * with axis notation of [Seq, Batch, Channel, Height, Width].
+     *
+     * When this enum value is selected,
+     * the repeated shape field in the message "ArrayFeatureType" in feature types proto,
+     * must be either length 1 or length 3.
+     *
+     * The following rule is used to map the values in the shape field to the actual tensor shape:
+     * rank 1 shape is mapped to shape [1,1,C,1,1]
+     * rank 3 shape is mapped to shape [1,1,C,H,W]
+     * At runtime, the first two dimensions (Seq or Batch) can be presented as well, with non-1 values.
+     *
+     * It is invalid to use this enum value if any of the layers added
+     * Specification version 4 (iOS >= 13, macOS >= 10.15) onwards are used in the network.
+     * Validator will raise an error in that case.
+     */
+    RANK5_ARRAY_MAPPING = 0;
+
+    /*
+     * The exact shape and rank (i.e. number of dimensions in the shape) of the input,
+     * as specified in the message "ArrayFeatureType", is passed through to the layers.
+     * Supported only for Specification version >= 4 (iOS >= 13, macOS >= 10.15).
+     */
+    EXACT_ARRAY_MAPPING = 1;
+
+}
+
+enum NeuralNetworkImageShapeMapping {
+
+    /*
+     * Describes how the shape of the input tensors is constructed from image inputs.
+     */
+
+    /*
+     * In this case, image input is mapped to a rank 5 tensor.
+     * For Color images, input tensor is shaped as [1,1,3,H,W].
+     * For Gray images, input tensor is shaped as [1,1,1,H,W].
+     */
+    RANK5_IMAGE_MAPPING = 0;
+
+    /*
+     * For Color images, input tensor is shaped as [1,3,H,W].
+     * For Gray images, input tensor is shaped as [1,1,H,W].
+     * Supported only for Specification version >= 4 (iOS >= 13, macOS >= 10.15).
+     */
+    RANK4_IMAGE_MAPPING = 1;
+
+}
+
+/*
+ A neural network.
+ */
+message NeuralNetwork {
+
+    repeated NeuralNetworkLayer layers = 1;
+    repeated NeuralNetworkPreprocessing preprocessing = 2;
+
+    // use this enum value to determine the input tensor shapes to the neural network, for multiarray inputs
+    NeuralNetworkMultiArrayShapeMapping arrayInputShapeMapping = 5;
+
+    // use this enum value to determine the input tensor shapes to the neural network, for image inputs
+    NeuralNetworkImageShapeMapping imageInputShapeMapping = 6;
+
+
+    NetworkUpdateParameters updateParams = 10;
+
+}
+
+// Preprocessing
+// -------------
+
+/*
+ * A neural network preprocessor that
+ * performs a scalar multiplication of an image
+ * followed by addition of scalar biases to the channels.
+ *
+ * Input: X
+ *    An image in BGR or RGB format with shape ``[3, H, W]``
+ *    or in grayscale format with shape ``[1, H, W]``.
+ * Output: Y
+ *    An image with format and shape corresponding to the input.
+ *
+ * If the input image is in BGR format:
+ *
+ * .. code::
+ *
+ *     Y[0, :, :] = channelScale * X[0, :, :] + blueBias
+ *     Y[1, :, :] = channelScale * X[1, :, :] + greenBias
+ *     Y[2, :, :] = channelScale * X[2, :, :] + redBias
+ *
+ * If the input image is in RGB format:
+ *
+ * .. code::
+ *
+ *     Y[0, :, :] = channelScale * X[0, :, :] + redBias
+ *     Y[1, :, :] = channelScale * X[1, :, :] + greenBias
+ *     Y[2, :, :] = channelScale * X[2, :, :] + blueBias
+ *
+ * If the input image is in grayscale format:
+ *
+ * .. code::
+ *
+ *     Y[0, :, :] = channelScale * X[0, :, :] + grayBias
+ */
+message NeuralNetworkImageScaler {
+
+    float channelScale = 10; // Scalar to be multiplied.
+    float blueBias = 20; // Scalar blue bias to be added.
+    float greenBias = 21; // Scalar green bias to be added.
+    float redBias = 22; // Scalar red bias to be added.
+    float grayBias = 30; // Scalar bias to be added for grayscale images.
+
+}
+
+/*
+ * A neural network preprocessor that
+ * subtracts the provided mean image from the input image.
+ * The mean image is subtracted from the input named
+ * ``NeuralNetworkPreprocessing.featureName``.
+ */
+message NeuralNetworkMeanImage {
+
+    /*
+     * Mean image stored as a flattened array of floats,
+     * representing shape [Channel,Height,Width].
+     */
+    repeated float meanImage = 1;
+
+}
+
+// Preprocessing parameters for image inputs.
+message NeuralNetworkPreprocessing {
+
+    string featureName = 1; // must be equal to the input name to which the preprocessing is applied
+    oneof preprocessor {
+        NeuralNetworkImageScaler scaler = 10;
+        NeuralNetworkMeanImage meanImage = 11;
+    }
+
+}
+
+// Activation Functions
+// --------------------
+
+/*
+ * A rectified linear unit (ReLU) activation function.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *     f(x) = \text{max}(0, x)
+ */
+message ActivationReLU {
+
+}
+
+/*
+ * A leaky rectified linear unit (ReLU) activation function.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *     f(x) = \begin{cases}
+ *             x      & \text{if } x \geq 0 \\
+ *             \alpha x & \text{if } x < 0
+ *            \end{cases}
+ */
+message ActivationLeakyReLU {
+
+    float alpha = 1; //negative slope value for leakyReLU
+
+}
+
+/*
+ * A hyperbolic tangent activation function.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *     f(x) = \dfrac{1 - e^{-2x}}{1 + e^{-2x}}
+ */
+message ActivationTanh {
+
+}
+
+/*
+ * A scaled hyperbolic tangent activation function.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *     f(x) = \alpha \tanh(\beta x)
+ */
+message ActivationScaledTanh {
+
+    float alpha = 1;
+    float beta = 2;
+
+}
+
+/*
+ * A sigmoid activation function.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *     f(x) = \dfrac{1}{1 + e^{-x}}
+ */
+message ActivationSigmoid {
+
+}
+
+/*
+ * A linear activation function.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *     f(x) = \alpha x + \beta
+ */
+message ActivationLinear {
+
+    float alpha = 1;
+    float beta = 2;
+
+}
+
+/*
+ * A hard sigmoid activation function.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *     f(x) = \text{min}(\text{max}(\alpha x + \beta, 0), 1)
+ */
+message ActivationSigmoidHard {
+
+    float alpha = 1;
+    float beta = 2;
+
+}
+
+/*
+ * A parameterized rectified linear unit (PReLU) activation function.
+ * Input must be at least rank 3. Axis = -3 is denoted by "C", or channels.
+ * "alpha" parameter can be a vector of length C.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *    f(x_i) = \begin{cases}
+ *                 x_i          & \text{if } x_i \geq 0 \\
+ *                 \alpha_i x_i & \text{if } x_i < 0
+ *             \end{cases} \;,\;i=1,...,C
+ */
+message ActivationPReLU {
+
+    // parameter of length C or 1.
+    // If length is 1, same value is used for all channels
+    WeightParams alpha = 1;
+
+}
+
+/*
+ * An exponential linear unit (ELU) activation function.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *     f(x) = \begin{cases}
+ *             x              & \text{if } x \geq 0 \\
+ *             \alpha (e^x - 1) & \text{if } x < 0
+ *            \end{cases}
+ */
+message ActivationELU {
+
+    float alpha = 1;
+
+}
+
+/*
+ * A thresholded rectified linear unit (ReLU) activation function.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *     f(x) = \begin{cases}
+ *             x & \text{if } x \geq \alpha \\
+ *             0 & \text{if } x < \alpha
+ *            \end{cases}
+ */
+message ActivationThresholdedReLU {
+
+    float alpha = 1;
+
+}
+
+/*
+ * A softsign activation function.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *     f(x) = \dfrac{x}{1 + |x|}
+ */
+message ActivationSoftsign {
+
+}
+
+/*
+ * A softplus activation function.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *     f(x) = \text{log}(1 + e^x)
+ */
+message ActivationSoftplus {
+
+}
+
+/*
+ * A parametric softplus activation function.
+ * Input must be at least rank 3. axis = -3 is denoted by "C", or channels.
+ * "alpha"/"beta" parameter can be a vector of length C.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *     f(x_i) = \alpha_i \text{log}(1 + e^{\beta_i x_i}) \;,\;i=1,...,C
+ */
+message ActivationParametricSoftplus {
+
+    // If length is 1, same value is used for all channels
+    WeightParams alpha = 1; //parameter of length C or 1
+    WeightParams beta = 2; //parameter of length C or 1
+
+}
+
+message ActivationParams {
+
+    oneof NonlinearityType {
+        ActivationLinear linear = 5;
+
+        ActivationReLU ReLU = 10;
+        ActivationLeakyReLU leakyReLU = 15;
+        ActivationThresholdedReLU thresholdedReLU = 20;
+        ActivationPReLU PReLU = 25;
+
+        ActivationTanh tanh = 30;
+        ActivationScaledTanh scaledTanh = 31;
+
+        ActivationSigmoid sigmoid = 40;
+        ActivationSigmoidHard sigmoidHard = 41;
+
+        ActivationELU ELU = 50;
+
+        ActivationSoftsign softsign = 60;
+        ActivationSoftplus softplus = 70;
+        ActivationParametricSoftplus parametricSoftplus = 71;
+    }
+
+}
+
+/*
+ * Representation of the intermediate tensors
+ */
+message Tensor {
+
+    // Number of dimensions in the tensor shape
+    uint32 rank = 1;
+    // actual value of the tensor shape.
+    // must be of length "rank". Can contain -1s for unknown dimensions.
+    repeated int64 dimValue = 2;
+
+}
+
+/*
+ * A single neural network layer.
+ */
+message NeuralNetworkLayer {
+
+    string name = 1; //descriptive name of the layer
+    repeated string input = 2;
+    repeated string output = 3;
+
+    repeated Tensor inputTensor = 4; // must be the same length as the "input" field
+    repeated Tensor outputTensor = 5; // must be the same length as the "output" field
+
+    // Must be set to true to mark the layer as updatable.
+    // If true, the weightParams in the layer's properties must also be set to updatable
+    // If false, the value of the isUpdatable parameter within the layer's weights are ignored
+    bool isUpdatable = 10;
+
+    oneof layer {
+
+        // Start at 100 here
+        ConvolutionLayerParams convolution = 100;
+
+        PoolingLayerParams pooling = 120;
+
+        ActivationParams activation = 130;
+
+        InnerProductLayerParams innerProduct = 140;
+        EmbeddingLayerParams embedding = 150;
+
+        // Normalization-related Layers
+        BatchnormLayerParams batchnorm = 160;
+        MeanVarianceNormalizeLayerParams mvn = 165;
+        L2NormalizeLayerParams l2normalize = 170;
+        SoftmaxLayerParams softmax = 175;
+        LRNLayerParams lrn = 180;
+
+        CropLayerParams crop = 190;
+        PaddingLayerParams padding = 200;
+        UpsampleLayerParams upsample = 210;
+
+        ResizeBilinearLayerParams resizeBilinear = 211;
+        CropResizeLayerParams cropResize = 212;
+
+        UnaryFunctionLayerParams unary = 220;
+
+        // Element-wise Operations
+        AddLayerParams add = 230;
+        MultiplyLayerParams multiply = 231;
+
+        AverageLayerParams average = 240;
+        ScaleLayerParams scale = 245;
+
+        BiasLayerParams bias = 250;
+        MaxLayerParams max = 260;
+        MinLayerParams min = 261;
+
+        DotProductLayerParams dot = 270;
+        ReduceLayerParams reduce = 280;
+        LoadConstantLayerParams loadConstant = 290;
+
+        // Data Reorganization
+        ReshapeLayerParams reshape = 300;
+        FlattenLayerParams flatten = 301;
+        PermuteLayerParams permute = 310;
+        ConcatLayerParams concat = 320;
+        SplitLayerParams split = 330;
+        SequenceRepeatLayerParams sequenceRepeat = 340;
+
+        ReorganizeDataLayerParams reorganizeData = 345;
+        SliceLayerParams slice = 350;
+
+        // Recurrent Layers
+        SimpleRecurrentLayerParams simpleRecurrent = 400;
+        GRULayerParams gru = 410;
+        UniDirectionalLSTMLayerParams uniDirectionalLSTM = 420;
+        BiDirectionalLSTMLayerParams biDirectionalLSTM = 430;
+
+        // Custom (user-implemented) Layer
+        CustomLayerParams custom = 500;
+
+        // Following layers are available only after Core ML Specification
+        // version >= 4 (iOS >= 13, macOS >= 10.15)
+
+        // Control Flow related Layers
+        CopyLayerParams copy = 600;
+        BranchLayerParams branch = 605;
+
+        LoopLayerParams loop = 615;
+        LoopBreakLayerParams loopBreak = 620;
+        LoopContinueLayerParams loopContinue = 625;
+
+        RangeStaticLayerParams rangeStatic = 635;
+        RangeDynamicLayerParams rangeDynamic = 640;
+
+        // Element-wise Unary Layers
+        ClipLayerParams clip = 660;
+        CeilLayerParams ceil = 665;
+        FloorLayerParams floor = 670;
+
+        SignLayerParams sign = 680;
+        RoundLayerParams round = 685;
+
+        Exp2LayerParams exp2 = 700;
+
+        SinLayerParams sin = 710;
+        CosLayerParams cos = 715;
+        TanLayerParams tan = 720;
+
+        AsinLayerParams asin = 730;
+        AcosLayerParams acos = 735;
+        AtanLayerParams atan = 740;
+
+        SinhLayerParams sinh = 750;
+        CoshLayerParams cosh = 755;
+        TanhLayerParams tanh = 760;
+
+        AsinhLayerParams asinh = 770;
+        AcoshLayerParams acosh = 775;
+        AtanhLayerParams atanh = 780;
+
+        ErfLayerParams erf = 790;
+        GeluLayerParams gelu = 795;
+
+        // Element-wise Binary with Broadcasting Support
+        EqualLayerParams equal = 815;
+        NotEqualLayerParams notEqual = 820;
+        LessThanLayerParams lessThan = 825;
+        LessEqualLayerParams lessEqual = 827;
+        GreaterThanLayerParams greaterThan = 830;
+        GreaterEqualLayerParams greaterEqual = 832;
+
+        LogicalOrLayerParams logicalOr = 840;
+        LogicalXorLayerParams logicalXor = 845;
+        LogicalNotLayerParams logicalNot = 850;
+        LogicalAndLayerParams logicalAnd = 855;
+
+        ModBroadcastableLayerParams modBroadcastable = 865;
+        MinBroadcastableLayerParams minBroadcastable = 870;
+        MaxBroadcastableLayerParams maxBroadcastable = 875;
+        AddBroadcastableLayerParams addBroadcastable = 880;
+        PowBroadcastableLayerParams powBroadcastable = 885;
+        DivideBroadcastableLayerParams divideBroadcastable = 890;
+        FloorDivBroadcastableLayerParams floorDivBroadcastable = 895;
+        MultiplyBroadcastableLayerParams multiplyBroadcastable = 900;
+        SubtractBroadcastableLayerParams subtractBroadcastable = 905;
+
+        // Tensor Manipulations
+        TileLayerParams tile = 920;
+        StackLayerParams stack = 925;
+        GatherLayerParams gather = 930;
+        ScatterLayerParams scatter = 935;
+        GatherNDLayerParams gatherND = 940;
+        ScatterNDLayerParams scatterND = 945;
+        SoftmaxNDLayerParams softmaxND = 950;
+        GatherAlongAxisLayerParams gatherAlongAxis = 952;
+        ScatterAlongAxisLayerParams scatterAlongAxis = 954;
+
+        ReverseLayerParams reverse = 960;
+        ReverseSeqLayerParams reverseSeq = 965;
+
+        SplitNDLayerParams splitND = 975;
+        ConcatNDLayerParams concatND = 980;
+        TransposeLayerParams transpose = 985;
+
+        SliceStaticLayerParams sliceStatic = 995;
+        SliceDynamicLayerParams sliceDynamic = 1000;
+        SlidingWindowsLayerParams slidingWindows = 1005;
+
+        TopKLayerParams topK = 1015;
+        ArgMinLayerParams argMin = 1020;
+        ArgMaxLayerParams argMax = 1025;
+
+        EmbeddingNDLayerParams embeddingND = 1040;
+        BatchedMatMulLayerParams batchedMatmul = 1045;
+
+        // Tensor Allocation / Reshape-related Operations
+        GetShapeLayerParams getShape = 1065;
+        LoadConstantNDLayerParams loadConstantND = 1070;
+
+        FillLikeLayerParams fillLike = 1080;
+        FillStaticLayerParams fillStatic = 1085;
+        FillDynamicLayerParams fillDynamic = 1090;
+
+        BroadcastToLikeLayerParams broadcastToLike = 1100;
+        BroadcastToStaticLayerParams broadcastToStatic = 1105;
+        BroadcastToDynamicLayerParams broadcastToDynamic = 1110;
+
+        SqueezeLayerParams squeeze = 1120;
+        ExpandDimsLayerParams expandDims = 1125;
+        FlattenTo2DLayerParams flattenTo2D = 1130;
+        ReshapeLikeLayerParams reshapeLike = 1135;
+        ReshapeStaticLayerParams reshapeStatic = 1140;
+        ReshapeDynamicLayerParams reshapeDynamic = 1145;
+        RankPreservingReshapeLayerParams rankPreservingReshape = 1150;
+
+        ConstantPaddingLayerParams constantPad = 1155;
+
+        // Random Distributions
+        RandomNormalLikeLayerParams randomNormalLike = 1170;
+        RandomNormalStaticLayerParams randomNormalStatic = 1175;
+        RandomNormalDynamicLayerParams randomNormalDynamic = 1180;
+
+        RandomUniformLikeLayerParams randomUniformLike = 1190;
+        RandomUniformStaticLayerParams randomUniformStatic = 1195;
+        RandomUniformDynamicLayerParams randomUniformDynamic = 1200;
+
+        RandomBernoulliLikeLayerParams randomBernoulliLike = 1210;
+        RandomBernoulliStaticLayerParams randomBernoulliStatic = 1215;
+        RandomBernoulliDynamicLayerParams randomBernoulliDynamic = 1220;
+
+        CategoricalDistributionLayerParams categoricalDistribution = 1230;
+
+        // Reduction-related Layers:
+        ReduceL1LayerParams reduceL1 = 1250;
+        ReduceL2LayerParams reduceL2 = 1255;
+        ReduceMaxLayerParams reduceMax = 1260;
+        ReduceMinLayerParams reduceMin = 1265;
+        ReduceSumLayerParams reduceSum = 1270;
+        ReduceProdLayerParams reduceProd = 1275;
+        ReduceMeanLayerParams reduceMean = 1280;
+        ReduceLogSumLayerParams reduceLogSum = 1285;
+        ReduceSumSquareLayerParams reduceSumSquare = 1290;
+        ReduceLogSumExpLayerParams reduceLogSumExp = 1295;
+
+        // Masking / Selection Layers
+        WhereNonZeroLayerParams whereNonZero = 1313;
+        MatrixBandPartLayerParams matrixBandPart = 1315;
+        LowerTriangularLayerParams lowerTriangular = 1320;
+        UpperTriangularLayerParams upperTriangular = 1325;
+        WhereBroadcastableLayerParams whereBroadcastable = 1330;
+
+        // Normalization Layers
+        LayerNormalizationLayerParams layerNormalization = 1350;
+
+        NonMaximumSuppressionLayerParams NonMaximumSuppression = 1400;
+
+        // Following layers are available only after Core ML Specification
+        // version >= 5 (iOS >= 14, macOS >= 11.0)
+        OneHotLayerParams oneHot = 1450;
+        CumSumLayerParams cumSum = 1455;
+        ClampedReLULayerParams clampedReLU = 1460;
+        ArgSortLayerParams argSort = 1461;
+        Pooling3DLayerParams pooling3d = 1465;
+        GlobalPooling3DLayerParams globalPooling3d = 1466;
+        SliceBySizeLayerParams sliceBySize = 1470;
+        Convolution3DLayerParams convolution3d = 1471;
+
+    }
+
+}
+
+/*
+ * Branching Layer
+ *
+ * A layer that provides the functionality of branching or an If-Else block.
+ *
+ * Must have 1 input. There are no outputs as the execution is transferred to either the
+ * if or the else branch based on the value of the input.
+ *
+ * Input is the condition predicate. Must be a scalar (length 1 tensor).
+ *
+ */
+message BranchLayerParams {
+
+    /*
+     * execute this graph if the absolute value of the input Tensor is greater than 1e-6
+     * This must be present.
+     */
+    NeuralNetwork ifBranch = 1;
+    /*
+     * execute this graph if the absolute value of the input Tensor is less than 1e-6
+     * This is optional.
+     */
+    NeuralNetwork elseBranch = 2;
+
+}
+
+/*
+ * Loop Layer
+ *
+ * A layer that provides the functionality of a "for" loop or a "while" loop.
+ *
+ * There are either no inputs or 1 input. When an input is present, it corresponds to the maximum loop count,
+ * in that case the value of the "maxLoopIterations" field is ignored. Input must be a scalar.
+ * (For description below, maxLoopIterations is assumed to be the value of the input, when its present)
+ *
+ * No outputs are produced. Blobs produced by the condition or the body network are visible in the scope of the overall network.
+ *
+ * "conditionNetwork" must produce a tensor with the name specified in the "conditionVar" field.
+ *
+ * There are 3 possible cases for determining the termination condition:
+ *
+ * Case 1:
+ *
+ * If there is no "conditionNetwork", in this case the layer corresponds to a pure for loop, which is run "maxLoopIterations" number of times.
+ * Equivalent pseudo-code:
+ *
+ * for loopIterator = 0 : maxLoopIterations
+ *      bodyNetwork()
+ *
+ *
+ * Case 2:
+ *
+ * "conditionNetwork" is present, and "maxLoopIterations" is 0 and there is no input,
+ * in this case the layer corresponds to a while loop. Equivalent pseudo-code:
+ *
+ * conditionVar = conditionNetwork()
+ * while conditionVar:
+ *      bodyNetwork()
+ *      conditionVar = conditionNetwork()
+ *
+ *
+ * Case 3:
+ *
+ * "conditionNetwork" is provided, and "maxLoopIterations" is positive or there is an input,
+ * in this case the layer corresponds to a while loop with a joint condition. Equivalent pseudo-code:
+ *
+ * loopIterator = 0
+ * conditionVar = conditionNetwork()
+ * while (conditionVar and loopIterator < maxLoopIterations):
+ *      bodyNetwork()
+ *      loopIterator = loopIterator + 1
+ *      conditionVar = conditionNetwork()
+ *
+ */
+message LoopLayerParams {
+
+    /*
+     * maximum number of iterations. Ignored if input is present.
+     */
+    uint64 maxLoopIterations = 1;
+    /*
+     * This field provides the name of the tensor which is produced by the conditionNetwork
+     * and whose value is checked to start/continue/terminate the loop. Value close to 0.0f is treated as False.
+     * This field is optional.
+     * Must be a non empty string if and only if "conditionNetwork" is present.
+     */
+    string conditionVar = 2;
+    /*
+     * Must generate a tensor with the name provided in the "conditionVar" field.
+     * This field is optional.
+     * Must be present if and only if "conditionVar" field is a non empty string.
+     */
+    NeuralNetwork conditionNetwork = 3;
+    /*
+     * Body of the loop.
+     * This field must be present.
+     */
+    NeuralNetwork bodyNetwork = 4;
+
+}
+
+/*
+ * Loop break Layer
+ *
+ * Terminate the loop that has this layer.
+ * If present, it should always reside in the "bodyNetwork" of the loop layer
+ *
+ * No inputs/outputs
+ *
+ */
+message LoopBreakLayerParams {
+
+}
+
+/*
+ * Loop Continue Layer
+ *
+ * Stop the current loop iteration and continue on the next iteration.
+ * If present, it should always reside in the "bodyNetwork" of the loop layer
+ *
+ * No inputs/outputs
+ *
+ */
+message LoopContinueLayerParams {
+
+}
+
+/*
+ * Copy Layer
+ *
+ * A layer that copies its input tensor to the output tensor.
+ * Must have 1 input and 1 output, with distinct names.
+ * This is the only layer that is allowed to re-generate an output that is already present in the neural network prior to this layer,
+ * in which case it will overwrite the output tensor.
+ *
+ */
+message CopyLayerParams {
+
+}
+
+/*
+ * GreaterThan Layer
+ *
+ * Either 1 or 2 inputs.
+ * Produces 1 output.
+ * Perform elementwise greater than operation.
+ *
+ * Output is 1.0f if the condition is true otherwise 0.0f.
+ *
+ * .. code::
+ *
+ *      y = x1 > x2
+ *          or
+ *      y = x1 > alpha, if only one input is provided
+ *
+ * Broadcasting is supported.
+ *
+ */
+message GreaterThanLayerParams {
+
+    /*
+     * Compare to the scalar value provided here if there is 1 input
+     */
+    float alpha = 2;
+
+}
+
+/*
+ * GreaterEqual Layer
+ *
+ * Either 1 or 2 inputs.
+ * Produces 1 output.
+ * Perform elementwise greater equal operation.
+ *
+ * Output is 1.0f if the condition is true otherwise 0.0f.
+ *
+ * .. code::
+ *
+ *      y = x1 >= x2
+ *          or
+ *      y = x1 >= alpha, if only one input is provided
+ *
+ * Broadcasting is supported.
+ *
+ */
+message GreaterEqualLayerParams {
+
+    /*
+     * Compare to the scalar value provided here if there is 1 input
+     */
+    float alpha = 2;
+
+}
+
+/*
+ * LessThan Layer
+ *
+ * Either 1 or 2 inputs.
+ * Produces 1 output.
+ * Perform elementwise less than operation.
+ *
+ * Output is 1.0f if the condition is true otherwise 0.0f.
+ *
+ * .. code::
+ *
+ *      y = x1 < x2
+ *          or
+ *      y = x1 < alpha, if only one input is provided
+ *
+ * Broadcasting is supported.
+ *
+ */
+message LessThanLayerParams {
+
+    /*
+     * Compare to the scalar value provided here if there is 1 input
+     */
+    float alpha = 2;
+
+}
+
+/*
+ * LessEqual Layer
+ *
+ * Either 1 or 2 inputs.
+ * Produces 1 output.
+ * Perform elementwise less equal operation.
+ *
+ * Output is 1.0f if the condition is true otherwise 0.0f.
+ *
+ * .. code::
+ *
+ *      y = x1 <= x2
+ *          or
+ *      y = x1 <= alpha, if only one input is provided
+ *
+ * Broadcasting is supported.
+ *
+ */
+message LessEqualLayerParams {
+
+    /*
+     * Compare to the scalar value provided here if there is 1 input
+     */
+    float alpha = 2;
+
+}
+
+/*
+ * Equal Layer
+ *
+ * Either 1 or 2 inputs.
+ * Produces 1 output.
+ * Perform elementwise equal operation.
+ *
+ * Output is 1.0f if the condition is true otherwise 0.0f.
+ *
+ * .. code::
+ *
+ *      y = x1 == x2
+ *          or
+ *      y = x1 == alpha, if only one input is provided
+ *
+ * Broadcasting is supported.
+ *
+ */
+message EqualLayerParams {
+
+    /*
+     * Compare to the scalar value provided here if there is 1 input
+     */
+    float alpha = 1;
+
+}
+
+/*
+ * NotEqual Layer
+ *
+ * Either 1 or 2 inputs.
+ * Produces 1 output.
+ * Perform elementwise not equal operation.
+ *
+ * Output is 1.0f if the condition is true otherwise 0.0f.
+ *
+ * .. code::
+ *
+ *      y = x1 != x2
+ *          or
+ *      y = x1 != alpha, if only one input is provided
+ *
+ * Broadcasting is supported.
+ *
+ */
+message NotEqualLayerParams {
+
+    /*
+     * Compare to the scalar value provided here if there is 1 input
+     */
+    float alpha = 1;
+
+}
+
+/*
+ * LogicalAnd Layer
+ *
+ * Must have 2 inputs, produces 1 output.
+ * Perform elementwise logical AND operation.
+ *
+ * Input is considered False if equal to 0.0f otherwise True.
+ * Output is 1.0f if the condition is true otherwise 0.0f.
+ *
+ * .. code::
+ *
+ *      y = AND(x1, x2)
+ *
+ * Broadcasting is supported.
+ *
+ */
+message LogicalAndLayerParams {
+
+}
+
+/*
+ * LogicalOr Layer
+ *
+ * Must have 2 inputs, produces 1 output.
+ * Perform elementwise logical OR operation.
+ *
+ * Input is considered False if equal to 0.0f otherwise True.
+ * Output is 1.0f if the condition is true otherwise 0.0f.
+ *
+ * .. code::
+ *
+ *      y = OR(x1, x2)
+ *
+ * Broadcasting is supported.
+ *
+ */
+message LogicalOrLayerParams {
+
+}
+
+/*
+ * LogicalXor Layer
+ *
+ * Must have 2 inputs, produces 1 output.
+ * Perform elementwise logical XOR operation.
+ *
+ * Input is considered False if equal to 0.0f otherwise True.
+ * Output is 1.0f if the condition is true otherwise 0.0f.
+ *
+ * .. code::
+ *
+ *      y = XOR(x1, x2)
+ *
+ * Broadcasting is supported.
+ *
+ */
+message LogicalXorLayerParams {
+
+}
+
+/*
+ * LogicalNot Layer
+ *
+ * Must have 1 input, produces 1 output.
+ * Perform elementwise logical NOT operation.
+ *
+ * Input is considered False if equal to 0.0f otherwise True.
+ * Output is 1.0f if the condition is true otherwise 0.0f.
+ *
+ * .. code::
+ *
+ *      y = NOT(x)
+ *
+ *
+ */
+message LogicalNotLayerParams {
+
+}
+
+// Border Amounts
+// --------------
+
+/*
+ * Specifies the amount of spatial border to be either padded or cropped.
+ *
+ * For padding:
+ *
+ * .. code::
+ *
+ *     H_out = borderAmounts[0].startEdgeSize + H_in + borderAmounts[0].endEdgeSize
+ *     W_out = borderAmounts[1].startEdgeSize + W_in + borderAmounts[1].endEdgeSize
+ *
+ *     topPaddingAmount == Height startEdgeSize
+ *     bottomPaddingAmount == Height endEdgeSize
+ *     leftPaddingAmount == Width startEdgeSize
+ *     rightPaddingAmount == Width endEdgeSize
+ *
+ * For cropping:
+ *
+ * .. code::
+ *
+ *     H_out = (-borderAmounts[0].startEdgeSize) + H_in + (-borderAmounts[0].endEdgeSize)
+ *     W_out = (-borderAmounts[1].startEdgeSize) + W_in + (-borderAmounts[1].endEdgeSize)
+ *
+ *     topCropAmount == Height startEdgeSize
+ *     bottomCropAmount == Height endEdgeSize
+ *     leftCropAmount == Width startEdgeSize
+ *     rightCropAmount == Width endEdgeSize
+ */
+message BorderAmounts {
+
+    message EdgeSizes {
+        /*
+         * The amount to be padded or cropped from the beginning.
+         */
+        uint64 startEdgeSize = 1;
+
+        /*
+         * The amount to be padded or cropped from the end.
+         */
+        uint64 endEdgeSize = 2;
+    }
+
+    /*
+     * The border amounts.
+     * This must be length 2 in the order ``[H, W]``.
+     */
+    repeated EdgeSizes borderAmounts = 10;
+
+}
+
+/*
+ * Specifies the type of padding to be used with Convolution/Deconvolution and Pooling layers.
+ * After padding, input spatial shape: ``[H_in, W_in]``, gets modified to the
+ * output spatial shape ``[H_out, W_out]``.
+ *
+ * .. code::
+ *
+ *      topPaddingAmount == Height startEdgeSize == borderAmounts[0].startEdgeSize
+ *      bottomPaddingAmount == Height endEdgeSize == borderAmounts[0].endEdgeSize
+ *      leftPaddingAmount == Width startEdgeSize == borderAmounts[1].startEdgeSize
+ *      rightPaddingAmount == Width endEdgeSize == borderAmounts[1].endEdgeSize
+ *
+ * With Convolution or Pooling:
+ *
+ * .. code::
+ *
+ *    H_out = int_division_round_down((H_in + topPaddingAmount + bottomPaddingAmount - KernelSize[0]),stride[0]) + 1
+ *
+ * which is same as:
+ *
+ * .. code::
+ *
+ *    H_out = int_division_round_up((H_in + topPaddingAmount + bottomPaddingAmount - KernelSize[0] + 1),stride[0])
+ *
+ * With Deconvolution:
+ *
+ * .. code::
+ *
+ *    H_out = (H_in-1) * stride[0] + kernelSize[0] - (topPaddingAmount + bottomPaddingAmount)
+ *
+ *
+ * The equivalent expressions hold true for ``W_out`` as well.
+ *
+ *
+ * By default, the values of ``paddingAmounts`` are set to ``0``,
+ * which results in a "true" valid padding.
+ * If non-zero values are provided for ``paddingAmounts``,
+ * "valid" convolution/pooling is performed within the spatially expanded input.
+ *
+ */
+message ValidPadding {
+
+    BorderAmounts paddingAmounts = 1;
+
+}
+
+/*
+ * Specifies the type of padding to be used with Convolution/Deconvolution and pooling layers.
+ * After padding, input spatial shape: ``[H_in, W_in]``, gets modified to the
+ * output spatial shape ``[H_out, W_out]``.
+ * With Convolution or pooling:
+ *
+ * .. code::
+ *
+ *      H_out = int_division_round_up(H_in,stride[0])
+ *      W_out = int_division_round_up(W_in,stride[1])
+ *
+ * This is achieved by using the following padding amounts:
+ *
+ * .. code::
+ *
+ *     totalPaddingHeight = max(0,(H_out-1) * stride[0] + KernelSize[0] - Hin)
+ *     totalPaddingWidth = max(0,(W_out-1) * stride[1] + KernelSize[1] - Win)
+ *
+ * There are two modes of asymmetry:
+ * ``BOTTOM_RIGHT_HEAVY``, and ``TOP_LEFT_HEAVY``.
+ *
+ * If the mode is ``BOTTOM_RIGHT_HEAVY``:
+ *
+ * .. code::
+ *
+ *     topPaddingAmount = floor(totalPaddingHeight / 2)
+ *     bottomPaddingAmount = totalPaddingHeight - topPaddingAmount
+ *     leftPaddingAmount = floor(totalPaddingWidth / 2)
+ *     rightPaddingAmount = totalPaddingWidth - leftPaddingAmount
+ *
+ * If the mode is ``TOP_LEFT_HEAVY``:
+ *
+ * .. code::
+ *
+ *     bottomPaddingAmount = floor(totalPaddingHeight / 2)
+ *     topPaddingAmount = totalPaddingHeight - bottomPaddingAmount
+ *     rightPaddingAmount = floor(totalPaddingWidth / 2)
+ *     leftPaddingAmount = totalPaddingWidth - rightPaddingAmount
+ *
+ *
+ * With Deconvolution:
+ *
+ * .. code::
+ *
+ *    H_out = H_in * stride[0]
+ *    W_out = W_in * stride[1]
+ */
+message SamePadding {
+
+    enum SamePaddingMode {
+
+        BOTTOM_RIGHT_HEAVY = 0;
+        TOP_LEFT_HEAVY = 1;
+
+    }
+    SamePaddingMode asymmetryMode = 1;
+
+}
+
+/*
+ * Specifies how grid points are sampled from an interval.
+ * Without the loss of generality, assume the interval to be [0, X-1] from which N points are to be sampled.
+ * Here X may correspond to an input image's height or width.
+ * All the methods can be expressed in terms of numpy's linspace function, along with the constraint that grid points have to lie in the interval [0, X-1].
+ * Note: numpy.linspace(start = start, end = end, num = N, endpoint = True) corresponds to sampling
+ * N points uniformly from the interval [start, end], endpoints included.
+ * The methods vary in how the ``start`` and ``end`` values are computed.
+ */
+message SamplingMode {
+
+    enum Method {
+
+        /*
+         * start = 0, end = X-1
+         * grid points = numpy.linspace(start, end)
+         */
+        STRICT_ALIGN_ENDPOINTS_MODE = 0;
+
+        /*
+         * if N == 1: start = end = (X-1)/2
+         * otherwise, start = 0, end = X-1
+         * grid points = numpy.linspace(start, end)
+         */
+        ALIGN_ENDPOINTS_MODE = 1;
+
+        /*
+         * start = 0, end = X - X/N
+         * grid points = min(X-1, numpy.linspace(start, end))
+         * This is same as the mode used in the upsample layer in this specification, when used with bilinear interpolation. In that case N/X = upsample ratio.
+         */
+        UPSAMPLE_MODE = 2;
+
+        /*
+         * spacing = max(1, X-1)/N
+         * start = 0.5 * spacing
+         * end = start + (N-1) * spacing
+         * grid points = min(X-1, numpy.linspace(start, end))
+         */
+        ROI_ALIGN_MODE = 3;
+
+    }
+
+    Method samplingMethod = 1;
+
+}
+
+/*
+ * Specifies the convention used to specify four bounding box coordinates for an image of size (Height, Width).
+ * The (0,0) coordinate corresponds to the top-left corner of the image.
+ */
+message BoxCoordinatesMode {
+
+    enum Coordinates {
+
+        /*
+         * [h_start, w_start, h_end, w_end]
+         */
+        CORNERS_HEIGHT_FIRST = 0;
+
+        /*
+         * [w_start, h_start, w_end, h_end]
+         */
+        CORNERS_WIDTH_FIRST = 1;
+
+        /*
+         * [h_center, w_center, box_height, box_width]
+         */
+        CENTER_SIZE_HEIGHT_FIRST = 2;
+
+        /*
+         * [w_center, h_center, box_width, box_height]
+         */
+        CENTER_SIZE_WIDTH_FIRST = 3;
+
+    }
+
+    Coordinates boxMode = 1;
+
+}
+
+/*
+ * Weights for layer parameters.
+ * Weights are stored as repeated floating point numbers
+ * using row-major ordering
+ * and can represent 1-, 2-, 3-, or 4-dimensional data.
+ */
+message WeightParams {
+
+    /*
+     * Values specified in single / float / FP32 precision.
+     */
+    repeated float floatValue = 1;
+
+    /*
+     * Values in 16-bit half precision floating point.
+     */
+    bytes float16Value = 2;
+
+    /*
+     * Raw value specification for quantized lower precisions.
+     *
+     * This field is interpreted as uintN, where N is the number of bits in quantization.
+     * E.g. if n=8, the field is interpreted as an array of UINT8.
+     * Use this field for quantized parameters unless specifically noted to use
+     * int8RawValue.
+     */
+    bytes rawValue = 30;
+
+    /*
+     * Field to be used if int8DynamicQuantize is set in the parent layer.
+     * Cannot be set if rawValue is also set.
+     * The values in this field are interpreted as INT8.
+     *
+     * If this field is set, following conditions must hold true:
+     * * QuantizationType == LinearQuantizationParams, such that
+     *   * size of the "scale" field is 1 and "bias" field is empty in "LinearQuantizationParams"
+     */
+    bytes int8RawValue = 31;
+
+    /*
+     * Quantization related parameters.
+     */
+    QuantizationParams quantization = 40;
+
+    bool isUpdatable = 50;
+
+}
+
+/*
+ * Quantization parameters.
+ */
+message QuantizationParams {
+
+    uint64 numberOfBits = 1;
+    oneof QuantizationType {
+        LinearQuantizationParams linearQuantization = 101;
+        LookUpTableQuantizationParams lookupTableQuantization = 102;
+    }
+
+}
+
+message LinearQuantizationParams {
+
+    /*
+     * Stores scale and bias values corresponding to the quantized weights.
+     * Must be an array of 1 element, or an array of C elements, where C
+     * is number of output channels. For recurrent layers it is equal to
+     * the output vector size.
+     *
+     * Relationship between quantized weights, unquantized weights, scale and bias:
+     *
+     * W_unquantized = W_quantized * scale + bias
+     *
+     */
+    repeated float scale = 1;
+    repeated float bias = 2;
+
+}
+
+message LookUpTableQuantizationParams {
+
+    /* Stores look-up table quantization values. Must be an array of
+    (2^numberOfBits) Elements.
+    */
+    repeated float floatValue = 1;
+
+}
+
+// Layers
+// ------
+
+/*
+ * A layer that performs spatial convolution or deconvolution.
+ *
+ * .. code::
+ *
+ *      y = ConvolutionLayer(x)
+ *
+ * Requires 1 or 2 inputs and produces 1 output.
+ *
+ * Input
+ *    First Input:
+ *      A blob with rank greater than or equal to 4.
+ *      Rank 4 blob represents [Batch, channels, height, width].
+ *      For ranks greater than 4, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
+ *
+ *     From Core ML specification version 4 onwards (iOS >= 13, macOS >= 10.15).
+ *     convolution layer can have 2 inputs, in which case the second input is
+ *     the blob representing the weights. This is allowed when "isDeconvolution" = False.
+ *     The weight blob should have shape
+ *     ``[outputChannels, kernelChannels, kernelHeight, kernelWidth]``,
+ *     where kernelChannels == inputChannels / nGroups.
+ *
+ * Output
+ *   Rank is same as the input. e.g.: for rank 4 input, output shape is [B, C_out, H_out, W_out]
+ *
+ *
+ * If ``dilationFactor`` is not 1, effective kernel size is
+ * modified as follows:
+ *
+ * .. code::
+ *
+ *      KernelSize[0] <-- (kernelSize[0]-1) * dilationFactor[0] + 1
+ *      KernelSize[1] <-- (kernelSize[1]-1) * dilationFactor[1] + 1
+ *
+ * Type of padding can be ``valid`` or ``same``. Output spatial dimensions depend on the
+ * the type of padding. For details, refer to the descriptions of the messages "ValidPadding"
+ * and "SamePadding". Padded values are all zeros.
+ *
+ * For Deconvolution, ``ConvolutionPaddingType`` (``valid`` or ``same``) is ignored when ``outputShape`` is set.
+ *
+ *
+ */
+message ConvolutionLayerParams {
+
+    /*
+     * The number of kernels.
+     * Same as ``C_out`` used in the layer description.
+     */
+    uint64 outputChannels = 1;
+
+    /*
+     * Channel dimension of the kernels.
+     * Must be equal to ``inputChannels / nGroups``, if isDeconvolution == False
+     * Must be equal to ``inputChannels``, if isDeconvolution == True
+     */
+    uint64 kernelChannels = 2;
+
+    /*
+     * Group convolution, i.e. weight reuse along channel axis.
+     * Input and kernels are divided into g groups
+     * and convolution / deconvolution is applied within the groups independently.
+     * If not set or 0, it is set to the default value 1.
+     */
+    uint64 nGroups = 10;
+
+    /*
+     * Must be length 2 in the order ``[H, W]``.
+     * If not set, default value ``[3, 3]`` is used.
+     */
+    repeated uint64 kernelSize = 20;
+
+    /*
+     * Must be length 2 in the order ``[H, W]``.
+     * If not set, default value ``[1, 1]`` is used.
+     */
+    repeated uint64 stride = 30;
+
+    /*
+     * Must be length 2 in order ``[H, W]``.
+     * If not set, default value ``[1, 1]`` is used.
+     * It is ignored if ``isDeconvolution == true``.
+     */
+    repeated uint64 dilationFactor = 40;
+
+    /*
+     * The type of padding.
+     */
+    oneof ConvolutionPaddingType {
+        ValidPadding valid = 50;
+        SamePadding same = 51;
+    }
+
+    /*
+     * Flag to specify whether it is a deconvolution layer.
+     */
+    bool isDeconvolution = 60;
+
+    /*
+     * Flag to specify whether a bias is to be added or not.
+     */
+    bool hasBias = 70;
+
+    /*
+     * Weights associated with this layer.
+     * If convolution (``isDeconvolution == false``), weights have the shape
+     * ``[outputChannels, kernelChannels, kernelHeight, kernelWidth]``, where kernelChannels == inputChannels / nGroups
+     * If deconvolution (``isDeconvolution == true``) weights have the shape
+     * ``[kernelChannels, outputChannels / nGroups, kernelHeight, kernelWidth]``, where kernelChannels == inputChannels
+     */
+    WeightParams weights = 90;
+    WeightParams bias = 91; // Must be of size [outputChannels].
+
+    /*
+     * The output shape, which has length 2 ``[H_out, W_out]``.
+     * This is used only for deconvolution (``isDeconvolution == true``).
+     * If not set, the deconvolution output shape is calculated
+     * based on ``ConvolutionPaddingType``.
+     */
+    repeated uint64 outputShape = 100;
+
+}
+
+/*
+ * A layer that performs a 3-dimensional convolution.
+ *
+ * .. code::
+ *
+ *      y = Convolution3DLayer(x)
+ *
+ * Input
+ *    A blob of rank 5.
+ *    The input blob's shape should be ``[batch, channels, depth, height, width]``.
+ *
+ * Fields
+ *   The bias field, if set, should have shape of ``[channelsOut]``.
+ *
+ * Output
+ *   A blob of rank 5.
+ *   The output blob's shape is ``[batch, channelsOut, depthOut, heightOut, widthOut]``.
+ *
+ * Type of padding can be ``custom``, ``valid``, or ``same``. Padded values are all zeros.
+ * Output spatial dimensions depend on the the type of padding. For details, refer to the
+ * descriptions of the ``PaddingType`` field of this ``Convolution3DLayerParams`` message.
+ *
+ * Example
+ *   For example, given an input of size ``[1, 3, 3, 8, 8]``, a stride of 2 in each dimension,
+ *   a kernel of 3 in each dimension, 2 output channels, and ``same`` padding, this layer will
+ *   compute the total padding applied in the depth, height, and width dimensions to be 2, 1, and 1,
+ *   respectively. The depth padding is even and will be applied equally to both sides of the depth
+ *   dimension. Since the height and width padding values are odd, they'll be applied to the
+ *   bottom/right of the height/width dimensions. Thus, the padding applied to the input will be
+ *   ``[1, 1, 0, 1, 0, 1]`` (front, back, top, bottom, left, right). Finally, the output produced
+ *   will have size ``[1, 2, 2, 4, 4]``.
+ *
+ */
+message Convolution3DLayerParams {
+
+    /*
+     * The number of channels in the output (channelsOut). Must be a positive integer.
+     */
+    int32 outputChannels = 1;
+
+    /*
+     * The number of channels in the input (channels). Must be a positive integer.
+     */
+    int32 inputChannels = 2;
+
+    /*
+    * Group convolution, i.e., weight reuse along the channel axis.
+    * It must evenly divide both the number of input and output channels and be at most the number
+    * of input channels (a depthwise convolution).
+    * Input and kernels are divided into g groups and convolution is applied within the groups
+    * independently.
+    */
+    int32 nGroups = 10;
+
+    /* Depth of the convolution kernel. Must be a positive integer.
+     */
+    int32 kernelDepth = 20;
+
+    /* Height of the convolution kernel. Must be a positive integer.
+     */
+    int32 kernelHeight = 21;
+
+    /* Width of the convolution kernel. Must be a positive integer.
+     */
+    int32 kernelWidth = 22;
+
+    /* Stride along the depth direction. Must be a positive integer.
+     */
+    int32 strideDepth = 31;
+
+    /* Stride along the height direction. Must be a positive integer.
+     */
+    int32 strideHeight = 32;
+
+    /* Stride along the width direction. Must be a positive integer.
+     */
+    int32 strideWidth = 33;
+
+    /* Dilation along the depth direction. Must be a positive integer.
+     */
+    int32 dilationDepth = 40;
+
+    /* Dilation along the height direction. Must be a positive integer.
+     */
+    int32 dilationHeight = 41;
+
+    /* Dilation along the width direction. Must be a positive integer.
+     */
+    int32 dilationWidth = 42;
+
+    /*
+     * Flag to specify whether a bias is to be added or not.
+     * If false, then no bias is added.
+     */
+    bool hasBias = 50;
+
+    /*
+     * Weights associated with this layer.
+     * Weights have the shape
+     * if deconvolution == False
+     * ``[outputChannels, kernelChannels, kernelDepth, kernelHeight, kernelWidth]``, where
+     * kernelChannels == inputChannels / nGroups
+     * else if deconvolution == True
+     * ``[outputChannels / nGroups, kernelChannels, kernelDepth, kernelHeight, kernelWidth]``, where
+     */
+    WeightParams weights = 60;
+
+    /*
+     * Must be of size ``[outputChannels]``.
+     */
+    WeightParams bias = 61;
+
+
+    /*
+     * The type of padding.
+     * All padding types pad the input shape with zeros.
+     * CUSTOM padding will add the custom padding values specified below to their respective
+     * dimensions, e.g., `customPaddingFront` number of zeros will be added to one side of the
+     * input's depth dimension and `customPaddingBack` number of zeros will be added to the other
+     * side of the input's depth dimension.
+     * VALID padding adds no padding to any dimension. In this case, the last convolution along
+     * each dimension will be dropped if the input dimension and the kernel size, stride, and
+     * dilation do not match.
+     * SAME padding adds enough padding to each dimension such that the output of the convolution
+     * has size ``Ceiling(inputShape / stride)``. Padding is added evenly to both sides of each
+     * dimension unless the total padding to add is odd, in which case it is added to the
+     * back/bottom/right side of the respective dimension. For example, if the total padding needed
+     * in the depth dimension is 3, 1 zero will be added to the front side of the depth dimension
+     * and 2 zeros will be added to the back side.
+     */
+    enum PaddingType {
+        CUSTOM = 0;
+        VALID = 1;
+        SAME = 2;
+    }
+    PaddingType paddingType = 70;
+
+    /* Padding before the input in the depth direction. Must be zero or a positive integer.
+     * Used when the `PaddingType` is `CustomPadding`, otherwise ignored by other padding types.
+     */
+    int32 customPaddingFront = 80;
+
+    /* Padding after the input in the depth direction. Must be zero or a positive integer.
+     * Used when the `PaddingType` is `CustomPadding`, otherwise ignored by other padding types.
+     */
+    int32 customPaddingBack = 81;
+
+    /* Padding before the input in the height direction. Must be zero or a positive integer.
+     * Used when the `PaddingType` is `CustomPadding`, otherwise ignored by other padding types.
+     */
+    int32 customPaddingTop = 82;
+
+    /* Padding after the input in the height direction. Must be zero or a positive integer.
+     * Used when the `PaddingType` is `CustomPadding`, otherwise ignored by other padding types.
+     */
+    int32 customPaddingBottom = 83;
+
+    /* Padding before the input in the width direction. Must be zero or a positive integer.
+     * Used when the `PaddingType` is `CustomPadding`, otherwise ignored by other padding types.
+     */
+    int32 customPaddingLeft = 84;
+
+    /* Padding after the input in the width direction. Must be zero or a positive integer.
+     * Used when the `PaddingType` is `CustomPadding`, otherwise ignored by other padding types.
+     */
+    int32 customPaddingRight = 85;
+
+    /* Flag to specify if this is Convolution Transpose or not.
+     */
+    bool isDeconvolution = 86;
+
+    /*
+     * The output shape, which has length 3 ``[D_out, H_out, W_out]``.
+     * This is used only for deconvolution (``isDeconvolution == true``).
+     * If not set, the deconvolution output shape is calculated
+     * based on ``PaddingType``.
+     */
+    repeated uint64 outputShape = 87;
+
+}
+
+/*
+ * A layer that performs a matrix-vector or matrix-matrix product.
+ * This is equivalent to a fully-connected, or dense layer.
+ * The weight parameters correspond to a matrix of dimensions (inputChannels, outputChannels) i.e. (C_in, C_out)
+ *
+ * .. code::
+ *
+ *      y = InnerProductLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *      Input can have rank 1 to rank 5. This is how it is reshaped in to the matrix (for rank > 1):
+ *      rank 1 (x1) : in this case, the layer corresponds to a matrix-vector product. x1 must be equal to C_in
+ *      rank 2 (x1, x2): x2 must be equal to C_in
+ *      rank 3 (x1, x2, x3) --> (x1 * x2, x3). x3 must be equal to C_in
+ *      rank 4 (x1, x2, x3, x4) ---> (x1, x2 * x3 * x4). x2 * x3 * x4 must be equal to C_in
+ *      rank 5 (x1, x2, x3, x4, x5) ---> (x1 * x2, x3 * x4 * x5). x3 * x4 * x5 must be equal to C_in
+ *
+ * Output
+ *      Output rank is same as the input rank
+ *      rank 1: (C_out)
+ *      rank 2: (x1, C_out)
+ *      rank 3: (x1, x2, C_out)
+ *      rank 4: (x1, C_out, 1, 1)
+ *      rank 5: (x1, x2, C_out, 1, 1)
+ *
+ */
+message InnerProductLayerParams {
+
+    uint64 inputChannels = 1; // Input size: C_in.
+    uint64 outputChannels = 2; // Output size: C_out.
+
+    bool hasBias = 10; // Whether a bias is added or not.
+
+    WeightParams weights = 20; // Weight matrix [C_out, C_in].
+    WeightParams bias = 21; // Bias vector [C_out].
+
+    /*
+     * If set, this layer, at runtime, quantizes the floating point input blob to int8 before applying an
+     * inner product using INT8 weight matrix parameters, as provided in weights->int8RawValue. The
+     * result is then dequantized.
+     * Requires:
+     * * hasBias == false
+     * * QuantizationType == LinearQuantizationParams, such that
+     *   * size of the "scale" field is 1 and "bias" field is empty in "LinearQuantizationParams"
+     * * numberOfBits == 8
+     * * weights->rawValue_size to be empty
+     */
+    bool int8DynamicQuantize = 22;
+
+}
+
+/*
+ * A layer that performs a matrix lookup and optionally adds a bias.
+ * The weights matrix is stored with dimensions [outputChannels, inputDim].
+ *
+ * .. code::
+ *
+ *      y = EmbeddingLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     Input values must be in the range ``[0, inputDim - 1]``.
+ *
+ *     Input must have rank equal to 4 or 5, such that the last 3 dimensions are all 1.
+ *     rank 4: shape (x1, 1, 1, 1). x1 is effectively the batch/sequence length.
+ *     rank 5: shape (x1, x2 , 1, 1, 1). x1 * x2 is effectively the combined batch/sequence length.
+ *
+ * Output
+ *      Output rank is same as the input rank. Please see input description above.
+ *      rank 4: shape (x1, outputChannels, 1, 1)
+ *      rank 5: shape (x1, x2, outputChannels, 1, 1)
+ *
+ */
+message EmbeddingLayerParams {
+
+    uint64 inputDim = 1; // Size of the input dictionary.
+    uint64 outputChannels = 2; // Size of the output vectors.
+
+    bool hasBias = 10; // Whether a bias is added or not.
+
+    WeightParams weights = 20; // 2-D weights of dimensions [outputChannels, inputDim].
+    WeightParams bias = 21; // Bias of size [outputChannels].
+
+}
+
+/*
+ * A layer that performs a matrix lookup and optionally adds a bias.
+ * The weights matrix is stored with dimensions [embeddingSize, vocabSize].
+ *
+ * .. code::
+ *
+ *      y = EmbeddingNDLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     Input values must be in the range ``[0, vocabSize - 1]``.
+ *     Input must have rank at least 2. The last dimension must always be 1.
+ *     rank 2: shape (x1, 1). x1 is the batch/sequence length.
+ *     rank 3: shape (x1, x2, 1). x1 * x2 is effectively the combined batch/sequence length.
+ *     rank 4: shape (x1, x2, x3, 1). x1 * x2 * x2 is effectively the combined batch/sequence length.
+ *     rank 5: shape (x1, x2 , x3, x4, 1). x1 * x2 * x3 * x4 is effectively the combined batch/sequence length.
+ *
+ * Output
+ *      Output rank is same as the input rank. Please see input description above.
+ *      rank 2: shape (x1, embeddingSize)
+ *      rank 3: shape (x1, x2, embeddingSize)
+ *      rank 4: shape (x1, x2, x3, embeddingSize)
+ *      rank 5: shape (x1, x2, x3, x4, embeddingSize)
+ *
+ */
+message EmbeddingNDLayerParams {
+
+    uint64 vocabSize = 1; // Size of the input dictionary.
+    uint64 embeddingSize = 2; // Size of the output vectors.
+    bool hasBias = 3; // Whether a bias is added or not.
+    WeightParams weights = 20; // 2-D weights of dimensions [embeddingSize, vocabSize].
+    WeightParams bias = 21; // Bias of size [embeddingSize].
+
+}
+
+/*
+ * A layer that performs batch normalization,
+ * which is performed along axis = -3,
+ * and repeated along the other axes, if present.
+ *
+ * .. code::
+ *
+ *      y = BatchnormLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * This operation is described by the following formula:
+ *
+ * .. math::
+ *     y_i = \gamma_i \dfrac{ (x_i - \mu_i)}{\sqrt{\sigma_i^2 + \epsilon}} + \beta_i \;,\;i=1,....,C
+ *
+ * Input
+ *     A blob with rank greater than equal to 3.
+ *     Example: Rank 4 blob represents [Batch, channels, height, width]
+ *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
+ *
+ * Output
+ *     A blob with the same shape as the input.
+ */
+message BatchnormLayerParams {
+
+    uint64 channels = 1; // Size of the channel dimension in the input.
+
+    /*
+     * If ``computeMeanVar == true``,
+     * the mean and variance are calculated from either
+     * the single input instance, if ``instanceNormalization == true``,
+     * or the whole batch, if ``instanceNormalization = false``.
+     * and the values provided in parameters "mean" and "variance" are ignored.
+     */
+    bool computeMeanVar = 5;
+    bool instanceNormalization = 6;
+
+    /*
+     * A small constant to avoid division by 0 while normalizing by variance.
+     * Defaults to ``1e-5`` if not set or set to ``0``.
+     */
+    float epsilon = 10;
+
+    WeightParams gamma = 15; // Parameter of length [channels]
+    WeightParams beta = 16; // Parameter of length [channels]
+    WeightParams mean = 17; // Parameter of length [channels]
+    WeightParams variance = 18; // Parameter of length [channels]
+
+}
+
+/*
+ * A spatial pooling layer.
+ *
+ * .. code::
+ *
+ *      y = PoolingLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     A blob with rank greater than equal to 4.
+ *     Rank 4 blob represents [Batch, channels, height, width]
+ *     For ranks greater than 4, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
+ *
+ * Output
+ *     Rank is same as the input. e.g.: for rank 4 input, output shape is [B, C, H_out, W_out]
+ *
+ * Padding options are similar to ``ConvolutionLayerParams``
+ * with the additional option of ``ValidCompletePadding`` (``includeLastPixel``),
+ * which ensures that the last application of the kernel
+ * always includes the last pixel of the input image, if there is padding.
+ *
+ * .. code::
+ *
+ *     H_out = ceil(float(H_in + 2 * paddingAmounts[0] - kernelSize[0])/float(Stride[0])) + 1
+ *     if (paddingAmounts[0] > 0 or paddingAmounts[1] > 0)
+ *          if ((H_out - 1) * Stride >= H_in + paddingAmounts[0]) {
+ *              H_out = H_out - 1
+ *          }
+ *     }
+ *
+ * The equivalent expressions hold true for ``W_out`` as well.
+ * Only symmetric padding is supported with this option.
+ */
+message PoolingLayerParams {
+
+    enum PoolingType {
+
+        MAX = 0;
+        AVERAGE = 1;
+        L2 = 2;
+
+    }
+    PoolingType type = 1; // Type of pooling operation.
+
+    /*
+     * Must be length 2 in the order ``[H, W]``.
+     * If not set, default value ``[3, 3]`` is used.
+     */
+    repeated uint64 kernelSize = 10;
+
+    /*
+     * Must be length 2 in the order ``[H, W]``.
+     * If not set, default value ``[1, 1]`` is used.
+     */
+    repeated uint64 stride = 20;
+
+    message ValidCompletePadding {
+
+        /*
+         * Must be length 2 in order ``[H, W]``.
+         * If not set, value ``[0, 0]`` is used.
+         */
+        repeated uint64 paddingAmounts = 10;
+
+    }
+
+    oneof PoolingPaddingType {
+        ValidPadding valid = 30;
+        SamePadding same = 31;
+        ValidCompletePadding includeLastPixel = 32;
+    }
+
+    /*
+     * If true, padded values are excluded from the count (denominator)
+     * when computing average pooling.
+     */
+    bool avgPoolExcludePadding = 50;
+
+    /*
+     * If true, global pooling is performed.
+     * Kernel size is inferred from the input data spatial dimensions.
+     */
+    bool globalPooling = 60;
+
+}
+
+/*
+ * A layer to pool three spatial dimensions
+ *
+ * Input
+ *      A blob with rank equal to 5, representing [Batch, channels, depth, height, width].
+ *
+ * Output
+ *      Rank is same as the input: A blob with rank equal to 5, representing [Batch, channels, depth, height, width].
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * For example, given an input of shape (1,1,2,3,3):
+ *        +----+----+----+
+ *      / | 10 | 11 | 12 |
+ *     /  +----+----+----+
+ *    /   | 13 | 14 | 15 |
+ *   /    +----+----+----+
+ *  /     | 16 | 17 | 18 |
+ * /      +----+----+----+
+ * +----+----+----+      /
+ * |  1 |  2 |  3 |     /
+ * +----+----+----+    /
+ * |  4 |  5 |  6 |   /
+ * +----+----+----+  /
+ * |  7 |  8 |  9 | /
+ * +----+----+----+
+ *
+ * And applying MAX pooling using:
+ *      Kernel: 2x2x2
+ *      Stride: 1x1x1
+ *      Valid Padding
+ * We expect to get an output with shape: (1,1,1,2,2) and value:
+ * +----+----+
+ * | 14 | 15 |
+ * +----+----+
+ * | 17 | 18 |
+ * +----+----+
+ */
+message Pooling3DLayerParams {
+
+    enum PoolingType3D {
+        MAX = 0;
+        AVERAGE = 1;
+    }
+
+    // Whether to use Max or Average
+    PoolingType3D type = 1;
+
+    // Depth of the pooling region.
+    int32 kernelDepth = 2;
+
+    // Height of the pooling region.
+    int32 kernelHeight = 3;
+
+    // Width of the pooling region.
+    int32 kernelWidth = 4;
+
+    // Stride along the depth direction
+    int32 strideDepth = 5;
+
+    // Stride along the height direction
+    int32 strideHeight = 6;
+
+    // Stride along the width direction
+    int32 strideWidth = 7;
+
+    /*
+     * The type of padding.
+     * All padding types pad the input shape with zeros.
+     * CUSTOM padding will add the custom padding values specified below to their respective
+     * dimensions, e.g., `customPaddingFront` number of zeros will be added to one side of the
+     * input's depth dimension and `customPaddingBack` number of zeros will be added to the other
+     * side of the input's depth dimension.
+     * VALID padding adds no padding to any dimension. In this case, the last pool along
+     * each dimension will be dropped if the input dimension and the kernel size, and stride do not match.
+     * SAME padding adds enough padding to each dimension such that the output
+     * has the same spatial dimensions as the input. Padding is added evenly to both
+     * sides of each dimension unless the total padding to add is odd, in which case the extra padding
+     * is added to the back/bottom/right side of the respective dimension.  For example, if the the
+     * total horizontal padding is 3, then there will be 1 padding on the left, and 2 padding on the right.
+     */
+    enum Pooling3DPaddingType {
+        CUSTOM = 0;
+        VALID = 1;
+        SAME = 2;
+    }
+    Pooling3DPaddingType paddingType = 15;
+
+    // Padding before the input in the depth direction.
+    int32 customPaddingFront = 8;
+
+    // Padding after the input in the depth direction.
+    int32 customPaddingBack = 9;
+
+    // Padding before the input in the height direction.
+    int32 customPaddingTop = 10;
+
+    // Padding after the input in the height direction.
+    int32 customPaddingBottom = 11;
+
+    // Padding before the input in the width direction.
+    int32 customPaddingLeft = 12;
+
+    // Padding after the input in the width direction.
+    int32 customPaddingRight = 13;
+
+    // If true, exclude zeros from padding in Average pooling.  Meaningless in Max Pooling.
+    bool countExcludePadding = 14;
+}
+
+/*
+ * A layer to pool three spatial dimensions down to one value.
+ * This behaves like a special case of Pooling3DLayerParams in which
+ * the Kernel is the size of the input and there is no padding.
+ *
+ * Input
+ *      A blob with rank equal to 5, representing [Batch, channels, depth, height, width].
+ *
+ * Output
+ *      Rank is same as the input: A blob with rank equal to 5, representing [Batch, channels, depth, height, width].
+ *      Depth, height, and width of the output will always be 1.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * For example, given an input of shape (1,1,2,3,3):
+ *        +----+----+----+
+ *      / | 10 | 11 | 12 |
+ *     /  +----+----+----+
+ *    /   | 13 | 14 | 15 |
+ *   /    +----+----+----+
+ *  /     | 16 | 17 | 18 |
+ * /      +----+----+----+
+ * +----+----+----+      /
+ * |  1 |  2 |  3 |     /
+ * +----+----+----+    /
+ * |  4 |  5 |  6 |   /
+ * +----+----+----+  /
+ * |  7 |  8 |  9 | /
+ * +----+----+----+
+ *
+ * And applying MAX global 3d pooling, we expect to get an output with shape: (1,1,1,1,1) and value:
+ * +----+
+ * | 18 |
+ * +----+
+ */
+message GlobalPooling3DLayerParams {
+
+    enum GlobalPoolingType3D {
+        MAX = 0;
+        AVERAGE = 1;
+    }
+
+    // Whether to use Max or Average
+    GlobalPoolingType3D type = 1;
+}
+
+/*
+ * A layer that performs padding along spatial dimensions.
+ *
+ * .. code::
+ *
+ *      y = PaddingLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     A blob with rank at least 2.
+ *     e.g.: blob with shape ``[H_in, W_in]``.
+ *     For ranks greater than 2, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch
+ *     i.e. Padding is applied on last two dimensions.
+ *
+ * Output
+ *     Same rank as the input.
+ *     e.g.: blob with shape ``[H_out, W_out]``.
+ *
+ * Output dimensions are calculated as follows:
+ *
+ * .. code::
+ *
+ *     H_out = H_in + topPaddingAmount + bottomPaddingAmount
+ *     W_out = W_in + leftPaddingAmount + rightPaddingAmount
+ *
+ *     topPaddingAmount == Height startEdgeSize == borderAmounts[0].startEdgeSize
+ *     bottomPaddingAmount == Height endEdgeSize == borderAmounts[0].endEdgeSize
+ *     leftPaddingAmount == Width startEdgeSize == borderAmounts[1].startEdgeSize
+ *     rightPaddingAmount == Width endEdgeSize == borderAmounts[1].endEdgeSize
+ *
+ * There are three types of padding:
+ *
+ * - ``PaddingConstant``, which fills a constant value at the border.
+ * - ``PaddingReflection``, which reflects the values at the border.
+ * - ``PaddingReplication``, which replicates the values at the border.
+ *
+ * Given the following input:
+ *
+ * .. code::
+ *
+ *     [1, 3, 4]  :  1   2   3   4
+ *                   5   6   7   8
+ *                   9   10  11  12
+ *
+ * Here is the output of applying the padding
+ * ``(top=2, left=2, bottom=0, right=0)``
+ * with each of the supported types:
+ *
+ * - ``PaddingConstant`` (``value = 0``):
+ *   .. code::
+ *
+ *       [1, 5, 6]  :  0   0   0  0   0   0
+ *                     0   0   0  0   0   0
+ *                     0   0   1  2   3   4
+ *                     0   0   5  6   7   8
+ *                     0   0   9  10  11  12
+ *
+ * - ``PaddingReflection``:
+ *   .. code::
+ *
+ *       [1, 5, 6]  :  11  10  9  10  11  12
+ *                     7   6   5  6   7   8
+ *                     3   2   1  2   3   4
+ *                     7   6   5  6   7   8
+ *                     11  10  9  10  11  12
+ *
+ * - ``PaddingReplication``:
+ *   .. code::
+ *
+ *       [1, 5, 6]  :  1   1   1  2   3   4
+ *                     1   1   1  2   3   4
+ *                     1   1   1  2   3   4
+ *                     5   5   5  6   7   8
+ *                     9   9   9  10  11  12
+ */
+message PaddingLayerParams {
+
+    /*
+     * Fill a constant value in the padded region.
+     */
+    message PaddingConstant {
+        float value = 1;
+    }
+
+    /*
+     * Reflect the values at the border for padding.
+     */
+    message PaddingReflection {
+    }
+
+    /*
+     * Replicate the values at the border for padding.
+     */
+    message PaddingReplication {
+    }
+
+    oneof PaddingType {
+        PaddingConstant constant = 1;
+        PaddingReflection reflection = 2;
+        PaddingReplication replication = 3;
+    }
+
+    BorderAmounts paddingAmounts = 10; // Amounts to be padded to the input.
+
+}
+
+/*
+ * A layer that concatenates along the axis = -3 or -5.
+ * For general concatenation along any axis, see ConcatNDLayer.
+ *
+ * .. code::
+ *
+ *      y = ConcatLayer(x1,x2,....)
+ *
+ * Requires more than 1 input and produces 1 output.
+ *
+ * Input
+ *   All input blobs must have same rank.
+ *   If "sequenceConcat" = False, rank must be greater than equal to 3. In this case concatenation is along axis = -3
+ *   If "sequenceConcat" = True, rank must be greater than equal to 5. In this case concatenation is along axis = -5
+ *
+ * Output
+ *   Same rank as the input.
+ *
+ */
+message ConcatLayerParams {
+
+    /*
+     * If true, concatenate along the axis = -5 instead of axis = -3.
+     */
+    bool sequenceConcat = 100;
+
+}
+
+/*
+ * A layer that performs local response normalization (LRN).
+ *
+ * .. code::
+ *
+ *      y = LRNLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     A blob with rank greater than equal to 3.
+ *     Example: Rank 4 blob represents [Batch, channels, height, width]
+ *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
+ * Output
+ *     A blob with the same shape as the input.
+ *
+ * This layer is described by the following formula:
+ *
+ * .. math::
+ *     x_i \leftarrow  \dfrac{x_i}{\left ( k + \dfrac{\alpha}{\text{localSize}} \sum_j x_j^2 \right )^\beta}
+ *
+ * where the summation is done over a ``(localSize, 1, 1)`` neighborhood ---
+ * that is, over a window "across" channels in 1x1 spatial neighborhoods.
+ */
+message LRNLayerParams {
+
+    float alpha = 1;
+    float beta = 2;
+    uint64 localSize = 3; // Number of channels in the normalization window.
+    float k = 4; // Defaults to 1 if not set or 0. Must be strictly positive.
+
+}
+
+/*
+ * Softmax Normalization Layer
+ *
+ * A layer that performs softmax normalization.
+ * Normalization is applied along axis = -3 or N-3 (where N is the rank of the input)
+ * For softmax layer that can operate on any axis, see SoftmaxNDLayer.
+ *
+ *
+ * .. code::
+ *
+ *      y = SoftmaxLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     Must be a blob with rank >= 3.
+ * Output
+ *     A blob with the same shape as the input.
+ *
+ * This layer is described by the following formula:
+ *
+ * .. math::
+ *     x_i \leftarrow \dfrac{e^{x_i}}{\sum_i{e^{x_i}}}
+ */
+message SoftmaxLayerParams {
+
+}
+
+/*
+ * A layer that uniformly splits across axis = -3 to produce a specified number of outputs.
+ * For general split operation along any axis, see SplitNDLayer.
+ *
+ * .. code::
+ *
+ *      (y1,y2,...yN) = SplitLayer(x), where N = nOutputs
+ *
+ * Requires 1 input and produces multiple outputs.
+ *
+ * Input
+ *     A blob with rank at least 3.
+ *     e.g.: blob with shape ``[C, H, W]``
+ * Output
+ *     ``nOutputs`` blobs each with same rank as the input.
+ *     e.g.: For input that is of shape ``[C, H, W]``, output shapes will be ``[C/nOutputs, H, W]``
+ */
+message SplitLayerParams {
+
+    uint64 nOutputs = 1; // The number of outputs.
+
+}
+
+/*
+ * A layer that performs elementwise addition.
+ * This layer has limited broadcasting support. For general broadcasting see AddBroadcastableLayer.
+ *
+ * .. code::
+ *
+ *      y = AddLayer(x1,x2,...)
+ *
+ * Requires 1 or more than 1 input and produces 1 output.
+ *
+ * Input
+ *     In general, there are no rank constraints.
+ *     However, only certain set of shapes are broadcastable. For example:
+ *     [B, 1, 1, 1], [B, C, 1, 1], [B, 1, H, W], [B, C, H, W]
+ * Output
+ *     A blob with shape equal to the input blob.
+ *
+ * If only one input is provided, scalar addition is performed:
+ *
+ * .. math::
+ *     y = x + \alpha
+ *
+ */
+message AddLayerParams {
+
+    /*
+     * Scalar to be added to the input.
+     * Only used if there is a single input.
+     */
+    float alpha = 1;
+
+}
+
+/*
+ * A layer that performs elementwise multiplication.
+ * This layer has limited broadcasting support. For general broadcasting see MultiplyBroadcastableLayer.
+ *
+ * .. code::
+ *
+ *      y = MultiplyLayer(x1,x2,...)
+ *
+ * Requires 1 or more than 1 input and produces 1 output.
+ *
+ * Input
+ *     In general, there are no rank constraints.
+ *     However, only certain set of shapes are broadcastable. For example:
+ *     [B, 1, 1, 1], [B, C, 1, 1], [B, 1, H, W], [B, C, H, W]
+ * Output
+ *     A blob with shape equal to the first input blob.
+ *
+ * If only one input is provided, scalar multiplication is performed:
+ *
+ * .. math::
+ *     y = \alpha x
+ *
+ */
+message MultiplyLayerParams {
+
+    /*
+     * Scalar to be multiplied with the input.
+     * Only used if there is a single input.
+     */
+    float alpha = 1;
+
+}
+
+/*
+ * A layer that applies a unary function.
+ *
+ * .. code::
+ *
+ *      y = UnaryFunctionLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     A blob with no rank constraints.
+ * Output
+ *     A blob with the same shape as the input.
+ *
+ * The input is first modified by shifting and scaling:
+ *
+ * .. math::
+ *     x \leftarrow \text{scale} \cdot x + \text{shift}
+ */
+message UnaryFunctionLayerParams {
+
+    /*
+     * A unary operator.
+     *
+     * The following functions are supported:
+     *
+     * ``SQRT``
+     *     .. math:: f(x) = \sqrt{x}
+     *
+     * ``RSQRT``
+     *     .. math:: f(x) = \dfrac{1}{\sqrt{x + \epsilon}}
+     *
+     * ``INVERSE``
+     *     .. math:: f(x) = \dfrac{1}{x + \epsilon}
+     *
+     * ``POWER``
+     *     .. math:: f(x) = x^\alpha
+     *
+     * ``EXP``
+     *     .. math:: f(x) = e^x
+     *
+     * ``LOG``
+     *     .. math:: f(x) = \log x
+     *
+     * ``ABS``
+     *     .. math:: f(x) = |x|
+     *
+     * ``THRESHOLD``
+     *     .. math:: f(x) = \text{max}(\alpha, x)
+     */
+    enum Operation {
+        SQRT = 0;
+        RSQRT = 1;
+        INVERSE = 2;
+        POWER = 3;
+        EXP = 4;
+        LOG = 5;
+        ABS = 6;
+        THRESHOLD = 7;
+    }
+    Operation type = 1; // The type of unary function.
+
+    /*
+     * A constant used in ``POWER`` and ``THRESHOLD`` functions.
+     */
+    float alpha = 2;
+
+    /*
+     * A small constant to avoid division by 0 while normalizing variance.
+     * Defaults to ``1e-6`` if not set or set to ``0``.
+     */
+    float epsilon = 3;
+
+    /*
+     * Input is shifted by this amount
+     * before the unary function is applied.
+     * Defaults to ``0.0`` if not set.
+     */
+    float shift = 4;
+
+    /*
+     * Input is scaled by this amount
+     * before the unary function is applied.
+     * Defaults to ``1.0`` if not set or set to ``0``.
+     */
+    float scale = 5;
+
+}
+
+/*
+ * A layer that scales up spatial dimensions.
+ * It supports two modes: nearest neighbour (default) and bilinear.
+ *
+ * .. code::
+ *
+ *      y = UpsampleLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     A blob with rank at least 3.
+ *     e.g.: blob with shape ``[C, H, W]``.
+ *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
+ *
+ * Output
+ *     Same rank as the input.
+ *     e.g.: blob with shape ``[C, scalingFactor[0] * H, scalingFactor[1] * W]``
+ */
+message UpsampleLayerParams {
+
+    /*
+     * Scaling Factor. Mutually exclusive with fractionalScalingFactor.
+     * Must be length 2 in order ``[H, W]``.
+     * If not set, default value ``[1, 1]`` is used.
+     */
+    repeated uint64 scalingFactor = 1;
+
+    /*
+     * Fractional scaling factor. Mutually exclusive with scalingFactor.
+     * Must be length 2 in order ``[H, W]``.
+     * If not set, default value ``[1.0, 1.0]`` is used.
+     */
+    repeated float fractionalScalingFactor = 7;
+
+    /*
+     * Overall mode for interpolating new elements when upsampling.
+     * NN - Nearest Neighbors - simply pick the nearest true value for interpolated values.
+     * BILINEAR - Use bilinear interpolation. See LinearUpsamplingMode for behavior.
+     */
+    enum InterpolationMode {
+
+        NN = 0; // Nearest Neighbour
+        BILINEAR = 1; // Bilinear
+
+    }
+
+    InterpolationMode mode = 5;
+
+    /*
+     * LinearUpsampleMode specifies the behavior for linear upsampling. Only valid when Interpolation Mode is BILINEAR.
+     * If input grid is [0, Xin-1] (corresponding to an input size of Xin), and if the output size is Xout,
+     * then the grid points are sampled in the following manner:
+     * DEFAULT:
+     *   spacing = (Xin-Xin/Xout) / (Xout-1)
+     *   grid_point[i] = min(Xin-1, max(0, i * spacing)), for i = 0,1,2,….,Xout-1
+     * ALIGN_CORNERS_TRUE:
+     *   spacing = (Xin-1) / (Xout-1)
+     *   grid_point[i] = min(Xin-1, max(0, i * spacing)), for i = 0,1,2,….,Xout-1
+     * ALIGN_CORNERS_FALSE:
+     *   spacing = Xin / Xout
+     *   grid_point[i] = min(Xin-1, max(0, i * spacing + 0.5 * spacing - 0.5)), for i = 0,1,2,….,Xout-1
+     */
+    enum LinearUpsampleMode {
+
+        DEFAULT = 0;
+        ALIGN_CORNERS_TRUE = 1;
+        ALIGN_CORNERS_FALSE = 2;
+
+    }
+
+    LinearUpsampleMode linearUpsampleMode = 6;
+
+}
+
+/*
+* A layer that resizes the input to a pre-specified spatial size using bilinear interpolation.
+*
+* .. code::
+*
+*      y = ResizeBilinearLayer(x)
+*
+* Requires 1 input and produces 1 output.
+*
+* Input
+*     A blob with rank at least 3.
+*     e.g.: blob with shape ``[C, H_in, W_in]``.
+*     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
+*
+* Output
+*     Same rank as the input.
+*     e.g.: blob with shape ``[C, H_out, W_out]``.
+*
+*/
+message ResizeBilinearLayerParams {
+
+    /*
+     * Target Spatial Size.
+     * Must be length 2 in order ``[Height, Width]``, i.e. ``[H_out, W_out]``.
+     * If not set, default value ``[1, 1]`` is used.
+     */
+    repeated uint64 targetSize = 1;
+
+    /*
+     * Mode used to compute the grid on which the spatial output values are evaluated.
+     * Same mode is applied to both the height and width axes.
+     */
+    SamplingMode mode = 2;
+
+}
+
+/*
+* A layer that extracts cropped spatial patches or RoIs (regions of interest) from the input and resizes them to a pre-specified size using
+* bilinear interpolation.
+* Note that RoI Align layer can be implemented with this layer followed by a pooling layer.
+*
+* .. code::
+*
+*      y = CropResizeLayer(x)
+*
+* Requires 2 inputs and produces 1 output.
+*
+* Input
+*     There are two inputs.
+*     First input represents an image feature map.
+*     Second input represents the bounding box coordinates for N patches or RoIs (region of interest).
+*
+*     First input is rank 5: [1, Batch, C, H_in, W_in].
+*     Second input is rank 5. Its shape can be either [N, 1, 4, 1, 1] or [N, 1, 5, 1, 1].
+*
+*     N: number of patches/RoIs to be extracted
+*
+*     If RoI shape = ``[N, 1, 4, 1, 1]``
+*                    The axis=-3 corresponds to the four coordinates specifying the bounding box.
+*                    All the N RoIs are extracted from all the batches of the input.
+*
+*     If RoI shape = ``[N, 1, 5, 1, 1]``
+*                     The first element of the axis=-3 specifies the input batch id from which to extract the RoI and
+*                               must be in the interval ``[0, Batch - 1]``. That is, n-th RoI is extracted from the RoI[n,0,0,0,0]-th
+*                     input batch id. The last four elements of the axis=-3 specify the bounding box coordinates.
+*
+* Output
+*     A blob with rank 5.
+*           - Shape is [N, Batch, C, H_out, W_out] if input RoI shape is [N, 1, 4, 1, 1]
+*           - Shape is [N, 1, C, H_out, W_out] if input RoI shape is [N, 1, 5, 1, 1]
+*
+*/
+message CropResizeLayerParams {
+
+    /*
+     * Target Spatial Size.
+     * Must be length 2 in order ``[Height, Width]``, i.e. ``[H_out, W_out]``.
+     * If not set, default value ``[1, 1]`` is used.
+     */
+    repeated uint64 targetSize = 1;
+
+    /*
+     * If true the bounding box coordinates must be in the interval [0, 1].
+     * They are scaled by (H_in - 1), (W_in - 1), i.e. based on the input spatial dimensions.
+     * If false the bounding box coordinates must be in the interval
+     * [0, H_in -1] and [0, W_in - 1], respectively for height and width dimensions.
+     */
+    bool normalizedCoordinates = 2;
+
+    /*
+     * Mode used to compute the grid on which the spatial output values are evaluated.
+     * Same mode is applied to both the height and width axes.
+     */
+    SamplingMode mode = 3;
+
+    /*
+     * Representation used to express the bounding box coordinates.
+     * It determines how the values of the second input are interpreted.
+     */
+    BoxCoordinatesMode boxIndicesMode = 4;
+
+    /*
+     * Additional spatial scale that multiplies the bounding box coordinates.
+     * Generally used while implementing the RoI Align layer,
+     * which uses unnormalized RoI coordinates along with a spatial scale less than or equal to 1.
+     */
+    float spatialScale = 5;
+
+}
+
+/*
+ * A layer that performs elementwise addition of a bias,
+ * which is broadcasted to match the input shape.
+ *
+ * .. code::
+ *
+ *      y = BiasLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     A blob with rank at least 3.
+ *     e.g.: blob with shape ``[C, H, W]``.
+ *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
+ * Output
+ *     A blob with the same shape as the input.
+ */
+message BiasLayerParams {
+
+    /*
+     * The shape of the bias.
+     * Must be one of the following:
+     * ``[1]``, ``[C]``, ``[1, H, W]`` or ``[C, H, W]``.
+     */
+    repeated uint64 shape = 1;
+
+    /*
+     * The bias values.
+     * The size must be equal to the product of the ``shape`` dimensions.
+     */
+    WeightParams bias = 2;
+
+}
+
+/*
+ * A layer that performs elmentwise multiplication by a scale factor
+ * and optionally adds a bias;
+ * both the scale and bias are broadcasted to match the input shape.
+ *
+ * .. code::
+ *
+ *      y = ScaleLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     A blob with rank at least 3.
+ *     e.g.: blob with shape ``[C, H, W]``.
+ *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
+ * Output
+ *     A blob with the same shape as the input.
+ */
+message ScaleLayerParams {
+
+    /*
+     * The shape of the scale.
+     * Must be one of the following:
+     * ``[1]``, ``[C]``, ``[1, H, W]`` or ``[C, H, W]``.
+     */
+    repeated uint64 shapeScale = 1;
+
+    /*
+     * The scale values.
+     * The size must be equal to the product of the ``shape`` dimensions.
+     */
+    WeightParams scale = 2; // Scale values. Size must be equal to the product of dimensions specified in shapeScale.
+
+    bool hasBias = 3; // If true, a bias is added after scaling.
+
+    /*
+     * The shape of the bias.
+     * Must be one of the following:
+     * ``[1]``, ``[C]``, ``[1, H, W]`` or ``[C, H, W]``.
+     */
+    repeated uint64 shapeBias = 4;
+
+    /*
+     * The bias values.
+     * The size must be equal to the product of the ``shape`` dimensions.
+     */
+    WeightParams bias = 5;
+
+}
+
+/*
+ * A layer that loads data as a parameter and provides it as an output.
+ * The output is rank 5. For general rank, see LoadConstantNDLayer.
+ *
+ * .. code::
+ *
+ *      y = LoadConstantLayer()
+ *
+ * Requires no input and produces 1 output.
+ *
+ * Output:
+ *     A blob with rank 5 and shape ``[1, 1, C, H, W]``
+ */
+message LoadConstantLayerParams {
+
+    /*
+     * The shape of the constant to be loaded,
+     * which must be``[C, H, W]``, that is length 3.
+     */
+    repeated uint64 shape = 1;
+
+    /*
+     * The data values,
+     * of size ``C * H * W``.
+     */
+    WeightParams data = 2;
+
+}
+
+/*
+ * A layer that performs L2 normalization, i.e. divides by the
+ * the square root of the sum of squares of all elements of input.
+ *
+ * .. code::
+ *
+ *      y = L2NormalizeLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     A blob with rank greater than equal to 3.
+ *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
+ * Output
+ *     A blob with the same shape as the input.
+ *
+ * This layer is described by the following formula:
+ *
+ * .. math::
+ *     x_i \leftarrow \dfrac{x_i}{\sqrt{\sum{x_i^2} + \epsilon}}
+ */
+message L2NormalizeLayerParams {
+
+    /*
+     * A small constant to avoid division by 0 while normalizing variance.
+     * Defaults to ``1e-6`` if not set or set to ``0``.
+     */
+    float epsilon = 1;
+
+}
+
+// Data Reorganization Layers
+// --------------------------
+
+/*
+ * A layer that flattens the input.
+ *
+ * .. code::
+ *
+ *      y = FlattenLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     A blob with rank greater than equal to 3.
+ *     e.g.: Rank 4 blob represents [Batch, C, H, W]
+ *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
+ * Output
+ *     Same rank as the input, such that last two dimensions are both 1.
+ *     e.g.: For rank 4 input, output shape is ``[Batch, C * H * W, 1, 1]``
+ *
+ * There are two X orders: ``CHANNEL_FIRST`` and ``CHANNEL_LAST``.
+ * ``CHANNEL_FIRST`` does not require data to be rearranged,
+ * because row major ordering is used by internal storage.
+ * ``CHANNEL_LAST`` requires data to be rearranged.
+ */
+message FlattenLayerParams {
+
+    enum FlattenOrder {
+
+        CHANNEL_FIRST = 0;
+        CHANNEL_LAST = 1;
+
+    }
+    FlattenOrder mode = 1;
+
+}
+
+/*
+ * A layer that recasts the input into a new shape.
+ *
+ * .. code::
+ *
+ *      y = ReshapeLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     A blob with rank 5.
+ *     e.g.: ``[1, 1, C, H, W]`` or ``[Seq, 1, C, H, W]``.
+ * Output
+ *     A blob with rank 5.
+ *     e.g.: ``[1, 1, C_out, H_out, W_out]`` or ``[Seq_out, 1, C_out, H_out, W_out]``.
+ *
+ * There are two reshape orders: ``CHANNEL_FIRST`` and ``CHANNEL_LAST``.
+ * ``CHANNEL_FIRST`` is equivalent to
+ * flattening the input to ``[Seq, 1, C * H * W, 1, 1]`` in channel first order
+ * and then reshaping it to the target shape;
+ * no data rearrangement is required.
+ * ``CHANNEL_LAST`` is equivalent to
+ * flattening the input to ``[Seq, 1, H * W * C, 1, 1]`` in channel last order,
+ * reshaping it to ``[Seq_out, 1, H_out, W_out, C_out]`` (it is now in "H_out-major"" order),
+ * and then permuting it to ``[C_out, H_out, W_out]``;
+ * both the flattening and permuting requires the data to be rearranged.
+ */
+message ReshapeLayerParams {
+
+    /*
+     * The shape of the output.
+     * Must be of length 3 or 4.
+     * If set to 3, ``targetShape`` is interpreted as
+     * ``[1, 1, C_out, H_out, W_out]``, and sequence length of the input is preserved.
+     * If set to 4, ``targetShape`` is interpreted as
+     * ``[Seq_out, 1, C_out, H_out, W_out]``,
+     * where ``Seq_out`` is the new sequence length.
+     */
+    repeated int64 targetShape = 1;
+
+    enum ReshapeOrder {
+
+        CHANNEL_FIRST = 0;
+        CHANNEL_LAST = 1;
+
+    }
+    ReshapeOrder mode = 2;
+
+}
+
+/*
+ * A layer that rearranges the dimensions and data of an input.
+ * For generic transpose/permute operation see TransposeLayer.
+ *
+ * .. code::
+ *
+ *      y = PermuteLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     Must be a rank 5 blob.
+ *     e.g.: shape ``[Seq, B, C, H, W]``.
+ * Output
+ *     Rank 5 blob. Transposed version of the input, such that dimensions at axis=1 or axis=-4 is unchanged.
+ *
+ *
+ * Examples:
+ *
+ *  Assume input shape is [Seq, B, C, H, W]
+ *
+ * - If ``axis`` is set to ``[0, 3, 1, 2]``,
+ *   then the output has shape ``[Seq, B, W, C, H]``
+ *
+ * - If ``axis`` is set to ``[3, 1, 2, 0]``,
+ *   then the output has shape ``[W, B, C, H, Seq]``
+ *
+ * - If ``axis`` is set to ``[0, 3, 2, 1]``,
+ *   then the output has shape ``[Seq, B, W, H, C]``
+ *
+ * - If ``axis`` is not set, or is set to ``[0, 1, 2, 3]``,
+ *   the output is the same as the input.
+ */
+message PermuteLayerParams {
+
+    /*
+     * The order in which to permute the dimensions.
+     * Must have length 4 and a permutation of ``[0, 1, 2, 3]``.
+     */
+    repeated uint64 axis = 1;
+
+}
+
+/*
+ * A layer that reorganizes data in the input in specific ways.
+ *
+ * .. code::
+ *
+ *      y = ReorganizeDataLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     A blob with rank at least 3.
+ *     e.g.: blob with shape ``[C, H, W]``.
+ *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
+ * Output
+ *     Same rank as the input.
+ *     e.g.: blob with shape ``[C_out, H_out, W_out]``.
+ *
+ * mode == SPACE_TO_DEPTH
+ *  ``[C_out, H_out, W_out]`` : ``[C * blockSize * blockSize, H/blockSize, W/blockSize]``.
+ *  blockSize must divide H and W.
+ *  Data is moved from the spatial dimensions to the channel dimension. Input is spatially divided into
+ *  non-overlapping blocks of size blockSize X blockSize and data from each block is moved into the
+ *  channel dimension.
+ *
+ * mode == DEPTH_TO_SPACE
+ *  ``[C_out, H_out, W_out]`` : ``[C/(blockSize * blockSize), H * blockSize, W * blockSize]``.
+ *  Square of blockSize must divide C.
+ *  Reverse of SPACE_TO_DEPTH. Data is moved from the channel dimension to the spatial dimensions.
+ *
+ * mode == PIXEL_SHUFFLE
+ *  ``[C_out, H_out, W_out]`` : ``[C/(blockSize * blockSize), H * blockSize, W *  blockSize]``.
+ *  Square of blockSize must divide C.
+ *  Similar to DEPTH_TO_SPACE, but using the pixel-shuffle semantics for channel order in the output space.
+ *  In both modes, elements along the channel dimension are collapsed into
+ *  blocks in the spatial dimensions. The difference is in the arrangement of
+ *  the input-channels' data in the output space. See below example for more
+ *  detail.
+ *  (Only available in Core ML Specification >= 5 (iOS >= 14, macOS >= 11.0)
+ *
+ *
+ * Examples:
+ *
+ * Assume input is the following [C = 8, H = 1, W = 2] tensor:
+ *
+ * .. code::
+ *
+ *    [[[1 2]] [[3 4]] [[5 6]] [[7 8]] [[9 10]] [[11 12]] [[13 14]] [[15 16]]]
+ *
+ * If block_size == 2 and mode == DEPTH_TO_SPACE, output will be the following
+ * [C = 2, H = 2, W = 4] tensor:
+ *
+ * .. code::
+ *
+ *    [[[ 1  5  2  6]
+ *      [ 9 13 10 14]]
+ *
+ *     [[ 3  7  4  8]
+ *      [11 15 12 16]]]
+ *
+ * For mode == SPACE_TO_DEPTH, the behavior is the same as mode ==
+ * DEPTH_TO_SPACE, but with the input and output swapped.
+ *
+ * If block_size == 2 and mode == PIXEL_SHUFFLE, output will be the following
+ * [C = 2, H = 2, W = 4] tensor:
+ *
+ * .. code::
+ *
+ *    [[[ 1  3  2  4]
+ *      [ 5  7  6  8]]
+ *
+ *     [[ 9 11 10 12]
+ *      [13 15 14 16]]]
+ *
+ */
+message ReorganizeDataLayerParams {
+
+    enum ReorganizationType {
+
+        SPACE_TO_DEPTH = 0;
+        DEPTH_TO_SPACE = 1;
+        PIXEL_SHUFFLE = 2;
+
+    }
+    ReorganizationType mode = 1;
+    uint64 blockSize = 2; // must be greater than 1
+
+}
+
+/*
+ * A layer that slices the input data along axis = -1 or -2 or -3.
+ * For general slice along any axis, please see SliceStaticLayer/SliceDynamicLayer.
+ *
+ * .. code::
+ *
+ *      y = SliceLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     A blob that can, in general, have any rank. However, depending on the value of "axis" ,
+ *     there may be additional rank constraints.
+ * Output
+ *     A blob with the same rank as the input.
+ *
+ * Sliced section is taken from the interval ``[startIndex, endIndex)``, i.e.
+ * startIndex is inclusive while endIndex is exclusive.
+ * stride must be positive and represents the step size for slicing.
+ * Negative indexing is supported for startIndex and endIndex.
+ * -1 denotes N-1, -2 denotes N-2 and so on, where N is the length of the dimension to be sliced.
+ *
+ */
+message SliceLayerParams {
+
+    int64 startIndex = 1; // start of the sliced section. Inclusive.
+    int64 endIndex = 2; // end of sliced section. Exclusive.
+    uint64 stride = 3; // The step size. Must be positive.
+
+    enum SliceAxis {
+
+        CHANNEL_AXIS = 0;
+        HEIGHT_AXIS = 1;
+        WIDTH_AXIS = 2;
+
+    }
+    // The following mapping is used for interpreting this parameter:
+    // CHANNEL_AXIS => axis = -3, input must have rank at least 3.
+    // HEIGHT_AXIS => axis = -2, input must have rank at least 2.
+    // WIDTH_AXIS => axis = -1
+    SliceAxis axis = 4;
+
+}
+
+/*
+ * A layer that reduces the input using a specified operation.
+ *
+ * .. code::
+ *
+ *      y = ReduceLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     A blob that can, in general, have any rank. However, depending on the value of "axis" ,
+ *      there may be additional rank constraints.
+ * Output
+ *     A blob with the same rank as the input, which has 1s on the dimensions specified in the parameter "axis"
+ *
+ *     Values supported for axis are [-1], [-2], [-3], [-2,-1], [-3,-2,-1]
+ *     and the equivalent positive values (depending on the rank of the input)
+ *     For mode == 'ArgMax', axis must be [-1] or [-2] or [-3].
+ */
+message ReduceLayerParams {
+
+    /*
+     * The following reduction operations are supported
+     * and are applied on the specified axis of the input array:
+     *
+     * ``SUM``
+     *     Sum of all elements
+     *
+     *     .. math:: \sum{x_i}
+     *
+     * ``AVG``
+     *     Sum of all elements divided by the number of elements
+     *
+     *     .. math:: \dfrac{\sum^n{x_i}}{n}
+     *
+     * ``PROD``
+     *     Product of all elements
+     *
+     *     .. math:: \prod{x_i}
+     *
+     * ``LOGSUM``
+     *     Sum of the natural logarithm of all elements
+     *
+     *     .. math:: \sum{\ln{(x_i + \epsilon)}}
+     *
+     * ``SUMSQUARE``
+     *     Sum of squares of all elements
+     *
+     *     .. math:: \sum{x^2}
+     *
+     * ``L1``
+     *     L1 normalization of all elements
+     *
+     *     .. math:: ||x||_1 = \sum{|x_i|}
+     *
+     * ``L2``
+     *     L2 normalization of all elements
+     *
+     *     .. math:: ||x||_2 = \sqrt{\sum{x_i^2}}
+     *
+     * ``MAX``
+     *     Maximum of all elements
+     *
+     *     .. math:: \text{max}(x_i)
+     *
+     * ``MIN``
+     *     Minimum of all elements
+     *
+     *     .. math:: \text{min}(x_i)
+     *
+     * ``ARGMAX``
+     *     Argument of the maximum of all elements
+     *
+     *     .. math:: \text{argmax}(x_i)
+     *
+     */
+    enum ReduceOperation {
+
+        SUM = 0;
+        AVG = 1;
+        PROD = 2;
+        LOGSUM = 3;
+        SUMSQUARE = 4;
+        L1 = 5;
+        L2 = 6;
+        MAX = 7;
+        MIN = 8;
+        ARGMAX = 9; // only supported with axis = C, H or W.
+
+    }
+    ReduceOperation mode = 1; // Specifies function used to reduce.
+
+    /*
+     * Used if mode is ``LOGSUM``.
+     * Defaults to ``1e-6`` if not set or is set to ``0``.
+     */
+    float epsilon = 2;
+
+    enum ReduceAxis {
+
+        CHW = 0;
+        HW = 1;
+        C = 2;
+        H = 3;
+        W = 4;
+
+    }
+
+    // The following mapping is used for interpreting this parameter:
+    // CHW = axis [-3, -2, -1], input must have rank at least 3.
+    // HW = axis [-2, -1], input must have rank at least 2.
+    // C = axis [-3]
+    // H = axis [-2]
+    // W = axis [-1]
+    ReduceAxis axis = 3;
+
+}
+
+/*
+ * A layer that crops the spatial dimensions of an input.
+ * If two inputs are provided, the shape of the second input is used as the reference shape.
+ *
+ * .. code::
+ *
+ *      y = CropLayer(x1) or y = CropLayer(x1,x2)
+ *
+ * Requires 1 or 2 inputs and produces 1 output.
+ *
+ * Input
+ *    1 or 2 tensors, each with rank at least 3, both inputs must have equal rank.
+ *    Example:
+ *     - 1 input case: A blob with shape ``[C, H_in, W_in]``.
+ *     - 2 input case: 1st blob with shape ``[C, H_in, W_in]``, 2nd blob with shape ``[C, H_out, W_out]``.
+ *
+ *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
+ *
+ * Output
+ *     Same rank as the inputs.
+ *     e.g.: A blob with shape ``[C, H_out, W_out]``.
+ *
+ * If one input is used, output is computed as follows:
+ *
+ * .. code::
+ *
+ *      y = x1[:, topCropAmount:H_in - bottomCropAmount, leftCropAmount:W_in - rightCropAmount]
+ *
+ *      topCropAmount == Height startEdgeSize == borderAmounts[0].startEdgeSize
+ *      bottomCropAmount == Height endEdgeSize == borderAmounts[0].endEdgeSize
+ *      leftCropAmount == Width startEdgeSize == borderAmounts[1].startEdgeSize
+ *      rightCropAmount == Width endEdgeSize == borderAmounts[1].endEdgeSize
+ *
+ *      H_out = H_in - topCropAmount - bottomCropAmount
+ *      W_out = W_in - leftCropAmount - rightCropAmount
+ *
+ * If two inputs are used, output is computed as follows:
+ *
+ * .. code::
+ *
+ *      y = x1[:, offset[0]:offset[0] + H_out, offset[1]:offset[1] + W_out]
+ */
+message CropLayerParams {
+
+    /*
+     * The amounts to be cropped from the input.
+     * Used only if a single input is provided.
+     */
+    BorderAmounts cropAmounts = 1;
+
+    /*
+     * The offset amounts.
+     * Used only if two inputs are provided.
+     * Must be of length 2, in order ``[H, W]``.
+     */
+    repeated uint64 offset = 5;
+
+}
+
+/*
+ * A layer that computes the elementwise average of the inputs.
+ * This layer has limited broadcasting support. For general broadcasting see AddBroadcastableLayer.
+ *
+ * .. code::
+ *
+ *      y = AverageLayer(x1,x2,...)
+ *
+ * Requires multiple inputs and produces 1 output.
+ *
+ * Input
+ *     In general, there are no rank constraints.
+ *     However, only certain set of shapes are broadcastable. For example:
+ *     [B, 1, 1, 1], [B, C, 1, 1], [B, 1, H, W], [B, C, H, W]
+ * Output
+ *     A blob with the same shape as each input.
+ */
+message AverageLayerParams {
+
+}
+
+/*
+ * A layer that computes the elementwise maximum over the inputs.
+ *
+ * .. code::
+ *
+ *      y = MaxLayer(x1,x2,...)
+ *
+ * Requires multiple inputs and produces 1 output.
+ *
+ * Input
+ *     In general, there are no rank constraints.
+ *     However, only certain set of shapes are broadcastable. For example:
+ *     [B, C, 1, 1], [B, C, H, W]
+ * Output
+ *     A blob with the same shape as each input.
+ */
+message MaxLayerParams {
+
+}
+
+/*
+ * A layer that computes the elementwise minimum over the inputs.
+ *
+ * .. code::
+ *
+ *      y = MinLayer(x1,x2,...)
+ *
+ * Requires multiple inputs and produces 1 output.
+ *
+ * Input
+ *     In general, there are no rank constraints.
+ *     However, only certain set of shapes are broadcastable. For example:
+ *     [B, C, 1, 1], [B, C, H, W]
+ * Output
+ *     A blob with the same shape as each input.
+ */
+message MinLayerParams {
+
+}
+
+/*
+ * A layer that computes the dot product of two vectors.
+ *
+ * .. code::
+ *
+ *      y = DotProductLayer(x1,x2)
+ *
+ * Requires 2 inputs and produces 1 output.
+ *
+ * Input
+ *     Two blobs with rank at least 3, such that the last two dimensions must be 1.
+ *     e.g.: blobs with shape ``[B, C, 1, 1]``.
+ *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
+ *
+ * Output
+ *     Same rank as the input.
+ *     e.g. for rank 4 inputs, output shape: [B, 1, 1, 1]
+ */
+message DotProductLayerParams {
+
+    /*
+     * If true, inputs are normalized first,
+     * thereby computing the cosine similarity.
+     */
+    bool cosineSimilarity = 1;
+
+}
+
+/*
+ * A layer that performs mean variance normalization, along axis = -3.
+ *
+ * .. code::
+ *
+ *      y = MeanVarianceNormalizeLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     A blob with rank greater than equal to 3.
+ *     Example: Rank 4 blob represents [Batch, channels, height, width]
+ *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
+ *
+ * Output
+ *     A blob with the same shape as the input.
+ *
+ * If ``acrossChannels == true``
+ * normalization is performed on flattened input, i.e. the input is reshaped to (Batch,C), where "Batch" contains
+ * all dimensions from 0 to -4 (inclusive), and C contains dimensions -1, -2, -3.
+ *
+ * If ``acrossChannels == false``
+ * normalization is performed within a channel,
+ * across spatial dimensions (i.e. last two dimensions).
+ */
+message MeanVarianceNormalizeLayerParams {
+
+    /*
+     * If true, mean and variance are computed across channels.
+     */
+    bool acrossChannels = 1;
+
+    /*
+     * If false, only mean is subtracted.
+     */
+    bool normalizeVariance = 2;
+
+    /*
+     * A small constant to avoid division by 0 while normalizing variance.
+     * Defaults to ``1e-6`` if not set or set to ``0``.
+     */
+    float epsilon = 3;
+
+}
+
+/*
+ * A layer that repeats a sequence or the dimension sitting at axis = -5
+ *
+ * .. code::
+ *
+ *      y = SequenceRepeatLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     A blob with rank at least 5.
+ *     e.g: shape ``[Seq, B, C, H, W]``
+ * Output
+ *     A blob with the same rank as the input.
+ *     e.g.: for input shape ``[Seq, B, C, H, W]``, output shape is ``[nRepetitions * Seq, B, C, H, W]``.
+ */
+message SequenceRepeatLayerParams {
+
+    /*
+     * Number of repetitions.
+     * Defaults to ``1`` if not set or set to ``0``.
+     */
+    uint64 nRepetitions = 1;
+
+}
+
+// Recurrent Layers
+// ----------------
+
+/*
+ * The following activations are supported with recurrent layers:
+ * - Linear
+ * - Sigmoid
+ * - Tanh
+ * - ReLU
+ * - Scaled Hyperbolic Tangent: alpha * tanh(beta * x), currently only supported for alpha = 1.7159, beta = 2/3
+ * - Hard Sigmoid: min(max(alpha * x + beta, 0), 1), currently only supported for alpha = 0.2, beta = 0.5
+ */
+
+/*
+ * A simple recurrent layer.
+ *
+ * .. code::
+ *
+ *      y_t = SimpleRecurrentLayer(x_t, y_{t-1})
+ *
+ * Input
+ *    A blob of rank 5, with shape `[Seq, Batch, inputVectorSize, 1, 1]``.
+ *    This represents a sequence of vectors of size ``inputVectorSize``.
+ * Output
+ *    Same rank as the input.
+ *    Represents a vector of size ``outputVectorSize``. It is either the final output or a sequence of outputs at all time steps.
+ *
+ * - Output Shape: ``[1, Batch, outputVectorSize, 1, 1]`` , if ``sequenceOutput == false``
+ * - Output Shape: ``[Seq, Batch, outputVectorSize, 1, 1]`` , if ``sequenceOutput == true``
+ *
+ * This layer is described by the following equation:
+ *
+ * .. math::
+ *     \boldsymbol{y_t} = f(\mathrm{clip}(W \boldsymbol{x_t} + \
+ *                                        R \boldsymbol{y_{t-1}} + b))
+ *
+ * - ``W`` is a 2-dimensional weight matrix
+ *   (``[outputVectorSize, inputVectorSize]``, row-major)
+ * - ``R`` is a 2-dimensional recursion matrix
+ *   (``[outputVectorSize, outputVectorSize]``, row-major)
+ * - ``b`` is a 1-dimensional bias vector (``[outputVectorSize]``)
+ * - ``f()`` is an activation
+ * - ``clip()`` is a function that constrains values between ``[-50.0, 50.0]``
+ */
+message SimpleRecurrentLayerParams {
+
+    uint64 inputVectorSize = 1; // The size of the input vectors.
+    uint64 outputVectorSize = 2; // The size of the output vectors.
+
+    /*
+    * Activations supported are Linear, Sigmoid, Tanh, ReLU, Scaled Tanh (alpha = 1.71, beta = 2/3), Hard sigmoid (alpha = 0.2, beta = 0.5)
+    */
+    ActivationParams activation = 10; // The activation function.
+
+    /*
+        If false output is just the result after final state update.
+        If true, output is a sequence, containing outputs at all time steps.
+    */
+    bool sequenceOutput = 15;
+
+    bool hasBiasVector = 20; // If false, no bias is added.
+
+    WeightParams weightMatrix = 30; // Weight matrix W.
+    WeightParams recursionMatrix = 31; // Recursion Weight matrix R.
+    WeightParams biasVector = 32; // Bias vector b.
+
+    bool reverseInput = 100;
+    // If true, then the node processes the input sequence from right to left
+
+}
+
+/*
+ * Gated-Recurrent Unit (GRU) Layer
+ *
+ * .. code::
+ *
+ *      y_t = GRULayer(x_t, y_{t-1})
+ *
+ * Input
+ *    A blob of rank 5, with shape `[Seq, Batch, inputVectorSize, 1, 1]``.
+ *    This represents a sequence of vectors of size ``inputVectorSize``.
+ * Output
+ *    Same rank as the input.
+ *    Represents a vector of size ``outputVectorSize``. It is either the final output or a sequence of outputs at all time steps.
+ *
+ * - Output Shape: ``[1, Batch, outputVectorSize, 1, 1]`` , if ``sequenceOutput == false``
+ * - Output Shape: ``[Seq, Batch, outputVectorSize, 1, 1]`` , if ``sequenceOutput == true``
+ *
+ * This layer is described by the following equations:
+ *
+ * Update Gate
+ *     .. math::
+ *         \boldsymbol{z_t} = \
+ *             f(\mathrm{clip}(W_z \boldsymbol{x_t} + \
+ *                             R_z \boldsymbol{y_{t-1}} + b_z)
+ *
+ * Reset Gate
+ *     .. math::
+ *         \boldsymbol{r_t} = \
+ *             f(\mathrm{clip}(W_r \boldsymbol{x_t} + \
+ *                             R_r \boldsymbol{y_{t-1}} + b_r))
+ *
+ * Cell Memory State
+ *     .. math::
+ *         \boldsymbol{c_t} = \
+ *             \boldsymbol{y_{t-1}} \odot \boldsymbol{r_t}
+ *
+ * Output Gate
+ *     .. math::
+ *         \boldsymbol{o_t} = \
+ *             g(\mathrm{clip}(W_o \boldsymbol{x_t} + \
+ *                             R_o \boldsymbol{c_t} + b_o))
+ *
+ * Output
+ *     .. math::
+ *         \boldsymbol{y_t} = \
+ *             (1 - \boldsymbol{z_t}) \odot \boldsymbol{o_t} + \
+ *              \boldsymbol{z_t} \odot \boldsymbol{y_{t-1}}
+ *
+ * - ``W_z``, ``W_r``, ``W_o`` are 2-dimensional input weight matrices
+ *   (``[outputVectorSize, inputVectorSize]``, row-major)
+ * - ``R_z``, ``R_r``, ``R_o`` are 2-dimensional recursion matrices
+ *   (``[outputVectorSize, outputVectorSize]``, row-major)
+ * - ``b_z``, ``b_r``, ``b_o`` are 1-dimensional bias vectors
+ *   (``[outputVectorSize]``)
+ * - ``f()``, ``g()`` are activations
+ * - ``clip()`` is a function that constrains values between ``[-50.0, 50.0]``
+ * - ``⊙`` denotes the elementwise product of matrices
+ */
+message GRULayerParams {
+
+    uint64 inputVectorSize = 1; // Size of the input vectors.
+    uint64 outputVectorSize = 2; // Size of the output vectors.
+
+    /*
+     * 2 element array representing activations [f(), g()] in that order.
+     * Typical values used = [sigmoid, tanh].
+     * Activations supported are Linear, Sigmoid, Tanh, ReLU, Scaled Tanh (alpha = 1.71, beta = 2/3), Hard sigmoid (alpha = 0.2, beta = 0.5)
+     */
+    repeated ActivationParams activations = 10;
+
+    /*
+     * If false output is just the result after final state update.
+     * If true, output is a sequence, containing outputs at all time steps.
+     */
+    bool sequenceOutput = 15;
+
+    /*
+     * If false, no biases (``b_z``, ``b_r``, ``b_o``) are added.
+     */
+    bool hasBiasVectors = 20;
+
+    WeightParams updateGateWeightMatrix = 30; // Weight Matrix W_z.
+    WeightParams resetGateWeightMatrix = 31; // Weight Matrix W_r.
+    WeightParams outputGateWeightMatrix = 32; // Weight Matrix W_o.
+
+    WeightParams updateGateRecursionMatrix = 50; // Recursion Weight Matrix R_z.
+    WeightParams resetGateRecursionMatrix = 51; // Recursion Weight Matrix R_r.
+    WeightParams outputGateRecursionMatrix = 52; // Recursion Weight Matrix R_o.
+
+    WeightParams updateGateBiasVector = 70; // Bias vector b_z.
+    WeightParams resetGateBiasVector = 71; // Bias vector b_r.
+    WeightParams outputGateBiasVector = 72; // Bias vector b_o.
+
+    // If true, then the node processes the input sequence from right to left
+    bool reverseInput = 100;
+
+}
+
+/*
+ * Long short-term memory (LSTM) parameters.
+ *
+ * This is described by the following equations:
+ *
+ * Input Gate
+ *     .. math::
+ *         \boldsymbol{i_t} = \
+ *             f(\mathrm{clip}(W_i \boldsymbol{x_t} + \
+ *                             R_i \boldsymbol{y_{t-1}} + \
+ *                             p_i \odot c_{t-1} + b_i))
+ *
+ * Forget Gate
+ *     .. math::
+ *         \boldsymbol{f_t} = \
+ *             f(\mathrm{clip}(W_f \boldsymbol{x_t} + \
+ *                             R_f \boldsymbol{y_{t-1}} + \
+ *                             p_f \odot c_{t-1} + b_f))
+ *
+ * Block Input
+ *     .. math::
+ *         \boldsymbol{z_t} = \
+ *             g(\mathrm{clip}(W_z \boldsymbol{x_t} + \
+ *                             R_z \boldsymbol{y_{t-1}} + b_z))
+ *
+ * Cell Memory State
+ *     .. math::
+ *         \boldsymbol{c_t} = \
+ *             \boldsymbol{c_{t-1}} \odot \boldsymbol{f_t} + \
+ *             \boldsymbol{i_t} \odot \boldsymbol{z_t}
+ *
+ * Output Gate
+ *     .. math::
+ *         \boldsymbol{o_t} = \
+ *             f(\mathrm{clip}(W_o \boldsymbol{x_t} + \
+ *                             R_o \boldsymbol{y_{t-1}} + \
+ *                             p_o \odot c_t + b_o))
+ *
+ * Output
+ *     .. math::
+ *         \boldsymbol{y_t} = \
+ *             h(\boldsymbol{c_t}) \odot \boldsymbol{o_t}
+ *
+ * - ``W_i``, ``W_f``, ``W_z``, ``W_o`` are 2-dimensional input weight matrices
+ *   (``[outputVectorSize, inputVectorSize]``, row-major)
+ * - ``R_i``, ``R_f``, ``R_z``, ``R_o`` are 2-dimensional recursion matrices
+ *   (``[outputVectorSize, outputVectorSize]``, row-major)
+ * - ``b_i``, ``b_f``, ``b_z``, ``b_o`` are 1-dimensional bias vectors
+ *   (``[outputVectorSize]``)
+ * - ``p_``, ``p_f``, ``p_o`` are 1-dimensional peephole vectors
+ *   (``[outputVectorSize]``)
+ * - ``f()``, ``g()``, ``h()`` are activations
+ * - ``clip()`` is a function that constrains values between ``[-50.0, 50.0]``
+ * - ``⊙`` denotes the elementwise product of matrices
+ */
+message LSTMParams {
+
+    /*
+     * If true, output is a sequence, containing outputs at all time steps.
+     * If false, output is just the result after final state update.
+     */
+    bool sequenceOutput = 10;
+
+    /*
+     * If false, no biases (``b_i``, ``b_f``, ``b_z``, ``b_o``) are added.
+     */
+    bool hasBiasVectors = 20;
+
+    /*
+     * If true, a vector of ``1`` values is added to ``b_f``.
+     */
+    bool forgetBias = 30;
+
+    /*
+     * If true, peephole vectors are included.
+     */
+    bool hasPeepholeVectors = 40;
+
+    /*
+     * If the coupled Input and Forget flag is on, the behaviour of
+     * ``c_t`` is changed to the following (i.e. forget gate is not used):
+     *
+     * .. math::
+     *     \boldsymbol{c_t} = \
+     *         \boldsymbol{c_{t-1}} \odot (1 - \boldsymbol{i_t}) + \
+     *         \boldsymbol{i_t} \odot \boldsymbol{z_t}
+     *
+     */
+    bool coupledInputAndForgetGate = 50;
+
+    /*
+     * Places a limit on the maximum and minimum values of ``c_t``.
+     * c_t = min(c_t, cellClipThreshold)
+     * c_t = max(c_t, -cellClipThreshold)
+     * If 0, it is set to its default value = 50.0.
+     */
+    float cellClipThreshold = 60;
+
+}
+
+/*
+ * Weights for long short-term memory (LSTM) layers
+ */
+message LSTMWeightParams {
+
+    WeightParams inputGateWeightMatrix = 1; // Weight Matrix W_i.
+    WeightParams forgetGateWeightMatrix = 2; // Weight Matrix W_f.
+    WeightParams blockInputWeightMatrix = 3; // Weight Matrix W_z.
+    WeightParams outputGateWeightMatrix = 4; // Weight Matrix W_o.
+
+    WeightParams inputGateRecursionMatrix = 20; // Recursion Weight Matrix R_i.
+    WeightParams forgetGateRecursionMatrix = 21; // Recursion Weight Matrix R_f.
+    WeightParams blockInputRecursionMatrix = 22; // Recursion Weight Matrix R_z.
+    WeightParams outputGateRecursionMatrix = 23; // Recursion Weight Matrix R_o.
+
+    //biases:
+    WeightParams inputGateBiasVector = 40; // Bias vector b_i.
+    WeightParams forgetGateBiasVector = 41; // Bias vector b_f.
+    WeightParams blockInputBiasVector = 42; // Bias vector b_z.
+    WeightParams outputGateBiasVector = 43; // Bias vector b_o.
+
+    //peepholes:
+    WeightParams inputGatePeepholeVector = 60; // Peephole vector p_i.
+    WeightParams forgetGatePeepholeVector = 61; // Peephole vector p_f.
+    WeightParams outputGatePeepholeVector = 62; // Peephole vector p_o.
+
+}
+
+/*
+ * A unidirectional long short-term memory (LSTM) layer.
+ *
+ * .. code::
+ *
+ *      (y_t, c_t) = UniDirectionalLSTMLayer(x_t, y_{t-1}, c_{t-1})
+ *
+ * Input
+ *    A blob of rank 5, with shape `[Seq, Batch, inputVectorSize, 1, 1]``.
+ *    This represents a sequence of vectors of size ``inputVectorSize``.
+ * Output
+ *    Same rank as the input.
+ *    Represents a vector of size ``outputVectorSize``. It is either the final output or a sequence of outputs at all time steps.
+ *
+ * - Output Shape: ``[1, Batch, outputVectorSize, 1, 1]`` , if ``sequenceOutput == false``
+ * - Output Shape: ``[Seq, Batch, outputVectorSize, 1, 1]`` , if ``sequenceOutput == true``
+ *
+ */
+message UniDirectionalLSTMLayerParams {
+
+    uint64 inputVectorSize = 1; // Size of the input vectors.
+    uint64 outputVectorSize = 2; // Size of the output vectors.
+
+    /*
+     * 3 element array representing activations [f(),g(),h()] in that order.
+     * Typical values used = [sigmoid, tanh, tanh].
+     * Activations supported are Linear, Sigmoid, Tanh, ReLU, Scaled Tanh (alpha = 1.71, beta = 2/3), Hard sigmoid (alpha = 0.2, beta = 0.5)
+     */
+    repeated ActivationParams activations = 10;
+
+    LSTMParams params = 15;
+
+    LSTMWeightParams weightParams = 20; // Weights, biases and peepholes.
+
+    // If true, then the node processes the input sequence from right to left
+    bool reverseInput = 100;
+
+}
+
+/*
+ * Bidirectional long short-term memory (LSTM) layer
+ *
+ * .. code::
+ *
+ *      (y_t, c_t, y_t_reverse, c_t_reverse) = BiDirectionalLSTMLayer(x_t, y_{t-1}, c_{t-1}, y_{t-1}_reverse, c_{t-1}_reverse)
+ *
+ * Input
+ *    A blob of rank 5, with shape `[Seq, Batch, inputVectorSize, 1, 1]``.
+ *    This represents a sequence of vectors of size ``inputVectorSize``.
+ * Output
+ *    Same rank as the input.
+ *    Represents a vector of size ``2 * outputVectorSize``. It is either the final output or a sequence of outputs at all time steps.
+ *
+ * - Output Shape: ``[1, Batch, 2 * outputVectorSize, 1, 1]`` , if ``sequenceOutput == false``
+ * - Output Shape: ``[Seq, Batch, 2 * outputVectorSize, 1, 1]`` , if ``sequenceOutput == true``
+ *
+ *
+ * The first LSTM operates on the input sequence in the forward direction.
+ * The second LSTM operates on the input sequence in the reverse direction.
+ *
+ * Example: given the input sequence ``[x_1, x_2, x_3]``,
+ * where ``x_i`` are vectors at time index ``i``:
+ *
+ * The forward LSTM output is ``[yf_1, yf_2, yf_3]``,
+ *
+ * where ``yf_i`` are vectors of size ``outputVectorSize``:
+ *
+ * - ``yf_1`` is the output at the end of sequence {``x_1``}
+ * - ``yf_2`` is the output at the end of sequence {``x_1``, ``x_2``}
+ * - ``yf_3`` is the output at the end of sequence {``x_1``, ``x_2``, ``x_3``}
+ *
+ * The backward LSTM output: ``[yb_1, yb_2, yb_3]``,
+ *
+ * where ``yb_i`` are vectors of size ``outputVectorSize``:
+ *
+ * - ``yb_1`` is the output at the end of sequence {``x_3``}
+ * - ``yb_2`` is the output at the end of sequence {``x_3``, ``x_2``}
+ * - ``yb_3`` is the output at the end of sequence {``x_3``, ``x_2``, ``x_1``}
+ *
+ * Output of the bi-dir layer:
+ *
+ * - if ``sequenceOutput = True`` : { ``[yf_1, yb_3]``,  ``[yf_2, yb_2]``,  ``[yf_3, yb_1]`` }
+ * - if ``sequenceOutput = False`` : { ``[yf_3, yb_3]`` }
+ */
+message BiDirectionalLSTMLayerParams {
+
+    /*
+     * Size of the input vectors.
+     */
+    uint64 inputVectorSize = 1;
+    /*
+     * Size of the outputs vectors.
+     * It is same for both forward and backward LSTMs.
+     */
+    uint64 outputVectorSize = 2;
+
+    /*
+     * 3 element array representing activations [f(),g(),h()] in that order.
+     * Typical values used = [sigmoid, tanh, tanh].
+     * Activations supported are Linear, Sigmoid, Tanh, ReLU, Scaled Tanh (alpha = 1.71, beta = 2/3), Hard sigmoid (alpha = 0.2, beta = 0.5)
+     */
+    repeated ActivationParams activationsForwardLSTM = 10;
+    /*
+     * Currently, backward LSTM activations
+     * must be same as the ones for the forward LSTM.
+     */
+    repeated ActivationParams activationsBackwardLSTM = 11;
+
+    /*
+     * Common parameters shared by the forward and backward LSTMs.
+     */
+    LSTMParams params = 15;
+
+    /*
+     * Weights and biases.
+     * Must be a length 2 message,
+     * for the forward and backward LSTM respectively.
+     */
+    repeated LSTMWeightParams weightParams = 20;
+
+}
+
+message CustomLayerParams {
+
+    message CustomLayerParamValue {
+        oneof value {
+            double doubleValue = 10;
+            string stringValue = 20;
+            int32 intValue = 30;
+            int64 longValue = 40;
+            bool boolValue = 50;
+        }
+    }
+
+    string className = 10; // The name of the class (conforming to MLCustomLayer) corresponding to this layer
+    repeated WeightParams weights = 20; // Any weights -- these are serialized in binary format and memmapped at runtime
+    map<string, CustomLayerParamValue> parameters = 30; // these may be handled as strings, so this should not be large
+    string description = 40; // An (optional) description of the layer provided by the model creator. This information is displayed when viewing the model, but does not affect the model's execution on device.
+
+}
+
+/*
+ * A layer that rearranges the dimensions and data of an input.
+ *
+ * .. code::
+ *
+ *      y = TransposeLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     A N-Dimensional tensor.
+ * Output
+ *     A N-Dimensional tensor of the same rank but with dimensions and data permuted according to axes.
+ *     Shape: ``[InputShape[axis[0]], InputShape[axis[1]], ... , InputShape[axis[N-1]]]``
+ *
+ * Examples:
+ *
+ * - If ``axes`` is set to ``[3, 1, 2, 0]`` and the input shape is ``[6,7,8,9]``,
+ *   then the output has shape ``[9,7,8,6]``
+ */
+
+message TransposeLayerParams {
+
+    /*
+     * Length of "axes" should match the rank of input & output tensor
+     * "axes" should be a permutation of "[0,1,2,...,N-1]" where N is the rank.
+     */
+    repeated uint64 axes = 1; //
+
+}
+
+/*
+ * A layer that computes the matrix multiplication of two tensors with numpy-like broadcasting
+ * where the matrices reside in the last two indices of the tensor.
+ *
+ * .. code::
+ *
+ *      y = BatchedMatMul(a,b)
+ *
+ * Requires 1 or 2 inputs and produces 1 output.
+ *
+ * The first tensor, "a", must be provided as an input. The second tensor can either be an input or provided as a weight matrix parameter.
+ *
+ * Input
+ *     - a: First N-Dimensional tensor
+ *     - b: Second N-Dimensional tensor (either a rank-N input or a matrix, i.e. N=2, provided as a layer parameter)
+ *
+ * Output
+ *     A tensor containing the matrix product of two tensors.
+ *     When there are two inputs: rank is max(2, rank(a), rank(b))
+ *     When there is one input: rank is same as that of the input.
+ *
+ * This operation behaves as following:
+ *
+ *  When there are two inputs:
+ *      - If N >= 2 for both tensors, it is treated as a batch of matrices residing in the last two indices.
+ *        All the indices, except for the last two, are broadcasted using conventional rules.
+ *      - If the first tensor is 1-D, it is converted to a 2-D tensor by prepending a 1 to its shape. Eg. (D) -> (1,D)
+ *      - If the second tensor is 1-D, it is converted to a 2-D tensor by appending a 1 to its shape. Eg. (D) -> (D,1)
+ *
+ *  When there is one input:
+ *      - The weight matrix corresponds to a matrix, of shape (X1, X2). Values of X1, X2 must be provided as layer parameters.
+ *      - The input, "a", is reshaped into a matrix by combining all the leading dimensions, except the last, into a batch dimension. eg:
+ *             - if "a" is rank 1 (X1,) -->  (1, X1). Output shape will be (X2,)
+ *             - if "a" is rank 2 (B1, X1) --> no need to reshape. Output shape will be (B1, X2)
+ *             - if "a" is rank 3 (B1, B2, X1) --> (B1 * B2, X1). Output shape will be (B1, B2, X2)
+ *             - etc
+ */
+message BatchedMatMulLayerParams {
+
+    /*
+     * If transposeA is true, it transposes the left matrix on the fly before matrix multiplication.
+     * (is ignored when there is one input)
+     */
+    bool transposeA = 1;
+    /*
+     * If transposeB is true, it transposes the right matrix on the fly before matrix multiplication.
+     * (is ignored when there is one input)
+     */
+    bool transposeB = 2;
+
+    /*
+     * Following parameters are ignored when there are two inputs.
+     */
+
+    uint64 weightMatrixFirstDimension = 5; // X1: same as the last dimension of the input tensor
+    uint64 weightMatrixSecondDimension = 6; // X2: same as the last dimension of the output tensor
+
+    bool hasBias = 7; // Whether a bias is added or not. Supported only when there is one input.
+
+    /*
+     * Weight matrix representing shape [X1, X2].
+     * Values are however stored in column major order,
+     * in the "repeated float" or "bytes" fields of the message "WeightParams"
+     */
+    WeightParams weights = 8;
+    WeightParams bias = 9; // Bias vector [X2]. Supported only when there is one input.
+
+    /*
+     * If set, this layer, at runtime, quantizes the floating point input blob to int8 before applying the
+     * matrix multiplication using the INT8 weight parameters provided in weights->int8RawValue. The
+     * result is then dequantized.
+     * Requires:
+     * * number of inputs to be 1
+     * * hasBias == false
+     * * QuantizationType == LinearQuantizationParams, such that
+     *   * size of the "scale" field is 1 and "bias" field is empty in "LinearQuantizationParams"
+     * * numberOfBits == 8
+     * * weights->rawValue_size to be empty
+     */
+    bool int8DynamicQuantize = 10;
+
+}
+
+/*
+ * A layer that concatenates a list of tensors along a specified axis.
+ *
+ * .. code::
+ *
+ *      y = ConcatNDLayer(x1,x2,....)
+ *
+ * Requires at least 2 input and produces 1 output.
+ *
+ * Input
+ *     The rank of the input tensors must match and all dimensions also must match, except for the dimension 'axis'.
+ *
+ *
+ * Output
+ *     Same rank as the input. The dimension along "axis", is the sum of the dimensions of the inputs.
+ *
+ * example:
+ *
+ * in1 : shape (3, 2), value = [[1, 2], [3, 4], [5, 6]]
+ * in2 : shape (3, 2), value = [[7, 8], [9, 10], [11, 12]]
+ * axis = 0
+ *
+ * if interleave = False (default)
+ * output : shape (6, 2)
+ * output[0:3, :] = in1
+ * output[3:6, :] = in2
+ * value = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]
+ *
+ * if interleave = True
+ * output : shape (6, 2)
+ * output[0::2, :] = in1
+ * output[1::2, :] = in2
+ * value = [[1, 2], [7, 8], [3, 4], [9, 10], [5, 6], [11, 12]]
+ *
+ */
+message ConcatNDLayerParams {
+
+    /*
+     * Dimension along which to concatenate. Supports negative values of the parameter 'axis'.
+     */
+    int64 axis = 1;
+
+    /*
+     * (Only available in Core ML Specification >= 5 (iOS >= 14, macOS >= 11.0)
+     * Interleave option. If True, concatenation is done via interleaving the inputs.
+     * This requires all inputs to have the exact same shape.
+     */
+    bool interleave = 2;
+
+
+}
+
+/*
+ * A layer that performs softmax normalization along a specified axis.
+ *
+ * .. code::
+ *
+ *      y = SoftmaxNDLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Output shape is same as the input.
+ */
+message SoftmaxNDLayerParams {
+
+    /*
+     * Dimension on which the softmax would be performed. Supports negative values of the parameter 'axis'.
+     */
+    int64 axis = 1;
+
+}
+
+/*
+ * A layer that reverses specific dimensions of the input tensor.
+ * It is similar in functionality to the numpy.flip method.
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ */
+message ReverseLayerParams {
+
+    /*
+     * Reverses each dimension of the input tensor for which corresponding reverseDim is set to True.
+     * Requires len(reverseDim) == rank(inputTensor)
+     */
+    repeated bool reverseDim = 1;
+
+}
+
+/*
+ * A layer that reverses variable length slices.
+ *
+ * Requires 2 inputs and produces 1 output.
+ *
+ * 2 inputs, in order are denoted by "data", "seq_lengths".
+ * "seq_lenghts" must be a rank 1 tensor, i.e. seq_lengths.shape = (B,)
+ * which contains the lengths of the amount of sequence to be reversed, for each element of the batch.
+ * Dimension "batchAxis" in "data" must be equal to B, i.e,
+ * data.shape[batchAxis] = B.
+ *
+ * According to the batch axis, input "data" is first divided into a batch of B inputs,
+ * each of which is flipped along the dimension "sequenceAxis", by the amount specified in
+ * "seq_lengths", the second input.
+ *
+ * e.g.:
+ *
+ * data [shape = (2,4)]:
+ * [0 1 2 3]
+ * [4 5 6 7]
+ * seq_lengths [shape = (2,)]:
+ * [3, 0]
+ * batchAxis = 0
+ * sequenceAxis = 1
+ *
+ * output [shape = (2,4)]:
+ * [2 1 0 3]
+ * [4 5 6 7]
+ *
+ *
+ * data [shape = (2,3,2)]:
+ * [0 1]
+ * [2 3]
+ * [4 5] (slice = 0)
+ * [6 7]
+ * [8 9]
+ * [10 11] (slice = 1)
+ * seq_lengths [shape = (2,)]:
+ * [2, 3]
+ * batchAxis = 0
+ * sequenceAxis = 1
+ *
+ * output [shape = (2,3,2)]:
+ * [2 3]
+ * [0 1]
+ * [4 5] (slice = 0)
+ * [10 11]
+ * [8 9]
+ * [6 7] (slice = 1)
+ *
+ * Output shape is same as the input.
+ */
+message ReverseSeqLayerParams {
+
+    int64 batchAxis = 1; // batch axis has to be strictly less than seq_axis
+    int64 sequenceAxis = 2;
+
+}
+
+/*
+ * A layer that loads data as a parameter and provides it as an output.
+ *
+ * .. code::
+ *
+ *      y = LoadConstantNDLayer()
+ *
+ * Requires no input and produces 1 output.
+ *
+ * Output: A tensor with shape as provided in the parameter "shape"
+ */
+message LoadConstantNDLayerParams {
+
+    /*
+     * The shape of the constant to be loaded.
+     */
+    repeated uint64 shape = 1;
+    WeightParams data = 2;
+
+}
+
+/*
+ * A layer that generates an output tensor with a constant value.
+ * Input is only used to determine the shape of the output.
+ * This layer is used to allocate a tensor with a dynamic shape (that of the input) and constant value.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * .. code::
+ *
+ *      y = FillLikeLayer(x)
+ *
+ * Input
+ *     A N-Dimensional tensor, whose values are ignored. Only the shape is used to
+ *     infer the shape of the output.
+ *
+ * Output
+ *     A N-Dimensional tensor with the same shape as the input tensor.
+ *
+ */
+message FillLikeLayerParams {
+
+    float value = 1;
+
+}
+
+/*
+ * A layer that generates an output tensor with a constant value.
+ * This layer is used to allocate a tensor with a static shape and constant value.
+ *
+ * Requires no input and produces 1 output.
+ *
+ * .. code::
+ *
+ *      y = FillStaticLayer(x)
+ *
+ * Output
+ *     A N-Dimensional tensor of shape "targetShape".
+ *
+ */
+message FillStaticLayerParams {
+
+    float value = 1;
+    repeated uint64 targetShape = 2;
+
+}
+
+/*
+ * A layer that generates an output tensor with a constant value.
+ * This layer is used to allocate a tensor with a dynamic shape (as specified by the input) and constant value.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * .. code::
+ *
+ *      y = FillDynamicLayer(x)
+ *
+ * Input
+ *     A rank 1 tensor specifying the shape of the output
+ *
+ * Output
+ *     An N-Dimensional tensor with the shape specified by the values in the input tensor.
+ *
+ */
+message FillDynamicLayerParams {
+
+    float value = 1;
+
+}
+
+/*
+ * A layer that returns the elements either from tensor x or tensor y,
+ * depending on the value in the condition tensor.
+ * It is similar in functionality to the numpy.where method with 3 inputs.
+ *
+ * Requires 3 inputs and produces 1 output.
+ * Inputs, in order, are the condition tensor, x and y.
+ *
+ * for each vector index (i,...,j):
+ *    output[i,...,j] = x[i,...,j] if condition[i,...,j] = True
+ *                      y[i,...,j] if condition[i,...,j] = False
+ *
+ * All the 3 inputs are first broadcasted to a common shape.
+ * (the shapes must be broadcastable)
+ *
+ * output.rank = max(input[0].rank, input[1].rank, input[2].rank)
+ *
+ */
+message WhereBroadcastableLayerParams {
+
+}
+
+/*
+ * A layer that computes elementwise trigonometric sine function.
+ *
+ *
+ * .. code::
+ *
+ *      y = SinLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ *
+ */
+message SinLayerParams {
+
+}
+
+/*
+ * A layer that computes elementwise trigonometric cosine function.
+ *
+ *
+ * .. code::
+ *
+ *      y = CosLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ *
+ */
+message CosLayerParams {
+
+}
+
+/*
+ * A layer that computes elementwise trigonometric tangent function.
+ *
+ *
+ * .. code::
+ *
+ *      y = TanLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ *
+ */
+message TanLayerParams {
+
+}
+
+/*
+ * A layer that computes elementwise trigonometric arcsine function.
+ *
+ *
+ * .. code::
+ *
+ *      y = AsinLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ *
+ */
+message AsinLayerParams {
+
+}
+
+/*
+ * A layer that computes elementwise trigonometric arccosine function.
+ *
+ *
+ * .. code::
+ *
+ *      y = AcosLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ *
+ */
+message AcosLayerParams {
+
+}
+
+/*
+ * A layer that computes elementwise trigonometric arctangent function.
+ *
+ *
+ * .. code::
+ *
+ *      y = AtanLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ *
+ */
+message AtanLayerParams {
+
+}
+
+/*
+ * A layer that computes elementwise trigonometric hyperbolic sine function.
+ *
+ *
+ * .. code::
+ *
+ *      y = SinhLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ *
+ */
+message SinhLayerParams {
+
+}
+
+/*
+ * A layer that computes elementwise trigonometric hyperbolic cosine function.
+ *
+ *
+ * .. code::
+ *
+ *      y = CoshLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ *
+ */
+message CoshLayerParams {
+
+}
+
+/*
+ * A layer that computes elementwise trigonometric hyperbolic tangent function.
+ *
+ *
+ * .. code::
+ *
+ *      y = TanhLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ *
+ */
+message TanhLayerParams {
+
+}
+
+/*
+ * A layer that computes elementwise trigonometric hyperbolic arcsine function.
+ *
+ *
+ * .. code::
+ *
+ *      y = AsinhLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ *
+ */
+message AsinhLayerParams {
+
+}
+
+/*
+ * A layer that computes elementwise trigonometric hyperbolic arccosine function.
+ *
+ *
+ * .. code::
+ *
+ *      y = AcoshLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ *
+ */
+message AcoshLayerParams {
+
+}
+
+/*
+ * A layer that computes elementwise trigonometric hyperbolic arctangent function.
+ *
+ *
+ * .. code::
+ *
+ *      y = AtanhLayer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ *
+ */
+message AtanhLayerParams {
+
+}
+/*
+ * A layer that raises each element in first tensor to the power of
+ * corresponding element in the second tensor.
+ * Supports conventional numpy-like broadcasting.
+ *
+ * .. code::
+ *
+ *      y = PowBroadcastableLayer(x)
+ *
+ * Requires 2 inputs and produces 1 output.
+ *
+ * Input
+ *     - First N-Dimensional tensor
+ *     - Second N-Dimensional tensor
+ *
+ * Output
+ *     An N-Dimensional tensor with the broadcast shape.
+ *
+ */
+message PowBroadcastableLayerParams {
+
+}
+
+/*
+ * A layer that computes the exponential of all elements in the input tensor, with the base 2.
+ *
+ *
+ * .. code::
+ *
+ *      y = Exp2Layer(x)
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ *
+ */
+message Exp2LayerParams {
+
+}
+
+/*
+ * A layer that returns a tensor containing the indices of all non-zero
+ * elements of input tensor.
+ * It is similar in functionality to the numpy.where method with 1 input.
+ *
+ * Requires 1 input and produces 1 output.
+ * Output is of rank 2, of shape (N,R),
+ * where N is the number of non-zero elements in the input and R is the rank of the input.
+ *
+ * Output contains indices represented in the multi-index form
+ *
+ * e.g.:
+ * input {shape = (4,)}:
+ * [0 1 0 2]
+ * output {shape = (2,1)}:
+ * [1]
+ * [3]
+ *
+ *
+ * input {shape = (3, 3)}:
+ * [1 2 1]
+ * [0 2 2]
+ * [2 1 0]
+ * output {shape = (7,1)}:
+ * [0. 0.]
+ * [0. 1.]
+ * [0. 2.]
+ * [1. 1.]
+ * [1. 2.]
+ * [2. 0.]
+ * [2. 1.]
+ *
+ */
+message WhereNonZeroLayerParams {
+
+}
+
+/*
+ * A layer that copies a tensor setting everything outside a central band in
+ * each inner-most matrix to zero.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameters for matrix_band_part layer
+ * band(m, n) = (num_lower < 0 || (m-n) <= num_lower) && (num_upper < 0 || (n-m) <= num_upper).
+ * output[i, j, k, ..., m, n] = band(m, n) * input[i, j, k, ..., m, n]
+ *
+ *
+ * Output shape is same as the input shape.
+ * Rank of the input must be at least 2.
+ * For rank higher than 2, the last 2 dimensions are treated as the matrix, while the rest are treated as batch.
+ */
+message MatrixBandPartLayerParams {
+
+    int64 numLower = 1;
+    int64 numUpper = 2;
+
+}
+
+/*
+ * A layer that copies a tensor setting everything outside upper triangular to zero.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Output shape is same as the input shape.
+ * Rank of the input must be at least 2.
+ * For rank higher than 2, the last 2 dimensions are treated as the matrix, while the rest are treated as batch.
+ */
+message UpperTriangularLayerParams {
+
+    int64 k = 1; // Diagonal below which to zero elements. k = 0 (the default) is the main diagonal, k < 0 is below it and k > 0 is above
+
+}
+
+/*
+ * A layer that copies a tensor setting everything outside lower triangular to zero.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Output shape is same as the input shape.
+ * Rank of the input must be at least 2.
+ * For rank higher than 2, the last 2 dimensions are treated as the matrix, while the rest are treated as batch.
+ */
+message LowerTriangularLayerParams {
+
+    int64 k = 1; // Diagonal above which to zero elements. k = 0 (the default) is the main diagonal, k < 0 is below it and k > 0 is above
+
+}
+
+/*
+ *
+ * A layer that broadcasts a tensor to a new shape.
+ *
+ * Requires 2 inputs and produces 1 output.
+ *
+ * First input is broadcast to produce the output, while the second input is only
+ * used to determine the shape of the output. Values of second input are not used.
+ *
+ * Output is a tensor with the same shape as the second input.
+ *
+ */
+message BroadcastToLikeLayerParams {
+
+}
+
+/*
+ *
+ * A layer that broadcasts a tensor to a new shape.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Output tensor is the broadcasted version of the input and has shape as specified in the
+ * parameter "targetShape".
+ */
+message BroadcastToStaticLayerParams {
+
+    repeated uint64 targetShape = 1;
+
+}
+
+/*
+ *
+ * A layer that broadcasts a tensor to a new shape.
+ *
+ * Requires 2 inputs and produces 1 output.
+ *
+ * First input is the one that is broadcasted to produce the output.
+ * Second input is a rank 1 tensor specifying the shape of the output.
+ * Output tensor has shape as specified by the values in the 2nd input tensor.
+ */
+message BroadcastToDynamicLayerParams {
+
+}
+
+/*
+ * A layer that performs element-wise addition operation with broadcast support.
+ *
+ * Requires 2 inputs and produces 1 output.
+ */
+message AddBroadcastableLayerParams {
+
+}
+
+/*
+ * A layer that performs element-wise maximum operation with broadcast support.
+ *
+ * Requires 2 inputs and produces 1 output.
+ */
+message MaxBroadcastableLayerParams {
+
+}
+
+/*
+ * A layer that performs element-wise minimum operation with broadcast support.
+ *
+ * Requires 2 inputs and produces 1 output.
+ */
+message MinBroadcastableLayerParams {
+
+}
+
+/*
+ * A layer that performs element-wise modular operation with broadcast support.
+ *
+ * Requires 2 inputs and produces 1 output.
+ */
+message ModBroadcastableLayerParams {
+
+}
+
+/*
+ * A layer that performs element-wise floor division operation with broadcast support.
+ *
+ * Requires 2 inputs and produces 1 output.
+ */
+message FloorDivBroadcastableLayerParams {
+
+}
+
+/*
+ * A layer that performs element-wise subtract operation with broadcast support.
+ *
+ * Requires 2 inputs and produces 1 output.
+ */
+message SubtractBroadcastableLayerParams {
+
+}
+
+/*
+ * A layer that performs element-wise multiply operation with broadcast support.
+ *
+ * Requires 2 inputs and produces 1 output.
+ */
+message MultiplyBroadcastableLayerParams {
+
+}
+
+/*
+ * A layer that performs element-wise division operation with broadcast support.
+ *
+ * Requires 2 inputs and produces 1 output.
+ */
+message DivideBroadcastableLayerParams {
+
+}
+
+/*
+ * Gather layer that gathers elements from the first input, along a specified axis,
+ * at indices specified in the second input.
+ * It is similar in functionality to the numpy.take method.
+ *
+ * Requires 2 inputs and produces 1 output.
+ *
+ * Given two inputs, 'data' and 'indices', gather the slices of 'data'
+ * and store into output.
+ * e.g.
+ * for i in [0, length(indices) - 1]
+ *    output[i] = data[indices[i]]  (1-D case, axis=0)
+ *
+ * if axis = 0:
+ * for each vector index (i,...,j)
+ *    output[i,...,j,:,..,:] = data[indices[i,...,j],:,..,:]
+ *
+ * output.rank = (data.rank - 1) + indices.rank
+ *
+ * Negative indices and negative axis are supported.
+ *
+ * e.g:
+ *
+ * data shape = (2, 3)
+ * indices shape = (6, 8)
+ * axis = 0
+ * output shape = (6, 8) + (3,) = (6, 8, 3)
+ *
+ * data shape = (2, 3, 5)
+ * indices shape = (6, 8)
+ * axis = 1
+ * output shape = (2,) + (6, 8) + (5,) =  (2, 6, 8, 5)
+ *
+ */
+message GatherLayerParams {
+
+    int64 axis = 1;
+
+}
+
+/*
+ * Scatter accumulation mode.
+ */
+enum ScatterMode {
+
+    SCATTER_UPDATE = 0;
+    SCATTER_ADD = 1; // add
+    SCATTER_SUB = 2; // subtract
+    SCATTER_MUL = 3; // multiply
+    SCATTER_DIV = 4; // divide
+    SCATTER_MAX = 5; // maximum
+    SCATTER_MIN = 6; // minimum
+
+}
+
+/*
+ * A layer that scatters data into a new tensor according to indices from the input.
+ * This is the inverse operation of Gather.
+ *
+ * Requires 3 inputs and produces 1 output.
+ *
+ * Output is initialized with the first input.
+ * Then updated with the values in the third input, at indices specified by the second input.
+ *
+ * An example when axis=0:
+ * Given three inputs, in order, "container", "indices", "updates", where
+ *
+ * - "container" is a rank R+1 tensor of shape [D_0, D_1, ..., D_R], which
+ *   contains D_0 number of tensors, each with shape [D_1, ..., D_R].
+ *
+ * - "indices" is a rank 1 tensor with shape [N], where N is the number of updates.
+ *   The values in this tensor must be in the range [0, D_0 - 1]. (negative indexing is supported)
+ *
+ * - "updates" is a rank R+1 tensor with shape [N, D_1, ..., D_R], which represents
+ *   a total number of N tensors, each of shape [D_1, ..., D_R].
+ *
+ * The effect of this operation is as follows:
+ *
+ * output = container;
+ * For each i in 0, ..., N - 1
+ *    output[indices[i], :, ..., :] = updates[i, :, ..., :] // if mode == "SCATTER_UPDATE"
+ *
+ * or
+ * For each i in 0, ..., N - 1
+ *    output[indices[i], :, ..., :] += updates[i, :, ..., :] // if mode == "SCATTER_ADD"
+ *
+ * etc
+ *
+ * When "indices" is a tensor of rank greater than 1, the equation becomes (for axis=0):
+ * For each vector index (i,...,j)
+ *   output[indices[i,...,j],...] -= updates[i,...,j,...] // if mode == "SCATTER_SUB"
+ *
+ *
+ * The output has the same shape as the first input.
+ * "indices" input must have rank less than or equal to the "updates" input and its shape
+ * must be a subset of the the shape of the "updates" input.
+ *
+ * e.g:
+ *
+ * container shape = (4, 3)
+ * indices shape = (5, 2, 3)
+ * updates shape = (4, 5, 2, 3)
+ * axis = 1
+ * output shape = (4, 3)
+ *
+ * container shape = (4, 4, 3)
+ * indices shape = (6,)
+ * updates shape = (4, 6, 3)
+ * axis = -2
+ * output shape = (4, 4, 3)
+ *
+ * container shape = (5,)
+ * indices shape = (5, 7, 5, 6)
+ * updates shape = (5, 7, 5, 6)
+ * axis = -1
+ * output shape = (5,)
+ */
+
+message ScatterLayerParams {
+
+    int64 axis = 1;
+    ScatterMode mode = 2; // mode of accumulation.
+
+}
+
+/*
+ * A layer that gathers elements from the first input, 'params', at the multi-indices specified
+ * by the second input, 'indices'.
+ *
+ * Requires 2 inputs and produces 1 output.
+ *
+ * 'params' = input[0], 'indices' = input[1]
+ *
+ * 'indices' is a rank K+1 tensor of shape [I_0, I_1, .., I_(K-1), I_K] which is viewed as a collection of
+ * indices of (I_0 * I_1 * ... * I_(K-1)) points in the I_K dimensional space. For instance, the multi-index of the first point
+ * is indices[0,0,...,0,:].
+ *
+ * Here is how the output is constructed:
+ *
+ * for i = 0,1,...,(I_0-1)
+ *   ...
+ *     for j = 0,1,....,(I_(K-1)-1)
+ *          output[i,....,j,:,:,..,:] = params[indices[i,...,j,:], :,:,..,:]
+ *
+ * Hence, output shape is [I_0, I_1,...,I(K-1)] + params.shape[I_K:]
+ *
+ * output.rank = indices.rank - 1 + params.rank - indices.shape[-1]
+ *
+ * e.g:
+ *
+ * input[0] shape = (4, 2, 3, 4)
+ * input[1] shape = (6, 2)
+ * output shape = (6,) + (3, 4) = (6, 3, 4)
+ *
+ * input[0] shape = (3, 3, 3, 4, 7)
+ * input[1] shape = (3, 5)
+ * output shape = (3,) + () = (3,)
+ *
+ * input[0] shape = (5, 3, 2, 5)
+ * input[1] shape = (2, 7, 3, 2)
+ * output shape = (2, 7, 3) + (2, 5) = (2, 7, 3, 2, 5)
+ *
+ */
+message GatherNDLayerParams {
+
+}
+
+/*
+ * A layer that scatters data into a new tensor according to multi-indices from the input.
+ * This is the inverse operation of GatherND.
+ *
+ * Requires 3 inputs and produces 1 output.
+ * 3 inputs, in order are denoted as "container", "indices", "updates".
+ *
+ * 'indices' is a rank K+1 tensor of shape [I_0, I_1, .., I_(K-1), I_K] which is viewed as a collection of
+ * indices of (I_0 * I_1 * ... * I_(K-1)) points in the I_K dimensional space. For instance, the multi-index of the first point
+ * is indices[0,0,...,0,:].
+ *
+ * container.rank >= I_K
+ * updates.rank = K + (container.rank - I_K)
+ * shape of 'updates' = [I_0, I_1,...,I(K-1)] + container.shape[I_K:]
+ *
+ * output = container
+ * For each vector index (i,...,j) s.t. 0<=i<I_0,..., 0<=j<I_K
+ *   output[indices[i,...,j,:], :,:,..,:] = updates[i,....,j,:,:,..,:] // if mode == "SCATTER_UPDATE"
+ *
+ * The output has the same shape as the first input.
+ *
+ * e.g:
+ *
+ * container shape = (3, 2)
+ * indices shape = (4, 2)
+ * updates shape = (4,)
+ * output shape = (3, 2)
+ *
+ * container shape = (7, 6)
+ * indices shape = (4, 7, 2, 5, 1)
+ * updates shape = (4, 7, 2, 5, 6)
+ * output shape = (7, 6)
+ *
+ */
+message ScatterNDLayerParams {
+
+    ScatterMode mode = 1; // mode of accumulation.
+
+}
+
+/*
+ * Gather layer that gathers elements from the first input, along a specified axis,
+ * at indices specified in the second input.
+ * It is similar in functionality to the numpy.take_along_axis method.
+ *
+ * Requires 2 inputs and produces 1 output.
+ *
+ * Given two inputs, 'data' and 'indices', gather the slices of 'data'
+ * and store into output.
+ *
+ * Both inputs and output have the same rank.
+ * Output shape is same as the shape of 'indices'
+ * Shapes of 'indices' and 'data' match, except at the 'axis' dimension.
+ *
+ * This operation performs the following operation for axis=0:
+ * for each vector index (i,j,....,k)
+ *    output[i,j,....,k] = data[index[i,j,....,k],j,....,k]
+ *
+ * Negative indices and negative axis are supported.
+ *
+ * e.g:
+ *
+ * data shape = (4, 4, 7)
+ * indices shape = (4, 5, 7)
+ * axis = 1
+ * output shape = (4, 5, 7)
+ *
+ */
+message GatherAlongAxisLayerParams {
+
+    int64 axis = 1;
+
+}
+
+/*
+ * A layer that scatters data into a new tensor according to indices from
+ * the input along the given axis into the output tensor.
+ * This is the inverse operation of GatherAlongAxis.
+ * It is similar in functionality to the numpy.put_along_axis method.
+ *
+ * Requires 3 inputs and produces 1 output.
+ * 3 inputs, in order are denoted as "container", "indices", "updates".
+ *
+ * All inputs and output have the same rank.
+ * Output shape is same as the shape of 'container'
+ * Shapes of 'indices' and 'updates' match, which is same as the shape of 'container' except at the 'axis' dimension.
+ *
+ * Negative indices and negative axis are supported.
+ *
+ * This operation performs the following operation for axis=0:
+ * output = container
+ * for each vector index (i,j,....,k)
+ *    output[index[i,j,....,k],j,....,k] = updates[i,j,....,k]
+ *
+ * e.g.:
+ *
+ * container shape = (2, 5, 6)
+ * indices shape = (2, 2, 6)
+ * updates shape = (2, 2, 6)
+ * axis = -2
+ * output shape = (2, 5, 6)
+ *
+ */
+message ScatterAlongAxisLayerParams {
+
+    int64 axis = 1;
+    ScatterMode mode = 2; // mode of accumulation.
+
+}
+
+/*
+ * A layer that stacks the input tensors along the given axis.
+ * It is similar in functionality to the numpy.stack method.
+ *
+ * Requires at least 2 inputs and produces 1 output.
+ * All inputs must have the same shape.
+ * Rank of the output is 1 greater than the rank of the inputs.
+ *
+ * Negative indexing is supported for the "axis" parameter.
+ *
+ * e.g.:
+ *
+ * input shape = (2, 4, 2)
+ * number of inputs = 5
+ * axis = 3
+ * output shape = (2, 4, 2, 5)
+ *
+ * input shape = (2, 4, 2)
+ * number of inputs = 5
+ * axis = -2
+ * output shape = (2, 4, 5, 2)
+ */
+message StackLayerParams {
+
+    int64 axis = 1;
+
+}
+
+/*
+ * A layer that reshapes a tensor that does not alter the rank of the input.
+ * Order of the data is left unchanged.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * e.g:
+ *
+ * input shape = (20,10)
+ * targetShape = (5,-1)
+ * output shape = (5,40)
+ *
+ * input shape = (20,10,5)
+ * targetShape = (0,2,25)
+ * output shape = (20,2,25)
+ *
+ * input shape = (10,3,5)
+ * targetShape = (25,0,-1)
+ * output shape = (25,3,2)
+ */
+message RankPreservingReshapeLayerParams {
+
+    /*
+     * Length of this field must be same as the input/output rank.
+     * It can have 0's, in which case the corresponding input dimension is kept intact.
+     * At most one element can be -1, in which case the output dimension is calculated from rest of the shape.
+     */
+    repeated int64 targetShape = 1;
+
+}
+
+/*
+ * Constant padding layer.
+ * Pad the input array with a constant value, either along a single given axis or along a set of axes.
+ *
+ * Requires 1 or 2 inputs and produces 1 output.
+ * The amount of padding can be either set as a parameter ("padAmounts") or provided as a second input.
+ *
+ * Output rank is same as the rank of the first input.
+ *
+ * when "padToGivenOutputSizeMode" is False:
+ *
+ * output_shape[i] = input_shape[i] + padAmounts[2*i] + padAmounts[2*i+1], i=0,...,rank-1
+ *
+ * Examples:
+ *
+ * input shape = (20,10)
+ * padAmounts = [0,1,4,0]
+ * output shape = (21,14)
+ *
+ * input shape = (20,10,5)
+ * padAmounts = [0,0,3,4,0,9]
+ * output shape = (20,17,14)
+ *
+ *
+ * when "padToGivenOutputSizeMode" is True
+ *
+ * output_shape[i] = max(input_shape[i], max(padAmounts[2*i] + padAmounts[2*i+1])), i=0,...,rank-1
+ *
+ * input shape = (20,10)
+ * padAmounts = [0,21,14,0]
+ * output shape = (21,14)
+ *
+ * input shape = (20,10,5)
+ * padAmounts = [0,0,17,0,0,14]
+ * output shape = (20,17,14)
+ */
+message ConstantPaddingLayerParams {
+    /*
+     * The value to be used for padding.
+     */
+    float value = 1;
+
+    /*
+     * Length of this repeated field must be twice the rank of the first input.
+     * 2*i-th and (2*i+1)-th values represent the amount of padding to be applied to the the i-th input
+     * dimension, "before" and "after" the input values, respectively.
+     */
+    repeated uint64 padAmounts = 2;
+
+    /*
+     * When this is True, positive values in "padAmounts" are equivalent to the output shape.
+     * In that case only one of padAmounts[2*i] and padAmounts[2*i+1] can be non zero, for i=0,..,rank-1.
+     */
+    bool padToGivenOutputSizeMode = 3;
+}
+
+/*
+ * A layer that returns a tensor filled with values from the normal distribution.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameters
+ *     seed: seed used for the normal distribution.
+ *     mean: mean of the normal distribution.
+ *     stdDev: standard deviation of the normal distribution.
+ *
+ * Input
+ *     An N-Dimensional tensor, whose values are ignored. Only the shape is used to
+ *     infer the shape of the output.
+ *
+ * Output
+ *     An N-Dimensional tensor with the same shape as the input tensor.
+ *
+ */
+message RandomNormalLikeLayerParams {
+
+    int64 seed = 1;
+    float mean = 2;
+    float stdDev = 3;
+
+}
+
+/*
+ * A layer that returns a tensor filled with values from the normal distribution.
+ *
+ * Requires no input and produces 1 output.
+ *
+ * Parameters
+ *     seed: seed used for the normal distribution.
+ *     mean: mean of the normal distribution.
+ *     stdDev: standard deviation of the normal distribution.
+ *     outputShape: shape of the output tensor.
+ *
+ * Output
+ *     An N-Dimensional tensor of shape "outputShape".
+ *
+ */
+message RandomNormalStaticLayerParams {
+
+    int64 seed = 1;
+    float mean = 2;
+    float stdDev = 3;
+    repeated uint64 outputShape = 4;
+
+}
+
+/*
+ * A layer that returns a tensor filled with values from the normal distribution.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameters:
+ *     seed: seed used for the normal distribution.
+ *     mean: mean of the normal distribution.
+ *     stdDev: standard deviation of the normal distribution.
+ *
+ * Input
+ *     A rank 1 tensor specifying the shape of the output
+ *
+ * Output
+ *     An N-Dimensional tensor with the shape specified by the values in the input tensor.
+ */
+message RandomNormalDynamicLayerParams {
+
+    int64 seed = 1;
+    float mean = 2;
+    float stdDev = 3;
+
+}
+
+/*
+ * A layer that returns a tensor filled with values from the uniform distribution.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameters
+ *     seed: seed used for the uniform distribution.
+ *     minVal: lower bound on the range of random values for the uniform distribution.
+ *     maxVal: upper bound on the range of random values for the uniform distribution.
+ *
+ * Input
+ *     An N-Dimensional tensor, whose values are ignored. Only the shape is used to
+ *     infer the shape of the output.
+ *
+ * Output
+ *     An N-Dimensional tensor with the same shape as the input tensor.
+ *
+ */
+message RandomUniformLikeLayerParams {
+
+    int64 seed = 1;
+    float minVal = 2;
+    float maxVal = 3;
+
+}
+
+/*
+ * A layer that returns a tensor filled with values from the uniform distribution.
+ *
+ * Requires no input and produces 1 output.
+ *
+ * Parameters
+ *     seed: seed used for the uniform distribution.
+ *     minVal: lower bound on the range of random values for the uniform distribution.
+ *     maxVal: upper bound on the range of random values for the uniform distribution.
+ *     outputShape: shape of the output tensor.
+ *
+ * Output
+ *     An N-Dimensional tensor of shape "outputShape".
+ *
+ */
+message RandomUniformStaticLayerParams {
+
+    int64 seed = 1;
+    float minVal = 2;
+    float maxVal = 3;
+    repeated uint64 outputShape = 4;
+
+}
+
+/*
+ * A layer that returns a tensor filled with values from the uniform distribution.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameters:
+ *     seed: seed used for the uniform distribution.
+ *     minVal: lower bound on the range of random values for the uniform distribution.
+ *     maxVal: upper bound on the range of random values for the uniform distribution.
+ *
+ * Input
+ *     A rank 1 tensor specifying the shape of the output
+ *
+ * Output
+ *     An N-Dimensional tensor with the shape specified by the values in the input tensor.
+ *
+ */
+message RandomUniformDynamicLayerParams {
+
+    int64 seed = 1;
+    float minVal = 2;
+    float maxVal = 3;
+
+}
+
+/*
+ * A layer that returns a tensor filled with values from the Bernoulli distribution.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameters
+ *     seed: seed used for the Bernoulli distribution.
+ *     prob: probability of a 1 event.
+ *
+ * Input
+ *     An N-Dimensional tensor, whose values are ignored. Only the shape is used to
+ *     infer the shape of the output.
+ *
+ * Output
+ *     An N-Dimensional tensor with the same shape as the input tensor.
+ *
+ */
+message RandomBernoulliLikeLayerParams {
+
+    int64 seed = 1;
+    float prob = 2;
+
+}
+
+/*
+ * A layer that returns a tensor filled with values from the Bernoulli distribution.
+ *
+ * Requires no input and produces 1 output.
+ *
+ * Parameters
+ *     seed: seed used for the Bernoulli distribution.
+ *     prob: probability of a 1 event.
+ *     outputShape: shape of the output tensor.
+ *
+ * Output
+ *     An N-Dimensional tensor of shape "outputShape".
+ */
+message RandomBernoulliStaticLayerParams {
+
+    int64 seed = 1;
+    float prob = 2;
+    repeated uint64 outputShape = 3;
+
+}
+
+/*
+ * A layer that returns a tensor filled with values from the Bernoulli distribution.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameters:
+ *     seed: seed used for the Bernoulli distribution.
+ *     prob: probability of a 1 event.
+ *
+ * Input
+ *     A rank 1 tensor specifying the shape of the output
+ *
+ * Output
+ *     An N-Dimensional tensor with the shape specified by the values in the input tensor.
+ */
+message RandomBernoulliDynamicLayerParams {
+
+    int64 seed = 1;
+    float prob = 2;
+
+}
+
+/*
+ * A layer that returns a tensor of the specified shape filled with values from the categorical distribution.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameter:
+ *     seed: seed used for the categorical distribution.
+ *     numSamples: number of samples to draw.
+ *     isLogits: true if the inputs are logits, false if the inputs are probabilities.
+ *     eps: default value is 1e-10.
+ *     temperature: default value is 1.0.
+ *
+ * Input tensor shape = [D_1, D_2, ... , D_(R-1), D_R] (Rank = R)
+ * Then the shape of the output is [D_1, D_2, ... , D_(R-1), numSamples] (Rank = R)
+ *
+ */
+message CategoricalDistributionLayerParams {
+
+    int64 seed = 1;
+    int64 numSamples = 2;
+    bool isLogits = 3;
+    float eps = 4;
+    float temperature = 5;
+}
+
+/*
+ * A layer that performs reduction with L1 normalization operation.
+ *
+ * Negative indexing is supported.
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameters:
+ *    axes: dimensions along which to perform reduction
+ *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
+ *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
+ *
+ */
+message ReduceL1LayerParams {
+
+    repeated int64 axes = 1;
+    bool keepDims = 2;
+    bool reduceAll = 3;
+
+}
+
+/*
+ * A layer that performs reduction with L2 normalization operation.
+ *
+ * Negative indexing is supported.
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameters:
+ *    axes: dimensions along which to perform reduction
+ *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
+ *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
+ *
+ */
+message ReduceL2LayerParams {
+
+    repeated int64 axes = 1;
+    bool keepDims = 2;
+    bool reduceAll = 3;
+
+}
+
+/*
+ * A layer that performs reduction with max operation.
+ *
+ * Negative indexing is supported.
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameters:
+ *    axes: dimensions along which to perform reduction
+ *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
+ *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
+ *
+ */
+message ReduceMaxLayerParams {
+
+    repeated int64 axes = 1;
+    bool keepDims = 2;
+    bool reduceAll = 3;
+
+}
+
+/*
+ * A layer that performs reduction with min operation.
+ *
+ * Negative indexing is supported.
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameters:
+ *    axes: dimensions along which to perform reduction
+ *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
+ *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
+ *
+ */
+message ReduceMinLayerParams {
+
+    repeated int64 axes = 1;
+    bool keepDims = 2;
+    bool reduceAll = 3;
+
+}
+
+/*
+ * A layer that performs reduction with sum operation.
+ *
+ * Negative indexing is supported.
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameters:
+ *    axes: dimensions along which to perform reduction
+ *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
+ *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
+ *
+ */
+message ReduceSumLayerParams {
+
+    repeated int64 axes = 1;
+    bool keepDims = 2;
+    bool reduceAll = 3;
+
+}
+
+/*
+ * A layer that performs reduction with prod operation.
+ *
+ * Negative indexing is supported.
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameters:
+ *    axes: dimensions along which to perform reduction
+ *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
+ *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
+ *
+ */
+message ReduceProdLayerParams {
+
+    repeated int64 axes = 1;
+    bool keepDims = 2;
+    bool reduceAll = 3;
+
+}
+
+/*
+ * A layer that performs reduction with mean operation.
+ *
+ * Negative indexing is supported.
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameters:
+ *    axes: dimensions along which to perform reduction
+ *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
+ *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
+ *
+ */
+message ReduceMeanLayerParams {
+
+    repeated int64 axes = 1;
+    bool keepDims = 2;
+    bool reduceAll = 3;
+
+}
+
+/*
+ * A layer that performs reduction with logSum operation.
+ *
+ * Negative indexing is supported.
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameters:
+ *    axes: dimensions along which to perform reduction
+ *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
+ *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
+ *
+ */
+message ReduceLogSumLayerParams {
+
+    repeated int64 axes = 1;
+    bool keepDims = 2;
+    bool reduceAll = 3;
+
+}
+
+/*
+ * A layer that performs reduction with logSumExp operation.
+ *
+ * Negative indexing is supported.
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameters:
+ *    axes: dimensions along which to perform reduction
+ *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
+ *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
+ *
+ */
+message ReduceSumSquareLayerParams {
+
+    repeated int64 axes = 1;
+    bool keepDims = 2;
+    bool reduceAll = 3;
+
+}
+
+/*
+ * A layer that performs reduction with logSumExp operation.
+ *
+ * Negative indexing is supported.
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameters:
+ *    axes: dimensions along which to perform reduction
+ *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
+ *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
+ *
+ */
+message ReduceLogSumExpLayerParams {
+
+    repeated int64 axes = 1;
+    bool keepDims = 2;
+    bool reduceAll = 3;
+
+}
+
+/*
+ * A layer that increases the rank of the input tensor by adding unit dimensions.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * e.g.:
+ *
+ * input shape = (10,5)
+ * axes = (0,1)
+ * output shape = (1,1,10,5)
+ *
+ * input shape = (10,5)
+ * axes = (0,2)
+ * output shape = (1,10,1,5)
+ *
+ * input shape = (10,5)
+ * axes = (-2,-1)
+ * output shape = (10,5,1,1)
+ *
+ */
+message ExpandDimsLayerParams {
+
+    /*
+     * Axis values provided here get dimension 1 in the output tensor.
+     * Negative indexing is supported.
+     */
+    repeated int64 axes = 1;
+
+}
+
+/*
+ * A layer that flattens the input tensor into a 2-dimensional matrix.
+ *
+ * Requires 1 input and produces 1 output.
+ * Output tensor is always rank 2.
+ *
+ * First dimension of output is the product of all the dimensions in input[:axis] ("axis" is exclusive)
+ * Second dimension of output is the product of all the dimensions in input[axis:] ("axis" is inclusive)
+ *
+ * e.g.:
+ * input shape:  (3,)
+ * axis:  -1
+ * output shape:  (1, 3)
+ *
+ * input shape:  (3,)
+ * axis:  1
+ * output shape:  (3, 1)
+ *
+ * input shape:  (4, 3)
+ * axis:  -1
+ * output shape:  (4, 3)
+ *
+ * input shape:  (5, 2)
+ * axis:  0
+ * output shape:  (1, 10)
+ *
+ * input shape:  (5, 5, 3)
+ * axis:  -2
+ * output shape:  (5, 15)
+ *
+ * input shape:  (2, 3, 2)
+ * axis:  -1
+ * output shape:  (6, 2)
+ *
+ */
+message FlattenTo2DLayerParams {
+
+    int64 axis = 1;
+
+}
+
+/*
+ * A layer that reshapes a tensor.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Output tensor is the reshaped version of the input and has shape as specified in the
+ * parameter "targetShape".
+ *
+ */
+message ReshapeStaticLayerParams {
+
+    repeated int64 targetShape = 1;
+
+}
+
+/*
+ * A layer that reshapes a tensor.
+ *
+ * Requires 2 inputs and produces 1 output.
+ *
+ * First input is reshaped to produce the output, while the second input is only
+ * used to determine the shape of the output. Values of the second input are not used.
+ *
+ * Output is a tensor with the same shape as the second input.
+ *
+ */
+message ReshapeLikeLayerParams {
+
+}
+
+/*
+ * A layer that reshapes a tensor.
+ *
+ * Requires 2 inputs and produces 1 output.
+ *
+ * First input is the one that is reshaped to produce the output.
+ * Second input is a rank 1 tensor specifying the shape of the output.
+ * Output tensor has shape as specified by the values in the 2nd input tensor.
+ */
+message ReshapeDynamicLayerParams {
+
+}
+
+/*
+ * A layer that decreases the rank of the input tensor by removing unit dimensions.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Output rank is one less than input rank, if input rank is more than 1.
+ * If input rank is 1, output rank is also 1.
+ *
+ * e.g.:
+ *
+ * input shape = (1,1,10,5)
+ * axes = (0,1)
+ * output shape = (10,5)
+ *
+ * input shape = (1,10,5,1)
+ * axes = (0,3)
+ * output shape = (10,5)
+ *
+ * input shape = (10,5,1,1)
+ * axes = (-2,-1)
+ * output shape = (10,5)
+ *
+ * input shape = (1,)
+ * axes = (0)
+ * output shape = (1,)
+ *
+ */
+message SqueezeLayerParams {
+
+    /*
+     * Axis values provided here get removed from the input tensor.
+     * Negative indexing is supported.
+     */
+    repeated int64 axes = 1;
+    bool squeezeAll = 2; // if true squeeze all dimensions that are 1.
+
+}
+
+/*
+ * A layer that returns top K (or bottom K) values and the corresponding indices
+ * of the input along a given axis.
+ *
+ * Requires 1 or 2 inputs and produces 2 outputs.
+ *
+ * The second input is the value of the K, and is optional.
+ * If there is only one input, value of K that is specified in the layer parameter is used.
+ *
+ * Both outputs have the same rank as the first input.
+ * Second input must correspond to a scalar tensor.
+ *
+ * e.g.:
+ *
+ * first input's shape = (45, 34, 10, 5)
+ * axis = 1
+ * output shape, for both outputs = (45, K, 10, 5)
+ *
+ */
+message TopKLayerParams {
+
+    int64 axis = 1; //  negative indexing is supported
+    uint64 K = 2; // is ignored if a second input is present.
+    bool useBottomK = 3; // if true, bottom K (values, indices) are returned instead
+
+}
+
+/*
+ * A layer that returns the indices of the maximum value along a specified axis in a tensor.
+ *
+ * Requires 1 input and produces 1 output. Negative indexing is supported.
+ *
+ * Output has the same rank as the input if "removeDim" is False (default).
+ * Output has rank one less than the input if "removeDim" is True and input rank is more than 1.
+ *
+ * e.g.:
+ *
+ * input shape = (45, 34, 10, 5)
+ * axis = -2
+ * output shape = (45, 1, 10, 5), if removeDim = False (default)
+ * output shape = (45, 10, 5), if removeDim = True
+ *
+ * input shape = (5,)
+ * axis = 0
+ * output shape = (1,), if removeDim = False or True
+ *
+ */
+message ArgMaxLayerParams {
+
+    int64 axis = 1;
+    bool removeDim = 2;
+
+}
+
+/*
+* A layer that returns the indices of the minimum value along a specified axis in a tensor.
+*
+* Requires 1 input and produces 1 output. Negative indexing is supported.
+*
+* Output has the same rank as the input if "removeDim" is False (default).
+* Output has rank one less than the input if "removeDim" is True and input rank is more than 1.
+*
+* e.g.:
+*
+* input shape = (45, 34, 10, 5)
+* axis = -2
+* output shape = (45, 1, 10, 5), if removeDim = False (default)
+* output shape = (45, 10, 5), if removeDim = True
+*
+* input shape = (5,)
+* axis = 0
+* output shape = (1,), if removeDim = False or True
+*
+*/
+message ArgMinLayerParams {
+
+    int64 axis = 1;
+    bool removeDim = 2;
+
+}
+
+/*
+ * A layer layer that splits the input tensor into multiple output tensors,
+ * along the specified axis.
+ *
+ * The layer either uniformly splits the input tensor into ``num_splits`` tensors, or
+ * splits according to the given split sizes in ``split_sizes``.
+ * Supports unequal splits and negative indexing.
+ *
+ * Requires 1 input and produces at least 2 outputs.
+ * Rank of all the outputs is same as that of the input.
+ *
+ * If parameter "splitSizes" is provided, value of the parameter "numSplits" is ignored, since in that case
+ * "numSplits" is automatically inferred to be the length of "splitSizes".
+ *
+ *
+ * e.g.:
+ * input shape:  (5, 3, 4)
+ * axis = -3, split_sizes = [3, 2]
+ * output shape:  (3, 3, 4)
+ * output shape:  (2, 3, 4)
+ */
+message SplitNDLayerParams {
+
+    int64 axis = 1;
+    uint64 numSplits = 2;
+    repeated uint64 splitSizes = 3;
+
+}
+
+/*
+ * A layer that performs element-wise ceil operation on the input tensor that
+ * rounds the value to the smallest integer not less than x.
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ *
+ */
+message CeilLayerParams {
+
+}
+
+/*
+ * A layer that performs element-wise round operation on the input tensor
+ * that rounds the value to the nearest integer.
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ *
+ */
+message RoundLayerParams {
+
+}
+
+/*
+ * A layer that performs element-wise floor operation on the input tensor
+ * that rounds the value to the largest integer not greater than x.
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ *
+ */
+message FloorLayerParams {
+
+}
+
+/*
+ * A layer that performs element-wise sign operation (+1 for positive values,
+ * -1 for negative values, 0 for zeros).
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ *
+ */
+message SignLayerParams {
+
+}
+
+/*
+ * A layer that performs element-wise clip operation. Clip the values in the
+ * input tensor to the threshold values [min_value, max_value].
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Parameter minVal: the minimum threshold.
+ * Parameter maxVal: the maximum threshold.
+ *
+ * output =  min(max(input, minVal), maxVal)
+ *
+ * Output shape is same as the input.
+ */
+message ClipLayerParams {
+
+    float minVal = 1;
+    float maxVal = 2;
+
+}
+
+/*
+ * A layer that extracts a slice of size ``(end - begin) / stride``
+ * from the given input tensor.
+ * Support negative indexing and negative strides.
+ *
+ * Requires 1 input and produces 1 output.
+ * Output rank is same as the input rank.
+ *
+ * Value of beginIds, beginMasks, endIds, endMasks, strides are required parameters.
+ * Lengths of all the parameters must equal the rank of the input.
+ *
+ * i-th element of "beginIds" is ignored and assumed to be 0 if the i-th element of
+ * "beginMasks" is True
+ *
+ * i-th element of "endIds" is ignored and assumed to be -1 if the i-th element of
+ * "endMasks" is True
+ *
+ * e.g.:
+ * if i-th element of "squeezeMasks" is set to True, only beginIds[i] would be sliced
+ * out, and all other masks and inputs are ignored.
+ *
+ * e.g. (without squeezeMasks):
+ * input shape:  (5, 5, 5)
+ * beginIds:  [1, 2, 3]
+ * beginMasks:  [True, False, True]
+ * endIds:  [3, -3, 2]
+ * endMasks:  [False, True, True]
+ * strides:  [2, 2, 2]
+ * SqueezeMasks:  [False, False, False]
+ * output shape:  (2, 2, 3)
+ * This is equivalent to input[:3:2, 2::2, ::2]
+ *
+ * e.g. (with squeezeMasks):
+ * input shape:  (5, 5, 5)
+ * beginIds:  [1, 2, 3]
+ * beginMasks:  [True, False, True]
+ * endIds:  [3, -3, 2]
+ * endMasks:  [False, True, True]
+ * strides:  [2, 2, 2]
+ * SqueezeMasks:  [False, True, False]
+ * output shape:  (2, 3)
+ * This is equivalent to input[:3:2, 2, ::2]
+ *
+ */
+message SliceStaticLayerParams {
+
+    repeated int64 beginIds = 1;
+    repeated bool beginMasks = 2;
+    repeated int64 endIds = 3;
+    repeated bool endMasks = 4;
+    repeated int64 strides = 5;
+    repeated bool squeezeMasks = 6;
+
+
+}
+
+/*
+ * A layer that extracts a slice of size ``(end - begin) / stride``
+ * from the given input tensor.
+ * Support negative indexing and negative strides.
+ * See "SliceStaticLayerParams" for the description and an example of the functionality of the layer.
+ *
+ * Requires 2 to 7 inputs and produces 1 output.
+ * Rank of the output is same as the rank of the first input unless squeezeMask is set.
+ *
+ * Value of beginIds, beginMasks, endIds, endMasks, strides can be passed in either
+ * as dynamic inputs or as static parameters.
+ * Lengths of all the parameters or inputs from 2-6 must equal the rank of the first input.
+ *
+ * The 2nd input represents the "beginIds".
+ * The 3rd input, if present, corresponds to "endIds". In this case the value of the "endIds" parameter is ignored.
+ * The 4th input, if present, corresponds to "strides". In this case the value of the "strides" parameter is ignored.
+ * The 5th input, if present, corresponds to "beginMasks". In this case the value of the "beginMasks" parameter is ignored.
+ * The 6th input, if present, corresponds to "endMasks". In this case the value of the "endMasks" parameter is ignored.
+ * The 7th input, if present, corresponds to "squeezeMasks". In this case the value of the "squeezeMasks" parameter is ignored.
+ *
+ */
+message SliceDynamicLayerParams {
+
+    repeated bool beginMasks = 2;
+    repeated int64 endIds = 3;
+    repeated bool endMasks = 4;
+    repeated int64 strides = 5;
+    repeated bool squeezeMasks = 6;
+
+}
+
+/*
+ * A layer that constructs a tensor by repeating the input tensor multiple
+ * number of times.
+ *
+ * Requires 1 or 2 inputs and produces 1 output.
+ * Output rank is same as the input rank.
+ *
+ * If two inputs are provided, second input is used as "reps"
+ * and "reps" parameter is ignored.
+ *
+ * If only one input is provided,
+ * length of the "reps" parameter must be at least 1 and
+ * not greater than the rank of the input.
+ * If it is less than the input rank, it is made equal to the input rank by prepending 1's to it.
+ *
+ * e.g.:
+ *
+ * input shape = (2, 4, 2)
+ * reps = (1, 2, 6)
+ * output shape = (2, 8, 12)
+ *
+ * input shape = (2, 4, 2)
+ * reps = (6)
+ * reps after prepending ones = (1, 1, 6)
+ * output shape = (2, 4, 12)
+ *
+ * input shape = (2, 4, 2)
+ * second input = [1, 2, 6] -> shape: (3,)
+ * reps = N/A [Ignored]
+ * output shape = (2, 8, 12)
+ *
+ */
+message TileLayerParams {
+
+    repeated uint64 reps = 1;
+
+}
+
+/*
+ * A layer that returns the shape of an input tensor.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input: a tensor.
+ * Output: a vector of length R, where R is the rank of the input tensor
+ * Output is always a rank 1 tensor.
+ */
+message GetShapeLayerParams {
+
+}
+
+/*
+ * A layer that computes the Gauss error function,
+ * which is defined as:
+ *
+ * .. math::
+ *     f(x) = \dfrac{1}{\sqrt{\pi}}\int_{-x}^{x}{e^{-t^2}dt}
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ */
+message ErfLayerParams {
+
+}
+
+/*
+ * A layer that evaluates the Gaussian Error Linear Unit (GELU) activation.
+ * Following equations are used to compute the activation based on the value of the "mode" parameter:
+ *
+ * mode == 'EXACT':
+ * .. math::
+ *     f(x) = 0.5x\left ( 1+\rm{erf}\left ( \frac{x}{\sqrt{2}} \right ) \right )
+ *
+ * mode == 'TANH_APPROXIMATION':
+ * .. math::
+ *     f(x) = 0.5x\left ( 1+\rm{tanh}\left ( \sqrt{2/\pi}\left ( x + 0.044715x^3 \right ) \right ) \right )
+ *
+ * mode == 'SIGMOID_APPROXIMATION':
+ * .. math::
+ *     f(x) = x*\rm{sigmoid}(1.702x)
+ *
+ * Requires 1 input and produces 1 output.
+ * Output shape is same as the input.
+ *
+ */
+message GeluLayerParams {
+
+    enum GeluMode {
+
+        EXACT = 0;
+        TANH_APPROXIMATION = 1;
+        SIGMOID_APPROXIMATION = 2;
+
+    }
+
+    GeluMode mode = 1; // mode of GELU operation.
+
+}
+
+/*
+ * RangeStatic layer that returns a tensor that contains evenly spaced values.
+ * It is similar in functionality to the numpy.arange method.
+ *
+ * Requires no input and produces 1 output.
+ * Output is a rank 1 tensor.
+ */
+message RangeStaticLayerParams {
+
+    float endValue = 1;
+    float startValue = 2;
+    float stepSizeValue = 3;
+
+}
+
+/*
+ * A layer that returns a tensor that contains evenly spaced values.
+ * Its functionality is similar to the numpy.arange method.
+ *
+ * Requires at least 1 input, up to a maximum of 3 inputs.
+ * Produces 1 output, which is a rank 1 tensor.
+ *
+ * Each input must be a scalar, or rank 1 and shape (1,).
+ *
+ * The first input represents the "endValue".
+ * The second input, if present, corresponds to "startValue". In this case the value of the "startValue" parameter is ignored.
+ * The third input, if present, corresponds to "stepSizeValue". In this case the value of the "stepSizeValue" parameter is ignored.
+ *
+ */
+message RangeDynamicLayerParams {
+
+    float startValue = 2;
+    float stepSizeValue = 3;
+
+}
+
+/*
+ * A layer that returns a tensor containing all windows of size ``windowSize``
+ * separated by ``step`` along the dimension ``axis``.
+ *
+ * .. code::
+ *
+ *      y = SlidingWindows(x)
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * Input
+ *     An N-Dimensional tensor.
+ *
+ * Output
+ *     An (N+1)-Dimensional tensor.
+ *
+ * This operation behaves as following:
+ *      - if axis = 0 & input is rank 1 (L,). Output shape will be (M, W).
+ *      - if axis = 1 & input is rank 3 (B1, L, C1). Output shape will be (B1, M, W, C1)
+ *      - if axis = 2 & input is rank 5 (B1, B2, L, C1, C2) --> (B1 * B2, L, C1 * C2) --> (B1 * B2, M, W, C1 * C2). Output shape will be (B1, B2, M, W, C1, C2)
+ *      - etc.
+ * where
+ *      - L, C, B refer to input length, feature dimension length & batch size respectively
+ *      - W is the window size.
+ *      - M is the number of windows/slices calculated as M = (L - W) / step + 1
+ */
+message SlidingWindowsLayerParams {
+
+    int64 axis = 1;
+    uint64 windowSize = 2;
+    uint64 step = 3;
+
+}
+
+/*
+ * A layer that applies layer normalization over the input tensor.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * output = gamma * (input - computed_mean) / (sqrt(computed_variance + eps)) + beta
+ *
+ * Parameters
+ *     normalizedShape: subset of the input shape, along with layer norm is performed, rest of the input shape is treated as the batch dimension. The mean and variance are computed for the input, over the last few dimensions as specified by the normalizedShape parameter.
+ *     gamma: must have shape = "normalizedShape"
+ *     beta: must have shape = "normalizedShape"
+ *     eps: small constant to avoid division by 0
+ *
+ * Output shape is same as the input.
+ *
+ * e.g.:
+ * input shape = (10,5)
+ * normalized shape = (5,) or (10,5)
+ *
+ * input shape = (10,5,6,7)
+ * normalized shape = (7,) or (6,7) or (5,6,7) or (10,5,6,7)
+ */
+message LayerNormalizationLayerParams {
+
+    repeated int64 normalizedShape = 1;
+    float eps = 2;
+    WeightParams gamma = 3;
+    WeightParams beta = 4;
+
+}
+
+/*
+ * Non maximum suppression (NMS) layer.
+ * Applies the non maximum suppression algorithm to input bounding box coordinates.
+ * The effect of this layer is similar to the functionality of the "NonMaximumSuppression"
+ * model type (for details please see NonMaximumSuppression.proto) with a couple of differences.
+ * One, this is a layer in a neural network model, whereas that is a different model type. Second,
+ * this layer supports a batch of bounding boxes.
+ *
+ * The NMS layer requires at least 2 inputs, and up to a maximum of 5 inputs. It produces 4 outputs.
+ * Following is the description of inputs and outputs:
+ *
+ * input 1, shape (B,N,4): coordinates of N boxes, for a batch size B.
+ * input 2, shape (B,N,C): class scores for each box. C can be 1 when there is only 1 score per box, i.e., no class specific score.
+ *
+ * input 3, optional, shape (1,): IoU threshold. When present, it overwrites the value provided in layer parameter "iouThreshold".
+ * input 4, optional, shape (1,): Score threshold. When present, it overwrites the value provided in layer parameter "scoreThreshold".
+ * input 5, optional, shape (1,): Maximum number of boxes. When present, it overwrites the value provided in layer parameter "maxBoxes".
+ *
+ * output 1, shape (B,maxBoxes,4): box coordinates, corresponding to the surviving boxes.
+ * output 2, shape (B,maxBoxes,C): box scores, corresponding to the surviving boxes.
+ * output 3, shape (B,maxBoxes): indices of the surviving boxes. Hence it will have values in the range [0,N-1], except for padding.
+ * output 4, shape (B,): number of boxes selected after the NMS algorithm, for each batch.
+ *
+ * When surviving boxes are less than "maxBoxes", the first 3 outputs are padded.
+ * For the first two outputs, the padding is done using values 0, whereas for the third output the
+ * padding value used is -1, since the output values represent indices.
+ *
+ * If no box survives, that is, all the scores are below the "scoreThreshold",
+ * then for that batch, number of boxes (value of the fourth output) will be 1. The first 3 outputs will
+ * correspond to the box with the highest score. This is to avoid generating an "empty" output.
+ *
+ * The four values that describe the box dimensions are (in order):
+ *
+ *  - x (center location of the box along the horizontal axis)
+ *  - y (center location of the box along the vertical axis)
+ *  - width (size of box along the horizontal axis)
+ *  - height (size of box on along the vertical axis)
+ *
+ * In each batch,
+ * the N scores for N boxes, used for suppression, are generated by taking the max of the matrix (N,C)
+ * along the columns.
+ * If "perClassSuppression" flag is false, suppression happens across all classes.
+ * If "perClassSuppression" flag is true, each box is assigned to the class with the highest
+ * score and then the suppression happens separately for boxes within the same class.
+ *
+ * Note that the 4th output can be used to dynamically slice the first 3 outputs, in case
+ * the padded outputs are not required.
+ *
+ */
+message NonMaximumSuppressionLayerParams {
+    /*
+     * The intersection over union (IoU) threshold over which boxes are suppressed.
+     */
+    float iouThreshold = 1;
+
+    /*
+     * Before IoU suppression is performed, boxes with class scores below this threshold are rejected.
+     */
+    float scoreThreshold = 2;
+
+    /*
+     * The maximum number of boxes to be given out as output.
+     * If the number of surviving boxes are less, output is padded up to this number.
+     */
+    uint64 maxBoxes = 3;
+
+    /*
+     * If true, suppression is performed independently within boxes of each class.
+     */
+    bool perClassSuppression = 4;
+}
+
+/*
+ * A layer that performs element-wise clamped ReLU operation.
+ *
+ * Requires 1 input and produces 1 output.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *     f(x) = \begin{cases}
+ *               \text{min}(\text{beta},x) \;\; \text{if} \;\; x \geq 0\\
+ *               \text{min}(\text{beta} ,\text{alpha}\cdot x) \;\; \text{if} \;\; x<0
+ *            \end{cases}
+ *
+ * Output shape is same as the input.
+ *
+ * Available (iOS >= 14, macOS >= 11.0, watchOS >= 7)
+ */
+message ClampedReLULayerParams {
+
+    float alpha = 1;
+    float beta = 2;
+
+}
+
+/*
+* A layer that returns the indices that would sort the input tensor, along a specified axis.
+*
+* Requires 1 input and produces 1 output.
+*
+* Output has the same rank and shape as the input.
+*
+* Value of "axis" must be positive and less than the rank of the input.
+*
+* e.g.:
+*
+* input shape = (5,)
+* axis = 0
+* input values = [3.1, 5.4, 32.9, 3.2, 77.0]
+* output shape = (5,)
+* output values = [0, 3, 1, 2, 4], descending = False
+* output values = [4, 2, 1, 3, 0], descending = True
+*
+* input shape = (2,3)
+* axis = 1
+* input values = [[3, 5, 32], [3, 77, 6]]
+* output shape = (2,3)
+* output values = [[0, 1, 2], [0, 2, 1]], descending = False
+* output values = [[2, 1, 0], [1, 2, 0]], descending = True
+*
+*/
+message ArgSortLayerParams {
+
+    int64 axis = 1; // must be between [0, input_rank - 1]
+    bool descending = 2;
+
+}
+
+/*
+ * A layer that does slice operation by providing size to be extracted
+ * from the given input tensor.
+ *
+ * Requires 2 inputs and produces 1 output.
+ * Rank of the output is same as the rank of the first input.
+ *
+ * The 1st input represents the tensor to be sliced.
+ * The 2nd input represents the beginning index to be sliced from.
+ *
+ * Example:
+ * Input 1: x (x.shape = (2, 3, 4))
+ * Input 2: begin
+ * size: 2
+ * axis: 1
+ *
+ * Output: x[:, begin:begin+2, :]
+ *
+ */
+message SliceBySizeLayerParams {
+
+    int64 size = 2;
+    int64 axis = 3;
+
+}
+
+
+// Neural Network Specializations
+// ------------------------------
+
+/*
+ * A neural network specialized as a classifier.
+ */
+message NeuralNetworkClassifier {
+
+    repeated NeuralNetworkLayer layers = 1;
+    repeated NeuralNetworkPreprocessing preprocessing = 2;
+
+    // use this enum value to determine the input tensor shapes to the neural network, for multiarray inputs
+    NeuralNetworkMultiArrayShapeMapping arrayInputShapeMapping = 5;
+
+    // use this enum value to determine the input tensor shapes to the neural network, for image inputs
+    NeuralNetworkImageShapeMapping imageInputShapeMapping = 6;
+
+    NetworkUpdateParameters updateParams = 10;
+
+    // The set of labels for every possible class.
+    oneof ClassLabels {
+        StringVector stringClassLabels = 100;
+        Int64Vector int64ClassLabels = 101;
+    }
+
+    // The name of the output blob containing the probability of each class.
+    // In other words, the score vector. Must be a 1-D tensor with the same
+    // number and order of elements as ClassLabels.
+    string labelProbabilityLayerName = 200;
+}
+
+
+/*
+ * A layer that computes the one hot representation of the input.
+ *
+ * Requires 1 or 2 inputs and produces 1 output.
+ * Rank of the output is one more than the first input.
+ * If the second input is present, it is used to determine the value of "oneHotVectorSize" and the parameter "oneHotVectorSize" is ignored.
+ *
+ * Input values correspond to indices and should typically be in the range [0,"oneHotVectorSize" -1]. If it is outside this range, a vector of all "offValue" will be chosen.
+ *
+ * Typically one hot vectors contain 0s everywhere, except 1 at the index that the input corresponds to.
+ * However, instead of 0, any float value could be generated by using the "offValue" parameter.
+ * Similarly, instead of 1, any other value can be used by employing the "onValue" parameter.
+ *
+ * e.g.:
+ * input shape: (10,), "oneHotVectorSize" : 32, axis=-1, then output shape will be (10,32)
+ * input shape: (10,23), "oneHotVectorSize" : 32, axis=1, then output shape will be (10,32,23)
+ * input shape: (10,), "oneHotVectorSize" : 32, axis=0, then output shape will be (32,10)
+ *
+ * input shape: (2,), "oneHotVectorSize" : 4, axis=-1, then output shape will be (2,4)
+ * say input values = [2, 0], and "onValue" = 5, and "offValue" = -1, then output will be:
+ * [-1, -1, 5, -1
+ *  5, -1, -1, -1]
+ *
+ *  say input values = [2, -1], and "onValue" = 5, and "offValue" = -1, then output will be:
+ * [-1, -1, 5, -1
+ *  -1, -1, -1, -1]
+ *
+ * Available (iOS >= 14, macOS >= 11.0, watchOS >= 7)
+ */
+
+message OneHotLayerParams {
+
+    uint64 oneHotVectorSize = 1; // size of the one hot vector
+    int64 axis = 2; //  negative indexing is supported. It refers to the axis in the output tensor.
+    float onValue = 3;
+    float offValue = 4;
+}
+
+
+/*
+ * A layer that computes the cumsum values of the input along a given axis.
+ *
+ * Requires 1 or 2 inputs and produces 1 output.
+ *
+ * Output shape and rank is same as the first input.
+ * If the second input is present, it is used to determine the value of "axis" and the parameter "axis" is ignored.
+ *
+ * e.g.:
+ * Input shape = (3,), values it has:  [4, 6, 7]
+ *
+ * Then output values will be:
+ *
+ * if "excludeFinalSum" = False and "reverse" = False:
+ * output values : [4, 10, 17]
+ *
+ * if "excludeFinalSum" = True and "reverse" = False:
+ * output values : [0, 4, 10]
+ *
+ * if "excludeFinalSum" = False and "reverse" = True:
+ * output values : [17, 13, 7]
+ *
+ * if "excludeFinalSum" = True and "reverse" = True:
+ * output values : [13, 7, 0]
+ *
+ *
+ * Available (iOS >= 14, macOS >= 11.0, watchOS >= 7)
+ */
+
+
+message CumSumLayerParams {
+
+    int64 axis = 1; //  negative indexing is supported
+
+    // if true, the first element of the output is 0, and the last element contains the sum of the input up to the penultimate value
+    // if false, the first element of the output is same as the input and the last element is the sum of all the input values
+    // (this behavior is reversed when "reverse" flag is True)
+    bool excludeFinalSum = 2;
+
+    bool reverse = 3; // if true, cumsum is performed in the opposite direction
+}
+
+
+/*
+ * A neural network specialized as a regressor.
+ */
+message NeuralNetworkRegressor {
+
+    repeated NeuralNetworkLayer layers = 1;
+    repeated NeuralNetworkPreprocessing preprocessing = 2;
+
+    // use this enum value to determine the input tensor shapes to the neural network, for multiarray inputs
+    NeuralNetworkMultiArrayShapeMapping arrayInputShapeMapping = 5;
+
+    // use this enum value to determine the input tensor shapes to the neural network, for image inputs
+    NeuralNetworkImageShapeMapping imageInputShapeMapping = 6;
+
+    NetworkUpdateParameters updateParams = 10;
+
+}
+
+// ---------------------------------------------------------
+// On-device Training related messages
+// ---------------------------------------------------------
+
+/*
+ * Details on how the network will be updated
+ */
+message NetworkUpdateParameters {
+
+    repeated LossLayer lossLayers = 1;
+    Optimizer optimizer = 2;
+    Int64Parameter epochs = 3;
+
+    /*
+     * Describes whether to shuffle the batch of data between epochs.
+     */
+    BoolParameter shuffle = 10;
+
+    /*
+     * The seed to be used in an associated random number generator.
+     */
+    Int64Parameter seed = 20;
+}
+
+/*
+ * Loss layer - categorical cross entropy and mean squared error are the only supported loss functions currently
+ */
+message LossLayer {
+
+    string name = 1;
+    oneof LossLayerType {
+
+        CategoricalCrossEntropyLossLayer categoricalCrossEntropyLossLayer = 10;
+        MeanSquaredErrorLossLayer meanSquaredErrorLossLayer = 11;
+
+    }
+
+}
+
+/*
+ * Categorical cross entropy loss layer
+ * Categorical cross entropy is used for single label categorization (only one category is applicable for each data point).
+ *
+ * The input is a vector of length N representing the distribution over N categories.  It must be the output of a softmax.
+ *
+ * The target is a single value representing the true category or class label. If the target is the predictedFeatureName of a neural network classifier it will be inverse mapped to the corresponding categorical index for you.
+ *
+ * math:
+ * Loss_{CCE}(input, target) = -\sum_{i=1}^{N} (target == i) log( input[i] ) = - log (input[target])
+ */
+message CategoricalCrossEntropyLossLayer {
+
+    string input = 1;
+    string target = 2;
+
+}
+
+/*
+ * Mean squared error loss layer,
+ * specifying input and target
+ */
+message MeanSquaredErrorLossLayer {
+
+    string input = 1;
+    string target = 2;
+
+}
+
+/*
+ * Optimizer - stochastic gradient descent and adam are the only supported optimizers currently
+ */
+message Optimizer {
+
+    oneof OptimizerType {
+
+        SGDOptimizer sgdOptimizer = 10;
+        AdamOptimizer adamOptimizer = 11;
+
+    }
+
+}
+
+/*
+ * Stochastic gradient descent optimizer,
+ * specifying configurable learning rate, mini batch size, and momentum
+ */
+message SGDOptimizer {
+
+    DoubleParameter learningRate = 1;
+    Int64Parameter miniBatchSize = 2;
+    DoubleParameter momentum = 3;
+
+}
+
+/*
+ * Adam optimizer,
+ * specifying configurable learning rate, mini batch size, betas, and eps
+ */
+message AdamOptimizer {
+
+    DoubleParameter learningRate = 1;
+    Int64Parameter miniBatchSize = 2;
+    DoubleParameter beta1 = 3;
+    DoubleParameter beta2 = 4;
+    DoubleParameter eps = 5;
+
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/NonMaximumSuppression.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/NonMaximumSuppression.proto
new file mode 100644
index 000000000..047f74bdb
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/NonMaximumSuppression.proto
@@ -0,0 +1,187 @@
+// Copyright (c) 2018, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "DataStructures.proto";
+
+package CoreML.Specification;
+
+/*
+* Non-maximum suppression of axis-aligned bounding boxes.
+*
+* This is used primarily for object detectors that tend to produce multiple
+* boxes around a single object.  This is a byproduct of the detector's
+* robustness to spatial translation. If there are two or more bounding boxes
+* that are very similar to one another, the algorithm should return only a
+* single representative.
+*
+* Similarity between two bounding boxes is measured by intersection-over-union
+* (IOU), the fraction between the area of intersection and area of the union.
+* Here is an example where the areas can be calculated by hand by counting glyphs::
+*
+*     +-------+                            +-------+
+*     |       |                            |       |
+*     |    +------+          +--+          |       +---+
+*     |    |  |   |          |  |          |           |
+*     +-------+   |          +--+          +----+      |
+*          |      |                             |      |
+*          +------+                             +------+
+*                        Intersection         Union
+*      IOU: 0.16      =       12       /       73
+*
+* All IOU scores are fractions between 0.0 (fully disjoint) and 1.0 (perfect
+* overlap). The standard algorithm (PickTop) is defined as follows:
+*
+*  1. Sort boxes by descending order of confidence
+*  2. Take the top one and mark it as keep
+*  3. Suppress (mark it as discard) all boxes within a fixed IOU radius of the
+*     keep box
+*  4. Go to 2 and repeat on the subset of boxes not already kept or discarded
+*  5. When all boxes are processed, output only the ones marked as keep
+*
+* Before the algorithm, boxes that fall below the confidence threshold are
+* discarded.
+*/
+message NonMaximumSuppression {
+    // Suppression methods:
+    /*
+    * Pick the bounding box of the top confidence, suppress all within a radius.
+    */
+    message PickTop {
+        /*
+        * Suppression is only done among predictions with the same label
+        * (argmax of the confidence).
+        */
+        bool perClass = 1;
+    }
+
+    /*
+    * Choose which underlying suppression method to use
+    */
+    oneof SuppressionMethod {
+        PickTop pickTop = 1;
+    }
+
+    /*
+    * Optional class label mapping.
+    */
+    oneof ClassLabels {
+        StringVector stringClassLabels = 100;
+        Int64Vector int64ClassLabels = 101;
+    }
+
+    /*
+    * This defines the radius of suppression. A box is considered to be within
+    * the radius of another box if their IOU score is less than this value.
+    */
+    double iouThreshold = 110;
+
+    /*
+    * Remove bounding boxes below this threshold.  The algorithm run-time is
+    * proportional to the square of the number of incoming bounding boxes
+    * (O(N^2)). This threshold is a way to reduce N to make the algorithm
+    * faster. The confidence threshold can be any non-negative value. Negative
+    * confidences are not allowed, since if the output shape is specified to be
+    * larger than boxes after suppression, the unused boxes are filled with
+    * zero confidence. If the prediction is handled by Core Vision, it is also
+    * important that confidences are defined with the following semantics:
+    *
+    *   1. Confidences should be between 0 and 1
+    *   2. The sum of the confidences for a prediction should not exceed 1, but is
+    *      allowed to be less than 1
+    *   3. The sum of the confidences will be interpreted as the confidence of
+    *      any object (e.g. if the confidences for two classes are 0.2 and 0.4,
+           it means there is a 60% (0.2 + 0.4) confidence that an object is
+           present)
+    */
+    double confidenceThreshold = 111;
+
+    /*
+    * Set the name of the confidence input.
+    *
+    * The input should be a multi-array of type double and shape N x C. N is
+    * the number of boxes and C the number of classes. Each row describes the
+    * confidences of each object category being present at that particular
+    * location. Confidences should be nonnegative, where 0.0 means the highest
+    * certainty the object is not present.
+    *
+    * Specifying shape is optional.
+    */
+    string confidenceInputFeatureName = 200;
+
+    /*
+    * Set the name of the coordinates input.
+    *
+    * The input should be a multi-array of type double and shape N x 4. The
+    * rows correspond to the rows of the confidence matrix. The four values
+    * describe (in order):
+    *
+    *  - x (center location of the box along the horizontal axis)
+    *  - y (center location of the box along the vertical axis)
+    *  - width (size of box along the horizontal axis)
+    *  - height (size of box on along the vertical axis)
+    *
+    * Specifying shape is optional.
+    */
+    string coordinatesInputFeatureName = 201;
+
+    /*
+    * The iouThreshold can be optionally overridden by specifying this string
+    * and providing a corresponding input of type double. This allows changing
+    * the value of the parameter during run-time.
+    *
+    * The input should be a scalar double between 0.0 and 1.0. Setting it to 1.0
+    * means there will be no suppression based on IOU.
+    */
+    string iouThresholdInputFeatureName = 202;
+
+    /*
+    * The confidenceThreshold can be optionally overridden by specifying this
+    * string and providing a corresponding input. This allows changing the
+    * value of the parameter during run-time, which can aid setting it just
+    * right for a particular use case.
+    *
+    * The input should be a scalar double with nonnegative value.
+    */
+    string confidenceThresholdInputFeatureName = 203;
+
+    /*
+    * Set the name of the confidence output. The output will be the same type
+    * and shape as the corresponding input. The only difference is that the
+    * number of rows may have been reduced.
+    *
+    * Specifying shape is optional. One reason to specify shape is to limit
+    * the number of output boxes. This can be done is several ways:
+    *
+    * Fixed shape:
+    * The output can be pinned to a fixed set of boxes. If this number is larger
+    * than the number of boxes that would have been returned, the output is padded
+    * with zeros for both confidence and coordinates. Specifying a fixed shape
+    * can be done by setting either shape (deprecated) or allowedShapes set to
+    * fixedsize.
+    *
+    * Min/max:
+    * It is also possible to set both a minimum and a maximum. The same zero-padding
+    * as for fixed shape is applied when necessary. Setting min/max is done by defining
+    * two allowedShapes, where the first dimension uses a rangeofsizes defining lowerbound
+    * and upperbound.
+    */
+    string confidenceOutputFeatureName = 210;
+
+    /*
+    * Set the name of the coordinates output. The output will be the same type
+    * and shape as the corresponding input. The only difference is that the
+    * number of rows may have been reduced.
+    *
+    * Specifying shape is optional. See confidence output for a more detailed
+    * description. Note that to achieve either fixed shape output or a
+    * constraint range of boxes, only one of confidence or coordinates need to
+    * set a shape. Both shapes are allowed to be defined, but in such case they
+    * have to be consistent along dimension 0.
+    */
+    string coordinatesOutputFeatureName = 211;
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/Normalizer.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/Normalizer.proto
new file mode 100644
index 000000000..0967bbf0a
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/Normalizer.proto
@@ -0,0 +1,38 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+package CoreML.Specification;
+
+/*
+ * A normalization preprocessor.
+ */
+message Normalizer {
+    /*
+     * There are three normalization modes,
+     * which have the corresponding formulas:
+     *
+     * Max
+     *     .. math::
+     *         max(x_i)
+     *
+     * L1
+     *     .. math::
+     *         z = ||x||_1 = \sum_{i=1}^{n} |x_i|
+     *
+     * L2
+     *     .. math::
+     *         z = ||x||_2 = \sqrt{\sum_{i=1}^{n} x_i^2}
+     */
+    enum NormType {
+        LMax = 0;
+        L1 = 1;
+        L2 = 2;
+    }
+
+    NormType normType = 1;
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/OneHotEncoder.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/OneHotEncoder.proto
new file mode 100644
index 000000000..417639908
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/OneHotEncoder.proto
@@ -0,0 +1,41 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "DataStructures.proto";
+
+package CoreML.Specification;
+
+/*
+ * Transforms a categorical feature into an array. The array will be all
+ * zeros expect a single entry of one.
+ *
+ * Each categorical value will map to an index, this mapping is given by
+ * either the ``stringCategories`` parameter or the ``int64Categories``
+ * parameter.
+ */
+message OneHotEncoder {
+    enum HandleUnknown {
+        ErrorOnUnknown = 0;
+        IgnoreUnknown = 1;   // Output will be all zeros for unknown values.
+    }
+
+    /*
+     * Mapping to be used for the encoding. The position of the category in
+     * the below vector determines where the single one entry will be in the
+     * output.
+     */
+    oneof CategoryType {
+        StringVector stringCategories = 1;
+        Int64Vector int64Categories = 2;
+    }
+
+    // Output can be a dictionary with only one entry, instead of an array.
+    bool outputSparse = 10;
+
+    HandleUnknown handleUnknown = 11;
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/Parameters.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/Parameters.proto
new file mode 100644
index 000000000..044b2a95a
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/Parameters.proto
@@ -0,0 +1,52 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "DataStructures.proto";
+
+package CoreML.Specification;
+
+/*
+ * Int64 parameter,
+ * consisting of a default int64 value, and allowed range or set of values
+ * value is unbounded if AllowedValues is not set.
+ */
+message Int64Parameter {
+    int64 defaultValue = 1;
+    oneof AllowedValues {
+        Int64Range range = 10;
+        Int64Set set = 11;
+    }
+}
+
+/*
+ * Double parameter,
+ * consisting of a default double value, and allowed range of values
+ * value is unbounded if AllowedValues is not set.
+ */
+message DoubleParameter {
+    double defaultValue = 1;
+    oneof AllowedValues {
+        DoubleRange range = 10;
+    }
+}
+
+/*
+ * String parameter,
+ * A default string value must be provided
+ */
+message StringParameter {
+    string defaultValue = 1;
+}
+
+/*
+ * String parameter,
+ * A default bool value must be provided
+ */
+message BoolParameter {
+    bool defaultValue = 1;
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/SVM.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/SVM.proto
new file mode 100644
index 000000000..d900e9aca
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/SVM.proto
@@ -0,0 +1,195 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "DataStructures.proto";
+
+package CoreML.Specification;
+
+// Kernel Definitions
+// ------------------
+
+/*
+ * A linear kernel.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *     K(\boldsymbol{x}, \boldsymbol{x'}) = \boldsymbol{x}^T \boldsymbol{x'}
+ */
+message LinearKernel {
+}
+
+/*
+ * A Gaussian radial basis function (RBF) kernel.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *     K(\boldsymbol{x}, \boldsymbol{x'}) = \
+ *          \exp(-\gamma || \boldsymbol{x} - \boldsymbol{x'} ||^2 )
+ *
+ */
+message RBFKernel {
+    double gamma = 1;
+}
+
+/*
+ * A polynomial kernel.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *     K(\boldsymbol{x}, \boldsymbol{x'}) = \
+ *           (\gamma \boldsymbol{x}^T \boldsymbol{x'} + c)^{degree}
+ */
+message PolyKernel {
+    int32 degree = 1;
+    double c = 2;
+    double gamma = 3;
+}
+
+/*
+ * A sigmoid kernel.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *     K(\boldsymbol{x}, \boldsymbol{x'}) = \
+ *           \tanh(\gamma \boldsymbol{x}^T \boldsymbol{x'} + c)
+ */
+message SigmoidKernel {
+    double gamma = 1;
+    double c = 2;
+}
+
+/*
+ * A kernel.
+ */
+message Kernel {
+    oneof kernel {
+        LinearKernel linearKernel = 1;
+        RBFKernel rbfKernel = 2;
+        PolyKernel polyKernel = 3;
+        SigmoidKernel sigmoidKernel = 4;
+    }
+}
+
+
+// Support Vector Definitions
+// --------------------------
+
+/*
+ * A sparse node.
+ */
+message SparseNode {
+    int32 index = 1; // 1-based indexes, like libsvm
+    double value = 2;
+}
+
+/*
+ * A sparse vector.
+ */
+message SparseVector {
+    repeated SparseNode nodes = 1;
+}
+
+/*
+ * One or more sparse support vectors.
+ */
+message SparseSupportVectors {
+    repeated SparseVector vectors = 1;
+}
+
+/*
+ * A dense vector.
+ */
+message DenseVector {
+    repeated double values = 1;
+}
+
+/*
+ * One or more dense support vectors.
+ */
+message DenseSupportVectors {
+    repeated DenseVector vectors = 1;
+}
+
+/*
+ * One or more coefficients.
+ */
+message Coefficients {
+    repeated double alpha = 1;
+}
+
+/*
+ * A support vector regressor.
+ */
+message SupportVectorRegressor {
+    Kernel kernel = 1;
+
+    // Support vectors, either sparse or dense format
+    oneof supportVectors {
+        SparseSupportVectors sparseSupportVectors = 2;
+        DenseSupportVectors denseSupportVectors = 3;
+    }
+
+    // Coefficients, one for each support vector
+    Coefficients coefficients = 4;
+
+    double rho = 5;
+}
+
+/*
+ * A support vector classifier
+ */
+message SupportVectorClassifier {
+    Kernel kernel = 1;
+
+    /*
+     * The number of support vectors for each class.
+     */
+    repeated int32 numberOfSupportVectorsPerClass = 2;
+
+    /*
+     * The support vectors, in either sparse or dense format.
+     */
+    oneof supportVectors {
+        SparseSupportVectors sparseSupportVectors = 3;
+        DenseSupportVectors denseSupportVectors = 4;
+    }
+
+    /*
+     * The coefficients, essentially a two dimensional array of
+     * size: (numberOfClasses-1) by (total number of support vectors)
+     */
+    repeated Coefficients coefficients = 5;
+
+    /*
+     * Constants for decision function,
+     * with K*(K-1) / 2 elements,
+     * where K is the number of classes.
+     */
+    repeated double rho = 6;
+
+    /*
+     * Pairwise probability information for A vs B classifier.
+     * Total of K*(K-1)/2 elements where K is the number of classes.
+     * These fields are optional,
+     * and only required if you want probabilities or multi class predictions.
+     */
+    repeated double probA = 7;
+    repeated double probB = 8;
+
+    /*
+     * Class label mapping.
+     */
+    oneof ClassLabels {
+        StringVector stringClassLabels = 100;
+        Int64Vector int64ClassLabels = 101;
+    }
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/Scaler.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/Scaler.proto
new file mode 100644
index 000000000..2b389d29a
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/Scaler.proto
@@ -0,0 +1,34 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+package CoreML.Specification;
+
+/*
+ * A scaling operation.
+ *
+ * This function has the following formula:
+ *
+ * .. math::
+ *     f(x) = scaleValue \cdot (x + shiftValue)
+ *
+ * If the ``scaleValue`` is not given, the default value 1 is used.
+ * If the ``shiftValue`` is not given, the default value 0 is used.
+ *
+ * If ``scaleValue`` and ``shiftValue`` are each a single value
+ * and the input is an array, then the scale and shift are applied
+ * to each element of the array.
+ *
+ * If the input is an integer, then it is converted to a double to
+ * perform the scaling operation. If the output type is an integer,
+ * then it is cast to an integer. If that cast is lossy, then an
+ * error is generated.
+ */
+message Scaler {
+    repeated double shiftValue = 1;
+    repeated double scaleValue = 2;
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/SoundAnalysisPreprocessing.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/SoundAnalysisPreprocessing.proto
new file mode 100644
index 000000000..b08957e97
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/SoundAnalysisPreprocessing.proto
@@ -0,0 +1,60 @@
+// Copyright (c) 2019, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+package CoreML.Specification.CoreMLModels;
+
+/*
+ * A model which takes audio signal samples as input and outputs an array of
+ * preprocessed samples according to the specified preprocessing types
+ */
+message SoundAnalysisPreprocessing {
+
+    // Specific preprocessing types for sound analysis
+
+    /* Vggish preprocesses input audio samples and makes them ready to
+       be fed to Vggish feature extractor.
+       c.f. https://arxiv.org/pdf/1609.09430.pdf
+
+       The preprocessing takes input a single channel (monophonic) audio samples
+       975 milliseconds long, sampled at 16KHz, i.e., 15600 samples 1D multiarray
+       and produces preprocessed samples in multiarray of shape [1, 96, 64]
+
+     (1) Splits the input audio samples into overlapping frames, where each
+         frame is 25 milliseconds long and hops forward by 10 milliseconds.
+         Any partial frames at the end are dropped.
+
+     (2) Hann window: apply a periodic Hann with a window_length of
+         25 milliseconds, which translates to 400 samples in 16KHz sampling rate
+
+         w(n) = 0.5 - 0.5 * cos(2*pi*n/window_length_sample),
+         where 0 <= n <= window_lenth_samples - 1 and window_lenth_samples = 400
+
+         Then, the Hann window is applied to each frame as below
+
+         windowed_frame(n) = frame(n) * w(n)
+         where 0 <= n <= window_lenth_samples - 1 and window_lenth_samples = 400
+
+     (3) Power spectrum: calculate short-time Fourier transfor magnitude, with
+         an FFT length of 512
+
+     (4) Log Mel filter bank: calculates a log magnitude mel-frequency
+         spectrogram minimum frequency of 125Hz and maximum frequency of 7500Hz,
+         number of mel bins is 64, log_offset is 0.01, number of spectrum bins
+         is 64.
+    */
+
+    message Vggish {
+        // no specific parameter
+    }
+
+    // Vision feature print type
+    oneof SoundAnalysisPreprocessingType {
+        Vggish vggish = 20;
+    }
+
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/TextClassifier.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/TextClassifier.proto
new file mode 100644
index 000000000..d31113fda
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/TextClassifier.proto
@@ -0,0 +1,43 @@
+// Copyright (c) 2018, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "DataStructures.proto";
+
+package CoreML.Specification.CoreMLModels;
+
+/*
+ * A model which takes a single input string and outputs a
+ * label for the input.
+ */
+message TextClassifier {
+
+    /*
+     * Stores the resivion number for the model, revision 1 is available on
+     * iOS, tvOS 12.0+, macoOS 10.14+
+     */
+    uint32 revision = 1;
+    
+    /*
+     * Stores the language of the model, as specified in BCP-47 format,
+     * e.g. "en-US". See https://tools.ietf.org/html/bcp47
+     */
+    string language = 10;
+
+    /*
+     * Stores the byte representation of learned model parameters
+     */
+    bytes modelParameterData = 100;
+    
+    /*
+     * Stores the set of output class labels
+     */
+    oneof ClassLabels {
+        StringVector stringClassLabels = 200;
+    }
+    
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/TreeEnsemble.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/TreeEnsemble.proto
new file mode 100644
index 000000000..6428dc730
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/TreeEnsemble.proto
@@ -0,0 +1,161 @@
+// Copyright (c) 2017, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+/*
+ * Each tree is a collection of nodes,
+ * each of which is identified by a unique identifier.
+ *
+ * Each node is either a branch or a leaf node.
+ * A branch node evaluates a value according to a behavior;
+ * if true, the node identified by ``true_child_node_id`` is evaluated next,
+ * if false, the node identified by ``false_child_node_id`` is evaluated next.
+ * A leaf node adds the evaluation value to the base prediction value
+ * to get the final prediction.
+ *
+ * A tree must have exactly one root node,
+ * which has no parent node.
+ * A tree must not terminate on a branch node.
+ * All leaf nodes must be accessible
+ * by evaluating one or more branch nodes in sequence,
+ * starting from the root node.
+ */
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "DataStructures.proto";
+
+package CoreML.Specification;
+
+/*
+ * A tree ensemble post-evaluation transform.
+ */
+enum TreeEnsemblePostEvaluationTransform {
+    NoTransform = 0;
+    Classification_SoftMax = 1;
+    Regression_Logistic = 2;
+    Classification_SoftMaxWithZeroClassReference = 3;
+}
+
+/*
+ * Tree ensemble parameters.
+ */
+message TreeEnsembleParameters {
+    message TreeNode {
+        uint64 treeId = 1;
+        uint64 nodeId = 2;
+
+        enum TreeNodeBehavior {
+            BranchOnValueLessThanEqual = 0;
+            BranchOnValueLessThan = 1;
+            BranchOnValueGreaterThanEqual = 2;
+            BranchOnValueGreaterThan = 3;
+            BranchOnValueEqual = 4;
+            BranchOnValueNotEqual = 5;
+            LeafNode = 6;
+        }
+
+        /*
+         * The branch mode parameters.
+         *
+         * If branch is false,
+         * then the parameters in this section must be filled in
+         * to determine how the branching functions.
+         */
+        TreeNodeBehavior nodeBehavior = 3;
+
+        /*
+         * If the node behavior mode is a branch mode,
+         * then these values must be filled in.
+         */
+        uint64 branchFeatureIndex = 10;
+        double branchFeatureValue = 11;
+        uint64 trueChildNodeId = 12;
+        uint64 falseChildNodeId = 13;
+        bool missingValueTracksTrueChild = 14;
+
+        /*
+         * The leaf mode.
+         *
+         * If ``nodeBahavior`` == ``LeafNode``,
+         * then the evaluationValue is added to the base prediction value
+         * in order to get the final prediction.
+         * To support multiclass classification
+         * as well as regression and binary classification,
+         * the evaluation value is encoded here as a sparse vector,
+         * with evaluationIndex being the index of the base vector
+         * that evaluation value is added to.
+         * In the single class case,
+         * it is expected that evaluationIndex is exactly 0.
+         */
+        message EvaluationInfo {
+           uint64 evaluationIndex = 1;
+           double evaluationValue = 2;
+        }
+
+        repeated EvaluationInfo evaluationInfo = 20;
+
+        /*
+         * The relative hit rate of a node for optimization purposes.
+         *
+         * This value has no effect on the accuracy of the result;
+         * it allows the tree to optimize for frequent branches.
+         * The value is relative,
+         * compared to the hit rates of other branch nodes.
+         *
+         * You typically use a proportion of training samples
+         * that reached this node
+         * or some similar metric to derive this value.
+         */
+        double relativeHitRate = 30;
+    }
+
+    repeated TreeNode nodes = 1;
+
+    /*
+     * The number of prediction dimensions or classes in the model.
+     *
+     * All instances of ``evaluationIndex`` in a leaf node
+     * must be less than this value,
+     * and the number of values in the ``basePredictionValue`` field
+     * must be equal to this value.
+     *
+     * For regression,
+     * this is the dimension of the prediction.
+     * For classification,
+     * this is the number of classes.
+     */
+    uint64 numPredictionDimensions = 2;
+
+    /*
+     * The base prediction value.
+     *
+     * The number of values in this must match
+     * the default values of the tree model.
+     */
+    repeated double basePredictionValue = 3;
+}
+
+/*
+ * A tree ensemble classifier.
+ */
+message TreeEnsembleClassifier {
+    TreeEnsembleParameters treeEnsemble = 1;
+    TreeEnsemblePostEvaluationTransform postEvaluationTransform = 2;
+
+    // Required class label mapping
+    oneof ClassLabels {
+        StringVector stringClassLabels = 100;
+        Int64Vector int64ClassLabels = 101;
+    }
+}
+
+/*
+ * A tree ensemble regressor.
+ */
+message TreeEnsembleRegressor {
+    TreeEnsembleParameters treeEnsemble = 1;
+    TreeEnsemblePostEvaluationTransform postEvaluationTransform = 2;
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/VisionFeaturePrint.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/VisionFeaturePrint.proto
new file mode 100644
index 000000000..a87fdd40f
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/VisionFeaturePrint.proto
@@ -0,0 +1,67 @@
+// Copyright (c) 2018, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+package CoreML.Specification.CoreMLModels;
+
+/*
+ * A model which takes an input image and outputs array(s) of features
+ * according to the specified feature types
+ */
+message VisionFeaturePrint {
+
+    // Specific vision feature print types
+   
+    // Scene extracts features useful for identifying contents of natural images
+    // in both indoor and outdoor environments
+    message Scene {
+        enum SceneVersion {
+            SCENE_VERSION_INVALID = 0;
+            // VERSION_1 is available on iOS,tvOS 12.0+, macOS 10.14+
+            // It uses a 299x299 input image and yields a 2048 float feature vector
+            SCENE_VERSION_1 = 1;
+
+            // VERSION_2 is available on iOS,tvOS 17.0+, macOS 14.0+
+            // It uses a 360x360 input image and yields a 768 float feature vector
+            SCENE_VERSION_2 = 2;
+        }
+        
+        SceneVersion version = 1;
+    }
+
+    // Objects extracts features useful for identifying and localizing
+    // objects in natural images
+    message Objects {
+        enum ObjectsVersion {
+            OBJECTS_VERSION_INVALID = 0;
+            // VERSION_1 is available on iOS,tvOS 14.0+, macOS 11.0+
+            // It uses a 299x299 input image and yields two multiarray
+            // features: one at high resolution of shape (288, 35, 35)
+            // the other at low resolution of shape (768, 17, 17)
+            OBJECTS_VERSION_1 = 1;
+        }
+
+        ObjectsVersion version = 1;
+
+        /*
+        * Stores the names of the output features according to the
+        * order of them being computed from the neural network, i.e.,
+        * the first element in the output is the earliest being
+        * computed, while the last is the latest being computed. In
+        * general, the order reflects the resolution of the feature.
+        * The earlier it is computed, the higher the feature resolution.
+        */
+        repeated string output = 100;
+    }
+
+    // Vision feature print type
+    oneof VisionFeaturePrintType {
+        Scene scene = 20;
+        Objects objects = 21;
+    }
+
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/WordEmbedding.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/WordEmbedding.proto
new file mode 100644
index 000000000..349a068d9
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/WordEmbedding.proto
@@ -0,0 +1,35 @@
+// Copyright (c) 2019, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "DataStructures.proto";
+
+package CoreML.Specification.CoreMLModels;
+
+/*
+ * A model which maps a set of strings into a finite-dimensional real vector space.
+ */
+message WordEmbedding {
+
+    /*
+     * Stores the revision number for the model, revision 2 is available on
+     * iOS, tvOS 13.0+, macOS 10.15+
+     */
+    uint32 revision = 1;
+    
+    /*
+     * Stores the language of the model, as specified in BCP-47 format,
+     * e.g. "en-US". See https://tools.ietf.org/html/bcp47
+     */
+    string language = 10;
+
+    /*
+     * Stores efficient representation of emebedding as encoded by the Natural Language Framework
+     */
+    bytes modelParameterData = 100;
+    
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/format/WordTagger.proto b/cpp/external/katagocoreml/vendor/mlmodel/format/WordTagger.proto
new file mode 100644
index 000000000..c8452a4ec
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/format/WordTagger.proto
@@ -0,0 +1,75 @@
+// Copyright (c) 2018, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
+
+syntax = "proto3";
+option optimize_for = LITE_RUNTIME;
+
+import public "DataStructures.proto";
+
+package CoreML.Specification.CoreMLModels;
+
+/*
+ * A model which takes a single input string and outputs a
+ * sequence of tokens, tags for tokens, along with their
+ * locations and lengths, in the original string.
+ */
+message WordTagger {
+
+    /*
+     * Stores the resivion number for the model, revision 1 is available on
+     * iOS, tvOS 12.0+, macoOS 10.14+
+     */
+    uint32 revision = 1;
+
+    /*
+     * Stores the language of the model, as specified in BCP-47 format,
+     * e.g. "en-US". See https://tools.ietf.org/html/bcp47
+     */
+    string language = 10;
+
+    /*
+     * Stores the name of tokens output. The output will be
+     * a sequence of strings that contains the tokens in the
+     * input string
+     */
+    string tokensOutputFeatureName = 20;
+
+    /*
+     * Stores the name of token tags output. The output will be
+     * a sequence of strings that contains the tags for each
+     * token in the input string
+     */
+    string tokenTagsOutputFeatureName = 21;
+
+    /*
+     * Stores the name of token locations output. The output will be
+     * a sequence of integers that contains the locations (indices)
+     * for each token in the input string, location starts from 0
+     */
+    string tokenLocationsOutputFeatureName = 22;
+
+    /*
+     * Stores the name of token lengths output. The output will be
+     * a sequence of integers that contains the lengths for each
+     * token in the input string
+     */
+    string tokenLengthsOutputFeatureName = 23;
+
+    /*
+     * Stores the byte representation of learned model parameters
+     */
+    bytes modelParameterData = 100;
+
+    /*
+     * Stores the set of output tags
+     */
+    oneof Tags {
+        StringVector stringTags = 200;
+    }
+
+
+    
+}
+
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Bf16.hpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Bf16.hpp
new file mode 100644
index 000000000..125e59c45
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Bf16.hpp
@@ -0,0 +1,57 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+
+namespace MILBlob {
+
+/**
+ * Struct for holding bytes that represent a bf16 number.
+ * Floating point interface treats "bytes" as brain float16 floating point
+ *  (https://en.wikipedia.org/wiki/Bfloat16_floating-point_format)
+ */
+struct Bf16 {
+    explicit Bf16(uint16_t bs) : bytes(bs) {}
+    Bf16() : bytes(0) {}
+
+    static Bf16 FromFloat(float f);
+
+    float GetFloat() const;
+    void SetFloat(float f);
+
+    // NOLINTNEXTLINE(misc-non-private-member-variables-in-classes)
+    uint16_t bytes;
+};
+
+inline bool operator==(const Bf16& first, const Bf16& second) noexcept
+{
+    // Note this comparison is quick and dirty - it will give incorrect results
+    // for (-0.0 == 0.0) and, depending on bit pattern, (NaN == NaN).
+    return first.bytes == second.bytes;
+}
+
+inline bool operator!=(const Bf16& first, const Bf16& second) noexcept
+{
+    // Note this comparison is quick and dirty - it will give incorrect results
+    // for (-0.0 != 0.0) and, depending on bit pattern, (NaN != NaN).
+    return first.bytes != second.bytes;
+}
+
+}  // namespace MILBlob
+
+namespace std {
+
+template <>
+struct hash<MILBlob::Bf16> {
+    size_t operator()(const MILBlob::Bf16& fp) const
+    {
+        return fp.bytes;
+    }
+};
+
+}  // namespace std
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/BlobDataType.hpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/BlobDataType.hpp
new file mode 100644
index 000000000..4dee4cc06
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/BlobDataType.hpp
@@ -0,0 +1,131 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#pragma once
+
+#include "MILBlob/Bf16.hpp"
+#include "MILBlob/Fp16.hpp"
+#include "MILBlob/Fp8.hpp"
+#include "MILBlob/SubByteTypes.hpp"
+
+namespace MILBlob {
+namespace Blob {
+
+enum class BlobDataType : uint32_t {
+    // *** WARNING ***
+    // For binary compatibility, values should ONLY be added at the end.
+    //
+    // this file needs to remain in sync across multiple repos.
+    // please be cognizant of that when making changes to the
+    // format.
+    Float16 = 1,
+    Float32 = 2,
+    UInt8 = 3,
+    Int8 = 4,
+    BFloat16 = 5,
+    Int16 = 6,
+    UInt16 = 7,
+    Int4 = 8,
+    UInt1 = 9,
+    UInt2 = 10,
+    UInt4 = 11,
+    UInt3 = 12,
+    UInt6 = 13,
+    Int32 = 14,
+    UInt32 = 15,
+    Float8E4M3FN = 16,
+    Float8E5M2 = 17,
+};
+
+template <typename T>
+struct BlobDataTypeTraits;
+
+template <>
+struct BlobDataTypeTraits<float> {
+    static constexpr BlobDataType DataType = BlobDataType::Float32;
+};
+
+template <>
+struct BlobDataTypeTraits<Fp16> {
+    static constexpr BlobDataType DataType = BlobDataType::Float16;
+};
+
+template <>
+struct BlobDataTypeTraits<Fp8E4M3FN> {
+    static constexpr BlobDataType DataType = BlobDataType::Float8E4M3FN;
+};
+
+template <>
+struct BlobDataTypeTraits<Fp8E5M2> {
+    static constexpr BlobDataType DataType = BlobDataType::Float8E5M2;
+};
+
+template <>
+struct BlobDataTypeTraits<Bf16> {
+    static constexpr BlobDataType DataType = BlobDataType::BFloat16;
+};
+
+template <>
+struct BlobDataTypeTraits<uint8_t> {
+    static constexpr BlobDataType DataType = BlobDataType::UInt8;
+};
+
+template <>
+struct BlobDataTypeTraits<int8_t> {
+    static constexpr BlobDataType DataType = BlobDataType::Int8;
+};
+
+template <>
+struct BlobDataTypeTraits<int16_t> {
+    static constexpr BlobDataType DataType = BlobDataType::Int16;
+};
+
+template <>
+struct BlobDataTypeTraits<uint16_t> {
+    static constexpr BlobDataType DataType = BlobDataType::UInt16;
+};
+
+template <>
+struct BlobDataTypeTraits<int32_t> {
+    static constexpr BlobDataType DataType = BlobDataType::Int32;
+};
+
+template <>
+struct BlobDataTypeTraits<uint32_t> {
+    static constexpr BlobDataType DataType = BlobDataType::UInt32;
+};
+
+template <>
+struct BlobDataTypeTraits<Int4> {
+    static constexpr BlobDataType DataType = BlobDataType::Int4;
+};
+
+template <>
+struct BlobDataTypeTraits<UInt6> {
+    static constexpr BlobDataType DataType = BlobDataType::UInt6;
+};
+
+template <>
+struct BlobDataTypeTraits<UInt4> {
+    static constexpr BlobDataType DataType = BlobDataType::UInt4;
+};
+
+template <>
+struct BlobDataTypeTraits<UInt3> {
+    static constexpr BlobDataType DataType = BlobDataType::UInt3;
+};
+
+template <>
+struct BlobDataTypeTraits<UInt2> {
+    static constexpr BlobDataType DataType = BlobDataType::UInt2;
+};
+
+template <>
+struct BlobDataTypeTraits<UInt1> {
+    static constexpr BlobDataType DataType = BlobDataType::UInt1;
+};
+
+}  // namespace Blob
+}  // namespace MILBlob
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/FileWriter.cpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/FileWriter.cpp
new file mode 100644
index 000000000..f30e7352f
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/FileWriter.cpp
@@ -0,0 +1,94 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#include "MILBlob/Blob/FileWriter.hpp"
+#include "MILBlob/Blob/StorageFormat.hpp"
+
+#include <cstdio>
+#include <stdexcept>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+using namespace MILBlob;
+using namespace MILBlob::Blob;
+using namespace MILBlob::Util;
+
+namespace {
+std::ios_base::openmode GetWriterMode(bool truncate)
+{
+    std::ios_base::openmode result = (std::ios::in | std::ios::out | std::ios::binary);
+    if (truncate) {
+        result |= std::ios::trunc;
+    }
+    return result;
+}
+}  // anonymous namespace
+
+FileWriter::~FileWriter() = default;
+
+FileWriter::FileWriter(const std::string& filePath, bool truncateFile)
+{
+    m_fileStream.open(filePath, GetWriterMode(truncateFile));
+    if (!m_fileStream) {
+        // If file does not exists, ios::in does not create one
+        // Let's create a file and re-open with required flags
+        m_fileStream.open(filePath, std::ofstream::binary | std::ios::out);
+        m_fileStream.close();
+        m_fileStream.open(filePath, GetWriterMode(truncateFile));
+    }
+    MILVerifyIsTrue(m_fileStream,
+                    std::runtime_error,
+                    "[MIL FileWriter]: Unable to open " + filePath + " file stream for writing");
+}
+
+uint64_t FileWriter::GetNextAlignedOffset()
+{
+    m_fileStream.seekg(0, std::ios::end);
+    uint64_t offset = static_cast<uint64_t>(m_fileStream.tellg());
+    if (offset % DefaultStorageAlignment == 0) {
+        return offset;
+    }
+    auto pad = DefaultStorageAlignment - (offset % DefaultStorageAlignment);
+    return offset + pad;
+}
+
+uint64_t FileWriter::GetFileSize()
+{
+    m_fileStream.seekg(0, std::ios::end);
+    return static_cast<uint64_t>(m_fileStream.tellg());
+}
+
+uint64_t FileWriter::AppendData(Span<const uint8_t> data)
+{
+    auto offset = GetNextAlignedOffset();
+    m_fileStream.seekp(static_cast<std::streamoff>(offset), std::ios::beg);
+    m_fileStream.write(reinterpret_cast<const char*>(data.Data()), static_cast<std::streamsize>(data.Size()));
+    MILVerifyIsTrue(m_fileStream.good(),
+                    std::runtime_error,
+                    "[MIL FileWriter]: Unknown error occurred while writing data to the file.");
+    return offset;
+}
+
+void FileWriter::WriteData(Span<const uint8_t> data, uint64_t offset)
+{
+    MILVerifyIsTrue(offset % DefaultStorageAlignment == 0,
+                    std::runtime_error,
+                    "[MIL FileWriter]: Provided offset not aligned. offset=" + std::to_string(offset) +
+                        " alignment=" + std::to_string(DefaultStorageAlignment) + ".");
+    m_fileStream.seekp(static_cast<std::streamoff>(offset), std::ios::beg);
+    m_fileStream.write(reinterpret_cast<const char*>(data.Data()), static_cast<std::streamsize>(data.Size()));
+    MILVerifyIsTrue(m_fileStream.good(),
+                    std::runtime_error,
+                    "[MIL FileWriter]: Unknown error occurred while writing data to the file.");
+}
+
+void FileWriter::ReadData(uint64_t offset, Util::Span<uint8_t> destData)
+{
+    m_fileStream.seekg(static_cast<std::streamsize>(offset), std::ios::beg);
+    m_fileStream.read(reinterpret_cast<char*>(destData.Data()), static_cast<std::streamsize>(destData.Size()));
+    MILVerifyIsTrue(m_fileStream.good(),
+                    std::runtime_error,
+                    "[MIL FileWriter]: Unknown error occurred while reading data from the file.");
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/FileWriter.hpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/FileWriter.hpp
new file mode 100644
index 000000000..2bc994033
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/FileWriter.hpp
@@ -0,0 +1,63 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#pragma once
+
+#include "MILBlob/Util/Span.hpp"
+
+#include <fstream>
+#include <string>
+#include <type_traits>
+
+namespace MILBlob {
+namespace Blob {
+/**
+ * Utility for interfacing with files
+ */
+class FileWriter final {
+public:
+    FileWriter() = delete;
+    FileWriter(const FileWriter&) = delete;
+    FileWriter(FileWriter&&) = delete;
+    FileWriter& operator=(const FileWriter&) = delete;
+    FileWriter& operator=(FileWriter&&) = delete;
+
+    FileWriter(const std::string& filePath, bool truncateFile);
+    ~FileWriter();
+
+    /**
+     * Appends given data to file at next aligned offset
+     * @throws std::runtime_error if error occurs while writing to file stream
+     */
+    uint64_t AppendData(Util::Span<const uint8_t> data);
+
+    /**
+     * Writes data to given offset
+     * @throws std::runtime_error if error occurs while writing to file stream or offset is not aligned
+     */
+    void WriteData(Util::Span<const uint8_t> data, uint64_t offset);
+
+    /**
+     * Returns next available aligned offset for writing
+     */
+    uint64_t GetNextAlignedOffset();
+
+    /**
+     * Returns size in byte of file currently open
+     */
+    uint64_t GetFileSize();
+
+    /**
+     * Reads data from current stream from given offset and writes into destData
+     * @throws std:runtime_error if error occurs during reading data
+     */
+    void ReadData(uint64_t offset, Util::Span<uint8_t> destData);
+
+private:
+    std::fstream m_fileStream;
+};
+
+}  // namespace Blob
+}  // namespace MILBlob
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/MMapFileReader.cpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/MMapFileReader.cpp
new file mode 100644
index 000000000..e773c98e6
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/MMapFileReader.cpp
@@ -0,0 +1,62 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#include "MILBlob/Blob/MMapFileReader.hpp"
+
+#include <cstdio>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+using namespace MILBlob;
+using namespace MILBlob::Blob;
+
+MMapFileReader::~MMapFileReader() = default;
+
+MMapFileReader::MMapFileReader(const std::string& filename) : m_isEncrypted(false)
+{
+    // verify file exists and find its length
+    struct stat fileInfo;
+    if (stat(filename.c_str(), &fileInfo) != 0) {
+        throw std::runtime_error("Could not open " + filename);
+    }
+
+    // mmap works in size_t units to be compatible with virtual address space units
+    auto fileLength = static_cast<size_t>(fileInfo.st_size);
+
+    // wrap fopen/fclose in exception-safe type
+    std::unique_ptr<FILE, decltype(&fclose)> f(fopen(filename.c_str(), "r"), fclose);
+
+    MILVerifyIsTrue(f != nullptr, std::runtime_error, "Unable to read file " + filename);
+
+    // wrap mmap/munmap in exception-safe type
+    std::unique_ptr<void, std::function<void(void*)>> mmapPtr(
+        mmap(nullptr, fileLength, PROT_READ, MAP_PRIVATE, fileno(f.get()), 0 /*offset*/),
+        [length = fileLength](void* ptr) { munmap(ptr, length); });
+
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-cstyle-cast) -- MAP_FAILED is (void*) -1.
+    MILVerifyIsTrue(mmapPtr.get() != nullptr && mmapPtr.get() != MAP_FAILED,
+                    std::runtime_error,
+                    "Unable to mmap file " + filename);
+
+    m_dataSpan = Util::Span<uint8_t>(reinterpret_cast<uint8_t*>(mmapPtr.get()), fileLength);
+
+    // Keep mmaping alive
+    m_mmap = std::move(mmapPtr);
+}
+
+uint64_t MMapFileReader::GetLength() const
+{
+    return m_dataSpan.Size();
+}
+
+Util::Span<const uint8_t> MMapFileReader::ReadData(uint64_t offset, uint64_t length) const
+{
+    return m_dataSpan.Slice(static_cast<size_t>(offset), static_cast<size_t>(length));
+}
+
+bool MMapFileReader::IsEncrypted() const
+{
+    return m_isEncrypted;
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/MMapFileReader.hpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/MMapFileReader.hpp
new file mode 100644
index 000000000..3a4f72522
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/MMapFileReader.hpp
@@ -0,0 +1,67 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#pragma once
+
+#include "MILBlob/Util/Span.hpp"
+
+#include <functional>
+#include <memory>
+#include <string>
+
+namespace MILBlob {
+namespace Blob {
+
+/**
+ * Memory-mapped file reader.
+ */
+class MMapFileReader {
+public:
+    MMapFileReader() = delete;
+    MMapFileReader(const MMapFileReader&) = delete;
+    MMapFileReader(MMapFileReader&&) = delete;
+    MMapFileReader& operator=(const MMapFileReader&) = delete;
+    MMapFileReader& operator=(MMapFileReader&&) = delete;
+
+    /**
+     * Maps the file specified into virtual memory space.
+     * @throws std::runtime_error if the file cannot be loaded or mapping fails.
+     */
+    MMapFileReader(const std::string& filename);
+
+    /** Unmaps the loaded file from virtual memory space. */
+    ~MMapFileReader();
+
+    uint64_t GetLength() const;
+
+    /**
+     * Provides a read-only Span of bytes at the requested offset and length.
+     * @throws std::range_error if offset or length are invalid.
+     */
+    Util::Span<const uint8_t> ReadData(uint64_t offset, uint64_t length) const;
+
+    /**
+     * Interprets mapped data as a C++ struct at the provided offset.
+     */
+    template <typename T>
+    const T& ReadStruct(uint64_t offset) const
+    {
+        auto region = ReadData(offset, sizeof(T));
+        return *reinterpret_cast<const T*>(region.Data());
+    }
+
+    /** Returns true if the underlying file is encrypted. */
+    bool IsEncrypted() const;
+
+protected:
+    std::unique_ptr<void, std::function<void(void*)>> m_mmap;
+
+    Util::Span<uint8_t> m_dataSpan;
+
+    bool m_isEncrypted;
+};
+
+}  // namespace Blob
+}  // namespace MILBlob
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/MMapFileReaderFactory.cpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/MMapFileReaderFactory.cpp
new file mode 100644
index 000000000..fe932d37f
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/MMapFileReaderFactory.cpp
@@ -0,0 +1,16 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#include "MILBlob/Blob/MMapFileReader.hpp"
+#include "MILBlob/Blob/MMapFileReaderFactory.hpp"
+
+namespace MILBlob::Blob {
+
+std::unique_ptr<MMapFileReader> MakeMMapFileReader(const std::string& filePath)
+{
+    return std::make_unique<MMapFileReader>(filePath);
+}
+
+}  // namespace MILBlob::Blob
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/MMapFileReaderFactory.hpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/MMapFileReaderFactory.hpp
new file mode 100644
index 000000000..4c031ac96
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/MMapFileReaderFactory.hpp
@@ -0,0 +1,19 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#pragma once
+
+#include <string>
+
+namespace MILBlob::Blob {
+
+class MMapFileReader;
+
+/**
+ * MakeMMapFileReader: Returns MMapedFileReader for file present at given filePath
+ */
+std::unique_ptr<MMapFileReader> MakeMMapFileReader(const std::string& filePath);
+
+}  // namespace MILBlob::Blob
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/StorageFormat.hpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/StorageFormat.hpp
new file mode 100644
index 000000000..135669b97
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/StorageFormat.hpp
@@ -0,0 +1,92 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#pragma once
+
+#include "MILBlob/Blob/BlobDataType.hpp"
+
+namespace MILBlob {
+namespace Blob {
+
+//
+// ---: Blob Storage File Format :---
+// Default file format for CoreML (iOS15 onwards)
+//
+// ---: File sturcture :---
+// File is structured as below:
+// 1. Storage header: `struct storage_header`
+// 2. Followed by pair: `struct blob_metadata` and `raw_data`
+// Each entry i.e. blob_metadata and raw data is 64 bytes aligned.
+//
+// Example file structure:
+// |<storage_header>|<blob_metadata 0>|<data 0>|...|<optional padding for
+// alignment>|<blob_metadata k>|<data k>|
+//
+// Example (file structure and associated mil_program usage):
+// |storage_header>|<blob_metadata_0>,<data_0>|...|<blob_metadata_k>,<data_k>| //  file structure
+// |               |64               ,128     |   |256              ,320     | //  byte offset
+//
+// Example usage in MIL program:
+// a = const(BlobFile(file_path="weights/file.wt", offset=64))
+// b = const(BlobFile(file_path="weights/file.wt", offset=256))
+//
+// Reference: https://quip-apple.com/V5zFA91jmjL3
+//
+
+// Default alignment being used for reading-writing Blob Storage format.
+constexpr uint64_t DefaultStorageAlignment = 64;
+// Default sentinel for validation for metadata
+constexpr uint64_t BlobMetadataSentinel = 0xDEADBEEF;
+
+/**
+ * blob_metadata: stores information of blob present in weight file
+ *
+ * Before ios18, the reserved fields were uninitialized and could have any values if not specified.
+ * From ios18 on, the reserved fields are initialized to 0 by default.
+ * To extend the format, make sure to bump the version number in storage_header.
+ */
+struct blob_metadata {
+    uint32_t sentinel = BlobMetadataSentinel;  // for validating correctness of metadata.
+
+    BlobDataType mil_dtype = BlobDataType::Float16;  // data type of the blob data.
+    uint64_t sizeInBytes = 0;                        // size of the blob data in bytes.
+    uint64_t offset = 0;                             // offset in file for blob data.
+    uint64_t padding_size_in_bits = 0;               // describes the number of unused bits in this blob,
+                                                     // required to calculate the actual size for spans of
+                                                     // sub-btye-sized types.  Unused otherwise
+    // Reserve fields
+    uint64_t reserved_1 = 0;
+    uint64_t reserved_2 = 0;
+    uint64_t reserved_3 = 0;
+    uint64_t reserved_4 = 0;
+};
+
+/**
+ * storage_header: Header for MIL Blob Storage format
+ *  - stores count of number of blobs present in current weight file
+ *  - stores version (this format currently only supports version=2)
+ *        version=1 in file header is Espresso `blob_v1` format
+ */
+struct storage_header {
+    uint32_t count = 0;    // Number of blob data.
+    uint32_t version = 2;  // default=2
+
+    uint64_t reserved_0 = 0;
+    uint64_t reserved_1 = 0;
+    uint64_t reserved_2 = 0;
+    uint64_t reserved_3 = 0;
+    uint64_t reserved_4 = 0;
+    uint64_t reserved_5 = 0;
+    uint64_t reserved_6 = 0;
+};
+
+// storage_header and blob_metadata are 64 bytes aligned.
+// This allows first metadata to be aligned by default
+// and data following blob_metadata aligned by default as well.
+static_assert(sizeof(blob_metadata) == sizeof(uint64_t) * 8, "blob_metadata must be of size 64 bytes");
+static_assert(sizeof(storage_header) == sizeof(uint64_t) * 8, "storage_header must be of size 64 bytes");
+
+}  // namespace Blob
+}  // namespace MILBlob
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/StorageReader.cpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/StorageReader.cpp
new file mode 100644
index 000000000..3f4147035
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/StorageReader.cpp
@@ -0,0 +1,309 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#include "MILBlob/Bf16.hpp"
+#include "MILBlob/Blob/MMapFileReader.hpp"
+#include "MILBlob/Blob/MMapFileReaderFactory.hpp"
+#include "MILBlob/Blob/StorageFormat.hpp"
+#include "MILBlob/Blob/StorageReader.hpp"
+#include "MILBlob/Fp16.hpp"
+#include "MILBlob/Fp8.hpp"
+#include "MILBlob/Util/SpanCast.hpp"
+
+#include <mutex>
+#include <stdexcept>
+#include <unordered_map>
+
+using namespace MILBlob;
+using namespace MILBlob::Blob;
+
+class StorageReader::Impl final {
+public:
+    Impl(const Impl&) = delete;
+    Impl(Impl&&) = delete;
+    Impl& operator=(const Impl&) = delete;
+    Impl& operator=(Impl&&) = delete;
+
+    explicit Impl(std::string filename) : m_filePath(std::move(filename)) {}
+    ~Impl() = default;
+
+    const std::string& GetFilename() const
+    {
+        return m_filePath;
+    }
+
+    blob_metadata GetMetadata(uint64_t offset) const
+    {
+        EnsureLoaded();
+
+        blob_metadata metadata = m_reader->ReadStruct<blob_metadata>(offset);
+
+        // validate sentinel
+        MILVerifyIsTrue(metadata.sentinel == BlobMetadataSentinel,
+                        std::runtime_error,
+                        "Invalid sentinel in blob_metadata.");
+        return metadata;
+    }
+
+    Util::Span<const uint8_t> GetRawDataView(uint64_t offset) const
+    {
+        auto metadata = GetMetadata(offset);
+
+        return m_reader->ReadData(metadata.offset, metadata.sizeInBytes);
+    }
+
+    template <typename T>
+    Util::Span<const T> GetDataViewForByteAligned(uint64_t offset) const
+    {
+        auto metadata = GetAndCheckMetadata(offset, BlobDataTypeTraits<T>::DataType);
+
+        return Util::SpanCast<const T>(m_reader->ReadData(metadata.offset, metadata.sizeInBytes));
+    }
+
+    template <typename T>
+    Util::Span<const T> GetDataViewForSubByteSized(uint64_t offset) const
+    {
+        auto metadata = GetAndCheckMetadata(offset, BlobDataTypeTraits<T>::DataType);
+
+        Util::Span<const uint8_t> rawSpan = m_reader->ReadData(metadata.offset, metadata.sizeInBytes);
+
+        MILVerifyIsTrue(metadata.padding_size_in_bits < 8,
+                        std::runtime_error,
+                        "8 or more bits of padding for sub-byte sized data is incorrect");
+
+        if constexpr (MILBlob::SubByteIsByteAligned<T>()) {
+            MILVerifyIsTrue(metadata.padding_size_in_bits % T::SizeInBits == 0,
+                            std::runtime_error,
+                            "Invalid padding for byte-aligned sub-byte-sized type");
+        }
+
+        // metadata.sizeInBytes includes the padding to make the data byte aligned
+
+        size_t numBits = metadata.sizeInBytes * 8;
+        numBits -= metadata.padding_size_in_bits;
+        MILVerifyIsTrue(numBits % T::SizeInBits == 0, std::runtime_error, "Invalid padding for blob");
+        size_t numElements = numBits / T::SizeInBits;
+
+        return Util::CastToBitSpan<const T>(rawSpan, numElements);
+    }
+
+    template <typename T>
+    Util::Span<const T> GetDataView(uint64_t offset) const
+    {
+        if constexpr (MILBlob::IsSubByteSized<T>::value) {
+            return this->GetDataViewForSubByteSized<T>(offset);
+        } else {
+            return this->GetDataViewForByteAligned<T>(offset);
+        }
+    }
+
+    uint64_t GetDataOffset(uint64_t offset) const
+    {
+        auto metadata = GetMetadata(offset);
+        return metadata.offset;
+    }
+
+    uint64_t GetDataPaddingInBits(uint64_t offset) const
+    {
+        auto metadata = GetMetadata(offset);
+        return metadata.padding_size_in_bits;
+    }
+
+    uint64_t GetDataSize(uint64_t metadataOffset) const
+    {
+        auto metadata = GetMetadata(metadataOffset);
+        return metadata.sizeInBytes;
+    }
+
+    bool IsEncrypted() const
+    {
+        EnsureLoaded();
+        return m_reader->IsEncrypted();
+    }
+
+    BlobDataType GetDataType(uint64_t metadataOffset) const
+    {
+        auto metadata = GetMetadata(metadataOffset);
+        return metadata.mil_dtype;
+    }
+
+    std::vector<uint64_t> GetAllOffsets() const
+    {
+        EnsureLoaded();
+
+        const auto& header = m_reader->ReadStruct<storage_header>(0);
+        auto numBlobs = header.count;
+
+        std::vector<uint64_t> allOffsets;
+        allOffsets.reserve(numBlobs);
+        // The first metadata offset lies just after the file header.
+        uint64_t currMetadataOffset = sizeof(storage_header);
+        for (uint32_t i = 0; i < numBlobs; ++i) {
+            allOffsets.push_back(currMetadataOffset);
+            auto metadata = GetMetadata(currMetadataOffset);
+            // Update offset for next iteration to aligned value.
+            currMetadataOffset = metadata.offset + metadata.sizeInBytes;
+            if (currMetadataOffset % DefaultStorageAlignment != 0) {
+                currMetadataOffset += DefaultStorageAlignment - currMetadataOffset % DefaultStorageAlignment;
+            }
+        }
+        return allOffsets;
+    }
+
+private:
+    void EnsureLoaded() const
+    {
+        auto load = [this]() {
+            auto reader = MakeMMapFileReader(m_filePath);
+            const auto& header = reader->ReadStruct<storage_header>(0);
+            MILVerifyIsTrue(header.version == 2, std::runtime_error, "Storage Reader expects file format version 2.");
+
+            // once we're good with the structure of the file, then set class state
+            m_reader = std::move(reader);
+        };
+
+        std::call_once(m_loadedFlag, [&load]() { load(); });
+    }
+
+    blob_metadata GetAndCheckMetadata(uint64_t offset, MILBlob::Blob::BlobDataType blobDType) const
+    {
+        auto metadata = GetMetadata(offset);
+
+        MILVerifyIsTrue(metadata.mil_dtype == blobDType,
+                        std::runtime_error,
+                        "Metadata data type does not match requested type.");
+
+        return metadata;
+    }
+
+    const std::string m_filePath;
+
+    mutable std::once_flag m_loadedFlag;
+    mutable std::unique_ptr<const MMapFileReader> m_reader;
+};
+
+// --------------------------------------------------------------------------------------
+
+StorageReader::~StorageReader() = default;
+
+StorageReader::StorageReader(std::string filename) : m_impl(std::make_unique<Impl>(std::move(filename))) {}
+
+const std::string& StorageReader::GetFilename() const
+{
+    return m_impl->GetFilename();
+}
+
+template <>
+Util::Span<const int8_t> StorageReader::GetDataView<int8_t>(uint64_t offset) const
+{
+    return m_impl->GetDataView<int8_t>(offset);
+}
+
+// StorageReader::GetDataView specializations for sub byte types
+#define DECLARE_SUB_BYTE_TYPE(TYPE_NAME)                                                     \
+    template <>                                                                              \
+    Util::Span<const TYPE_NAME> StorageReader::GetDataView<TYPE_NAME>(uint64_t offset) const \
+    {                                                                                        \
+        return m_impl->GetDataView<TYPE_NAME>(offset);                                       \
+    }
+
+#include "MILBlob/SubByteTypeList.hpp"
+
+#undef DECLARE_SUB_BYTE_TYPE
+
+template <>
+Util::Span<const uint8_t> StorageReader::GetDataView<uint8_t>(uint64_t offset) const
+{
+    return m_impl->GetDataView<uint8_t>(offset);
+}
+
+template <>
+Util::Span<const Bf16> StorageReader::GetDataView<Bf16>(uint64_t offset) const
+{
+    return m_impl->GetDataView<Bf16>(offset);
+}
+
+template <>
+Util::Span<const Fp8E4M3FN> StorageReader::GetDataView<Fp8E4M3FN>(uint64_t offset) const
+{
+    return m_impl->GetDataView<Fp8E4M3FN>(offset);
+}
+
+template <>
+Util::Span<const Fp8E5M2> StorageReader::GetDataView<Fp8E5M2>(uint64_t offset) const
+{
+    return m_impl->GetDataView<Fp8E5M2>(offset);
+}
+
+template <>
+Util::Span<const Fp16> StorageReader::GetDataView<Fp16>(uint64_t offset) const
+{
+    return m_impl->GetDataView<Fp16>(offset);
+}
+
+template <>
+Util::Span<const float> StorageReader::GetDataView<float>(uint64_t offset) const
+{
+    return m_impl->GetDataView<float>(offset);
+}
+
+template <>
+Util::Span<const int16_t> StorageReader::GetDataView<int16_t>(uint64_t offset) const
+{
+    return m_impl->GetDataView<int16_t>(offset);
+}
+
+template <>
+Util::Span<const uint16_t> StorageReader::GetDataView<uint16_t>(uint64_t offset) const
+{
+    return m_impl->GetDataView<uint16_t>(offset);
+}
+
+template <>
+Util::Span<const int32_t> StorageReader::GetDataView<int32_t>(uint64_t offset) const
+{
+    return m_impl->GetDataView<int32_t>(offset);
+}
+
+template <>
+Util::Span<const uint32_t> StorageReader::GetDataView<uint32_t>(uint64_t offset) const
+{
+    return m_impl->GetDataView<uint32_t>(offset);
+}
+
+Util::Span<const uint8_t> StorageReader::GetRawDataView(uint64_t offset) const
+{
+    return m_impl->GetRawDataView(offset);
+}
+
+uint64_t StorageReader::GetDataOffset(uint64_t metadataOffset) const
+{
+    return m_impl->GetDataOffset(metadataOffset);
+}
+
+uint64_t StorageReader::GetDataSize(uint64_t metadataOffset) const
+{
+    return m_impl->GetDataSize(metadataOffset);
+}
+
+bool StorageReader::IsEncrypted() const
+{
+    return m_impl->IsEncrypted();
+}
+
+BlobDataType StorageReader::GetDataType(uint64_t metadataOffset) const
+{
+    return m_impl->GetDataType(metadataOffset);
+}
+
+std::vector<uint64_t> StorageReader::GetAllOffsets() const
+{
+    return m_impl->GetAllOffsets();
+}
+
+uint64_t StorageReader::GetDataPaddingInBits(uint64_t metadataOffset) const
+{
+    return m_impl->GetDataPaddingInBits(metadataOffset);
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/StorageReader.hpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/StorageReader.hpp
new file mode 100644
index 000000000..bc8c7b687
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/StorageReader.hpp
@@ -0,0 +1,137 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#pragma once
+
+#include "MILBlob/Bf16.hpp"
+#include "MILBlob/Blob/BlobDataType.hpp"
+#include "MILBlob/Fp16.hpp"
+#include "MILBlob/Fp8.hpp"
+#include "MILBlob/SubByteTypes.hpp"
+#include "MILBlob/Util/Span.hpp"
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace MILBlob {
+namespace Blob {
+
+/**
+ * StorageReader encapsulates memory-mapped reading of the Storage Blob Format.
+ *
+ * Memory-mapping is performed lazily on first access to the underlying data.
+ *
+ * This file format supports the following types:
+ * - uint1,2,4
+ * - int4
+ * - uint8_t
+ * - Bf16
+ * - Fp16
+ * - float
+ * - int16_t
+ * - uint16_t
+ * - int32_t
+ * - uint32_t
+ */
+class StorageReader final {
+public:
+    StorageReader() = delete;
+    StorageReader(const StorageReader&) = delete;
+    StorageReader(StorageReader&&) = delete;
+    StorageReader& operator=(const StorageReader&) = delete;
+    StorageReader& operator=(StorageReader&&) = delete;
+
+    StorageReader(std::string filename);
+    ~StorageReader();
+
+    const std::string& GetFilename() const;
+
+    /**
+     * Returns a Span view into the underlying memory-mapped storage. The
+     * file will be mapped into memory on first access. This is valid for the
+     * supported types noted above.
+     * NOTE: `offset` should be the metadata offset.
+     * @throws std::range_error if offset is not valid.
+     */
+    template <typename T>
+    Util::Span<const T> GetDataView(uint64_t offset) const;
+
+    /**
+     * Returns an uint8_t Span view into the underlying memory-mapped storage. The
+     * file will be mapped into memory on first access. This is valid for the
+     * supported types noted above.
+     * NOTE: `offset` should be the metadata offset.
+     * @throws std::range_error if offset is not valid.
+     */
+    Util::Span<const uint8_t> GetRawDataView(uint64_t offset) const;
+
+    /**
+     * Returns file offset of data from given metadata offset
+     * @throws std::range_error if metadataOffset is not valid.
+     */
+    uint64_t GetDataOffset(uint64_t metadataOffset) const;
+
+    /**
+     * Returns the size of the data blob for the given metadata offset
+     * @throws std::range_error if metadataOffset is not valid.
+     */
+    uint64_t GetDataSize(uint64_t metadataOffset) const;
+
+    /** Returns true if the underlying file is encrypted. */
+    bool IsEncrypted() const;
+
+    /**
+     * Returns the storage type of the data blob for the given metadata offset
+     * @throws std::range_error if metadataOffset is not valid.
+     */
+    BlobDataType GetDataType(uint64_t metadataOffset) const;
+
+    /** Returns a vector containing the metadata offsets for all blobs in the file, in order. */
+    std::vector<uint64_t> GetAllOffsets() const;
+
+    uint64_t GetDataPaddingInBits(uint64_t metadataOffset) const;
+
+private:
+    class Impl;
+    const std::unique_ptr<Impl> m_impl;
+};
+
+template <>
+Util::Span<const Int4> StorageReader::GetDataView<Int4>(uint64_t) const;
+template <>
+Util::Span<const int8_t> StorageReader::GetDataView<int8_t>(uint64_t) const;
+template <>
+Util::Span<const uint8_t> StorageReader::GetDataView<uint8_t>(uint64_t) const;
+template <>
+Util::Span<const Fp8E4M3FN> StorageReader::GetDataView<Fp8E4M3FN>(uint64_t) const;
+template <>
+Util::Span<const Fp8E5M2> StorageReader::GetDataView<Fp8E5M2>(uint64_t) const;
+template <>
+Util::Span<const Bf16> StorageReader::GetDataView<Bf16>(uint64_t) const;
+template <>
+Util::Span<const Fp16> StorageReader::GetDataView<Fp16>(uint64_t) const;
+template <>
+Util::Span<const float> StorageReader::GetDataView<float>(uint64_t) const;
+template <>
+Util::Span<const UInt1> StorageReader::GetDataView<UInt1>(uint64_t) const;
+template <>
+Util::Span<const UInt2> StorageReader::GetDataView<UInt2>(uint64_t) const;
+template <>
+Util::Span<const UInt3> StorageReader::GetDataView<UInt3>(uint64_t) const;
+template <>
+Util::Span<const UInt4> StorageReader::GetDataView<UInt4>(uint64_t) const;
+template <>
+Util::Span<const UInt6> StorageReader::GetDataView<UInt6>(uint64_t) const;
+template <>
+Util::Span<const int16_t> StorageReader::GetDataView<int16_t>(uint64_t) const;
+template <>
+Util::Span<const uint16_t> StorageReader::GetDataView<uint16_t>(uint64_t) const;
+template <>
+Util::Span<const int32_t> StorageReader::GetDataView<int32_t>(uint64_t) const;
+template <>
+Util::Span<const uint32_t> StorageReader::GetDataView<uint32_t>(uint64_t) const;
+
+}  // namespace Blob
+}  // namespace MILBlob
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/StorageWriter.cpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/StorageWriter.cpp
new file mode 100644
index 000000000..2cc077e9c
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/StorageWriter.cpp
@@ -0,0 +1,234 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#include "MILBlob/Bf16.hpp"
+#include "MILBlob/Blob/FileWriter.hpp"
+#include "MILBlob/Blob/StorageFormat.hpp"
+#include "MILBlob/Blob/StorageWriter.hpp"
+#include "MILBlob/Fp16.hpp"
+#include "MILBlob/Fp8.hpp"
+#include "MILBlob/Util/Span.hpp"
+#include "MILBlob/Util/SpanCast.hpp"
+
+#include <string>
+#include <unordered_map>
+
+using namespace MILBlob;
+using namespace MILBlob::Blob;
+
+namespace {
+template <typename T>
+Util::Span<uint8_t> CastAndMakeSpan(T& x)
+{
+    return Util::Span<uint8_t>(reinterpret_cast<uint8_t*>(&x), sizeof(x));
+}
+}  // anonymous namespace
+
+class StorageWriter::Impl final {
+public:
+    Impl(const Impl&) = delete;
+    Impl(Impl&&) = delete;
+    Impl& operator=(const Impl&) = delete;
+    Impl& operator=(Impl&&) = delete;
+
+    ~Impl() = default;
+
+    Impl(const std::string& filePath, bool truncateFile)
+        : m_filePath(filePath)
+        , m_fileWriter(std::make_unique<FileWriter>(filePath, truncateFile))
+    {
+        if (truncateFile) {
+            m_fileWriter->WriteData(CastAndMakeSpan(m_header), 0);
+        } else {
+            auto fileSize = m_fileWriter->GetFileSize();
+            if (fileSize == 0) {
+                // File exists and is empty
+                m_fileWriter->WriteData(CastAndMakeSpan(m_header), 0);
+            } else if (static_cast<size_t>(fileSize) >= sizeof(m_header)) {
+                m_fileWriter->ReadData(0, CastAndMakeSpan(m_header));
+                if (m_header.version != 2) {
+                    // File exists and header is incorrect
+                    // File is not empty, please use truncate option
+                    throw std::runtime_error(
+                        "[MIL StorageWriter]: Incorrect file header, please use truncateFile=true");
+                }
+            } else {
+                // File is not empty, please use truncate option
+                throw std::runtime_error("[MIL StorageWriter]: Incorrect file header, please use truncateFile=true");
+            }
+        }
+    }
+
+    template <typename T>
+    uint64_t WriteData(Util::Span<const T> data);
+
+    std::string GetFilePath() const
+    {
+        return m_filePath;
+    }
+
+private:
+    std::string m_filePath;
+    std::unique_ptr<FileWriter> m_fileWriter;
+    storage_header m_header;
+};
+
+template <typename T>
+uint64_t SpanSizeInBytes(Util::Span<const T> data)
+{
+    if constexpr (MILBlob::IsSubByteSized<T>::value) {
+        auto uint8Span = MILBlob::Util::CastFromBitSpan(data);
+        return SpanSizeInBytes(uint8Span);
+    } else {
+        return data.Size() * sizeof(T);
+    }
+}
+
+template <typename T>
+void WritePaddingBits(blob_metadata& metadata, size_t numElements)
+{
+    // types aligned to byte boundaries don't need this padding
+    if constexpr (MILBlob::IsSubByteSized<T>::value) {
+        metadata.padding_size_in_bits = 0;
+        std::size_t numBitsRemaining = (numElements * T::SizeInBits) % 8;
+        if (numBitsRemaining != 0) {
+            metadata.padding_size_in_bits = 8 - numBitsRemaining;
+        }
+    }
+}
+
+template <typename T>
+uint64_t StorageWriter::Impl::WriteData(Util::Span<const T> data)
+{
+    // 1. Write data
+    blob_metadata metadata;
+    metadata.mil_dtype = BlobDataTypeTraits<typename std::remove_const<T>::type>::DataType;
+    metadata.sizeInBytes = SpanSizeInBytes(data);
+
+    // populate padding_size_in_bits, if we're writing a sub-byte-sized type
+    WritePaddingBits<std::remove_cv_t<T>>(metadata, data.Size());
+
+    // Get offset for data
+    auto metadataOffset = m_fileWriter->GetNextAlignedOffset();
+    // metadata is 64 bit aligned.
+    auto dataOffset = metadataOffset + sizeof(metadata);
+    MILVerifyIsTrue(dataOffset % DefaultStorageAlignment == 0,
+                    std::runtime_error,
+                    "[MIL StorageWriter]: dataOffset is expected to be 64 bits aligned.");
+    metadata.offset = dataOffset;
+    // We don't expect m_fileWriter to produce different offset for metadata and data
+    auto actualMetadataOffset = m_fileWriter->AppendData(CastAndMakeSpan(metadata));
+    MILVerifyIsTrue(metadataOffset == actualMetadataOffset,
+                    std::runtime_error,
+                    "[MIL StorageWriter]: Metadata written to different offset than expected.");
+    Util::Span<const uint8_t> byteSpan;
+    if constexpr (MILBlob::IsSubByteSized<T>::value) {
+        byteSpan = Util::CastFromBitSpan(data);
+    } else {
+        byteSpan = Util::SpanCast<const uint8_t>(data);
+    }
+    auto actualDataOffset = m_fileWriter->AppendData(byteSpan);
+    MILVerifyIsTrue(dataOffset == actualDataOffset,
+                    std::runtime_error,
+                    "[MIL StorageWriter]: Metadata written to different offset than expected.");
+
+    // 2. Update count in header
+    m_header.count++;
+    // Write header with new count
+    m_fileWriter->WriteData(CastAndMakeSpan(m_header), 0);
+    // return offset in file to blob_metadata
+    return metadataOffset;
+}
+
+// --------------------------------------------------------------------------------------
+
+StorageWriter::~StorageWriter() = default;
+
+StorageWriter::StorageWriter(const std::string& filePath, bool truncateFile)
+    : m_impl(std::make_unique<Impl>(filePath, truncateFile))
+{}
+
+template <>
+uint64_t StorageWriter::WriteData<int8_t>(Util::Span<const int8_t> data)
+{
+    return m_impl->WriteData(data);
+}
+
+template <>
+uint64_t StorageWriter::WriteData<uint8_t>(Util::Span<const uint8_t> data)
+{
+    return m_impl->WriteData(data);
+}
+
+template <>
+uint64_t StorageWriter::WriteData<uint32_t>(Util::Span<const uint32_t> data)
+{
+    return m_impl->WriteData(data);
+}
+
+template <>
+uint64_t StorageWriter::WriteData<Bf16>(Util::Span<const Bf16> data)
+{
+    return m_impl->WriteData(data);
+}
+
+template <>
+uint64_t StorageWriter::WriteData<Fp8E4M3FN>(Util::Span<const Fp8E4M3FN> data)
+{
+    return m_impl->WriteData(data);
+}
+
+template <>
+uint64_t StorageWriter::WriteData<Fp8E5M2>(Util::Span<const Fp8E5M2> data)
+{
+    return m_impl->WriteData(data);
+}
+
+template <>
+uint64_t StorageWriter::WriteData<Fp16>(Util::Span<const Fp16> data)
+{
+    return m_impl->WriteData(data);
+}
+
+template <>
+uint64_t StorageWriter::WriteData<float>(Util::Span<const float> data)
+{
+    return m_impl->WriteData(data);
+}
+
+template <>
+uint64_t StorageWriter::WriteData<int16_t>(Util::Span<const int16_t> data)
+{
+    return m_impl->WriteData(data);
+}
+
+template <>
+uint64_t StorageWriter::WriteData<int32_t>(Util::Span<const int32_t> data)
+{
+    return m_impl->WriteData(data);
+}
+
+// Implement WriteData forwarding stubs for all sub byte types
+#define DECLARE_SUB_BYTE_TYPE(TYPE_NAME)                                           \
+    template <>                                                                    \
+    uint64_t StorageWriter::WriteData<TYPE_NAME>(Util::Span<const TYPE_NAME> data) \
+    {                                                                              \
+        return m_impl->WriteData(data);                                            \
+    }
+
+#include "MILBlob/SubByteTypeList.hpp"
+
+#undef DECLARE_SUB_BYTE_TYPE
+
+template <>
+uint64_t StorageWriter::WriteData<uint16_t>(Util::Span<const uint16_t> data)
+{
+    return m_impl->WriteData(data);
+}
+
+std::string StorageWriter::GetFilePath() const
+{
+    return m_impl->GetFilePath();
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/StorageWriter.hpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/StorageWriter.hpp
new file mode 100644
index 000000000..58e3c95ca
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Blob/StorageWriter.hpp
@@ -0,0 +1,88 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#pragma once
+
+#include "MILBlob/Bf16.hpp"
+#include "MILBlob/Fp16.hpp"
+#include "MILBlob/Fp8.hpp"
+#include "MILBlob/SubByteTypes.hpp"
+#include "MILBlob/Util/Span.hpp"
+#include <memory>
+#include <string>
+
+namespace MILBlob {
+namespace Blob {
+
+/**
+ * Utility for writing MIL Blob Storage format
+ * details of new file format: MIL/Blob/StorageFormat.hpp
+ */
+class StorageWriter final {
+public:
+    StorageWriter() = delete;
+    StorageWriter(const StorageWriter&) = delete;
+    StorageWriter(StorageWriter&&) = delete;
+    StorageWriter& operator=(const StorageWriter&) = delete;
+    StorageWriter& operator=(StorageWriter&&) = delete;
+
+    StorageWriter(const std::string& filePath, bool truncateFile = true);
+    ~StorageWriter();
+
+    /**
+     * Writes data to the next available aligned location into opened file stream
+     * Writes blob_metadata followed by data (both at next aligned offset specified by MILBlob::Blob::DefaultAlignment)
+     * @throws std::runtime_error if error occurs while writing data to file
+     */
+    template <typename T>
+    uint64_t WriteData(Util::Span<const T> data);
+
+    /**
+     * Returns the file path of the blob storage file.
+     */
+    std::string GetFilePath() const;
+
+private:
+    class Impl;
+    const std::unique_ptr<Impl> m_impl;
+};
+
+template <>
+uint64_t StorageWriter::WriteData<Int4>(Util::Span<const Int4>);
+template <>
+uint64_t StorageWriter::WriteData<int8_t>(Util::Span<const int8_t>);
+template <>
+uint64_t StorageWriter::WriteData<uint8_t>(Util::Span<const uint8_t>);
+template <>
+uint64_t StorageWriter::WriteData<Bf16>(Util::Span<const Bf16>);
+template <>
+uint64_t StorageWriter::WriteData<Fp16>(Util::Span<const Fp16>);
+template <>
+uint64_t StorageWriter::WriteData<Fp8E4M3FN>(Util::Span<const Fp8E4M3FN>);
+template <>
+uint64_t StorageWriter::WriteData<Fp8E5M2>(Util::Span<const Fp8E5M2>);
+template <>
+uint64_t StorageWriter::WriteData<float>(Util::Span<const float>);
+template <>
+uint64_t StorageWriter::WriteData<int16_t>(Util::Span<const int16_t>);
+template <>
+uint64_t StorageWriter::WriteData<int32_t>(Util::Span<const int32_t>);
+template <>
+uint64_t StorageWriter::WriteData<UInt1>(Util::Span<const UInt1>);
+template <>
+uint64_t StorageWriter::WriteData<UInt2>(Util::Span<const UInt2>);
+template <>
+uint64_t StorageWriter::WriteData<UInt3>(Util::Span<const UInt3>);
+template <>
+uint64_t StorageWriter::WriteData<UInt4>(Util::Span<const UInt4>);
+template <>
+uint64_t StorageWriter::WriteData<UInt6>(Util::Span<const UInt6>);
+template <>
+uint64_t StorageWriter::WriteData<uint16_t>(Util::Span<const uint16_t>);
+template <>
+uint64_t StorageWriter::WriteData<uint32_t>(Util::Span<const uint32_t>);
+
+}  // namespace Blob
+}  // namespace MILBlob
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Fp16.cpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Fp16.cpp
new file mode 100644
index 000000000..ae1e71a10
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Fp16.cpp
@@ -0,0 +1,31 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#include "MILBlob/Fp16.hpp"
+
+// fp16 lib code has some conversion warnings we don't want to globally ignore
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wincompatible-pointer-types"
+#pragma clang diagnostic ignored "-Wsign-conversion"
+#pragma clang diagnostic ignored "-Wconversion"
+#include "fp16/fp16.h"
+#pragma clang diagnostic pop
+
+using namespace MILBlob;
+
+/* static */ Fp16 Fp16::FromFloat(float f)
+{
+    return Fp16(fp16_ieee_from_fp32_value(f));
+}
+
+float Fp16::GetFloat() const
+{
+    return fp16_ieee_to_fp32_value(bytes);
+}
+
+void Fp16::SetFloat(float f)
+{
+    bytes = fp16_ieee_from_fp32_value(f);
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Fp16.hpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Fp16.hpp
new file mode 100644
index 000000000..300e4566f
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Fp16.hpp
@@ -0,0 +1,53 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+
+namespace MILBlob {
+
+/**
+ * Struct for holding bytes that represent a fp16 number.
+ * Floating point interface treats "bytes" as IEEE 754 half precision floating point
+ *  (https://ieeexplore.ieee.org/document/8766229)
+ */
+struct Fp16 {
+    explicit Fp16(uint16_t bs) : bytes(bs) {}
+    Fp16() : bytes(0) {}
+
+    static Fp16 FromFloat(float f);
+
+    float GetFloat() const;
+    void SetFloat(float f);
+
+    // NOLINTNEXTLINE(misc-non-private-member-variables-in-classes)
+    uint16_t bytes;
+};
+
+inline bool operator==(const Fp16& first, const Fp16& second) noexcept
+{
+    return first.bytes == second.bytes;
+}
+
+inline bool operator!=(const Fp16& first, const Fp16& second) noexcept
+{
+    return first.bytes != second.bytes;
+}
+
+}  // namespace MILBlob
+
+namespace std {
+
+template <>
+struct hash<MILBlob::Fp16> {
+    size_t operator()(const MILBlob::Fp16& fp) const
+    {
+        return fp.bytes;
+    }
+};
+
+}  // namespace std
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Fp8.cpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Fp8.cpp
new file mode 100644
index 000000000..2176fad97
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Fp8.cpp
@@ -0,0 +1,189 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#include "MILBlob/Fp8.hpp"
+
+#include <cmath>
+#include <stdexcept>
+
+using namespace MILBlob;
+
+// Some global constants.
+constexpr uint8_t fp32MantissaBits = 23;
+constexpr int8_t fp32ExponentBias = 127;
+
+// Helper function to handle Fp32 -> Fp8 exponent and mantissa.
+template <typename FP8_TYPE, typename FP8_CAST>
+void HandleFp32ToFp8ExponentMantissa(FP8_CAST& fp8, FloatCast& fp32)
+{
+    int32_t unbiasedExponent = fp32.components.exponent - fp32ExponentBias;
+    if (unbiasedExponent + FP8_TYPE::fp8ExponentBias > 0) {
+        // Normal.
+        fp8.components.exponent = uint8_t(fp32.components.exponent - fp32ExponentBias + FP8_TYPE::fp8ExponentBias);
+    } else {
+        // Denormal.
+        FloatCast fp32_bias;
+        fp32_bias.components.sign = fp32.components.sign;
+        fp32_bias.components.exponent = -1 * FP8_TYPE::fp8ExponentBias + fp32ExponentBias + 1;
+        fp32_bias.components.mantissa = 0;
+        fp32.f += fp32_bias.f;
+        fp8.components.exponent = 0;
+    }
+    if ((fp32.components.mantissa & ((0x1 << (fp32MantissaBits - FP8_TYPE::fp8MantissaBits)) - 1)) != 0) {
+        throw std::range_error("FP8 SetFloat requires rounding for the given value.");
+    }
+    fp8.components.mantissa = fp32.components.mantissa >> (fp32MantissaBits - FP8_TYPE::fp8MantissaBits);
+}
+
+// Helper function to handle normalizing the denormalized case for fp8.
+// For denormalized fp8's, we need to normalize by subtracting a bias of 2^(1 - fp8ExponentBias)
+template <typename FP8_CAST>
+void HandleFp8ToFp32Denormalize(FP8_CAST& fp8, FloatCast& fp32)
+{
+    if (fp8.components.exponent == 0 && fp8.components.mantissa != 0) {
+        fp32.components.exponent++;
+        FloatCast fp32_bias;
+        fp32_bias.components.sign = fp8.components.sign;
+        fp32_bias.components.exponent = fp32.components.exponent;
+        fp32_bias.components.mantissa = 0;
+        fp32.f -= fp32_bias.f;
+    }
+}
+
+// Helper function to handle exponent and mantissa for Fp8 -> Fp32 conversion.
+template <typename FP8_TYPE, typename FP8_CAST>
+void HandleFp8ToFp32ExponentMantissa(const FP8_CAST& fp8, FloatCast& fp32)
+{
+    if (fp8.components.exponent == 0 && fp8.components.mantissa == 0) {
+        fp32.components.exponent = 0;
+        fp32.components.mantissa = 0;
+        return;
+    }
+    int32_t unbiasedExponent = fp8.components.exponent - FP8_TYPE::fp8ExponentBias;
+    fp32.components.exponent = uint32_t(unbiasedExponent + fp32ExponentBias);
+    fp32.components.mantissa =
+        uint32_t(int32_t(fp8.components.mantissa << (fp32MantissaBits - FP8_TYPE::fp8MantissaBits)));
+}
+
+float Fp8E5M2::GetFloat() const
+{
+    FloatCast fp32 = {.f = 0};
+    // Set the sign bit.
+    fp32.components.sign = data.components.sign;
+
+    // Standard NaN/Inf case. We just use the fp8 mantissa as there's
+    // no strong requirements for mantissa in the NaN case.
+    if (data.components.exponent == (0x1 << fp8ExponentBits) - 1) {
+        fp32.components.exponent = 0xFF;
+        fp32.components.mantissa = data.components.mantissa;
+        return fp32.f;
+    }
+    HandleFp8ToFp32ExponentMantissa<Fp8E5M2, Fp8E5M2::Cast>(data, fp32);
+    HandleFp8ToFp32Denormalize(data, fp32);
+    return fp32.f;
+}
+
+float Fp8E4M3FN::GetFloat() const
+{
+    FloatCast fp32 = {.f = 0};
+    // Set the sign bit.
+    fp32.components.sign = data.components.sign;
+    // NaN case, infinity is not supported. We just use the mantissa from the fp8.
+    if (data.components.exponent == (0x1 << fp8ExponentBits) - 1 && data.components.mantissa == 0x7) {
+        fp32.components.exponent = 0xFF;
+        fp32.components.mantissa = data.components.mantissa;
+        return fp32.f;
+    }
+    HandleFp8ToFp32ExponentMantissa<Fp8E4M3FN, Fp8E4M3FN::Cast>(data, fp32);
+    HandleFp8ToFp32Denormalize(data, fp32);
+    return fp32.f;
+}
+
+void Fp8E5M2::SetFloat(float f)
+{
+    FloatCast fp32 = {.f = f};
+    data = {.byte = 0};
+    // Set sign bit.
+    data.components.sign = fp32.components.sign;
+
+    // If f is nan or inf, set exponent to all 1's.
+    if (std::isnan(f)) {
+        data.components.exponent = (0x1 << fp8ExponentBits) - 1;
+        data.components.mantissa = 1;
+    } else if (std::isinf(f)) {
+        data.components.exponent = (0x1 << fp8ExponentBits) - 1;
+        data.components.mantissa = 0;
+    } else if (f == 0) {
+        data.components.exponent = 0;
+        data.components.mantissa = 0;
+    } else {
+        int32_t unbiasedExponent = fp32.components.exponent - fp32ExponentBias;
+        // Float is normal or denormal, check the exponent and set it.
+        // For now, we throw on over/underflows. There are alternative ways to handle
+        // this (round to zero).
+        if (unbiasedExponent > fp8ExponentBias) {
+            throw std::range_error("Fp8E5M2 SetFloat exponent overflow.");
+        } else if (unbiasedExponent < (-1 * fp8ExponentBias - int32_t(fp8MantissaBits) + 1)) {
+            throw std::range_error("Fp8E5M2 SetFloat exponent underflow.");
+        }
+        HandleFp32ToFp8ExponentMantissa<Fp8E5M2, Fp8E5M2::Cast>(data, fp32);
+    }
+}
+
+void Fp8E4M3FN::SetFloat(float f)
+{
+    FloatCast fp32 = {.f = f};
+    data = {.byte = 0};
+    // Set sign bit.
+    data.components.sign = fp32.components.sign;
+
+    // If f is nan or inf, set exponent to all 1's.
+    if (std::isnan(f)) {
+        data.components.exponent = (0x1 << fp8ExponentBits) - 1;
+        data.components.mantissa = 7;
+    } else if (std::isinf(f)) {
+        throw std::range_error("Fp8E4M3FN SetFloat infinity not supported.");
+    } else if (f == 0) {
+        data.components.exponent = 0;
+        data.components.mantissa = 0;
+    } else {
+        int32_t unbiasedExponent = fp32.components.exponent - fp32ExponentBias;
+        // Float is normal or denormal, check the exponent and set it.
+        // For now, we throw on over/underflows. There are alternative ways to handle
+        // this (round to zero).
+        if (unbiasedExponent > fp8ExponentBias + 1) {
+            throw std::range_error("Fp8E4M3FN SetFloat exponent overflow.");
+        } else if (unbiasedExponent < (-1 * fp8ExponentBias - int32_t(fp8MantissaBits) + 1)) {
+            // Underflow occurs when the exponent is below the minimum denormal value.
+            // This means unbiased exponent is less than -fp8ExponentBias - fp8MantissaBits + 1
+            throw std::range_error("Fp8E4M3FN SetFloat exponent underflow.");
+        }
+        HandleFp32ToFp8ExponentMantissa<Fp8E4M3FN, Fp8E4M3FN::Cast>(data, fp32);
+    }
+}
+
+Fp8E5M2 Fp8E5M2::FromFloat(float f)
+{
+    Fp8E5M2 result;
+    result.SetFloat(f);
+    return result;
+}
+
+Fp8E4M3FN Fp8E4M3FN::FromFloat(float f)
+{
+    Fp8E4M3FN result;
+    result.SetFloat(f);
+    return result;
+}
+
+bool Fp8E5M2::IsNaN() const
+{
+    return (data.components.exponent == 0x1F && data.components.mantissa != 0);
+}
+
+bool Fp8E4M3FN::IsNaN() const
+{
+    return (data.components.exponent == 0xF && data.components.mantissa == 7);
+}
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Fp8.hpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Fp8.hpp
new file mode 100644
index 000000000..1a99e9e69
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Fp8.hpp
@@ -0,0 +1,107 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+
+namespace MILBlob {
+
+// General helper typedef to help process an FP32 in different forms/its
+// constituent components.
+typedef union {
+    float f;
+    uint32_t bytes;
+    struct {
+        uint32_t mantissa : 23;
+        uint32_t exponent : 8;
+        uint32_t sign : 1;
+    } components;
+} FloatCast;
+
+// Macro for FP8 types.
+#define DECLARE_FP8_TYPE(NAME, EXPONENT_BITS, MANTISSA_BITS, EXPONENT_BIAS)                                        \
+    struct NAME {                                                                                       \
+        typedef union {                                                                                            \
+            uint8_t byte;                                                                                          \
+            struct {                                                                                               \
+                uint8_t mantissa : MANTISSA_BITS;                                                                  \
+                uint8_t exponent : EXPONENT_BITS;                                                                  \
+                uint8_t sign : 1;                                                                                  \
+            } components;                                                                                          \
+        } Cast;                                                                                                    \
+        explicit NAME(uint8_t d)                                                                                   \
+        {                                                                                                          \
+            data.byte = d;                                                                                         \
+        };                                                                                                         \
+        NAME()                                                                                                     \
+        {                                                                                                          \
+            data.byte = 0;                                                                                         \
+        }                                                                                                          \
+        static NAME FromFloat(float f);                                                                            \
+        float GetFloat() const;                                                                                    \
+        void SetFloat(float f);                                                                                    \
+        uint8_t GetByte() const                                                                                    \
+        {                                                                                                          \
+            return data.byte;                                                                                      \
+        }                                                                                                          \
+        void SetByte(uint8_t byte)                                                                                 \
+        {                                                                                                          \
+            data.byte = byte;                                                                                      \
+        }                                                                                                          \
+        bool IsNaN() const;                                                                                        \
+        Cast data;                                                                                                 \
+        static constexpr int8_t fp8ExponentBias = EXPONENT_BIAS;                                                   \
+        static constexpr uint8_t fp8ExponentBits = EXPONENT_BITS;                                                  \
+        static constexpr uint8_t fp8MantissaBits = MANTISSA_BITS;                                                  \
+        static_assert(fp8ExponentBits + fp8MantissaBits == 7, "Number of exponent and mantissa bits should be 7"); \
+    };                                                                                                             \
+    inline bool operator==(const NAME& first, const NAME& second) noexcept                                         \
+    {                                                                                                              \
+        if ((first.data.byte & 0x7F) == 0 && (second.data.byte & 0x7F) == 0) {                                     \
+            return true;                                                                                           \
+        }                                                                                                          \
+        if (first.IsNaN() && second.IsNaN()) {                                                                     \
+            return false;                                                                                          \
+        }                                                                                                          \
+        return first.data.byte == second.data.byte;                                                                \
+    }                                                                                                              \
+    inline bool operator!=(const NAME& first, const NAME& second) noexcept                                         \
+    {                                                                                                              \
+        if ((first.data.byte & 0x7F) == 0 && (second.data.byte & 0x7F) == 0) {                                     \
+            return false;                                                                                          \
+        }                                                                                                          \
+        if (first.IsNaN() && second.IsNaN()) {                                                                     \
+            return true;                                                                                           \
+        }                                                                                                          \
+        return first.data.byte != second.data.byte;                                                                \
+    }
+
+// Define the types.
+DECLARE_FP8_TYPE(Fp8E5M2, 5, 2, 15)
+DECLARE_FP8_TYPE(Fp8E4M3FN, 4, 3, 7)
+
+}  // namespace MILBlob
+
+namespace std {
+
+template <>
+struct hash<MILBlob::Fp8E5M2> {
+    size_t operator()(const MILBlob::Fp8E5M2& fp) const
+    {
+        return fp.data.byte;
+    }
+};
+
+template <>
+struct hash<MILBlob::Fp8E4M3FN> {
+    size_t operator()(const MILBlob::Fp8E4M3FN& fp) const
+    {
+        return fp.data.byte;
+    }
+};
+
+}  // namespace std
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/SubByteTypeList.hpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/SubByteTypeList.hpp
new file mode 100644
index 000000000..295313c33
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/SubByteTypeList.hpp
@@ -0,0 +1,13 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+// Listing of sub-byte-sized types in MIL
+// Template file used for generating stub functionality
+DECLARE_SUB_BYTE_TYPE(Int4)
+DECLARE_SUB_BYTE_TYPE(UInt6)
+DECLARE_SUB_BYTE_TYPE(UInt4)
+DECLARE_SUB_BYTE_TYPE(UInt3)
+DECLARE_SUB_BYTE_TYPE(UInt2)
+DECLARE_SUB_BYTE_TYPE(UInt1)
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/SubByteTypes.cpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/SubByteTypes.cpp
new file mode 100644
index 000000000..e2611bd6e
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/SubByteTypes.cpp
@@ -0,0 +1,209 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#include "MILBlob/Util/Verify.hpp"
+
+#include "MILBlob/SubByteTypes.hpp"
+#include "MILBlob/Util/SubByteConversionUtils.hpp"
+#include <string>
+#include <vector>
+
+namespace MILBlob {
+
+struct IndexAndOffset {
+    uint64_t index;
+    uint8_t offset;
+};
+
+static IndexAndOffset GetIndexAndOffsetForSubByteValue(uint64_t i, uint8_t numBits)
+{
+    IndexAndOffset ret;
+
+    uint64_t startBit = numBits * i;
+
+    ret.index = startBit / 8;
+    ret.offset = startBit % 8;
+
+    return ret;
+}
+
+template <typename T>
+std::vector<uint8_t> PackSubByteVecForNonByteAligned(Util::Span<const decltype(T::data)> span)
+{
+    std::vector<uint8_t> ret(MILBlob::SizeInBytes<T>(span.Size()), 0);
+
+    for (uint64_t i = 0; i < span.Size(); i++) {
+        MILVerifyIsTrue(span[i] <= T::MAX && span[i] >= T::MIN,
+                        std::range_error,
+                        "Value " + std::to_string(span[i]) + " is outside allowed subbyte datatype range [" +
+                            std::to_string(T::MIN) + ", " + std::to_string(T::MAX) + "].");
+
+        auto indexAndOffset = GetIndexAndOffsetForSubByteValue(i, T::SizeInBits);
+        auto idx = indexAndOffset.index;
+        auto offset = indexAndOffset.offset;
+
+        ret[idx] |= ((uint8_t)(span[i] << offset));
+        if (offset > 8 - T::SizeInBits) {
+            // part of the i'th element of span spills over to idx+1
+            // uint8_t rshift = T::SizeInBits - (8 - offset);
+            uint8_t rshift = 8 - offset;
+            ret[idx + 1] |= ((uint8_t)span[i] >> rshift);
+        }
+    }
+
+    return ret;
+}
+
+template <typename T>
+std::vector<uint8_t> PackSubByteVecImpl(Util::Span<const decltype(T::data)> vec)
+{
+    if constexpr (!MILBlob::SubByteIsByteAligned<T>()) {
+        return PackSubByteVecForNonByteAligned<T>(vec);
+    }
+    const auto ElementsPerByte = 8 / T::SizeInBits;
+    std::vector<uint8_t> ret(MILBlob::SizeInBytes<T>(vec.Size()));
+    for (size_t i = 0; i < vec.Size(); i++) {
+        size_t shiftAmmount = T::SizeInBits * (i % ElementsPerByte);
+        MILVerifyIsTrue(vec[i] <= T::MAX && vec[i] >= T::MIN,
+                        std::range_error,
+                        "Value " + std::to_string(vec[i]) + " is outside allowed subbyte datatype range [" +
+                            std::to_string(T::MIN) + ", " + std::to_string(T::MAX) + "].");
+        ret[i / ElementsPerByte] |= (static_cast<uint8_t>((vec[i] & T::BitMask) << shiftAmmount));
+    }
+    return ret;
+}
+
+#define DEFINE_PACK_SUB_BYTE_VEC(TYPE)                                                              \
+    std::vector<uint8_t> PackSubByteVec(const std::vector<TYPE>& vec)                               \
+    {                                                                                               \
+        using impl_t = decltype(TYPE::data);                                                        \
+        Util::Span<const impl_t> int8Span(reinterpret_cast<const impl_t*>(vec.data()), vec.size()); \
+        return PackSubByteVecImpl<TYPE>(int8Span);                                                  \
+    }
+
+#define DECLARE_SUB_BYTE_TYPE(TYPE_NAME) DEFINE_PACK_SUB_BYTE_VEC(TYPE_NAME)
+#include "MILBlob/SubByteTypeList.hpp"
+#undef DECLARE_SUB_BYTE_TYPE
+
+#define DEFINE_UNPACK_SUB_BYTE_VEC(TYPE)                                                          \
+    template <>                                                                                   \
+    std::vector<TYPE> UnPackSubByteVec<TYPE>(const std::vector<uint8_t>& vec, size_t numElements) \
+    {                                                                                             \
+        return UnPackSubByteVecImpl<TYPE>(vec, numElements);                                      \
+    }
+
+template <typename T>
+std::vector<T> UnPackSubByteVecImpl(const std::vector<uint8_t>& vec, size_t numElements)
+{
+    std::vector<T> ret(numElements);
+    MILVerifyIsTrue(
+        vec.size() == MILBlob::SizeInBytes<T>(numElements),
+        std::invalid_argument,
+        "Unpacking to sub-byte type vector has invalid number of elements. Sub-byte vector with NumElements "
+        "requires exactly vec.size() bytes.");
+    Util::Span<T> subByteSpan((typename MILBlob::Util::voidType<T>::type)(vec.data()), numElements);
+    for (size_t i = 0; i < numElements; i++) {
+        ret[i] = subByteSpan.ValueAt(i);
+    }
+    return ret;
+}
+
+#define DECLARE_SUB_BYTE_TYPE(TYPE_NAME) DEFINE_UNPACK_SUB_BYTE_VEC(TYPE_NAME)
+#include "MILBlob/SubByteTypeList.hpp"
+#undef DECLARE_SUB_BYTE_TYPE
+
+template <>
+std::vector<uint8_t> PackInt8Span<Int4>(Util::Span<const int8_t> unpackedValues)
+{
+    return PackSubByteVecImpl<Int4>(unpackedValues);
+}
+
+template <>
+std::vector<uint8_t> PackUInt8Span<UInt6>(Util::Span<const uint8_t> unpackedValues)
+{
+    return PackSubByteVecImpl<UInt6>(unpackedValues);
+}
+
+template <>
+std::vector<uint8_t> PackUInt8Span<UInt4>(Util::Span<const uint8_t> unpackedValues)
+{
+    return PackSubByteVecImpl<UInt4>(unpackedValues);
+}
+
+template <>
+std::vector<uint8_t> PackUInt8Span<UInt3>(Util::Span<const uint8_t> unpackedValues)
+{
+    return PackSubByteVecImpl<UInt3>(unpackedValues);
+}
+
+template <>
+std::vector<uint8_t> PackUInt8Span<UInt2>(Util::Span<const uint8_t> unpackedValues)
+{
+    return PackSubByteVecImpl<UInt2>(unpackedValues);
+}
+
+template <>
+std::vector<uint8_t> PackUInt8Span<UInt1>(Util::Span<const uint8_t> unpackedValues)
+{
+    return PackSubByteVecImpl<UInt1>(unpackedValues);
+}
+
+// Class methods for Int4, UInt4, etc.
+#define IMPLEMENT_METHODS_FOR_SUB_BYTE_TYPE(TYPE_NAME)                        \
+    TYPE_NAME::TYPE_NAME(decltype(TYPE_NAME::data) d)                         \
+    {                                                                         \
+        MILVerifyIsTrue(d <= TYPE_NAME::MAX && d >= TYPE_NAME::MIN,           \
+                        std::range_error,                                     \
+                        #TYPE_NAME " value is out of range.");                \
+        data = d;                                                             \
+    }                                                                         \
+    /* static */ TYPE_NAME TYPE_NAME::FromInt(int i)                          \
+    {                                                                         \
+        TYPE_NAME result;                                                     \
+        result.SetInt(i);                                                     \
+        return result;                                                        \
+    }                                                                         \
+    int TYPE_NAME::GetInt() const                                             \
+    {                                                                         \
+        return static_cast<int>(data);                                        \
+    }                                                                         \
+    void TYPE_NAME::SetInt(int i)                                             \
+    {                                                                         \
+        MILVerifyIsTrue(i <= TYPE_NAME::MAX && i >= TYPE_NAME::MIN,           \
+                        std::range_error,                                     \
+                        #TYPE_NAME " value is out of range.");                \
+        data = static_cast<decltype(TYPE_NAME::data)>(i);                     \
+        return;                                                               \
+    }                                                                         \
+    bool operator==(const TYPE_NAME& first, const TYPE_NAME& second) noexcept \
+    {                                                                         \
+        return first.data == second.data;                                     \
+    }                                                                         \
+    bool operator!=(const TYPE_NAME& first, const TYPE_NAME& second) noexcept \
+    {                                                                         \
+        return first.data != second.data;                                     \
+    }                                                                         \
+    static_assert(sizeof(TYPE_NAME) == 1, #TYPE_NAME " struct must be of size 1 byte");
+
+#define DECLARE_SUB_BYTE_TYPE(TYPE_NAME) IMPLEMENT_METHODS_FOR_SUB_BYTE_TYPE(TYPE_NAME)
+#include "MILBlob/SubByteTypeList.hpp"
+#undef DECLARE_SUB_BYTE_TYPE
+
+};  // namespace MILBlob
+
+namespace std {
+
+// +128 here so that casting i.data to size_t, for T==Int4, is safe
+#define DEFINE_HASH_FOR_SUB_BYTE_TYPE(TYPE)                      \
+    size_t hash<MILBlob::TYPE>::operator()(const MILBlob::TYPE& i) const \
+    {                                                            \
+        return static_cast<size_t>(i.data + 128);                \
+    }
+
+#define DECLARE_SUB_BYTE_TYPE(TYPE_NAME) DEFINE_HASH_FOR_SUB_BYTE_TYPE(TYPE_NAME)
+#include "MILBlob/SubByteTypeList.hpp"
+#undef DECLARE_SUB_BYTE_TYPE
+
+}  // namespace std
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/SubByteTypes.hpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/SubByteTypes.hpp
new file mode 100644
index 000000000..96be5e7a5
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/SubByteTypes.hpp
@@ -0,0 +1,134 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#pragma once
+
+#include <cmath>
+#include <stdint.h>
+#include <type_traits>
+#include <vector>
+
+// A sub-byte type of is represented in MIL by a byte-sized struct which wraps
+// an value of type IMPL_TYPE
+#define DEFINE_SUB_BYTE_TYPE(NAME, IMPL_TYPE, BIT_SIZE, MASK, MAX_VAL, MIN_VAL) \
+    struct NAME {                                                    \
+        explicit NAME(IMPL_TYPE d);                                             \
+        NAME() : data(0) {}                                                     \
+        static NAME FromInt(int i);                                             \
+        int GetInt() const;                                                     \
+        void SetInt(int i);                                                     \
+        IMPL_TYPE data;                                                         \
+        static constexpr uint8_t SizeInBits = BIT_SIZE;                         \
+        static constexpr uint8_t BitMask = MASK;                                \
+        static constexpr IMPL_TYPE MAX = MAX_VAL;                               \
+        static constexpr IMPL_TYPE MIN = MIN_VAL;                               \
+        static_assert(MAX >= MIN, "Incompatible values for MIN and MAX");       \
+    };
+
+// Declares the following exports for sub-byte-type NAME
+// operator ==
+// operator !=
+//
+// Packs a sub byte vector into uint8_t representation since a vector of sub byte type
+// cannot be packed.
+// std::vector<uint8_t> PackSubByteVec(const std::vector<MILBlob::NAME>& vec);
+//
+// Unpacks a sub byte vector in uint8_t representation to a vector of the sub byte type.
+// template <>
+// std::vector<NAME> UnPackSubByteVec<NAME>(const std::vector<uint8_t>& vec, size_t numElements);
+#define DECLARE_SUB_BYTE_TYPE_METHODS(NAME)                                            \
+    bool operator==(const NAME& first, const NAME& second) noexcept;        \
+    bool operator!=(const NAME& first, const NAME& second) noexcept;        \
+    std::vector<uint8_t> PackSubByteVec(const std::vector<MILBlob::NAME>& vec); \
+    template <>                                                                        \
+    std::vector<NAME> UnPackSubByteVec<NAME>(const std::vector<uint8_t>& vec, size_t numElements);
+
+namespace MILBlob {
+
+template <typename T>
+class IsSubByteSized {
+    struct S {
+        char a;
+        char b;
+    };
+    template <typename U>
+    static char Tester(decltype(&U::SizeInBits));
+    template <typename U>
+    static S Tester(...);
+
+public:
+    enum {
+        value = sizeof(Tester<T>(0)) == sizeof(char)
+    };
+};
+
+template <typename T>
+constexpr bool SubByteIsByteAligned()
+{
+    return (8 / T::SizeInBits) * T::SizeInBits == 8;
+}
+
+template <typename T>
+constexpr std::size_t SizeInBytes(std::size_t numElements)
+{
+    return (std::size_t)std::ceil((numElements * T::SizeInBits) / 8.0);
+}
+
+template <typename T>
+std::vector<T> UnPackSubByteVec(const std::vector<uint8_t>& vec, std::size_t numElements);
+
+DEFINE_SUB_BYTE_TYPE(Int4, int8_t, 4, 0xF, 7, -8)
+DECLARE_SUB_BYTE_TYPE_METHODS(Int4)
+
+DEFINE_SUB_BYTE_TYPE(UInt6, uint8_t, 6, 0b111111, 63, 0)
+DECLARE_SUB_BYTE_TYPE_METHODS(UInt6)
+
+DEFINE_SUB_BYTE_TYPE(UInt4, uint8_t, 4, 0xF, 15, 0)
+DECLARE_SUB_BYTE_TYPE_METHODS(UInt4)
+
+DEFINE_SUB_BYTE_TYPE(UInt3, uint8_t, 3, 0b111, 7, 0)
+DECLARE_SUB_BYTE_TYPE_METHODS(UInt3)
+
+DEFINE_SUB_BYTE_TYPE(UInt2, uint8_t, 2, 0b11, 3, 0)
+DECLARE_SUB_BYTE_TYPE_METHODS(UInt2)
+
+DEFINE_SUB_BYTE_TYPE(UInt1, uint8_t, 1, 0b1, 1, 0)
+DECLARE_SUB_BYTE_TYPE_METHODS(UInt1)
+
+}  // namespace MILBlob
+
+namespace std {
+
+template <>
+struct hash<MILBlob::Int4> {
+    size_t operator()(const MILBlob::Int4& i) const;
+};
+
+template <>
+struct hash<MILBlob::UInt6> {
+    size_t operator()(const MILBlob::UInt6& i) const;
+};
+
+template <>
+struct hash<MILBlob::UInt4> {
+    size_t operator()(const MILBlob::UInt4& i) const;
+};
+
+template <>
+struct hash<MILBlob::UInt3> {
+    size_t operator()(const MILBlob::UInt3& i) const;
+};
+
+template <>
+struct hash<MILBlob::UInt2> {
+    size_t operator()(const MILBlob::UInt2& i) const;
+};
+
+template <>
+struct hash<MILBlob::UInt1> {
+    size_t operator()(const MILBlob::UInt1& i) const;
+};
+
+}  // namespace std
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Util/Span.hpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Util/Span.hpp
new file mode 100644
index 000000000..9ce9a8596
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Util/Span.hpp
@@ -0,0 +1,674 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#pragma once
+
+#include "MILBlob/SubByteTypes.hpp"
+#include "MILBlob/Util/Verify.hpp"
+#include <array>
+#include <iterator>
+#include <limits>
+#include <stdexcept>
+#include <type_traits>
+#include <vector>
+
+namespace MILBlob {
+namespace Util {
+
+constexpr std::size_t DynamicExtent = std::numeric_limits<std::size_t>::max();
+
+namespace span_helpers {
+
+//----------------------------------------------------------------------
+// helper traits
+//----------------------------------------------------------------------
+
+template <size_t Extent>
+struct IsDynamicExtent {
+    static constexpr bool value = false;
+};
+
+template <>
+struct IsDynamicExtent<DynamicExtent> {
+    static constexpr bool value = true;
+};
+
+template <size_t Index, size_t Extent>
+struct IsIndexValid {
+    static constexpr bool value = (Index < Extent);
+};
+
+template <size_t Index>
+struct IsIndexValid<Index, DynamicExtent> {
+    static constexpr bool value = false;
+};
+
+//----------------------------------------------------------------------
+// helper storage size
+//----------------------------------------------------------------------
+
+template <size_t Extent>
+class SpanSize final {
+public:
+    SpanSize() = default;
+    ~SpanSize() = default;
+    SpanSize(const SpanSize&) = default;
+    SpanSize(SpanSize&&) noexcept = default;
+    SpanSize& operator=(const SpanSize&) = default;
+    SpanSize& operator=(SpanSize&&) noexcept = default;
+
+    constexpr size_t Size() const
+    {
+        return m_size;
+    }
+
+private:
+    static constexpr size_t m_size = Extent;
+};
+
+template <>
+class SpanSize<DynamicExtent> final {
+public:
+    SpanSize() = delete;
+    ~SpanSize() = default;
+    SpanSize(const SpanSize&) = default;
+    SpanSize(SpanSize&&) noexcept = default;
+    SpanSize& operator=(const SpanSize&) = default;
+    SpanSize& operator=(SpanSize&&) noexcept = default;
+
+    explicit SpanSize(size_t size) : m_size(size) {}
+
+    size_t Size() const
+    {
+        return m_size;
+    }
+
+private:
+    size_t m_size;
+};
+
+}  // namespace span_helpers
+
+//----------------------------------------------------------------------
+// Span<T, Extent> is a custom implementation of an array view, similar
+// to std::span introduced in C++20.
+//
+// If Extent is specified, Span supports compile-time bounds checking
+// when the Get<> method is used.
+//
+// For underlying types of at least byte-size, this version of Span also
+// supports iterating slices and dimensions of multi-dimensional
+// contiguous memory blocks.
+//
+// For sub-byte types, only basic access to the data pointer and size
+// are supported.
+//----------------------------------------------------------------------
+
+// Span types of at least byte-size.
+template <typename T, size_t Extent = DynamicExtent>
+class Span final {
+public:
+    using value_type = T;
+    using pointer = typename std::add_pointer<value_type>::type;
+    using reference = typename std::add_lvalue_reference<value_type>::type;
+    using iterator = pointer;
+
+    using const_value_type = typename std::add_const<value_type>::type;
+    using const_pointer = typename std::add_pointer<const_value_type>::type;
+    using const_iterator = const_pointer;
+
+    template <size_t Extent_>
+    using SpanSize = span_helpers::SpanSize<Extent_>;
+
+    template <size_t Extent_>
+    using IsDynamicExtent = span_helpers::IsDynamicExtent<Extent_>;
+
+    template <size_t Index, size_t Extent_>
+    using IsIndexValid = span_helpers::IsIndexValid<Index, Extent_>;
+
+    static_assert(!MILBlob::IsSubByteSized<T>::value, "Sub byte-sized types must use the reduced Span implementation");
+
+    class SliceIterator final {
+    public:
+        SliceIterator(pointer p, size_t stride) : m_ptr(p), m_stride(stride) {}
+
+        bool operator==(const SliceIterator& other) const
+        {
+            return m_ptr == other.m_ptr && m_stride == other.m_stride;
+        }
+
+        bool operator!=(const SliceIterator& other) const
+        {
+            return !(*this == other);
+        }
+
+        SliceIterator& operator++()
+        {
+            m_ptr += m_stride;
+            return *this;
+        }
+
+        // NOLINTNEXTLINE(cert-dcl21-cpp)
+        SliceIterator operator++(int) const
+        {
+            return SliceIterator(m_ptr + m_stride, m_stride);
+        }
+
+        Span<T> operator*() const
+        {
+            return Span<T>(m_ptr, m_stride);
+        }
+
+    private:
+        pointer m_ptr;
+        size_t m_stride;
+    };
+
+    template <size_t Stride>
+    class StaticSliceIterator final {
+    public:
+        explicit StaticSliceIterator(pointer p) : m_ptr(p) {}
+
+        bool operator==(const StaticSliceIterator<Stride>& other) const
+        {
+            return m_ptr == other.m_ptr;
+        }
+
+        bool operator!=(const StaticSliceIterator<Stride>& other) const
+        {
+            return !(*this == other);
+        }
+
+        StaticSliceIterator& operator++()
+        {
+            m_ptr += Stride;
+            return *this;
+        }
+
+        // NOLINTNEXTLINE(cert-dcl21-cpp)
+        StaticSliceIterator operator++(int) const
+        {
+            return StaticSliceIterator<Stride>(m_ptr + Stride);
+        }
+
+        Span<T, Stride> operator*() const
+        {
+            return Span<T, Stride>(m_ptr);
+        }
+
+    private:
+        pointer m_ptr;
+    };
+
+    template <typename Iterator>
+    class IteratorProvider final {
+    public:
+        IteratorProvider(Iterator begin, Iterator end) : m_begin(begin), m_end(end) {}
+
+        Iterator begin() const
+        {
+            return m_begin;
+        }
+
+        Iterator end() const
+        {
+            return m_end;
+        }
+
+    private:
+        Iterator m_begin;
+        Iterator m_end;
+    };
+
+    ~Span() = default;
+
+    Span(const Span<T, Extent>&) = default;
+    Span(Span<T, Extent>&&) noexcept = default;
+
+    Span<T, Extent>& operator=(const Span<T, Extent>&) = default;
+    Span<T, Extent>& operator=(Span<T, Extent>&&) noexcept = default;
+
+    /** Implicit copy constructor for converting a mutable span to a const span. Extent and type must be the same. */
+    template <typename NonConstT,
+              typename std::enable_if<!std::is_same<T, NonConstT>::value &&
+                                          std::is_same<T, typename std::add_const<NonConstT>::type>::value,
+                                      int>::type = 0>
+    Span(const Span<NonConstT, Extent>& other) : m_ptr(other.Data())
+                                               , m_size(other.Size())
+    {}
+
+    /** Implicit move constructor for converting a mutable span to a const span. Extent and type must be the same. */
+    template <typename NonConstT,
+              typename std::enable_if<!std::is_same<T, NonConstT>::value &&
+                                          std::is_same<T, typename std::add_const<NonConstT>::type>::value,
+                                      int>::type = 0>
+    Span(Span<NonConstT, Extent>&& other) : m_ptr(other.Data())
+                                          , m_size(other.Size())
+    {}
+
+    template <size_t Extent__ = Extent, typename std::enable_if<IsDynamicExtent<Extent__>::value, int>::type = 0>
+    Span() : m_ptr(nullptr)
+           , m_size(0)
+    {}
+
+    template <size_t Extent__ = Extent, typename std::enable_if<!IsDynamicExtent<Extent__>::value, int>::type = 0>
+    explicit Span(pointer p) : m_ptr(p)
+    {}
+
+    template <size_t Extent__ = Extent, typename std::enable_if<IsDynamicExtent<Extent__>::value, int>::type = 0>
+    Span(pointer p, size_t size) : m_ptr(size == 0 ? nullptr : p)
+                                 , m_size(size)
+    {}
+
+    //
+    // properties
+    //
+
+    pointer Data() const
+    {
+        return m_ptr;
+    }
+
+    size_t Size() const
+    {
+        return m_size.Size();
+    }
+
+    constexpr bool IsEmpty() const
+    {
+        return Size() == 0;
+    }
+
+    //
+    // random access
+    //
+
+    reference operator[](size_t index) const
+    {
+        MILDebugVerifyIsTrue(index < Size(), std::range_error, "index out of bounds");
+        return m_ptr[index];
+    }
+
+    reference At(size_t index) const
+    {
+        MILVerifyIsTrue(index < Size(), std::range_error, "index out of bounds");
+        return m_ptr[index];
+    }
+
+    // Get<N>() returns a reference to the value at index N.
+    // This method only exists for fixed-sized Span instantiations.
+    // The bounds of N are compile-time checked.
+    template <
+        size_t Index,
+        typename std::enable_if<!IsDynamicExtent<Extent>::value && IsIndexValid<Index, Extent>::value, int>::type = 0>
+    reference Get() const
+    {
+        return (*this)[Index];
+    }
+
+    //
+    // slicing
+    //
+
+    /** Gets a sub-span starting at index */
+    Span<T> Slice(size_t index) const
+    {
+        MILVerifyIsTrue(index < Size(), std::range_error, "index out of bounds");
+        return Span<T>(Data() + index, Size() - index);
+    }
+
+    /** Gets a sub-span starting at index with length size */
+    Span<T> Slice(size_t index, size_t size) const
+    {
+        MILVerifyIsTrue(size > 0 && index < Size() && index + size <= Size(), std::range_error, "index out of bounds");
+        return Span<T>(Data() + index, size);
+    }
+
+    /** Slices into num_slices dimensions, and returns the span corresponding to slice_index */
+    Span<T> SliceByDimension(size_t num_slices, size_t slice_index) const
+    {
+        MILVerifyIsTrue(Size() % num_slices == 0, std::range_error, "index out of bounds");
+        size_t stride = Size() / num_slices;
+        return Slice(slice_index * stride, stride);
+    }
+
+    //
+    // reinterpreting data
+    //
+
+    template <size_t NewExtent>
+    Span<T, NewExtent> StaticResize() const
+    {
+        MILVerifyIsTrue(NewExtent <= Size(), std::range_error, "index out of bounds");
+        return Span<T, NewExtent>(Data());
+    }
+
+    //
+    // basic C++ iterators
+    //
+
+    iterator begin() const
+    {
+        return Data();
+    }
+
+    iterator end() const
+    {
+        return Data() + Size();
+    }
+
+    const_iterator cbegin() const
+    {
+        return Data();
+    }
+
+    const_iterator cend() const
+    {
+        return Data() + Size();
+    }
+
+    std::reverse_iterator<iterator> rbegin() const
+    {
+        return std::reverse_iterator<iterator>(Data() + Size());
+    }
+
+    std::reverse_iterator<iterator> rend() const
+    {
+        return std::reverse_iterator<iterator>(Data());
+    }
+
+    std::reverse_iterator<const_iterator> crbegin() const
+    {
+        return std::reverse_iterator<const_iterator>(Data() + Size());
+    }
+
+    std::reverse_iterator<const_iterator> crend() const
+    {
+        return std::reverse_iterator<const_iterator>(Data());
+    }
+
+    //
+    // complex C++ iterators
+    //
+
+    /** Iterates based on slices. This iterator will produce Size() % sliceSice slices. */
+    IteratorProvider<SliceIterator> IterateSlices(size_t sliceSize) const
+    {
+        MILVerifyIsTrue(Size() % sliceSize == 0, std::range_error, "index out of bounds");
+
+        return IteratorProvider<SliceIterator>(SliceIterator(Data(), sliceSize),
+                                               SliceIterator(Data() + Size(), sliceSize));
+    }
+
+    template <size_t SliceSize>
+    IteratorProvider<StaticSliceIterator<SliceSize>> IterateSlices() const
+    {
+        MILVerifyIsTrue(Size() % SliceSize == 0, std::range_error, "index out of bounds");
+
+        return IteratorProvider<StaticSliceIterator<SliceSize>>(StaticSliceIterator<SliceSize>(Data()),
+                                                                StaticSliceIterator<SliceSize>(Data() + Size()));
+    }
+
+    /**
+     Iterates based on dimensions. Similar to IterateBySlices, but based on the number of slices (dimensions) rather
+     than the size of the slice.
+    */
+    IteratorProvider<SliceIterator> IterateByDimension(size_t dim) const
+    {
+        return IterateSlices(Size() / dim);
+    }
+
+private:
+    pointer m_ptr;
+    SpanSize<Extent> m_size;
+};
+
+template <typename T, typename = void>
+struct voidType {
+    using type = void*;
+};
+template <typename T>
+struct voidType<T, typename std::enable_if<std::is_const<T>::value>::type> {
+    using type = const void*;
+};
+// Specializations for sub-byte types.
+// This should ideally be implemented with std::enable_if but that involves an ABI breaking change.
+// The pointer referenced by m_ptr and returned by Data() is byte aligned and packed, with possible
+// padding in the last byte.
+#define DEFINE_SPAN_CLASS_FOR_SUBBYTE(subByteType)                                                                    \
+public:                                                                                                               \
+    template <size_t Extent_>                                                                                         \
+    using SpanSize = span_helpers::SpanSize<Extent_>;                                                                 \
+                                                                                                                      \
+    template <size_t Extent_>                                                                                         \
+    using IsDynamicExtent = span_helpers::IsDynamicExtent<Extent_>;                                                   \
+                                                                                                                      \
+    ~Span() = default;                                                                                                \
+                                                                                                                      \
+    Span(const Span<subByteType, Extent>&) = default;                                                                 \
+    Span(Span<subByteType, Extent>&&) noexcept = default;                                                             \
+                                                                                                                      \
+    Span<subByteType, Extent>& operator=(const Span<subByteType, Extent>&) = default;                                 \
+    Span<subByteType, Extent>& operator=(Span<subByteType, Extent>&&) noexcept = default;                             \
+                                                                                                                      \
+    /** Implicit copy constructor for converting a mutable span to a const span. Extent and type must be the same. */ \
+    template <typename NonConstT,                                                                                     \
+              typename std::enable_if<!std::is_same<subByteType, NonConstT>::value &&                                 \
+                                          std::is_same<subByteType, typename std::add_const<NonConstT>::type>::value, \
+                                      int>::type = 0>                                                                 \
+    Span(const Span<NonConstT, Extent>& other) : m_ptr(other.Data())                                                  \
+                                               , m_size(other.Size())                                                 \
+    {}                                                                                                                \
+                                                                                                                      \
+    /** Implicit move constructor for converting a mutable span to a const span. Extent and type must be the same. */ \
+    template <typename NonConstT,                                                                                     \
+              typename std::enable_if<!std::is_same<subByteType, NonConstT>::value &&                                 \
+                                          std::is_same<subByteType, typename std::add_const<NonConstT>::type>::value, \
+                                      int>::type = 0>                                                                 \
+    Span(Span<NonConstT, Extent>&& other) : m_ptr(other.Data())                                                       \
+                                          , m_size(other.Size())                                                      \
+    {}                                                                                                                \
+                                                                                                                      \
+    template <size_t Extent__ = Extent, typename std::enable_if<IsDynamicExtent<Extent__>::value, int>::type = 0>     \
+    Span() : m_ptr(nullptr)                                                                                           \
+           , m_size(0)                                                                                                \
+    {}                                                                                                                \
+                                                                                                                      \
+    template <size_t Extent__ = Extent, typename std::enable_if<!IsDynamicExtent<Extent__>::value, int>::type = 0>    \
+    explicit Span(voidType<subByteType>::type p) : m_ptr(p)                                                           \
+    {}                                                                                                                \
+                                                                                                                      \
+    template <size_t Extent__ = Extent, typename std::enable_if<IsDynamicExtent<Extent__>::value, int>::type = 0>     \
+    Span(voidType<subByteType>::type p, size_t size) : m_ptr(size == 0 ? nullptr : p)                                 \
+                                                     , m_size(size)                                                   \
+    {}                                                                                                                \
+                                                                                                                      \
+    voidType<subByteType>::type Data() const                                                                          \
+    {                                                                                                                 \
+        return m_ptr;                                                                                                 \
+    }                                                                                                                 \
+                                                                                                                      \
+    size_t Size() const                                                                                               \
+    {                                                                                                                 \
+        return m_size.Size();                                                                                         \
+    }                                                                                                                 \
+                                                                                                                      \
+    constexpr bool IsEmpty() const                                                                                    \
+    {                                                                                                                 \
+        return Size() == 0;                                                                                           \
+    }                                                                                                                 \
+    template <size_t NewExtent>                                                                                       \
+    Span<subByteType, NewExtent> StaticResize() const                                                                 \
+    {                                                                                                                 \
+        MILVerifyIsTrue(NewExtent <= Size(), std::range_error, "index out of bounds");                                \
+        return Span<subByteType, NewExtent>(Data());                                                                  \
+    }                                                                                                                 \
+                                                                                                                      \
+    std::remove_const<subByteType>::type ValueAt(std::size_t index)                                                   \
+    {                                                                                                                 \
+        if (index >= Size()) {                                                                                        \
+            throw std::out_of_range("index out of bounds.");                                                          \
+        }                                                                                                             \
+        using nonConstSubByteType = std::remove_const<subByteType>::type;                                             \
+        using impl_t = decltype(nonConstSubByteType::data);                                                           \
+                                                                                                                      \
+        uint8_t bitSize = nonConstSubByteType::SizeInBits;                                                            \
+        size_t elementIndex = index % Size();                                                                         \
+        size_t packedBitsIndex = elementIndex * bitSize / 8;                                                          \
+        size_t startBitIndex = elementIndex * bitSize % 8;                                                            \
+        uint8_t bitMask = static_cast<uint8_t>(nonConstSubByteType::BitMask << startBitIndex);                        \
+        uint8_t restoredElement_uint8 = (*((const uint8_t*)Data() + packedBitsIndex) & bitMask) >> startBitIndex;     \
+                                                                                                                      \
+        /* For non-byte-aligned dtypes like UInt3, the required bits can be spread across 2 bytes.                    \
+        Create mask and retrieve bits from the second byte if needed.                                                 \
+        Look at SpanTests::testSubByteUIntValueAt*/                                                                   \
+        size_t retrievedBits = 8 - startBitIndex;                                                                     \
+        if (retrievedBits < bitSize) {                                                                                \
+            bitMask = 0;                                                                                              \
+            for (size_t i = 0; i < (bitSize - retrievedBits); ++i) {                                                  \
+                bitMask |= 1 << i;                                                                                    \
+            }                                                                                                         \
+            restoredElement_uint8 |= (*((const uint8_t*)Data() + packedBitsIndex + 1) & bitMask) << retrievedBits;    \
+        }                                                                                                             \
+                                                                                                                      \
+        /* If sign=1, fill all 1s in the prefix.                                                                      \
+        e.g., say the Int4 value is 1011 which is -5 in 2s complement. At this point, restoredElement_uint8 is        \
+        00001011. To represent -5 correctly in 1 byte, we fill prefix 1s, resulting in 111110111. */                  \
+        if (nonConstSubByteType::MIN < 0) {                                                                           \
+            uint8_t sign_bit = (restoredElement_uint8 >> (bitSize - 1)) & 1;                                          \
+            if (sign_bit == 1) {                                                                                      \
+                for (size_t i = 0; i < 8 - bitSize; ++i) {                                                            \
+                    restoredElement_uint8 |= 1 << (i + bitSize);                                                      \
+                }                                                                                                     \
+            }                                                                                                         \
+        }                                                                                                             \
+        return nonConstSubByteType(*reinterpret_cast<impl_t*>(&restoredElement_uint8));                               \
+    }                                                                                                                 \
+                                                                                                                      \
+private:                                                                                                              \
+    voidType<subByteType>::type m_ptr;                                                                                \
+    SpanSize<Extent> m_size;
+
+template <size_t Extent>
+class Span<Int4, Extent> final {
+    DEFINE_SPAN_CLASS_FOR_SUBBYTE(Int4)
+};
+template <size_t Extent>
+class Span<const Int4, Extent> final {
+    DEFINE_SPAN_CLASS_FOR_SUBBYTE(const Int4)
+};
+
+template <size_t Extent>
+class Span<UInt6, Extent> final {
+    DEFINE_SPAN_CLASS_FOR_SUBBYTE(UInt6)
+};
+template <size_t Extent>
+class Span<const UInt6, Extent> final {
+    DEFINE_SPAN_CLASS_FOR_SUBBYTE(const UInt6)
+};
+
+template <size_t Extent>
+class Span<UInt4, Extent> final {
+    DEFINE_SPAN_CLASS_FOR_SUBBYTE(UInt4)
+};
+template <size_t Extent>
+class Span<const UInt4, Extent> final {
+    DEFINE_SPAN_CLASS_FOR_SUBBYTE(const UInt4)
+};
+
+template <size_t Extent>
+class Span<UInt3, Extent> final {
+    DEFINE_SPAN_CLASS_FOR_SUBBYTE(UInt3)
+};
+template <size_t Extent>
+class Span<const UInt3, Extent> final {
+    DEFINE_SPAN_CLASS_FOR_SUBBYTE(const UInt3)
+};
+
+template <size_t Extent>
+class Span<UInt2, Extent> final {
+    DEFINE_SPAN_CLASS_FOR_SUBBYTE(UInt2)
+};
+template <size_t Extent>
+class Span<const UInt2, Extent> final {
+    DEFINE_SPAN_CLASS_FOR_SUBBYTE(const UInt2)
+};
+
+template <size_t Extent>
+class Span<UInt1, Extent> final {
+    DEFINE_SPAN_CLASS_FOR_SUBBYTE(UInt1)
+};
+template <size_t Extent>
+class Span<const UInt1, Extent> final {
+    DEFINE_SPAN_CLASS_FOR_SUBBYTE(const UInt1)
+};
+
+// MakeSpan for std::vector<T> yields Span<T, DynamicExtent>
+// Examples:
+// (1) create a mutable span
+//     std::vector<int> v = { 1, 2, 3 };
+//     auto span = MakeSpan(v); // span is Span<int>
+// (2) create an immutable span
+//     const std::vector<int> v = { 1, 2, 3 };
+//     auto span = MakeSpan(v); // span is Span<const int>
+// (3) create an immutable span from a mutable vector
+//     std::vector<int> v = { 1, 2, 3 };
+//     auto span = MakeSpan<const int>(v); // span is Span<const int>
+
+template <typename T, template <typename, typename...> class C, typename... Args>
+Span<T> MakeSpan(C<T, Args...>& c)
+{
+    return Span<T>(c.data(), c.size());
+}
+
+template <typename T, template <typename, typename...> class C, typename... Args>
+Span<const T> MakeSpan(const C<T, Args...>& c)
+{
+    return Span<const T>(c.data(), c.size());
+}
+
+template <typename TargetT,
+          typename T,
+          template <typename, typename...>
+          class C,
+          typename... Args,
+          std::enable_if_t<std::is_const<TargetT>::value, bool> = true>
+Span<TargetT> MakeSpan(const C<T, Args...>& c)
+{
+    return Span<TargetT>(c.data(), c.size());
+}
+
+// MakeSpan for std::array<T, N> yields Span<T, N>.
+// Examples:
+// (1) create a mutable span
+//     std::array<int, 3> v = { 1, 2, 3 };
+//     auto span = MakeSpan(v); // span is Span<int, 3>
+// (2) create an immutable span from a mutable vector
+//     std::array<int, 3> v = { 1, 2, 3 };
+//     auto span = MakeSpan<const int>(v); // span is Span<const int, 3>
+// (3) create an immutable span
+//     const std::array<int, 3> v = { 1, 2, 3 };
+//     auto span = MakeSpan(v); // span is Span<const int, 3>
+
+template <typename T, size_t N>
+Span<T, N> MakeSpan(std::array<T, N>& v)
+{
+    return Span<T, N>(v.data());
+}
+
+template <typename T, size_t N, typename MutableT = typename std::remove_const<T>::type>
+Span<T, N> MakeSpan(const std::array<MutableT, N>& v)
+{
+    return Span<T, N>(v.data());
+}
+
+template <typename T, size_t N, typename ConstT = typename std::add_const<T>::type>
+Span<ConstT, N> MakeSpan(const std::array<T, N>& v)
+{
+    return Span<ConstT, N>(v.data());
+}
+
+}  // namespace Util
+}  // namespace MILBlob
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Util/SpanCast.hpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Util/SpanCast.hpp
new file mode 100644
index 000000000..d6337eef6
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Util/SpanCast.hpp
@@ -0,0 +1,65 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#pragma once
+
+#include "MILBlob/SubByteTypes.hpp"
+#include "MILBlob/Util/Span.hpp"
+#include <type_traits>
+
+namespace MILBlob {
+namespace Util {
+
+/**
+    reinterpret_casts the underlying pointer in Span<SourceT> to Span<TargetT>. Callers are responsible for ensuring
+    that SourceT can be interpreted as TargetT in a meaningful way as there are neither compile- nor run-time safety
+    guards in place.
+*/
+
+template <typename TargetT, typename SourceT>
+Span<TargetT> SpanCast(Span<SourceT> span)
+{
+    static_assert(!MILBlob::IsSubByteSized<TargetT>::value && !MILBlob::IsSubByteSized<SourceT>::value,
+                  "SpanCast for sub-byte sized types is not supported");
+    auto ptr = reinterpret_cast<TargetT*>(span.Data());
+    auto size = (span.Size() * sizeof(SourceT)) / sizeof(TargetT);
+    return Span<TargetT>(ptr, size);
+}
+
+/**
+    Reinterpret casts the underlying Span<uint8_t> to a sub-byte type span. numElements indicates the number of
+    sub-byte elements in the case where the last byte contains some padding due to round to nearest byte.
+*/
+
+template <typename TargetT, typename UINT8_T, std::enable_if_t<MILBlob::IsSubByteSized<TargetT>::value, bool> = true>
+Span<TargetT> CastToBitSpan(Span<UINT8_T> span, size_t numElements)
+{
+    static_assert(std::is_same<UINT8_T, const uint8_t>::value || std::is_same<UINT8_T, uint8_t>::value,
+                  "CastToBitSpan is only possible when casting from a uint8_t span");
+    if (span.Size() != MILBlob::SizeInBytes<TargetT>(numElements)) {
+        throw std::invalid_argument(
+            "BitSpanCast to sub-byte type span has invalid number of elements. Sub-byte span with NumElements "
+            "requires exactly Span<uint8_t>.Size() bytes.");
+    }
+    return Span<TargetT>((typename MILBlob::Util::voidType<UINT8_T>::type)(span.Data()), numElements);
+}
+
+/**
+    Reinterpret casts the underlying sub-byte-sized Span<T> to a Span<uint8_t>
+*/
+template <typename SourceT, std::enable_if_t<MILBlob::IsSubByteSized<SourceT>::value, bool> = true>
+Span<const uint8_t> CastFromBitSpan(Span<SourceT> span)
+{
+    size_t numBits = span.Size() * SourceT::SizeInBits;
+    size_t numElements = numBits / 8;
+    // need 1 more byte-sized element to hold remainder, if it exists
+    if (numBits % 8 != 0) {
+        numElements++;
+    }
+    return Span<const uint8_t>((const uint8_t*)span.Data(), numElements);
+}
+
+}  // namespace Util
+}  // namespace MILBlob
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Util/SubByteConversionUtils.hpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Util/SubByteConversionUtils.hpp
new file mode 100644
index 000000000..1a5bb8c82
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Util/SubByteConversionUtils.hpp
@@ -0,0 +1,41 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#pragma once
+
+#include "MILBlob/Util/Span.hpp"
+#include <vector>
+
+namespace MILBlob {
+
+// This header contains the utils used by coremltools to pack subbyte datatype values.
+
+// Packs a span of int8_t containing unpacked values into a packed uint8_t vector
+template <typename T>
+std::vector<uint8_t> PackInt8Span(Util::Span<const int8_t> unpackedValues);
+
+template <>
+std::vector<uint8_t> PackInt8Span<Int4>(Util::Span<const int8_t> unpackedValues);
+
+// Packs a span of uint8_t containing unpacked values into a packed uint8_t vector
+template <typename T>
+std::vector<uint8_t> PackUInt8Span(Util::Span<const uint8_t> unpackedValues);
+
+template <>
+std::vector<uint8_t> PackUInt8Span<UInt6>(Util::Span<const uint8_t> unpackedValues);
+
+template <>
+std::vector<uint8_t> PackUInt8Span<UInt4>(Util::Span<const uint8_t> unpackedValues);
+
+template <>
+std::vector<uint8_t> PackUInt8Span<UInt3>(Util::Span<const uint8_t> unpackedValues);
+
+template <>
+std::vector<uint8_t> PackUInt8Span<UInt2>(Util::Span<const uint8_t> unpackedValues);
+
+template <>
+std::vector<uint8_t> PackUInt8Span<UInt1>(Util::Span<const uint8_t> unpackedValues);
+
+}  // namespace MILBlob
diff --git a/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Util/Verify.hpp b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Util/Verify.hpp
new file mode 100644
index 000000000..59125422c
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/mlmodel/src/MILBlob/Util/Verify.hpp
@@ -0,0 +1,31 @@
+// Copyright (c) 2021, Apple Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-3-clause license that can be
+// found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+#pragma once
+
+#define MILVerifyImpl(condition, ex_type, ...) \
+    do {                                       \
+        if (!(condition)) {                    \
+            throw ex_type(__VA_ARGS__);        \
+        }                                      \
+    } while (0)
+
+#if defined(NDEBUG)
+#define MILVerifyDebugImpl(condition, ex_type, ...)
+#else
+#define MILVerifyDebugImpl(condition, ex_type, ...) MILVerifyImpl(condition, ex_type, __VA_ARGS__)
+#endif
+
+// MILVerifyIsNotNull verifies a pointer is not null. Upon failure, it throws the exception
+// with the provided arguments.
+#define MILVerifyIsNotNull(pointer, ex_type, ...) MILVerifyImpl(pointer != nullptr, ex_type, __VA_ARGS__)
+
+// MILVerifyIsTrue verifies condition is true. Upon failure, it throws the exception
+// with the provided arguments.
+#define MILVerifyIsTrue(condition, ex_type, ...) MILVerifyImpl(condition, ex_type, __VA_ARGS__)
+
+// MILDebugVerifyIsTrue verifies condition is true in debug builds only. Upon failure,
+// it throws the exception with the provided arguments.
+#define MILDebugVerifyIsTrue(condition, ex_type, ...) MILVerifyDebugImpl(condition, ex_type, __VA_ARGS__)
diff --git a/cpp/external/katagocoreml/vendor/modelpackage/LICENSE.txt b/cpp/external/katagocoreml/vendor/modelpackage/LICENSE.txt
new file mode 100644
index 000000000..b4570ec56
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/modelpackage/LICENSE.txt
@@ -0,0 +1,11 @@
+Copyright (c) 2017, Apple Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1.  Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2.  Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3.  Neither the name of the copyright holder(s) nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/cpp/external/katagocoreml/vendor/modelpackage/src/ModelPackage.cpp b/cpp/external/katagocoreml/vendor/modelpackage/src/ModelPackage.cpp
new file mode 100644
index 000000000..2bd2e89d9
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/modelpackage/src/ModelPackage.cpp
@@ -0,0 +1,603 @@
+//
+//  ModelPackage.cpp
+//  modelpackage
+//
+//  Copyright © 2021 Apple Inc. All rights reserved.
+//
+
+#include "ModelPackage.hpp"
+
+#include "utils/JsonMap.hpp"
+
+#include <algorithm>
+#include <fstream>
+#include <cstdio>
+#include <sstream>
+#include <istream>
+#include <string>
+
+#if __has_include(<filesystem>)
+#include <filesystem>
+#elif __has_include(<experimental/filesystem>)
+#include <experimental/filesystem> 
+namespace std {
+  namespace filesystem = std::experimental::filesystem;
+}
+#else
+#error "missing required header <filesystem>"
+#endif
+#include <uuid/uuid.h>
+#include <vector>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+static const char *kModelPackageManifestFileName = "Manifest.json";
+static const char *kModelPackageFileFormatVersionKey = "fileFormatVersion";
+
+static const int kModelPackageFileFormatMajorVersion = 1;
+static const int kModelPackageFileFormatMinorVersion = 0;
+static const int kModelPackageFileFormatPatchVersion = 0;
+
+static const char *kModelPackageItemInfoEntriesKey = "itemInfoEntries";
+
+static const char *kModelPackageItemInfoPathKey = "path";
+static const char *kModelPackageItemInfoNameKey = "name";
+static const char *kModelPackageItemInfoAuthorKey = "author";
+static const char *kModelPackageItemInfoDescriptionKey = "description";
+
+static const char *kModelPackageDataDir = "Data";
+
+static const char *kModelPackageRootModelKey = "rootModelIdentifier";
+
+using namespace MPL;
+using namespace detail;
+using namespace std::filesystem;
+
+class detail::ModelPackageItemInfoImpl {
+    
+private:
+    
+    std::string m_identifier;
+    std::string m_path;
+    std::string m_name;
+    std::string m_author;
+    std::string m_description;
+    
+public:
+
+    ModelPackageItemInfoImpl(const std::string& identifier, const std::string& path, const std::string& name, const std::string& author, const std::string& description);
+    
+    ~ModelPackageItemInfoImpl();
+        
+    inline const std::string& identifier() {
+        return m_identifier;
+    }
+    
+    inline const std::string& path() {
+        return m_path;
+    }
+    
+    inline const std::string& name() {
+        return m_name;
+    }
+    
+    inline const std::string& author() {
+        return m_author;
+    }
+    
+    inline const std::string& description() {
+        return m_description;
+    }
+};
+
+ModelPackageItemInfoImpl::ModelPackageItemInfoImpl(const std::string& identifier, const std::string& path, const std::string& name, const std::string& author, const std::string& description)
+: m_identifier(identifier),
+  m_path(path),
+  m_name(name),
+  m_author(author),
+  m_description(description)
+{
+}
+
+ModelPackageItemInfoImpl::~ModelPackageItemInfoImpl()
+{
+}
+
+ModelPackageItemInfo::ModelPackageItemInfo(std::shared_ptr<ModelPackageItemInfoImpl> modelPackageItemInfoImpl)
+: m_modelPackageItemInfoImpl(modelPackageItemInfoImpl)
+{
+}
+
+ModelPackageItemInfo::~ModelPackageItemInfo()
+{
+}
+
+const std::string& ModelPackageItemInfo::identifier() const
+{
+    return m_modelPackageItemInfoImpl->identifier();
+}
+
+const std::string& ModelPackageItemInfo::path() const
+{
+    return m_modelPackageItemInfoImpl->path();
+}
+
+const std::string& ModelPackageItemInfo::name() const
+{
+    return m_modelPackageItemInfoImpl->name();
+}
+
+const std::string& ModelPackageItemInfo::author() const
+{
+    return m_modelPackageItemInfoImpl->author();
+}
+
+const std::string& ModelPackageItemInfo::description() const
+{
+    return m_modelPackageItemInfoImpl->description();
+}
+
+class detail::ModelPackageImpl {
+    
+private:
+    
+    std::filesystem::path m_packagePath;
+    std::filesystem::path m_manifestPath;
+    std::filesystem::path m_packageDataDirPath;
+    
+    std::unique_ptr<JsonMap> m_manifest;
+    
+    bool m_readOnly;
+    
+    void validate();
+    
+    std::unique_ptr<JsonMap> getItemInfoEntries() const;
+    std::unique_ptr<JsonMap> getItemInfoEntry(const std::string& identifier) const;
+    
+    void createItemInfoEntry(const std::string& identifier, const std::string& path, const std::string& name, const std::string& author, const std::string& description);
+    void removeItemInfoEntry(const std::string& identifier);
+    
+    std::string generateIdentifier() const;
+    
+    std::filesystem::path getItemPath(const std::string& name, const std::string& author) const;
+
+public:
+
+    ModelPackageImpl(const std::filesystem::path& path, bool createIfNecessary = true, bool readOnly = false);
+    ~ModelPackageImpl();
+    
+    inline const std::filesystem::path& path() const {
+        return m_packagePath;
+    }
+    
+    std::string setRootModel(const std::filesystem::path& path, const std::string& name, const std::string& author, const std::string& description);
+    std::string replaceRootModel(const std::filesystem::path& path, const std::string& name, const std::string& author, const std::string& description);
+    std::shared_ptr<ModelPackageItemInfo> getRootModel() const;
+    
+    std::string addItem(const std::filesystem::path& path, const std::string& name, const std::string& author, const std::string& description);
+    std::shared_ptr<ModelPackageItemInfo> findItem(const std::string& identifier) const;
+    std::shared_ptr<ModelPackageItemInfo> findItem(const std::string& name, const std::string& author) const;
+    std::vector<ModelPackageItemInfo> findItemsByAuthor(const std::string& author) const;
+    
+    void removeItem(const std::string& identifier);
+    static bool isValid(const std::filesystem::path& path);
+    
+    ModelPackageItemInfo createFile(const std::string& name, const std::string& author, const std::string& description);
+};
+
+#pragma mark ModelPackageImpl
+
+ModelPackageImpl::ModelPackageImpl(const std::filesystem::path& path, bool createIfNecessary, bool readOnly)
+: m_packagePath(path),
+  m_manifestPath(path / kModelPackageManifestFileName),
+  m_packageDataDirPath(path / kModelPackageDataDir),
+  m_manifest(nullptr),
+  m_readOnly(readOnly)
+{
+    if (std::filesystem::exists(m_packagePath)) {
+        if (std::filesystem::exists(m_manifestPath)) {
+            std::ifstream manifestStream(m_manifestPath, std::ios::binary);
+            m_manifest = std::make_unique<JsonMap>(manifestStream);
+            manifestStream.close();
+        } else {
+            throw std::runtime_error("A valid manifest does not exist at path: " + m_manifestPath.string() + ". Remove the .mlpackage directory and try again.");
+        }
+    }
+    // Create the package structure at specified path
+    else if (createIfNecessary) {
+        if (false == create_directory(m_packagePath)) {
+            throw std::runtime_error("Failed to create model package at path: " + m_packagePath.string());
+        }
+        
+        if (false == create_directory(m_packageDataDirPath)) {
+            throw std::runtime_error("Failed to create data directory at path: " + m_packageDataDirPath.string());
+        }
+        
+        m_manifest = std::make_unique<JsonMap>();
+        std::stringstream ss;
+        ss << kModelPackageFileFormatMajorVersion << "." << kModelPackageFileFormatMinorVersion << "." << kModelPackageFileFormatPatchVersion;
+        m_manifest->setString(kModelPackageFileFormatVersionKey, ss.str());
+    }
+    // Error out since package does not exist
+    else {
+        throw std::runtime_error("Failed to open model package at path: " + m_packagePath.string());
+    }
+    
+    validate();
+}
+
+ModelPackageImpl::~ModelPackageImpl()
+{
+    if (m_readOnly) {
+        return;
+    }
+    
+    std::filesystem::path uniquedDestination(m_manifestPath);
+    std::filesystem::path suffix(generateIdentifier()); // std::filesystem::path from stringified UUID
+    uniquedDestination.replace_extension(suffix); // unique filename in the presumed writable directory where Manifest.json is sited
+    
+    std::ofstream uniquedStream(uniquedDestination, std::ios::binary);
+    m_manifest->serialize(uniquedStream);
+    uniquedStream.close();
+    if (uniquedStream.fail()) { // If any of the above fail do not go on to move uniquedDestination to m_manifestPath.
+        return;
+    }
+    
+    std::error_code ecode;
+    std::filesystem::rename(uniquedDestination, m_manifestPath, ecode); // On failure sets ecode and makes no changes. Does not throw.
+    if (ecode.value()) {
+        std::filesystem::remove(uniquedDestination);
+    }
+}
+
+void ModelPackageImpl::validate()
+{
+    const std::string versionString = m_manifest->getString(kModelPackageFileFormatVersionKey);
+
+    std::istringstream versionStringStream(versionString);
+    std::vector<std::string> versionTokens;
+    for (std::string token; std::getline(versionStringStream, token, '.');) {
+        versionTokens.push_back(token);
+    }
+
+    if (versionTokens.size() != 3) {
+        throw std::runtime_error("File format version must be in the form of major.minor.patch, but the specified value was: " + versionString);
+    }
+
+    int majorVersion = 0;
+    int minorVersion = 0;
+    int patchVersion = 0;
+    try {
+        majorVersion = std::stoi(versionTokens[0]);
+        minorVersion = std::stoi(versionTokens[1]);
+        patchVersion = std::stoi(versionTokens[2]);
+    } catch (std::invalid_argument& e) {
+        throw std::runtime_error("Failed to parse file format version: " + versionString + " because: " + e.what());
+    }
+
+    if (majorVersion < 0 ||
+        minorVersion < 0 ||
+        patchVersion < 0 ) {
+        throw std::runtime_error("File format version uses negative number(s): " + versionString);
+    }
+
+    if ((majorVersion > kModelPackageFileFormatMajorVersion) ||
+        (majorVersion == kModelPackageFileFormatMajorVersion && minorVersion > kModelPackageFileFormatMinorVersion) ||
+        (minorVersion == kModelPackageFileFormatMinorVersion && patchVersion > kModelPackageFileFormatPatchVersion)) {
+        throw std::runtime_error("Unsupported version: " + versionString);
+    }
+    
+    // Validate 1.0.0 model package
+    
+    auto itemInfoEntries = getItemInfoEntries();
+    if (itemInfoEntries != nullptr) {
+        std::vector<std::string> identifiers;
+        itemInfoEntries->getKeys(identifiers);
+        for (const auto& identifier : identifiers) {
+            auto itemInfoEntry = getItemInfoEntry(identifier);
+            
+            if (false == itemInfoEntry->hasKey(kModelPackageItemInfoPathKey) ||
+                false == itemInfoEntry->hasKey(kModelPackageItemInfoNameKey) ||
+                false == itemInfoEntry->hasKey(kModelPackageItemInfoAuthorKey) ||
+                false == itemInfoEntry->hasKey(kModelPackageItemInfoDescriptionKey)) {
+                throw std::runtime_error("Invalid itemInfo for identifier: " + identifier);
+            }
+            
+            auto path = m_packageDataDirPath / itemInfoEntry->getString(kModelPackageItemInfoPathKey);
+            if (false == exists(path)) {
+                throw std::runtime_error("Item does not exist for identifier: " + identifier);
+            }
+        }
+    }
+}
+
+std::unique_ptr<JsonMap> ModelPackageImpl::getItemInfoEntries() const
+{
+    if (m_manifest->hasKey(kModelPackageItemInfoEntriesKey)) {
+        return m_manifest->getObject(kModelPackageItemInfoEntriesKey);
+    }
+
+    return std::make_unique<JsonMap>();
+}
+
+std::unique_ptr<JsonMap> ModelPackageImpl::getItemInfoEntry(const std::string& identifier) const
+{
+    auto itemInfoEntries = getItemInfoEntries();
+    
+    if (itemInfoEntries->hasKey(identifier)) {
+        return itemInfoEntries->getObject(identifier);
+    }
+
+    return nullptr;
+}
+
+void ModelPackageImpl::removeItemInfoEntry(const std::string& identifier)
+{
+    auto itemInfoEntries = getItemInfoEntries();
+    
+    std::vector<std::string> identifiers;
+    itemInfoEntries->getKeys(identifiers);
+    
+    auto newItemInfoEntries = std::make_unique<JsonMap>();
+    for (const auto& localIdentifier : identifiers) {
+        if (localIdentifier != identifier) {
+            newItemInfoEntries->setObject(localIdentifier, itemInfoEntries->getObject(localIdentifier));
+        }
+    }
+    
+    m_manifest->setObject(kModelPackageItemInfoEntriesKey, std::move(newItemInfoEntries));
+}
+
+void ModelPackageImpl::createItemInfoEntry(const std::string& identifier, const std::string& path, const std::string& name, const std::string& author, const std::string& description) {
+    auto itemInfoEntry = getItemInfoEntry(identifier);
+    
+    if (nullptr == itemInfoEntry) {
+        itemInfoEntry = std::make_unique<JsonMap>();
+    }
+    
+    itemInfoEntry->setString(kModelPackageItemInfoPathKey, path);
+    itemInfoEntry->setString(kModelPackageItemInfoNameKey, name);
+    itemInfoEntry->setString(kModelPackageItemInfoAuthorKey, author);
+    itemInfoEntry->setString(kModelPackageItemInfoDescriptionKey, description);
+    
+    auto itemInfoEntries = getItemInfoEntries();
+    itemInfoEntries->setObject(identifier, std::move(itemInfoEntry));
+    m_manifest->setObject(kModelPackageItemInfoEntriesKey, std::move(itemInfoEntries));
+}
+
+std::filesystem::path ModelPackageImpl::getItemPath(const std::string& name, const std::string& author) const {
+    return std::filesystem::path(author) / name;
+}
+
+std::string ModelPackageImpl::generateIdentifier() const {
+    uuid_t uuid;
+    
+    // uuid_unparse generates a 36-character null-terminated string (37 bytes).
+    // they provide no mechanisms for us to deduce this length, therefore
+    // we have to hardcode it here.
+    char buf[37] = "";
+                
+    uuid_generate(uuid);
+    uuid_unparse(uuid, buf);
+        
+    return std::string(buf);
+}
+
+ModelPackageItemInfo ModelPackageImpl::createFile(const std::string& name, const std::string& author, const std::string& description) {
+    
+    if (findItem(name, author) != nullptr) {
+        throw std::runtime_error("The package already contains a file with name: " + name + " author: " + author);
+    }
+    
+    auto filePath = getItemPath(name, author);
+    auto dstPath = m_packageDataDirPath / filePath;
+    
+    create_directories(dstPath.parent_path());
+        
+    std::ofstream stream(dstPath, std::ios::binary);
+    if (!stream.is_open()) {
+        throw std::runtime_error("Failed to create file at path: " + dstPath.string());
+    }
+    
+    auto identifier = generateIdentifier();
+    createItemInfoEntry(identifier, filePath.string(), name, author, description);
+    return *(findItem(identifier));
+}
+
+std::string ModelPackageImpl::addItem(const std::filesystem::path& path, const std::string& name, const std::string& author, const std::string& description)
+{
+    if (findItem(name, author) != nullptr) {
+        throw std::runtime_error("The package already contains a file with name: " + name + " author: " + author);
+    }
+    
+    auto filePath = getItemPath(name, author);
+    auto dstPath = m_packageDataDirPath / filePath;
+    
+    create_directories(dstPath.parent_path());
+    std::filesystem::copy(path, dstPath);
+    
+    auto identifier = generateIdentifier();
+    createItemInfoEntry(identifier, filePath.string(), name, author, description);
+    return identifier;
+}
+
+std::string ModelPackageImpl::setRootModel(const std::filesystem::path& path, const std::string& name, const std::string& author, const std::string& description)
+{
+    if (m_manifest->hasKey(kModelPackageRootModelKey)) {
+        throw std::runtime_error("A root model already exists in this package. Remove the existing root model or the .mlpackage directory and try again.");
+    }
+    
+    auto identifier = addItem(path, name, author, description);
+    m_manifest->setString(kModelPackageRootModelKey, identifier);
+    return identifier;
+}
+
+std::string ModelPackageImpl::replaceRootModel(const std::filesystem::path& path, const std::string& name, const std::string& author, const std::string& description)
+{
+    if (m_manifest->hasKey(kModelPackageRootModelKey)) {
+        auto rootModelIdentifier = m_manifest->getString(kModelPackageRootModelKey);
+        removeItem(rootModelIdentifier);
+    }
+    
+    auto identifier = addItem(path, name, author, description);
+    m_manifest->setString(kModelPackageRootModelKey, identifier);
+    return identifier;
+}
+
+std::shared_ptr<ModelPackageItemInfo> ModelPackageImpl::getRootModel() const
+{
+    if (false == m_manifest->hasKey(kModelPackageRootModelKey)) {
+        throw std::runtime_error("Failed to look up root model");
+    }
+    
+    auto rootModelIdentifier = m_manifest->getString(kModelPackageRootModelKey);
+    return findItem(rootModelIdentifier);
+}
+
+std::shared_ptr<ModelPackageItemInfo> ModelPackageImpl::findItem(const std::string& identifier) const
+{
+    auto itemInfoEntry = getItemInfoEntry(identifier);
+    if (itemInfoEntry == nullptr) {
+        return nullptr;
+    }
+    
+    auto path = m_packageDataDirPath / itemInfoEntry->getString(kModelPackageItemInfoPathKey);
+    auto name = itemInfoEntry->getString(kModelPackageItemInfoNameKey);
+    auto author = itemInfoEntry->getString(kModelPackageItemInfoAuthorKey);
+    auto description = itemInfoEntry->getString(kModelPackageItemInfoDescriptionKey);
+    
+    return std::make_shared<ModelPackageItemInfo>(std::make_shared<ModelPackageItemInfoImpl>(identifier, path, name, author, description));
+}
+
+std::shared_ptr<ModelPackageItemInfo> ModelPackageImpl::findItem(const std::string& name, const std::string& author) const
+{
+    auto itemInfoEntries = getItemInfoEntries();
+    if (itemInfoEntries != nullptr) {
+        std::vector<std::string> identifiers;
+        itemInfoEntries->getKeys(identifiers);
+        for (const auto& identifier : identifiers) {
+            auto itemInfo = findItem(identifier);
+            if (itemInfo->author() == author && itemInfo->name() == name) {
+                return itemInfo;
+            }
+        }
+    }
+    
+    return nullptr;
+}
+
+std::vector<ModelPackageItemInfo> ModelPackageImpl::findItemsByAuthor(const std::string& author) const
+{
+    auto itemInfoVector = std::vector<ModelPackageItemInfo>();
+    auto itemInfoEntries = getItemInfoEntries();
+    if (itemInfoEntries != nullptr) {
+        std::vector<std::string> identifiers;
+        itemInfoEntries->getKeys(identifiers);
+        for (const auto& identifier : identifiers) {
+            auto itemInfo = findItem(identifier);
+        	if (itemInfo->author() == author) {
+	            itemInfoVector.push_back(*itemInfo);
+	        }
+        }
+    }
+    
+    return itemInfoVector;
+}
+
+void ModelPackageImpl::removeItem(const std::string& identifier)
+{
+    auto itemInfoEntry = getItemInfoEntry(identifier);
+    if (itemInfoEntry == nullptr) {
+        throw std::runtime_error("Failed to look up file with identifier: " + identifier);
+    }
+    
+    auto path = m_packageDataDirPath / itemInfoEntry->getString(kModelPackageItemInfoPathKey);
+    if (0 != std::remove(path.c_str())) {
+        throw std::runtime_error("Failed to remove file at path: " + path.string());
+    }
+    
+    removeItemInfoEntry(identifier);
+}
+
+bool ModelPackageImpl::isValid(const std::filesystem::path& path)
+{
+    try {
+        ModelPackageImpl(path, false, true);
+    } catch (std::runtime_error& e) {
+        return false;
+    }
+    return true;
+}
+
+#pragma mark ModelPackage
+
+ModelPackage::ModelPackage(const std::string& packagePath, bool createIfNecessary, bool readOnly)
+: m_modelPackageImpl(std::make_shared<ModelPackageImpl>(packagePath, createIfNecessary, readOnly))
+{
+}
+
+ModelPackage::~ModelPackage()
+{
+}
+
+std::string ModelPackage::path() const
+{
+    return m_modelPackageImpl->path();
+}
+
+std::string ModelPackage::setRootModel(const std::string& path, const std::string& name, const std::string& author, const std::string& description)
+{
+    return m_modelPackageImpl->setRootModel(path, name, author, description);
+}
+
+std::string ModelPackage::replaceRootModel(const std::string& path, const std::string& name, const std::string& author, const std::string& description)
+{
+    return m_modelPackageImpl->replaceRootModel(path, name, author, description);
+}
+
+std::shared_ptr<ModelPackageItemInfo> ModelPackage::getRootModel() const
+{
+    return m_modelPackageImpl->getRootModel();
+}
+
+std::string ModelPackage::addItem(const std::string& path, const std::string& name, const std::string& author, const std::string& description)
+{
+    return m_modelPackageImpl->addItem(path, name, author, description);
+}
+
+std::shared_ptr<ModelPackageItemInfo> ModelPackage::findItem(const std::string& identifier) const
+{
+    return m_modelPackageImpl->findItem(identifier);
+}
+
+std::shared_ptr<ModelPackageItemInfo> ModelPackage::findItem(const std::string& name, const std::string& author) const
+{
+    return m_modelPackageImpl->findItem(name, author);
+}
+
+std::vector<ModelPackageItemInfo> ModelPackage::findItemsByAuthor(const std::string& author) const
+{
+    return m_modelPackageImpl->findItemsByAuthor(author);
+}
+
+void ModelPackage::removeItem(const std::string& identifier)
+{
+    return m_modelPackageImpl->removeItem(identifier);
+}
+
+bool ModelPackage::isValid(const std::string& path)
+{
+    return ModelPackageImpl::isValid(path);
+}
+
+ModelPackageItemInfo ModelPackage::createFile(const std::string& name, const std::string& author, const std::string& description)
+{
+    return m_modelPackageImpl->createFile(name, author, description);
+}
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
diff --git a/cpp/external/katagocoreml/vendor/modelpackage/src/ModelPackage.hpp b/cpp/external/katagocoreml/vendor/modelpackage/src/ModelPackage.hpp
new file mode 100644
index 000000000..2e44a0fb9
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/modelpackage/src/ModelPackage.hpp
@@ -0,0 +1,160 @@
+//
+//  ModelPackage.hpp
+//  modelpackage
+//
+//  Copyright © 2021 Apple Inc. All rights reserved.
+//
+
+#ifndef ModelPackage_hpp
+#define ModelPackage_hpp
+
+#include <string>
+#include <memory>
+#include <vector>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/** MPL = Model Package Library. */
+namespace MPL {
+
+namespace detail {
+
+class ModelPackageItemInfoImpl;
+class ModelPackageImpl;
+
+} // namespace detail
+
+class ModelPackageItemInfo {
+    
+private:
+    
+    std::shared_ptr<detail::ModelPackageItemInfoImpl> m_modelPackageItemInfoImpl;
+        
+public:
+    
+    /** Creates an instance of file info to hold information about a file that exists in a model package. */
+    ModelPackageItemInfo(std::shared_ptr<detail::ModelPackageItemInfoImpl> modelPackageItemInfoImpl);
+    ~ModelPackageItemInfo();
+    
+    /** Unique file identifier of the file in the model package. */
+    const std::string& identifier() const;
+    
+    /** Path of the file inside the model package. */
+    const std::string& path() const;
+    
+    /** Name specified while storing the file in the model package. */
+    const std::string& name() const;
+    
+    /** Author specified while storing the file in the model package. */
+    const std::string& author() const;
+    
+    /** Description specified while storing the file in the model package. Defaults to "". */
+    const std::string& description() const;
+};
+
+
+class ModelPackage {
+
+private:
+    
+    std::shared_ptr<detail::ModelPackageImpl> m_modelPackageImpl;
+    
+public:
+    
+    /** Creates an instance of model package that exists at the specified path.
+        @param path Path of the model package (with extension .mlpackage).
+        @param createIfNecessary Create a new model package if one does not exist at the specified path. Defaults to true.
+        @param readOnly The model package will not be mutated Defaults to false.
+        @throw Runtime exception if an invalid model package exists at the specified path. */
+    explicit ModelPackage(const std::string& path, bool createIfNecessary = true, bool readOnly = false);
+    
+    ~ModelPackage();
+    
+    /** Returns the path of the model package. */
+    std::string path() const;
+    
+    /**
+     Set a root model in model package. Each model package has a unique root model, which can be retrieved without needing for an identifier.
+         @param path Path of the model file.
+         @param name Name of the model file.
+         @param author Author of the model file. Reverse DNS identifier of the author application is recommended. Example: com.apple.coremltools.
+         @param description Optional description to describe the model file.
+         @return Unique file identifier that can be used to retrieve the model file.
+         @throw a runtime exception if the model package already contains a root model. */
+    std::string setRootModel(const std::string& path, const std::string& name, const std::string& author, const std::string& description = "");
+
+    /**
+     replace a root model in model package. model package may or may not already contain a root model. Each model package has a unique root model, which can be retrieved without needing for an identifier.
+         @param path Path of the model file.
+         @param name Name of the model file.
+         @param author Author of the model file. Reverse DNS identifier of the author application is recommended. Example: com.apple.coremltools.
+         @param description Optional description to describe the model file.
+         @return Unique file identifier that can be used to retrieve the model file. */
+    std::string replaceRootModel(const std::string& path, const std::string& name, const std::string& author, const std::string& description = "");
+
+    /**
+     Retrieve previously set root model from the model package.
+         @return ModelPackageItemInfo with information about the retrieved root model file.
+         @throw Runtime exception if the model package does not contain a root model. */
+    std::shared_ptr<ModelPackageItemInfo> getRootModel() const;
+    
+    /**
+     Add a file or directory in the model package using name and author as a uniqueing key.
+         @param path Path of the file.
+         @param name Name of the file.
+         @param author Author of the file. Reverse DNS identifier of the author application is recommended. Example: com.apple.coremltools.
+         @param description Optional description to describe the file.
+         @return Unique file identifier that can be used to look up the file.
+         @throw a runtime exception if the model package already contains a file with provided name and author. */
+    std::string addItem(const std::string& path, const std::string& name, const std::string& author, const std::string& description = "");
+    
+    /**
+     Retrieve previously added file or directory from the model package by providing an identifier.
+        @param identifier Unique identifier of a previous added file
+        @return A pointer to ModelPackageItemInfo with information about the retrieved file or directory. nullptr if a file or directory with given identifier does not exist. */
+    std::shared_ptr<ModelPackageItemInfo> findItem(const std::string& identifier) const;
+    
+    /**
+     Retrieve previously added file or directory from the model package by providing name and author.
+        @param name Name of a previous added file
+        @param author Author of a previous added file
+        @return A pointer to ModelPackageItemInfo with information about the retrieved file or directory by providing name and author. nullptr if a file or directory with given name and author does not exist. */
+    std::shared_ptr<ModelPackageItemInfo> findItem(const std::string& name, const std::string& author) const;
+    
+    /**
+     Retrieve previously added files or directories from the model package by providing an author.
+        @param author Name of the author.
+        @return Vector of ModelPackageItemInfo objects with information about the retrieved files by providing the author. */
+    std::vector<ModelPackageItemInfo> findItemsByAuthor(const std::string& author) const;
+    
+    /**
+     Remove previously added file or directory from the model package by providing an identifier.
+        @param identifier Unique file identifier corresponding to a file that was added previously.
+        @throw Runtime exception if the model package does not contain file with provided identifier. */
+    void removeItem(const std::string& identifier);
+    
+    /**
+     Tells if the input path corresponds to a valid model package.
+        @param path Path of model package.
+        @return True if the path corresponds to a valid model package. False, otherwise. */
+    static bool isValid(const std::string& path);
+    
+    /**
+     Creates an empty file in the model package and returns corresponding file identifier.
+        @param name Name of the file.
+        @param author Author of the file. Reverse DNS identifier of the author application is recommended. Example: com.apple.coremltools.
+        @param description Optional description to describe the file.
+        @return ModelPackageItemInfo with information about the created file. */
+    ModelPackageItemInfo createFile(const std::string& name, const std::string& author, const std::string& description);
+};
+
+} // namespace MPL
+    
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#endif /* ModelPackage_hpp */
+
diff --git a/cpp/external/katagocoreml/vendor/modelpackage/src/utils/JsonMap.cpp b/cpp/external/katagocoreml/vendor/modelpackage/src/utils/JsonMap.cpp
new file mode 100644
index 000000000..400dcf222
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/modelpackage/src/utils/JsonMap.cpp
@@ -0,0 +1,171 @@
+//  JsonMap.cpp
+//  modelpackage
+//
+//  Copyright © 2021 Apple. All rights reserved.
+
+#include <iostream>
+#include <string>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <sstream>
+
+#include "JsonMap.hpp"
+#include "json.hpp"
+
+using namespace nlohmann;
+
+class JsonMapImpl {
+    
+public:
+    
+    nlohmann::json m_jsonObject;
+    
+    JsonMapImpl();
+    JsonMapImpl(std::istream& stream);
+    JsonMapImpl(nlohmann::json j_init);
+    
+    JsonMapImpl(const JsonMapImpl&) = delete;
+    JsonMapImpl(JsonMapImpl&&) = delete;
+    JsonMapImpl& operator=(const JsonMapImpl&) = delete;
+    JsonMapImpl& operator=(JsonMapImpl&&) = delete;
+    
+    /* ==== Key operations ==== */
+
+    bool hasKey(const std::string& key) const;
+    void getKeys(std::vector<std::string>& keys);
+
+    /* ==== Getter methods ==== */
+
+    std::string getString(const std::string& key) const;
+    std::unique_ptr<JsonMapImpl> getObject(const std::string& key) const;
+
+    /* ==== Setter methods ==== */
+
+    void setString(const std::string& key, const std::string& value);
+    void setObject(const std::string& key, std::unique_ptr<JsonMapImpl> value);
+    
+    void serialize(std::ostream& stream);
+    void deserialize(std::istream& stream);
+};
+
+JsonMapImpl::JsonMapImpl() {
+    m_jsonObject = nlohmann::json({});
+}
+
+JsonMapImpl::JsonMapImpl(std::istream& stream) {
+    deserialize(stream);
+}
+
+JsonMapImpl::JsonMapImpl(nlohmann::json j_init)
+: m_jsonObject(j_init) {
+}
+
+/* ==== Key operations ==== */
+
+bool JsonMapImpl::hasKey(const std::string& key) const {
+    return m_jsonObject.count(key) > 0;
+}
+
+void JsonMapImpl::getKeys(std::vector<std::string>& keys) {
+    for(json::iterator it = m_jsonObject.begin(); it != m_jsonObject.end(); ++it) {
+        keys.push_back(it.key());
+    }
+}
+
+/* ==== Getter methods ==== */
+
+std::string JsonMapImpl::getString(const std::string& key) const {
+    return m_jsonObject.at(key).get<std::string>();
+}
+
+std::unique_ptr<JsonMapImpl> JsonMapImpl::getObject(const std::string& key) const {
+    auto childCopy = m_jsonObject.at(key);
+    return std::make_unique<JsonMapImpl>(childCopy);
+}
+
+/* ==== Setter methods ==== */
+
+void JsonMapImpl::setString(const std::string& key, const std::string& value) {
+    m_jsonObject[key] = value;
+}
+
+void JsonMapImpl::setObject(const std::string& key, std::unique_ptr<JsonMapImpl> value) {
+    m_jsonObject[key] = value->m_jsonObject;
+}
+
+void JsonMapImpl::deserialize(std::istream& stream) {
+    if(!stream.good()) {
+        throw std::runtime_error("Input stream is not valid");
+    }
+
+    try {
+        stream >> m_jsonObject;
+    } catch (std::exception& e) {
+        // nlohmann::json raises std::exception on parser errors, but the client of JsonMap only
+        // handles std::runtime_error because they don't want to "handle" programming errors
+        // (std::logic_error).
+        //
+        // As such, we translate the exception type here.
+        throw std::runtime_error(e.what());
+    }
+}
+
+void JsonMapImpl::serialize(std::ostream& stream) {
+    // write prettified JSON to another file
+    stream << std::setw(4) << m_jsonObject << std::endl;
+
+}
+
+/* ==== JsonMap ==== */
+
+JsonMap::JsonMap()
+: m_jsonMapImpl(std::make_unique<JsonMapImpl>())
+{
+}
+
+JsonMap::JsonMap(std::istream& stream)
+: m_jsonMapImpl(std::make_unique<JsonMapImpl>(stream))
+{
+}
+
+JsonMap::JsonMap(std::unique_ptr<JsonMapImpl> jsonMapImpl)
+: m_jsonMapImpl(std::move(jsonMapImpl))
+{
+}
+
+JsonMap::~JsonMap() = default;
+
+/* ==== Key operations ==== */
+
+bool JsonMap::hasKey(const std::string& key) const {
+    return m_jsonMapImpl->hasKey(key);
+}
+
+void JsonMap::getKeys(std::vector<std::string>& keys) {
+    return m_jsonMapImpl->getKeys(keys);
+}
+
+/* ==== Getter methods ==== */
+
+std::string JsonMap::getString(const std::string& key) const {
+    return m_jsonMapImpl->getString(key);
+}
+
+std::unique_ptr<JsonMap> JsonMap::getObject(const std::string& key) const {
+    return std::make_unique<JsonMap>(m_jsonMapImpl->getObject(key));
+}
+
+/* ==== Setter methods ==== */
+
+void JsonMap::setString(const std::string& key, const std::string& value) {
+    return m_jsonMapImpl->setString(key, value);
+}
+
+void JsonMap::setObject(const std::string& key, std::unique_ptr<JsonMap> value) {
+    m_jsonMapImpl->setObject(key, std::move(value->m_jsonMapImpl));
+}
+
+void JsonMap::serialize(std::ostream& stream) {
+    return m_jsonMapImpl->serialize(stream);
+}
diff --git a/cpp/external/katagocoreml/vendor/modelpackage/src/utils/JsonMap.hpp b/cpp/external/katagocoreml/vendor/modelpackage/src/utils/JsonMap.hpp
new file mode 100644
index 000000000..962d25b61
--- /dev/null
+++ b/cpp/external/katagocoreml/vendor/modelpackage/src/utils/JsonMap.hpp
@@ -0,0 +1,52 @@
+//
+//  JsonMap.hpp
+//  modelpackage
+//
+//  Copyright © 2021 Apple. All rights reserved.
+//
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <memory>
+
+class JsonMapImpl;
+
+class JsonMap {
+    
+private:
+    
+    std::unique_ptr<JsonMapImpl> m_jsonMapImpl;
+    
+public:
+
+    JsonMap();
+    JsonMap(std::istream& stream);
+    JsonMap(std::unique_ptr<JsonMapImpl> jsonMapImpl);
+    
+    ~JsonMap();
+    
+    JsonMap(const JsonMap&) = delete;
+    JsonMap(JsonMap&&) = delete;
+    JsonMap& operator=(const JsonMap&) = delete;
+    JsonMap& operator=(JsonMap&&) = delete;
+
+    /* ==== Key operations ==== */
+
+    bool hasKey(const std::string& key) const;
+    void getKeys(std::vector<std::string>& keys);
+
+    /* ==== Getter methods ==== */
+
+    std::string getString(const std::string& key) const;
+    std::unique_ptr<JsonMap> getObject(const std::string& key) const;
+
+    /* ==== Setter methods ==== */
+
+    void setString(const std::string& key, const std::string& value);
+    void setObject(const std::string& key, std::unique_ptr<JsonMap> value);
+
+    void serialize(std::ostream& stream);
+};
diff --git a/cpp/neuralnet/metalbackend.cpp b/cpp/neuralnet/metalbackend.cpp
index 58ff2c4a3..ac061429c 100644
--- a/cpp/neuralnet/metalbackend.cpp
+++ b/cpp/neuralnet/metalbackend.cpp
@@ -5,62 +5,149 @@
 #include "../neuralnet/nninputs.h"
 #include "../neuralnet/nninterface.h"
 #include "../neuralnet/metalbackend.h"
-#include "../core/test.h"
 
-/// Converts a ConvLayerDesc instance from C++ to Swift by creating a new SWConvLayerDesc instance with the same properties.
-/// - Parameter desc: The ConvLayerDesc instance to convert.
-/// - Returns: A SWConvLayerDesc instance with the same properties as the input ConvLayerDesc.
-SWConvLayerDesc MetalProcess::convLayerDescToSwift(const ConvLayerDesc * desc) {
+#include <katagocoreml/KataGoConverter.hpp>
+#include <ghc/filesystem.hpp>
+#include <mutex>
+#include <chrono>
+#include <cassert>
+#include <unistd.h>  // For getpid()
 
-  SWConvLayerDesc swDesc = createSWConvLayerDesc(desc->convYSize,
-                                                 desc->convXSize,
-                                                 desc->inChannels,
-                                                 desc->outChannels,
-                                                 desc->dilationY,
-                                                 desc->dilationX,
-                                                 (float*)desc->weights.data());
+using namespace std;
 
-  return swDesc;
+//------------------------------------------------------------------------------
+// CoreML Model Conversion - Native C++ using katagocoreml library
+//------------------------------------------------------------------------------
+
+namespace gfs = ghc::filesystem;
+
+namespace CoreMLConversion {
+
+// Get temp directory for model conversion
+static string getTempDirectory() {
+  gfs::path tempDir = gfs::temp_directory_path() / "katago_coreml";
+  std::error_code ec;
+  gfs::create_directories(tempDir, ec);
+  if(ec) {
+    throw runtime_error("Failed to create temp directory: " + ec.message());
+  }
+  return tempDir.string();
 }
 
-/// Converts a BatchNormLayerDesc instance from C++ to Swift by creating a new SWBatchNormLayerDesc instance with the same properties.
-/// - Parameter desc: The BatchNormLayerDesc instance to convert.
-/// - Returns: A SWBatchNormLayerDesc instance with the same properties as the input BatchNormLayerDesc.
-SWBatchNormLayerDesc MetalProcess::batchNormLayerDescToSwift(const BatchNormLayerDesc * desc) {
+// Generate unique temporary path for model conversion
+static string generateTempPath(int serverThreadIdx) {
+  auto now = chrono::steady_clock::now().time_since_epoch().count();
+  return getTempDirectory() + "/model_" + to_string(getpid()) + "_" +
+         to_string(serverThreadIdx) + "_" + to_string(now) + ".mlpackage";
+}
 
-  SWBatchNormLayerDesc swDesc =
-  createSWBatchNormLayerDesc(desc->numChannels,
-                             (float*)desc->mergedScale.data(),
-                             (float*)desc->mergedBias.data());
+// CoreML model metadata constants
+static const string COREML_MODEL_AUTHOR = "KataGo";
+static const string COREML_MODEL_LICENSE = "See original model file for license terms";
 
-  return swDesc;
+// Convert KataGo model to CoreML in temp directory, returns path to .mlpackage
+// The caller (Swift side) is responsible for deleting the temp file after loading
+static string convertModelToTemp(
+  const string& modelPath,
+  int boardX,
+  int boardY,
+  bool useFP16,
+  bool optimizeMask,
+  int maxBatchSize,
+  int serverThreadIdx
+) {
+  // maxBatchSize is validated upstream: cfg.getInt("nnMaxBatchSize", 1, 65536) in setup.cpp
+  // and NNEvaluator constructor throws if maxBatchSize <= 0. Assert for defensive documentation.
+  assert(maxBatchSize >= 1);
+
+  string tempPath = generateTempPath(serverThreadIdx);
+  cerr << "Metal backend " << serverThreadIdx << ": Converting model to " << tempPath << endl;
+
+  katagocoreml::ConversionOptions opts;
+  opts.board_x_size = boardX;
+  opts.board_y_size = boardY;
+  opts.compute_precision = useFP16 ? "FLOAT16" : "FLOAT32";
+  opts.optimize_identity_mask = optimizeMask;
+  opts.min_batch_size = 1;
+  opts.max_batch_size = maxBatchSize;
+  opts.author = COREML_MODEL_AUTHOR;
+  opts.license = COREML_MODEL_LICENSE;
+
+  try {
+    katagocoreml::KataGoConverter::convert(modelPath, tempPath, opts);
+  } catch(const exception& e) {
+    // Clean up partial conversion on failure
+    std::error_code ec;
+    gfs::remove_all(tempPath, ec);
+    if(ec) {
+      cerr << "Metal backend " << serverThreadIdx << ": Warning: Failed to clean up partial conversion at " << tempPath << ": " << ec.message() << endl;
+    }
+    throw runtime_error(string("Metal backend ") + to_string(serverThreadIdx) + ": Core ML model conversion failed: " + e.what());
+  }
+
+  cerr << "Metal backend " << serverThreadIdx << ": Conversion completed" << endl;
+  return tempPath;
 }
 
-/// Convert an activation layer description from C++ to Swift
-/// - Parameter desc: An activation layer description
-ActivationKind MetalProcess::activationLayerDescToSwift(const ActivationLayerDesc * desc) {
+}  // namespace CoreMLConversion
+
+//------------------------------------------------------------------------------
+// Model Descriptor Conversion - C++ to Swift types for MPSGraph
+//------------------------------------------------------------------------------
+
+namespace MetalProcess {
+
+/// Converts a ConvLayerDesc instance from C++ to Swift
+SWConvLayerDesc convLayerDescToSwift(const ConvLayerDesc* desc) {
+  return createSWConvLayerDesc(
+    desc->convYSize,
+    desc->convXSize,
+    desc->inChannels,
+    desc->outChannels,
+    desc->dilationY,
+    desc->dilationX,
+    (float*)desc->weights.data());
+}
+
+/// Converts a BatchNormLayerDesc instance from C++ to Swift
+SWBatchNormLayerDesc batchNormLayerDescToSwift(const BatchNormLayerDesc* desc) {
+  return createSWBatchNormLayerDesc(
+    desc->numChannels,
+    (float*)desc->mergedScale.data(),
+    (float*)desc->mergedBias.data());
+}
 
-  switch (desc->activation) {
+/// Convert an activation layer description from C++ to Swift
+ActivationKind activationLayerDescToSwift(const ActivationLayerDesc* desc) {
+  switch(desc->activation) {
     case ACTIVATION_RELU:
       return ActivationKind::relu();
     case ACTIVATION_MISH:
       return ActivationKind::mish();
     case ACTIVATION_MISH_SCALE8:
-      testAssert(false); // Metal does not use scaled mish activations due to no fp16
-      return ActivationKind::identity(); // Placeholder for compilation
+      return ActivationKind::identity(); // Metal/CoreML does not use scaled mish
     case ACTIVATION_IDENTITY:
       return ActivationKind::identity();
     default:
-      testAssert(false);
-      return ActivationKind::identity(); // Placeholder for compilation
+      throw StringError("Unhandled activation kind: " + std::to_string(desc->activation));
   }
 }
 
-/// Convert a residual block description from C++ to Swift
-/// - Parameter desc: A residual block description
-/// - Returns: The residual block description converted to SWResidualBlockDesc
-SWResidualBlockDesc MetalProcess::residualBlockDescToSwift(const ResidualBlockDesc * desc) {
+/// Convert a matrix multiplication layer description from C++ to Swift
+SWMatMulLayerDesc matMulLayerDescToSwift(const MatMulLayerDesc* desc) {
+  return createSWMatMulLayerDesc(
+    desc->inChannels,
+    desc->outChannels,
+    (float*)desc->weights.data());
+}
+
+/// Convert a matrix bias layer description from C++ to Swift
+SWMatBiasLayerDesc matBiasLayerDescToSwift(const MatBiasLayerDesc* desc) {
+  return createSWMatBiasLayerDesc(desc->numChannels, (float*)desc->weights.data());
+}
 
+/// Convert a residual block description from C++ to Swift
+SWResidualBlockDesc residualBlockDescToSwift(const ResidualBlockDesc* desc) {
   SWBatchNormLayerDesc preBN = batchNormLayerDescToSwift(&desc->preBN);
   ActivationKind preActivationKind = activationLayerDescToSwift(&desc->preActivation);
   SWConvLayerDesc regularConv = convLayerDescToSwift(&desc->regularConv);
@@ -68,34 +155,17 @@ SWResidualBlockDesc MetalProcess::residualBlockDescToSwift(const ResidualBlockDe
   ActivationKind midActivationKind = activationLayerDescToSwift(&desc->midActivation);
   SWConvLayerDesc finalConv = convLayerDescToSwift(&desc->finalConv);
 
-  SWResidualBlockDesc swDesc =
-  createSWResidualBlockDesc(preBN,
-                            preActivationKind,
-                            regularConv,
-                            midBN,
-                            midActivationKind,
-                            finalConv);
-
-  return swDesc;
-}
-
-/// Convert a matrix multiplication layer description from C++ to Swift
-/// - Parameter desc: A matrix multiplication layer description
-/// - Returns: The matrix multiplication layer description converted to SWMatMulLayerDesc
-SWMatMulLayerDesc MetalProcess::matMulLayerDescToSwift(const MatMulLayerDesc * desc) {
-
-  SWMatMulLayerDesc swDesc = createSWMatMulLayerDesc(desc->inChannels,
-                                                     desc->outChannels,
-                                                     (float*)desc->weights.data());
-
-  return swDesc;
+  return createSWResidualBlockDesc(
+    preBN,
+    preActivationKind,
+    regularConv,
+    midBN,
+    midActivationKind,
+    finalConv);
 }
 
 /// Convert a global pooling residual block description from C++ to Swift
-/// - Parameter desc: A global pooling residual block description
-/// - Returns: The global pooling residual block description converted to SWGlobalPoolingResidualBlockDesc
-SWGlobalPoolingResidualBlockDesc MetalProcess::globalPoolingResidualBlockDescToSwift(const GlobalPoolingResidualBlockDesc* desc) {
-
+SWGlobalPoolingResidualBlockDesc globalPoolingResidualBlockDescToSwift(const GlobalPoolingResidualBlockDesc* desc) {
   SWBatchNormLayerDesc preBN = batchNormLayerDescToSwift(&desc->preBN);
   ActivationKind preActivationKind = activationLayerDescToSwift(&desc->preActivation);
   SWConvLayerDesc regularConv = convLayerDescToSwift(&desc->regularConv);
@@ -107,37 +177,53 @@ SWGlobalPoolingResidualBlockDesc MetalProcess::globalPoolingResidualBlockDescToS
   ActivationKind midActivationKind = activationLayerDescToSwift(&desc->midActivation);
   SWConvLayerDesc finalConv = convLayerDescToSwift(&desc->finalConv);
 
-  SWGlobalPoolingResidualBlockDesc swDesc =
-  createSWGlobalPoolingResidualBlockDesc(preBN,
-                                         preActivationKind,
-                                         regularConv,
-                                         gpoolConv,
-                                         gpoolBN,
-                                         gpoolActivationKind,
-                                         gpoolToBiasMul,
-                                         midBN,
-                                         midActivationKind,
-                                         finalConv);
+  return createSWGlobalPoolingResidualBlockDesc(
+    preBN,
+    preActivationKind,
+    regularConv,
+    gpoolConv,
+    gpoolBN,
+    gpoolActivationKind,
+    gpoolToBiasMul,
+    midBN,
+    midActivationKind,
+    finalConv);
+}
+
+// Forward declaration for mutual recursion
+swift::Array<BlockDescriptor> residualBlocksToSwift(const vector<pair<int, unique_ptr_void>>& blocks);
 
-  return swDesc;
+/// Convert a nested bottleneck residual block description from C++ to Swift
+SWNestedBottleneckResidualBlockDesc nestedBottleneckResidualBlockDescToSwift(const NestedBottleneckResidualBlockDesc* desc) {
+  SWBatchNormLayerDesc preBN = batchNormLayerDescToSwift(&desc->preBN);
+  ActivationKind preActivationKind = activationLayerDescToSwift(&desc->preActivation);
+  SWConvLayerDesc preConv = convLayerDescToSwift(&desc->preConv);
+  auto swBlocks = residualBlocksToSwift(desc->blocks);
+  SWBatchNormLayerDesc postBN = batchNormLayerDescToSwift(&desc->postBN);
+  ActivationKind postActivationKind = activationLayerDescToSwift(&desc->postActivation);
+  SWConvLayerDesc postConv = convLayerDescToSwift(&desc->postConv);
+
+  return createSWNestedBottleneckResidualBlockDesc(
+    preBN,
+    preActivationKind,
+    preConv,
+    swBlocks,
+    postBN,
+    postActivationKind,
+    postConv);
 }
 
 /// Convert residual blocks from C++ to Swift
-/// - Parameters:
-///   - blocks: Residual blocks
-///   - swBlocks: A pointer to an array of BlockDescriptor
-swift::Array<BlockDescriptor> MetalProcess::residualBlocksToSwift(const vector<pair<int, unique_ptr_void>>& blocks) {
-
+swift::Array<BlockDescriptor> residualBlocksToSwift(const vector<pair<int, unique_ptr_void>>& blocks) {
   auto builder = createBlockDescriptorBuilder();
 
-  for (int i = 0; i < blocks.size(); i++) {
+  for(size_t i = 0; i < blocks.size(); i++) {
+    void* blockDesc = blocks[i].second.get();
 
-    void * blockDesc = blocks[i].second.get();
-
-    if (blocks[i].first == GLOBAL_POOLING_BLOCK_KIND) {
+    if(blocks[i].first == GLOBAL_POOLING_BLOCK_KIND) {
       BlockDescriptor descriptor = globalPoolingResidualBlockDescToSwift((GlobalPoolingResidualBlockDesc*)blockDesc);
       builder.enque(descriptor);
-    } else if (blocks[i].first == NESTED_BOTTLENECK_BLOCK_KIND) {
+    } else if(blocks[i].first == NESTED_BOTTLENECK_BLOCK_KIND) {
       BlockDescriptor descriptor = nestedBottleneckResidualBlockDescToSwift((NestedBottleneckResidualBlockDesc*)blockDesc);
       builder.enque(descriptor);
     } else {
@@ -149,35 +235,8 @@ swift::Array<BlockDescriptor> MetalProcess::residualBlocksToSwift(const vector<p
   return builder.getBlockDescriptors();
 }
 
-/// Convert a nested bottleneck residual block description from C++ to Swift
-/// - Parameter desc: A nested bottleneck residual block description
-SWNestedBottleneckResidualBlockDesc MetalProcess::nestedBottleneckResidualBlockDescToSwift(const NestedBottleneckResidualBlockDesc* desc) {
-
-  SWBatchNormLayerDesc preBN = batchNormLayerDescToSwift(&desc->preBN);
-  ActivationKind preActivationKind = activationLayerDescToSwift(&desc->preActivation);
-  SWConvLayerDesc preConv = convLayerDescToSwift(&desc->preConv);
-  auto swBlocks = residualBlocksToSwift(desc->blocks);
-  SWBatchNormLayerDesc postBN = batchNormLayerDescToSwift(&desc->postBN);
-  ActivationKind postActivationKind = activationLayerDescToSwift(&desc->postActivation);
-  SWConvLayerDesc postConv = convLayerDescToSwift(&desc->postConv);
-
-  SWNestedBottleneckResidualBlockDesc swDesc =
-  createSWNestedBottleneckResidualBlockDesc(preBN,
-                                            preActivationKind,
-                                            preConv,
-                                            swBlocks,
-                                            postBN,
-                                            postActivationKind,
-                                            postConv);
-
-  return swDesc;
-}
-
 /// Convert a SGF metadata encoder description from C++ to Swift
-/// - Parameter desc: A SGF metadata encoder description
-/// - Returns: The SGF metadata encoder description converted to SWSGFMetadataEncoderDesc
-swift::Optional<SWSGFMetadataEncoderDesc> MetalProcess::sGFMetadataEncoderDescToSwift(const SGFMetadataEncoderDesc * desc) {
-
+swift::Optional<SWSGFMetadataEncoderDesc> sGFMetadataEncoderDescToSwift(const SGFMetadataEncoderDesc* desc) {
   SWMatMulLayerDesc mul1 = matMulLayerDescToSwift(&desc->mul1);
   SWMatBiasLayerDesc bias1 = matBiasLayerDescToSwift(&desc->bias1);
   ActivationKind act1 = activationLayerDescToSwift(&desc->act1);
@@ -186,24 +245,20 @@ swift::Optional<SWSGFMetadataEncoderDesc> MetalProcess::sGFMetadataEncoderDescTo
   ActivationKind act2 = activationLayerDescToSwift(&desc->act2);
   SWMatMulLayerDesc mul3 = matMulLayerDescToSwift(&desc->mul3);
 
-  auto swSGFMetadataEncoderDesc = createSWSGFMetadataEncoderDesc(desc->metaEncoderVersion,
-                                                                 desc->numInputMetaChannels,
-                                                                 mul1,
-                                                                 bias1,
-                                                                 act1,
-                                                                 mul2,
-                                                                 bias2,
-                                                                 act2,
-                                                                 mul3);
-
-  return swSGFMetadataEncoderDesc;
+  return createSWSGFMetadataEncoderDesc(
+    desc->metaEncoderVersion,
+    desc->numInputMetaChannels,
+    mul1,
+    bias1,
+    act1,
+    mul2,
+    bias2,
+    act2,
+    mul3);
 }
 
 /// Convert a trunk description from C++ to Swift
-/// - Parameter trunk: A trunk description
-/// - Returns: The trunk description converted to SWTrunkDesc
-SWTrunkDesc MetalProcess::trunkDescToSwift(const TrunkDesc * trunk) {
-
+SWTrunkDesc trunkDescToSwift(const TrunkDesc* trunk) {
   SWConvLayerDesc initialConv = convLayerDescToSwift(&trunk->initialConv);
   SWMatMulLayerDesc initialMatMul = matMulLayerDescToSwift(&trunk->initialMatMul);
   auto sgfMetadataEncoder = sGFMetadataEncoderDescToSwift(&trunk->sgfMetadataEncoder);
@@ -211,26 +266,22 @@ SWTrunkDesc MetalProcess::trunkDescToSwift(const TrunkDesc * trunk) {
   SWBatchNormLayerDesc trunkTipBN = batchNormLayerDescToSwift(&trunk->trunkTipBN);
   ActivationKind trunkTipActivation = activationLayerDescToSwift(&trunk->trunkTipActivation);
 
-  SWTrunkDesc swTrunkDesc = createSWTrunkDesc(trunk->modelVersion,
-                                              trunk->trunkNumChannels,
-                                              trunk->midNumChannels,
-                                              trunk->regularNumChannels,
-                                              trunk->gpoolNumChannels,
-                                              initialConv,
-                                              initialMatMul,
-                                              sgfMetadataEncoder,
-                                              swBlocks,
-                                              trunkTipBN,
-                                              trunkTipActivation);
-
-  return swTrunkDesc;
+  return createSWTrunkDesc(
+    trunk->modelVersion,
+    trunk->trunkNumChannels,
+    trunk->midNumChannels,
+    trunk->regularNumChannels,
+    trunk->gpoolNumChannels,
+    initialConv,
+    initialMatMul,
+    sgfMetadataEncoder,
+    swBlocks,
+    trunkTipBN,
+    trunkTipActivation);
 }
 
 /// Convert a policy head description from C++ to Swift
-/// - Parameter policyHead: A policy head description
-/// - Returns: The policy head description converted to SWPolicyHeadDesc
-SWPolicyHeadDesc MetalProcess::policyHeadDescToSwift(const PolicyHeadDesc * policyHead) {
-
+SWPolicyHeadDesc policyHeadDescToSwift(const PolicyHeadDesc* policyHead) {
   SWConvLayerDesc p1Conv = convLayerDescToSwift(&policyHead->p1Conv);
   SWConvLayerDesc g1Conv = convLayerDescToSwift(&policyHead->g1Conv);
   SWBatchNormLayerDesc g1BN = batchNormLayerDescToSwift(&policyHead->g1BN);
@@ -244,38 +295,24 @@ SWPolicyHeadDesc MetalProcess::policyHeadDescToSwift(const PolicyHeadDesc * poli
   ActivationKind passActivation = activationLayerDescToSwift(&policyHead->passActivation);
   SWMatMulLayerDesc gpoolToPassMul2 = matMulLayerDescToSwift(&policyHead->gpoolToPassMul2);
 
-  SWPolicyHeadDesc swPolicyHead = createSWPolicyHeadDesc(policyHead->modelVersion,
-                                                         p1Conv,
-                                                         g1Conv,
-                                                         g1BN,
-                                                         g1Activation,
-                                                         gpoolToBiasMul,
-                                                         p1BN,
-                                                         p1Activation,
-                                                         p2Conv,
-                                                         gpoolToPassMul,
-                                                         gpoolToPassBias,
-                                                         passActivation,
-                                                         gpoolToPassMul2);
-
-  return swPolicyHead;
-}
-
-/// Convert a matrix bias layer description from C++ to Swift
-/// - Parameter desc: A matrix bias layer description
-/// - Returns: The matrix bias layer description converted to SWMatBiasLayerDesc
-SWMatBiasLayerDesc MetalProcess::matBiasLayerDescToSwift(const MatBiasLayerDesc * desc) {
-
-  SWMatBiasLayerDesc swDesc = createSWMatBiasLayerDesc(desc->numChannels, (float*)desc->weights.data());
-
-  return swDesc;
+  return createSWPolicyHeadDesc(
+    policyHead->modelVersion,
+    p1Conv,
+    g1Conv,
+    g1BN,
+    g1Activation,
+    gpoolToBiasMul,
+    p1BN,
+    p1Activation,
+    p2Conv,
+    gpoolToPassMul,
+    gpoolToPassBias,
+    passActivation,
+    gpoolToPassMul2);
 }
 
 /// Convert a value head description from C++ to Swift
-/// - Parameter valueHead: A value head description
-/// - Returns: The value head description converted to SWValueHeadDesc
-SWValueHeadDesc MetalProcess::valueHeadDescToSwift(const ValueHeadDesc * valueHead) {
-
+SWValueHeadDesc valueHeadDescToSwift(const ValueHeadDesc* valueHead) {
   SWConvLayerDesc v1Conv = convLayerDescToSwift(&valueHead->v1Conv);
   SWBatchNormLayerDesc v1BN = batchNormLayerDescToSwift(&valueHead->v1BN);
   ActivationKind v1Activation = activationLayerDescToSwift(&valueHead->v1Activation);
@@ -288,136 +325,90 @@ SWValueHeadDesc MetalProcess::valueHeadDescToSwift(const ValueHeadDesc * valueHe
   SWMatBiasLayerDesc sv3Bias = matBiasLayerDescToSwift(&valueHead->sv3Bias);
   SWConvLayerDesc vOwnershipConv = convLayerDescToSwift(&valueHead->vOwnershipConv);
 
-  SWValueHeadDesc swDesc = createSWValueHeadDesc(valueHead->modelVersion,
-                                                 v1Conv,
-                                                 v1BN,
-                                                 v1Activation,
-                                                 v2Mul,
-                                                 v2Bias,
-                                                 v2Activation,
-                                                 v3Mul,
-                                                 v3Bias,
-                                                 sv3Mul,
-                                                 sv3Bias,
-                                                 vOwnershipConv);
-
-  return swDesc;
-}
-
-SWModelDesc MetalProcess::modelDescToSwift(const ModelDesc* modelDesc) {
-  return createSWModelDesc(modelDesc->modelVersion,
-                           swift::String(modelDesc->name),
-                           modelDesc->numInputChannels,
-                           modelDesc->numInputGlobalChannels,
-                           modelDesc->numInputMetaChannels,
-                           modelDesc->numValueChannels,
-                           modelDesc->numScoreValueChannels,
-                           modelDesc->numOwnershipChannels,
-                           trunkDescToSwift(&modelDesc->trunk),
-                           policyHeadDescToSwift(&modelDesc->policyHead),
-                           valueHeadDescToSwift(&modelDesc->valueHead));
-}
-
-//---------------------------------------------------------------------------------------------------------
-
-/**
- * @brief This function initializes the global state of the NeuralNet class upon program startup.
- * This function should be called only once upon program startup. It ensures that the global state
- * of the NeuralNet class is properly initialized, enabling it to function correctly throughout
- * the lifetime of the program.
- * Note that this function does not take any input parameters or return any values.
- */
+  return createSWValueHeadDesc(
+    valueHead->modelVersion,
+    v1Conv,
+    v1BN,
+    v1Activation,
+    v2Mul,
+    v2Bias,
+    v2Activation,
+    v3Mul,
+    v3Bias,
+    sv3Mul,
+    sv3Bias,
+    vOwnershipConv);
+}
+
+/// Convert a model description from C++ to Swift
+SWModelDesc modelDescToSwift(const ModelDesc* modelDesc) {
+  return createSWModelDesc(
+    modelDesc->modelVersion,
+    swift::String(modelDesc->name),
+    modelDesc->numInputChannels,
+    modelDesc->numInputGlobalChannels,
+    modelDesc->numInputMetaChannels,
+    modelDesc->numValueChannels,
+    modelDesc->numScoreValueChannels,
+    modelDesc->numOwnershipChannels,
+    modelDesc->numPolicyChannels,
+    trunkDescToSwift(&modelDesc->trunk),
+    policyHeadDescToSwift(&modelDesc->policyHead),
+    valueHeadDescToSwift(&modelDesc->valueHead));
+}
+
+}  // namespace MetalProcess
+
+//------------------------------------------------------------------------------
+// LoadedModel implementation
+//------------------------------------------------------------------------------
+
+LoadedModel::LoadedModel(const string& fileName, const string& expectedSha256) {
+  modelPath = fileName;
+  ModelDesc::loadFromFileMaybeGZipped(fileName, modelDesc, expectedSha256);
+}
+
+//------------------------------------------------------------------------------
+// NeuralNet namespace - Global functions
+//------------------------------------------------------------------------------
+
 void NeuralNet::globalInitialize() {
-  // Do nothing.
+  // No global initialization needed for Metal backend
 }
 
-/**
- * @brief This function cleans up the global state of the NeuralNet class at program termination.
- * This function should be called once at program termination. It ensures that the global state of
- * the NeuralNet class is properly cleaned up, freeing any resources that were allocated during the
- * lifetime of the program.
- * Note that this function does not take any input parameters or return any values.
- */
 void NeuralNet::globalCleanup() {
-  // Do nothing.
-}
-
-/**
- * @brief Loads a neural network model from a file.
- * This function creates a LoadedModel object by loading a neural network model from a file specified by
- * the `file` parameter and expected SHA-256 hash specified by the `expectedSha256` parameter. The LoadedModel
- * object is returned as a pointer.
- * @param file The name of the file containing the neural network model.
- * @param expectedSha256 The expected SHA-256 hash of the model file.
- * @return A pointer to the LoadedModel object created by loading the model file.
- */
+  // No cleanup needed - temp files are deleted immediately after loading
+}
+
 LoadedModel* NeuralNet::loadModelFile(const string& file, const string& expectedSha256) {
   LoadedModel* loadedModel = new LoadedModel(file, expectedSha256);
   return loadedModel;
 }
 
-/**
- * @brief Frees memory used by a LoadedModel object.
- * This function deallocates memory used by a LoadedModel object specified by the `loadedModel` parameter.
- * @param loadedModel A pointer to the LoadedModel object to deallocate memory for.
- */
 void NeuralNet::freeLoadedModel(LoadedModel* loadedModel) {
   delete loadedModel;
 }
 
-/**
- * @brief Retrieves the model description associated with the loaded model.
- *
- * This function accesses the model description from a given LoadedModel instance.
- * It returns a constant reference to the ModelDesc, which contains details
- * about the structure and parameters of the neural network model.
- *
- * @param loadedModel Pointer to the LoadedModel instance from which to retrieve
- *                    the model description. This should not be null.
- * @return const ModelDesc& A constant reference to the model description of
- *                          the loaded model.
- */
 const ModelDesc& NeuralNet::getModelDesc(const LoadedModel* loadedModel) {
   return loadedModel->modelDesc;
 }
 
+//------------------------------------------------------------------------------
+// ComputeContext implementation
 //------------------------------------------------------------------------------
 
 ComputeContext::ComputeContext(int nnX, int nnY, enabled_t useFP16Mode, enabled_t useNHWCMode):
-metalComputeContext(createMetalComputeContext(nnX, nnY)) {
+metalContext(createMetalComputeContext(nnX, nnY, useFP16Mode != enabled_t::False)) {
   this->useFP16Mode = useFP16Mode;
-
-  SWEnable swUseFP16Mode =
-  (useFP16Mode == enabled_t::False) ? SWEnable::False() :
-  (useFP16Mode == enabled_t::True) ? SWEnable::True() :
-  SWEnable::Auto();
-
-  SWEnable swUseNHWCMode =
-  (useNHWCMode == enabled_t::False) ? SWEnable::False() :
-  (useNHWCMode == enabled_t::True) ? SWEnable::True() :
-  SWEnable::Auto();
+  this->nnXLen = nnX;
+  this->nnYLen = nnY;
+  // Metal backend only supports NCHW layout (MPSGraph native format)
+  (void)useNHWCMode;
 }
 
 ComputeContext::~ComputeContext() {
 }
 
-/**
- * @brief Creates a ComputeContext object for computing neural network operations.
- * This function creates a ComputeContext object by setting configuration settings for neural network computations,
- * such as whether to use half-precision floating-point (FP16) mode and whether to use the NHWC format for input
- * tensors. The ComputeContext object is returned as a pointer.
- * @param gpuIdxs (Unused) A vector of GPU indices to use for computations.
- * @param logger (Unused) A pointer to a Logger object to use for logging messages.
- * @param nnXLen The width of the input tensor.
- * @param nnYLen The height of the input tensor.
- * @param openCLTunerFile (Unused) The name of a file containing OpenCL tuning parameters.
- * @param homeDataDirOverride (Unused) A directory to use for storing data.
- * @param openCLReTunePerBoardSize (Unused) Whether to re-tune OpenCL parameters for different board sizes.
- * @param useFP16Mode Whether to use half-precision floating-point (FP16) mode for computations.
- * @param useNHWCMode Whether to use the NHWC format for input tensors.
- * @param loadedModel (Unused) A pointer to a LoadedModel object containing a loaded neural network model.
- * @return A pointer to the ComputeContext object created.
- */
 ComputeContext* NeuralNet::createComputeContext(
   const vector<int>& gpuIdxs,
   Logger* logger,
@@ -440,29 +431,125 @@ ComputeContext* NeuralNet::createComputeContext(
   return new ComputeContext(nnXLen, nnYLen, useFP16Mode, useNHWCMode);
 }
 
-/**
- * @brief Frees memory used by a ComputeContext object.
- * This function deallocates memory used by a ComputeContext object specified by the `computeContext` parameter.
- * @param computeContext A pointer to the ComputeContext object to deallocate memory for.
- */
 void NeuralNet::freeComputeContext(ComputeContext* computeContext) {
   delete computeContext;
 }
 
-//--------------------------------------------------------------
+//------------------------------------------------------------------------------
+// ComputeHandle implementation
+//------------------------------------------------------------------------------
+
+static mutex computeHandleMutex;
 
-ComputeHandle::ComputeHandle(ComputeContext* context,
-                             const LoadedModel* loadedModel,
-                             bool inputsUseNHWC,
-                             int gpuIdx,
-                             int serverThreadIdx):
-metalhandle(maybeCreateMetalComputeHandle((gpuIdx < 100),
-                                          serverThreadIdx,
-                                          MetalProcess::modelDescToSwift(&loadedModel->modelDesc),
-                                          context->metalComputeContext)) {
+// Helper function to convert model and create CoreML-only compute handle (for mux ANE thread)
+static swift::Optional<KataGoSwift::CoreMLComputeHandle> convertAndCreateCoreMLOnlyHandle(
+  ComputeContext* context,
+  const LoadedModel* loadedModel,
+  bool requireExactNNLen,
+  int maxBatchSize,
+  int serverThreadIdx
+) {
+  auto metalContext = context->metalContext;
+  int nnXLen = metalContext.getNnXLen();
+  int nnYLen = metalContext.getNnYLen();
+  bool useFP16 = (context->useFP16Mode != enabled_t::False);
+  bool optimizeMask = requireExactNNLen;
+
+  // Convert model to CoreML format in temp directory
+  string coremlModelPath = CoreMLConversion::convertModelToTemp(
+    loadedModel->modelPath,
+    nnXLen,
+    nnYLen,
+    useFP16,
+    optimizeMask,
+    maxBatchSize,
+    serverThreadIdx
+  );
+
+  // Create CoreML-only compute handle (CPU+ANE)
+  return createCoreMLComputeHandle(
+    swift::String(coremlModelPath),
+    serverThreadIdx,
+    requireExactNNLen,
+    loadedModel->modelDesc.numInputChannels,
+    loadedModel->modelDesc.numInputGlobalChannels,
+    loadedModel->modelDesc.numInputMetaChannels,
+    loadedModel->modelDesc.numPolicyChannels,
+    loadedModel->modelDesc.numValueChannels,
+    loadedModel->modelDesc.numScoreValueChannels,
+    loadedModel->modelDesc.numOwnershipChannels,
+    metalContext
+  );
+}
+
+// Helper function to create CoreML-only handle when gpuIdx == METAL_MUX_ANE
+static swift::Optional<KataGoSwift::CoreMLComputeHandle> createCoreMLOnlyHandleIfNeeded(
+  ComputeContext* context,
+  const LoadedModel* loadedModel,
+  bool requireExactNNLen,
+  int maxBatchSize,
+  int gpuIdx,
+  int serverThreadIdx
+) {
+  if(gpuIdx != METAL_MUX_ANE) {
+    return swift::Optional<KataGoSwift::CoreMLComputeHandle>::none();
+  }
+
+  if(context->useFP16Mode == enabled_t::False) {
+    cerr << "Metal backend " << serverThreadIdx << ": Warning: ANE mode with FP32 - "
+         << "CoreML FP32 runs on CPU only (no ANE acceleration) and is significantly slower. "
+         << "Consider using GPU mode (metalDeviceToUseThread<N>=0) or setting metalUseFP16=true." << endl;
+  }
+
+  cerr << "Metal backend " << serverThreadIdx << ": Mux ANE mode - using CoreML (CPU+ANE)" << endl;
+  return convertAndCreateCoreMLOnlyHandle(context, loadedModel, requireExactNNLen, maxBatchSize, serverThreadIdx);
+}
+
+// Helper function to create MPSGraph-only handle for all non-ANE modes
+static swift::Optional<KataGoSwift::MPSGraphModelHandle> createMPSGraphHandleIfNeeded(
+  ComputeContext* context,
+  const LoadedModel* loadedModel,
+  bool requireExactNNLen,
+  int maxBatchSize,
+  int gpuIdx,
+  int serverThreadIdx
+) {
+  (void)maxBatchSize; // MPSGraph handles dynamic batches internally
+
+  // Skip if this is an ANE thread - CoreML-only handle will be created instead
+  if(gpuIdx == METAL_MUX_ANE) {
+    return swift::Optional<KataGoSwift::MPSGraphModelHandle>::none();
+  }
+
+  cerr << "Metal backend " << serverThreadIdx << ": GPU mode - using MPSGraph (GPU)" << endl;
+
+  SWModelDesc swModelDesc = MetalProcess::modelDescToSwift(&loadedModel->modelDesc);
+  return createMPSGraphOnlyHandle(
+    swModelDesc,
+    serverThreadIdx,
+    requireExactNNLen,
+    context->metalContext
+  );
+}
+
+ComputeHandle::ComputeHandle(
+  ComputeContext* context,
+  const LoadedModel* loadedModel,
+  bool inputsUseNHWC,
+  int gpuIdx,
+  int serverThreadIdx,
+  bool requireExactNNLen,
+  int maxBatchSize):
+mpsGraphOnlyHandle(createMPSGraphHandleIfNeeded(context, loadedModel, requireExactNNLen, maxBatchSize, gpuIdx, serverThreadIdx)),
+coremlOnlyHandle(createCoreMLOnlyHandleIfNeeded(context, loadedModel, requireExactNNLen, maxBatchSize, gpuIdx, serverThreadIdx)) {
+  bool hasMPSGraph = static_cast<bool>(mpsGraphOnlyHandle);
+  bool hasCoreML = static_cast<bool>(coremlOnlyHandle);
+  if(hasMPSGraph == hasCoreML) {
+    throw runtime_error("Metal backend: Logic error - expected exactly one compute handle, got " + string(hasMPSGraph && hasCoreML ? "both" : "neither") + " (gpuIdx=" + to_string(gpuIdx) + ")");
+  }
 
   const ModelDesc* modelDesc = &loadedModel->modelDesc;
-  auto metalContext = context->metalComputeContext;
+  auto metalContext = context->metalContext;
 
   nnXLen = metalContext.getNnXLen();
   nnYLen = metalContext.getNnYLen();
@@ -470,34 +557,13 @@ metalhandle(maybeCreateMetalComputeHandle((gpuIdx < 100),
   version = modelDesc->modelVersion;
   metaEncoderVersion = modelDesc->metaEncoderVersion;
   this->inputsUseNHWC = inputsUseNHWC;
-
-  /* Use FP16 mode if the model supports it and the user has not explicitly
-   * disabled it. */
+  this->requireExactNNLen = requireExactNNLen;
   useFP16 = (context->useFP16Mode != enabled_t::False);
-
-  (void)serverThreadIdx;
 }
 
 ComputeHandle::~ComputeHandle() {
 }
 
-static mutex computeHandleMutex;
-
-/**
- * @brief Create a new ComputeHandle object for performing neural network computations.
- * This function creates a new ComputeHandle object for performing neural network computations,
- * using the specified parameters and settings. The object is allocated on the heap using the
- * 'new' operator and returned as a pointer.
- * @param context A pointer to the ComputeContext object to use for computation.
- * @param loadedModel A pointer to the LoadedModel object containing the neural network model to use.
- * @param logger A pointer to the Logger object to use for logging messages.
- * @param maxBatchSize The maximum batch size to use for computation.
- * @param requireExactNNLen Whether the neural network length must match the input data length exactly.
- * @param inputsUseNHWC Whether the input data uses NHWC format.
- * @param gpuIdxForThisThread The index of the GPU to use for computation.
- * @param serverThreadIdx The index of the server thread to use for computation.
- * @return A pointer to the newly-created ComputeHandle object.
- */
 ComputeHandle* NeuralNet::createComputeHandle(
   ComputeContext* context,
   const LoadedModel* loadedModel,
@@ -509,63 +575,44 @@ ComputeHandle* NeuralNet::createComputeHandle(
   int serverThreadIdx) {
 
   (void)logger;
-  (void)maxBatchSize;
-  // Current implementation always tolerates excess nn len
-  (void)requireExactNNLen;
 
-  // Transfer the default GPU index into physical GPU index 0
   int gpuIdx = (gpuIdxForThisThread == -1) ? 0 : gpuIdxForThisThread;
+  if(gpuIdx != METAL_MUX_GPU && gpuIdx != METAL_MUX_ANE) {
+    cerr << "Metal backend: Warning: Unrecognized gpuIdx=" << gpuIdx
+         << ", valid values are " << METAL_MUX_GPU << " (GPU) and " << METAL_MUX_ANE << " (ANE)"
+         << ". Defaulting to GPU mode." << endl;
+    gpuIdx = METAL_MUX_GPU;
+  }
   ComputeHandle* handle = nullptr;
 
   {
     lock_guard<mutex> lock(computeHandleMutex);
-    handle = new ComputeHandle(context, loadedModel, inputsUseNHWC, gpuIdx, serverThreadIdx);
+    handle = new ComputeHandle(context, loadedModel, inputsUseNHWC, gpuIdx, serverThreadIdx, requireExactNNLen, maxBatchSize);
   }
 
   return handle;
 }
 
-/**
- * @brief Free the memory used by a ComputeHandle object.
- * This function frees the memory used by the specified ComputeHandle object, which was
- * previously allocated on the heap using the 'new' operator.
- * @param handle A pointer to the ComputeHandle object to free.
- */
 void NeuralNet::freeComputeHandle(ComputeHandle* handle) {
   delete handle;
 }
 
-/**
- * @brief Check whether a ComputeHandle object is using 16-bit floating-point precision.
- * This function checks whether the specified ComputeHandle object is using 16-bit floating-point
- * precision for computation, and returns a boolean value indicating the result.
- * @param handle A pointer to the ComputeHandle object to check.
- * @return True if the ComputeHandle object is using 16-bit floating-point precision, false otherwise.
- */
 bool NeuralNet::isUsingFP16(const ComputeHandle* handle) {
   return handle->useFP16;
 }
 
+//------------------------------------------------------------------------------
+// Device information
 //------------------------------------------------------------------------------
 
-/**
- * @brief Print information about the available devices.
- */
 void NeuralNet::printDevices() {
   printMetalDevices();
 }
 
-//--------------------------------------------------------------
+//------------------------------------------------------------------------------
+// InputBuffers implementation
+//------------------------------------------------------------------------------
 
-/**
- * @brief Construct a new InputBuffers object for storing input data for neural network computation.
- * This constructor initializes a new InputBuffers object for storing input data for neural network
- * computation, based on the specified parameters and settings.
- * @param loadedModel A pointer to the LoadedModel object containing the neural network model to use.
- * @param maxBatchSz The maximum batch size to use for computation.
- * @param nnXLen The x length of the neural network computation context.
- * @param nnYLen The y length of the neural network computation context.
- */
 InputBuffers::InputBuffers(const LoadedModel* loadedModel, int maxBatchSz, int nnXLen, int nnYLen) {
   const ModelDesc& m = loadedModel->modelDesc;
 
@@ -587,6 +634,7 @@ InputBuffers::InputBuffers(const LoadedModel* loadedModel, int maxBatchSz, int n
   singleOwnershipResultElts = (size_t)m.numOwnershipChannels * nnXLen * nnYLen;
   singleOwnerMapElts = (size_t)m.numOwnershipChannels * nnXLen * nnYLen;
   singleScoreValuesResultElts = (size_t)m.numScoreValueChannels;
+  singleMaskElts = (size_t)nnXLen * nnYLen;
 
   assert(NNModelVersion::getNumSpatialFeatures(m.modelVersion) == m.numInputChannels);
   assert(NNModelVersion::getNumGlobalFeatures(m.modelVersion) == m.numInputGlobalChannels);
@@ -603,10 +651,10 @@ InputBuffers::InputBuffers(const LoadedModel* loadedModel, int maxBatchSz, int n
   ownershipResultBufferElts = (size_t)maxBatchSize * singleOwnershipResultElts;
   ownerMapBufferElts = (size_t)maxBatchSz * singleOwnerMapElts;
   scoreValuesResultBufferElts = (size_t)maxBatchSize * singleScoreValuesResultElts;
+  userInputMaskBufferElts = (size_t)maxBatchSize * singleMaskElts;
 
   rowSpatialBuffer = new float[rowSpatialBufferElts];
   userInputBuffer = new float[userInputBufferElts];
-  // Zero out the input buffer for arbitrary board sizes
   memset(&userInputBuffer[0], 0, userInputBufferElts * sizeof(userInputBuffer[0]));
 
   userInputGlobalBuffer = new float[userInputGlobalBufferElts];
@@ -618,13 +666,10 @@ InputBuffers::InputBuffers(const LoadedModel* loadedModel, int maxBatchSz, int n
   ownershipResults = new float[ownershipResultBufferElts];
   ownerMapBuffer = new float[ownerMapBufferElts];
   scoreValuesResults = new float[scoreValuesResultBufferElts];
+  userInputMaskBuffer = new float[userInputMaskBufferElts];
+  memset(&userInputMaskBuffer[0], 0, userInputMaskBufferElts * sizeof(userInputMaskBuffer[0]));
 }
 
-/**
- * @brief Destroy the InputBuffers object and free all associated memory.
- * This destructor destroys the InputBuffers object and frees all memory associated with it,
- * including all input and output buffers used for neural network computation.
- */
 InputBuffers::~InputBuffers() {
   delete[] rowSpatialBuffer;
   delete[] userInputBuffer;
@@ -637,48 +682,25 @@ InputBuffers::~InputBuffers() {
   delete[] ownershipResults;
   delete[] ownerMapBuffer;
   delete[] scoreValuesResults;
+  delete[] userInputMaskBuffer;
 }
 
-/**
- * @brief Create a new InputBuffers object for storing input data for neural network computation.
- * This function creates a new InputBuffers object for storing input data for neural network computation,
- * using the specified parameters and settings. The object is allocated on the heap using the 'new' operator
- * and returned as a pointer.
- * @param loadedModel A pointer to the LoadedModel object containing the neural network model to use.
- * @param maxBatchSize The maximum batch size to use for computation.
- * @param nnXLen The x length of the neural network computation context.
- * @param nnYLen The y length of the neural network computation context.
- * @return A pointer to the newly-created InputBuffers object.
- */
 InputBuffers* NeuralNet::createInputBuffers(const LoadedModel* loadedModel, int maxBatchSize, int nnXLen, int nnYLen) {
   return new InputBuffers(loadedModel, maxBatchSize, nnXLen, nnYLen);
 }
 
-/**
- * @brief Free the memory used by an InputBuffers object.
- * This function frees the memory used by the specified InputBuffers object, which was
- * previously allocated on the heap using the 'new' operator.
- * @param inputBuffers A pointer to the InputBuffers object to free.
- */
 void NeuralNet::freeInputBuffers(InputBuffers* inputBuffers) {
   delete inputBuffers;
 }
 
-//--------------------------------------------------------------
+//------------------------------------------------------------------------------
+// MetalProcess namespace - Helper functions
+//------------------------------------------------------------------------------
 
 void MetalProcess::copyRowData(float* dest, const float* src, size_t numElements) {
   copy(src, src + numElements, dest);
 }
 
-/**
- * @brief Convert input data from NHWC format to NCHW format in-place if necessary.
- *
- * @param rowSpatialInput Pointer to the input data (single batch element assumed).
- * @param C Number of channels.
- * @param H Height.
- * @param W Width.
- * @param inputsUseNHWC Flag indicating if the input data is currently in NHWC format.
- */
 void MetalProcess::convertNCHW(
     float* rowSpatialInput,
     const int C,
@@ -766,6 +788,11 @@ void MetalProcess::processRowData(size_t row, ComputeHandle* gpuHandle, InputBuf
     nnYLen,
     nnXLen,
     gpuHandle->inputsUseNHWC);
+
+  // Copy first channel of spatial input (mask) to dedicated mask buffer
+  // After NCHW conversion, the first nnXLen*nnYLen elements are the mask channel
+  float* rowMaskInput = &inputBuffers->userInputMaskBuffer[inputBuffers->singleMaskElts * row];
+  copy(rowSpatialInput, rowSpatialInput + inputBuffers->singleMaskElts, rowMaskInput);
 }
 
 float MetalProcess::policyOptimismCalc(const double policyOptimism, const float p, const float pOpt) {
@@ -782,7 +809,7 @@ void MetalProcess::processOptimism(
   float* targetBuffer = &buffers.policyProbsBuffer[row * singlePolicyResultElts];
   float* policyOutputBuf = &buffers.policyResults[row * singlePolicyResultElts * buffers.policyResultChannels];
 
-  for(auto i = 0; i < singlePolicyResultElts; ++i) {
+  for(size_t i = 0; i < singlePolicyResultElts; ++i) {
     const float p = policyOutputBuf[i];
     const float pOpt = policyOutputBuf[i + singlePolicyResultElts];
     targetBuffer[i] = MetalProcess::policyOptimismCalc(policyOptimism, p, pOpt);
@@ -801,7 +828,6 @@ void MetalProcess::processPolicy(
   size_t row) {
   auto& buffers = *inputBuffers;
   float* targetBuffer = &buffers.policyResults[row * buffers.singlePolicyResultElts * buffers.policyResultChannels];
-  const auto symmetry = inputBuf->symmetry;
   const auto policyOptimism = inputBuf->policyOptimism;
 
   if(buffers.policyResultChannels == 1) {
@@ -813,7 +839,7 @@ void MetalProcess::processPolicy(
   }
 
   SymmetryHelpers::copyOutputsWithSymmetry(
-    targetBuffer, currentOutput->policyProbs, 1, gpuHandle->nnYLen, gpuHandle->nnXLen, symmetry);
+    targetBuffer, currentOutput->policyProbs, 1, gpuHandle->nnYLen, gpuHandle->nnXLen, inputBuf->symmetry);
 }
 
 void MetalProcess::processValue(
@@ -839,7 +865,6 @@ void MetalProcess::processOwnership(
   const size_t singleOwnershipResultElts = inputBuffers->singleOwnershipResultElts;
   const size_t ownershipOutputBufOffset = row * singleOwnershipResultElts;
 
-  // Copy ownership results with symmetry if available
   if(currentOutput->whiteOwnerMap != nullptr) {
     const float* ownershipOutputBuf = &inputBuffers->ownershipResults[ownershipOutputBufOffset];
     SymmetryHelpers::copyOutputsWithSymmetry(
@@ -890,7 +915,6 @@ void MetalProcess::processScoreValues(
     size_t numScoreValueChannels = inputBuffers->singleScoreValuesResultElts;
     assert(numScoreValueChannels == 1);
     currentOutput->whiteScoreMean = currentScoreValueData[0];
-    //Version 3 neural nets don't have any second moment currentOutput, implicitly already folding it in, so we just use the mean squared
     currentOutput->whiteScoreMeanSq = currentOutput->whiteScoreMean * currentOutput->whiteScoreMean;
     currentOutput->whiteLead = currentOutput->whiteScoreMean;
     currentOutput->varTimeLeft = 0;
@@ -914,16 +938,6 @@ void MetalProcess::processRow(
   MetalProcess::processScoreValues(inputBuffers, currentOutput, gpuHandle->version, row);
 }
 
-/**
- * @brief Compute the neural network output using Metal API and the specified input data and GPU handle.
- * This function computes the neural network output using the Metal API and the specified input data and ComputeHandle
- * object for GPU acceleration. The computed output is stored in the specified vector of NNOutput pointers.
- * @param gpuHandle A pointer to the ComputeHandle object to use for GPU computation.
- * @param inputBuffers A pointer to the InputBuffers object containing the input data for computation.
- * @param numBatchEltsFilled The number of batch elements filled in the input buffer.
- * @param inputBufs An array of pointers to NNResultBuf objects containing the neural network input data.
- * @param outputs A vector of NNOutput pointers to store the computed output.
- */
 void MetalProcess::getMetalOutput(
   ComputeHandle* gpuHandle,
   InputBuffers* inputBuffers,
@@ -935,47 +949,54 @@ void MetalProcess::getMetalOutput(
   int batchSize = numBatchEltsFilled;
 
   assert(batchSize <= inputBuffers->maxBatchSize);
-  assert((NNModelVersion::getNumSpatialFeatures(gpuHandle->version) * gpuHandle->nnXLen * gpuHandle->nnYLen) <= inputBuffers->singleInputElts);
-  assert(NNModelVersion::getNumGlobalFeatures(gpuHandle->version) == inputBuffers->singleInputGlobalElts);
+  assert((NNModelVersion::getNumSpatialFeatures(gpuHandle->version) * gpuHandle->nnXLen * gpuHandle->nnYLen) <= (int)inputBuffers->singleInputElts);
+  assert(NNModelVersion::getNumGlobalFeatures(gpuHandle->version) == (int)inputBuffers->singleInputGlobalElts);
 
   if(gpuHandle->metaEncoderVersion > 0) {
-    assert(SGFMetadata::METADATA_INPUT_NUM_CHANNELS == inputBuffers->singleInputMetaElts);
+    assert(SGFMetadata::METADATA_INPUT_NUM_CHANNELS == (int)inputBuffers->singleInputMetaElts);
   }
 
   assert(inputBuffers->singleValueResultElts == 3);
 
-  for(size_t row = 0; row < batchSize; row++) {
+  for(int row = 0; row < batchSize; row++) {
     MetalProcess::processRowData(row, gpuHandle, inputBuffers, inputBufs);
   }
 
-  auto metalHandle = gpuHandle->metalhandle;
-  assert(metalHandle);
-
-  metalHandle.get().apply(inputBuffers->userInputBuffer,
-                          inputBuffers->userInputGlobalBuffer,
-                          inputBuffers->userInputMetaBuffer,
-                          inputBuffers->policyResults,
-                          inputBuffers->policyPassResults,
-                          inputBuffers->valueResults,
-                          inputBuffers->scoreValuesResults,
-                          inputBuffers->ownershipResults,
-                          batchSize);
+  // Dispatch to appropriate handle based on mode
+  if(gpuHandle->coremlOnlyHandle) {
+    // ANE mode: Use CoreML (CPU+ANE)
+    gpuHandle->coremlOnlyHandle.get().apply(
+      inputBuffers->userInputBuffer,
+      inputBuffers->userInputGlobalBuffer,
+      inputBuffers->userInputMetaBuffer,
+      inputBuffers->userInputMaskBuffer,
+      inputBuffers->policyResults,
+      inputBuffers->policyPassResults,
+      inputBuffers->valueResults,
+      inputBuffers->scoreValuesResults,
+      inputBuffers->ownershipResults,
+      batchSize);
+  } else if(gpuHandle->mpsGraphOnlyHandle) {
+    // GPU mode: Use MPSGraph (GPU)
+    gpuHandle->mpsGraphOnlyHandle.get().apply(
+      inputBuffers->userInputBuffer,
+      inputBuffers->userInputGlobalBuffer,
+      inputBuffers->userInputMetaBuffer,
+      inputBuffers->policyResults,
+      inputBuffers->policyPassResults,
+      inputBuffers->valueResults,
+      inputBuffers->scoreValuesResults,
+      inputBuffers->ownershipResults,
+      batchSize);
+  } else {
+    throw runtime_error("Metal backend: No valid compute handle available");
+  }
 
-  for(size_t row = 0; row < batchSize; row++) {
+  for(int row = 0; row < batchSize; row++) {
     MetalProcess::processRow(row, gpuHandle, inputBuffers, inputBufs, outputs);
   }
 }
 
-/**
- * @brief Compute the neural network output using the specified input data and GPU handle.
- * This function computes the neural network output using the specified input data and ComputeHandle object
- * for GPU acceleration. The computed output is stored in the specified vector of NNOutput pointers.
- * @param gpuHandle A pointer to the ComputeHandle object to use for GPU computation.
- * @param inputBuffers A pointer to the InputBuffers object containing the input data for computation.
- * @param numBatchEltsFilled The number of batch elements filled in the input buffer.
- * @param inputBufs An array of pointers to NNResultBuf objects containing the neural network input data.
- * @param outputs A vector of NNOutput pointers to store the computed output.
- */
 void NeuralNet::getOutput(
   ComputeHandle* gpuHandle,
   InputBuffers* inputBuffers,
@@ -986,41 +1007,254 @@ void NeuralNet::getOutput(
   MetalProcess::getMetalOutput(gpuHandle, inputBuffers, numBatchEltsFilled, inputBufs, outputs);
 }
 
-bool MetalProcess::testEvaluateConv(const ConvLayerDesc* desc,
-                                    int batchSize,
-                                    int nnXLen,
-                                    int nnYLen,
-                                    const vector<float>& inputBuffer,
-                                    vector<float>& outputBuffer) {
+//------------------------------------------------------------------------------
+// Test functions - Metal backend uses NCHW layout (not NHWC)
+//------------------------------------------------------------------------------
+
+namespace MetalProcess {
+
+// Helper function to compute merged scale and bias from raw values
+// This is needed because test descriptors are created manually without computing merged values
+static void computeMergedBatchNormValues(
+  const BatchNormLayerDesc* desc,
+  vector<float>& mergedScale,
+  vector<float>& mergedBias) {
+
+  int numChannels = desc->numChannels;
+  mergedScale.resize(numChannels);
+  mergedBias.resize(numChannels);
+
+  // If merged values are already computed, use them
+  if(!desc->mergedScale.empty() && !desc->mergedBias.empty()) {
+    mergedScale = desc->mergedScale;
+    mergedBias = desc->mergedBias;
+    return;
+  }
+
+  // Otherwise compute from raw values: mergedScale = scale / sqrt(variance + epsilon)
+  // mergedBias = bias - mergedScale * mean
+  // Note: Use scale/bias values from vectors if available, regardless of hasScale/hasBias flags
+  // This matches how desc.cpp computes merged values during model loading
+  for(int c = 0; c < numChannels; c++) {
+    float scale = c < (int)desc->scale.size() ? desc->scale[c] : 1.0f;
+    float bias = c < (int)desc->bias.size() ? desc->bias[c] : 0.0f;
+    float mean = c < (int)desc->mean.size() ? desc->mean[c] : 0.0f;
+    float variance = c < (int)desc->variance.size() ? desc->variance[c] : 1.0f;
+    float epsilon = desc->epsilon;
+
+    mergedScale[c] = scale / sqrt(variance + epsilon);
+    mergedBias[c] = bias - mergedScale[c] * mean;
+  }
+}
+
+// Helper to convert BatchNormLayerDesc to Swift with computed merged values
+static SWBatchNormLayerDesc batchNormLayerDescToSwiftWithMerge(
+  const BatchNormLayerDesc* desc,
+  vector<float>& mergedScaleStorage,
+  vector<float>& mergedBiasStorage) {
+
+  computeMergedBatchNormValues(desc, mergedScaleStorage, mergedBiasStorage);
+
+  return createSWBatchNormLayerDesc(
+    desc->numChannels,
+    mergedScaleStorage.data(),
+    mergedBiasStorage.data());
+}
+
+// Helper to convert ResidualBlockDesc to Swift with computed merged values
+static SWResidualBlockDesc residualBlockDescToSwiftWithMerge(
+  const ResidualBlockDesc* desc,
+  vector<float>& mergedScalePreBN,
+  vector<float>& mergedBiasPreBN,
+  vector<float>& mergedScaleMidBN,
+  vector<float>& mergedBiasMidBN) {
+
+  computeMergedBatchNormValues(&desc->preBN, mergedScalePreBN, mergedBiasPreBN);
+  computeMergedBatchNormValues(&desc->midBN, mergedScaleMidBN, mergedBiasMidBN);
+
+  SWBatchNormLayerDesc preBN = createSWBatchNormLayerDesc(
+    desc->preBN.numChannels,
+    mergedScalePreBN.data(),
+    mergedBiasPreBN.data());
+
+  ActivationKind preActivationKind = MetalProcess::activationLayerDescToSwift(&desc->preActivation);
+  SWConvLayerDesc regularConv = MetalProcess::convLayerDescToSwift(&desc->regularConv);
+
+  SWBatchNormLayerDesc midBN = createSWBatchNormLayerDesc(
+    desc->midBN.numChannels,
+    mergedScaleMidBN.data(),
+    mergedBiasMidBN.data());
+
+  ActivationKind midActivationKind = MetalProcess::activationLayerDescToSwift(&desc->midActivation);
+  SWConvLayerDesc finalConv = MetalProcess::convLayerDescToSwift(&desc->finalConv);
+
+  return createSWResidualBlockDesc(
+    preBN,
+    preActivationKind,
+    regularConv,
+    midBN,
+    midActivationKind,
+    finalConv);
+}
+
+// Helper to convert GlobalPoolingResidualBlockDesc to Swift with computed merged values
+static SWGlobalPoolingResidualBlockDesc globalPoolingResidualBlockDescToSwiftWithMerge(
+  const GlobalPoolingResidualBlockDesc* desc,
+  vector<float>& mergedScalePreBN,
+  vector<float>& mergedBiasPreBN,
+  vector<float>& mergedScaleMidBN,
+  vector<float>& mergedBiasMidBN,
+  vector<float>& mergedScaleGpoolBN,
+  vector<float>& mergedBiasGpoolBN) {
+
+  computeMergedBatchNormValues(&desc->preBN, mergedScalePreBN, mergedBiasPreBN);
+  computeMergedBatchNormValues(&desc->gpoolBN, mergedScaleGpoolBN, mergedBiasGpoolBN);
+  computeMergedBatchNormValues(&desc->midBN, mergedScaleMidBN, mergedBiasMidBN);
+
+  SWBatchNormLayerDesc preBN = createSWBatchNormLayerDesc(
+    desc->preBN.numChannels,
+    mergedScalePreBN.data(),
+    mergedBiasPreBN.data());
+
+  ActivationKind preActivationKind = MetalProcess::activationLayerDescToSwift(&desc->preActivation);
+  SWConvLayerDesc regularConv = MetalProcess::convLayerDescToSwift(&desc->regularConv);
+  SWConvLayerDesc gpoolConv = MetalProcess::convLayerDescToSwift(&desc->gpoolConv);
+
+  SWBatchNormLayerDesc gpoolBN = createSWBatchNormLayerDesc(
+    desc->gpoolBN.numChannels,
+    mergedScaleGpoolBN.data(),
+    mergedBiasGpoolBN.data());
+
+  ActivationKind gpoolActivationKind = MetalProcess::activationLayerDescToSwift(&desc->gpoolActivation);
+  SWMatMulLayerDesc gpoolToBiasMul = MetalProcess::matMulLayerDescToSwift(&desc->gpoolToBiasMul);
+
+  SWBatchNormLayerDesc midBN = createSWBatchNormLayerDesc(
+    desc->midBN.numChannels,
+    mergedScaleMidBN.data(),
+    mergedBiasMidBN.data());
+
+  ActivationKind midActivationKind = MetalProcess::activationLayerDescToSwift(&desc->midActivation);
+  SWConvLayerDesc finalConv = MetalProcess::convLayerDescToSwift(&desc->finalConv);
+
+  return createSWGlobalPoolingResidualBlockDesc(
+    preBN,
+    preActivationKind,
+    regularConv,
+    gpoolConv,
+    gpoolBN,
+    gpoolActivationKind,
+    gpoolToBiasMul,
+    midBN,
+    midActivationKind,
+    finalConv);
+}
+
+bool testEvaluateConv(
+  const ConvLayerDesc* desc,
+  int batchSize,
+  int nnXLen,
+  int nnYLen,
+  const vector<float>& inputBuffer,
+  vector<float>& outputBuffer) {
+
+  SWConvLayerDesc swDesc = MetalProcess::convLayerDescToSwift(desc);
 
   size_t numOutputFloats = (size_t)batchSize * nnXLen * nnYLen * desc->outChannels;
   outputBuffer.resize(numOutputFloats);
 
-  testConvLayer(convLayerDescToSwift(desc),
-                nnXLen,
-                nnYLen,
-                batchSize,
-                (float*)inputBuffer.data(),
-                (float*)outputBuffer.data());
-
-  return true;
-}
-
-/**
- * @brief Evaluate a convolutional layer using Metal API for testing purposes.
- * This function evaluates a convolutional layer using the Metal API for testing purposes.
- * The input buffer and output buffer are specified as vectors of floats, and the result of the computation
- * is stored in the output buffer. The function returns true if the evaluation is implemented.
- * @param desc A pointer to the ConvLayerDesc object describing the convolutional layer to evaluate.
- * @param batchSize The batch size to use for computation.
- * @param nnXLen The x length of the neural network computation context.
- * @param nnYLen The y length of the neural network computation context.
- * @param useFP16 A boolean indicating whether to use half-precision floating point format for computation.
- * @param useNHWC A boolean indicating whether to use NHWC layout for input and output buffers.
- * @param inputBuffer A vector of floats containing the input buffer data.
- * @param outputBuffer A vector of floats to store the computed output.
- * @return true if the convolutional layer evaluation is implemented, false otherwise.
- */
+  return testConvLayer(
+    swDesc,
+    batchSize,
+    nnXLen,
+    nnYLen,
+    (float*)inputBuffer.data(),
+    outputBuffer.data());
+}
+
+bool testEvaluateBatchNorm(
+  const BatchNormLayerDesc* desc,
+  int batchSize,
+  int nnXLen,
+  int nnYLen,
+  const vector<float>& inputBuffer,
+  const vector<float>& maskBuffer,
+  vector<float>& outputBuffer) {
+
+  vector<float> mergedScaleStorage;
+  vector<float> mergedBiasStorage;
+  SWBatchNormLayerDesc swDesc = batchNormLayerDescToSwiftWithMerge(desc, mergedScaleStorage, mergedBiasStorage);
+
+  size_t numOutputFloats = (size_t)batchSize * nnXLen * nnYLen * desc->numChannels;
+  outputBuffer.resize(numOutputFloats);
+
+  return testBatchNormLayer(
+    swDesc,
+    batchSize,
+    nnXLen,
+    nnYLen,
+    (float*)inputBuffer.data(),
+    (float*)maskBuffer.data(),
+    outputBuffer.data());
+}
+
+bool testEvaluateResidualBlock(
+  const ResidualBlockDesc* desc,
+  int batchSize,
+  int nnXLen,
+  int nnYLen,
+  const vector<float>& inputBuffer,
+  const vector<float>& maskBuffer,
+  vector<float>& outputBuffer) {
+
+  vector<float> mergedScalePreBN, mergedBiasPreBN;
+  vector<float> mergedScaleMidBN, mergedBiasMidBN;
+  SWResidualBlockDesc swDesc = residualBlockDescToSwiftWithMerge(
+    desc, mergedScalePreBN, mergedBiasPreBN, mergedScaleMidBN, mergedBiasMidBN);
+
+  size_t numOutputFloats = (size_t)batchSize * nnXLen * nnYLen * desc->preBN.numChannels;
+  outputBuffer.resize(numOutputFloats);
+
+  return testResidualBlock(
+    swDesc,
+    batchSize,
+    nnXLen,
+    nnYLen,
+    (float*)inputBuffer.data(),
+    (float*)maskBuffer.data(),
+    outputBuffer.data());
+}
+
+bool testEvaluateGlobalPoolingResidualBlock(
+  const GlobalPoolingResidualBlockDesc* desc,
+  int batchSize,
+  int nnXLen,
+  int nnYLen,
+  const vector<float>& inputBuffer,
+  const vector<float>& maskBuffer,
+  vector<float>& outputBuffer) {
+
+  vector<float> mergedScalePreBN, mergedBiasPreBN;
+  vector<float> mergedScaleMidBN, mergedBiasMidBN;
+  vector<float> mergedScaleGpoolBN, mergedBiasGpoolBN;
+  SWGlobalPoolingResidualBlockDesc swDesc = globalPoolingResidualBlockDescToSwiftWithMerge(
+    desc, mergedScalePreBN, mergedBiasPreBN, mergedScaleMidBN, mergedBiasMidBN,
+    mergedScaleGpoolBN, mergedBiasGpoolBN);
+
+  size_t numOutputFloats = (size_t)batchSize * nnXLen * nnYLen * desc->preBN.numChannels;
+  outputBuffer.resize(numOutputFloats);
+
+  return testGlobalPoolingResidualBlock(
+    swDesc,
+    batchSize,
+    nnXLen,
+    nnYLen,
+    (float*)inputBuffer.data(),
+    (float*)maskBuffer.data(),
+    outputBuffer.data());
+}
+
+}  // namespace MetalProcess
+
 bool NeuralNet::testEvaluateConv(
   const ConvLayerDesc* desc,
   int batchSize,
@@ -1031,49 +1265,16 @@ bool NeuralNet::testEvaluateConv(
   const vector<float>& inputBuffer,
   vector<float>& outputBuffer) {
 
+  // Metal backend only supports NCHW layout
+  if(useNHWC)
+    return false;
+
+  // useFP16 is ignored - MPSGraph tests use FP32
   (void)useFP16;
-  (void)useNHWC;
+
   return MetalProcess::testEvaluateConv(desc, batchSize, nnXLen, nnYLen, inputBuffer, outputBuffer);
 }
 
-bool MetalProcess::testEvaluateBatchNorm(const BatchNormLayerDesc* desc,
-                                         int batchSize,
-                                         int nnXLen,
-                                         int nnYLen,
-                                         const vector<float>& inputBuffer,
-                                         const vector<float>& maskBuffer,
-                                         vector<float>& outputBuffer) {
-
-  size_t numOutputFloats = (size_t)batchSize * nnXLen * nnYLen * desc->numChannels;
-  outputBuffer.resize(numOutputFloats);
-
-  testBatchNormLayer(batchNormLayerDescToSwift(desc),
-                     nnXLen,
-                     nnYLen,
-                     batchSize,
-                     (float*)inputBuffer.data(),
-                     (float*)maskBuffer.data(),
-                     (float*)outputBuffer.data());
-
-  return true;
-}
-
-/**
- * @brief Evaluate a batch normalization layer using Metal API for testing purposes.
- * This function evaluates a batch normalization layer using the Metal API for testing purposes.
- * The input buffer and output buffer are specified as vectors of floats, and the result of the computation
- * is stored in the output buffer. The function returns true if the evaluation is implemented.
- * @param desc A pointer to the BatchNormLayerDesc object describing the batch normalization layer to evaluate.
- * @param batchSize The batch size to use for computation.
- * @param nnXLen The x length of the neural network computation context.
- * @param nnYLen The y length of the neural network computation context.
- * @param useFP16 A boolean indicating whether to use half-precision floating point format for computation.
- * @param useNHWC A boolean indicating whether to use NHWC layout for input and output buffers.
- * @param inputBuffer A vector of floats containing the input buffer data.
- * @param maskBuffer A vector of floats containing the mask buffer data. Mask should be in 'NHW' format (no "C" channel).
- * @param outputBuffer A vector of floats to store the computed output.
- * @return true if the batch normalization layer evaluation is implemented, false otherwise.
- */
 bool NeuralNet::testEvaluateBatchNorm(
   const BatchNormLayerDesc* desc,
   int batchSize,
@@ -1085,49 +1286,16 @@ bool NeuralNet::testEvaluateBatchNorm(
   const vector<float>& maskBuffer,
   vector<float>& outputBuffer) {
 
+  // Metal backend only supports NCHW layout
+  if(useNHWC)
+    return false;
+
+  // useFP16 is ignored - MPSGraph tests use FP32
   (void)useFP16;
-  (void)useNHWC;
+
   return MetalProcess::testEvaluateBatchNorm(desc, batchSize, nnXLen, nnYLen, inputBuffer, maskBuffer, outputBuffer);
 }
 
-bool MetalProcess::testEvaluateResidualBlock(const ResidualBlockDesc* desc,
-                                             int batchSize,
-                                             int nnXLen,
-                                             int nnYLen,
-                                             const vector<float>& inputBuffer,
-                                             const vector<float>& maskBuffer,
-                                             vector<float>& outputBuffer) {
-
-  size_t numTrunkFloats = (size_t)batchSize * nnXLen * nnYLen * desc->preBN.numChannels;
-  outputBuffer.resize(numTrunkFloats);
-
-  testResidualBlock(residualBlockDescToSwift(desc),
-                    batchSize,
-                    nnXLen,
-                    nnYLen,
-                    (float*)inputBuffer.data(),
-                    (float*)maskBuffer.data(),
-                    (float*)outputBuffer.data());
-
-  return true;
-}
-
-/**
- * @brief Evaluate a residual block using Metal API for testing purposes.
- * This function evaluates a residual block using the Metal API for testing purposes.
- * The input buffer and output buffer are specified as vectors of floats, and the result of the computation
- * is stored in the output buffer. The function returns true if the evaluation is implemented.
- * @param desc A pointer to the ResidualBlockDesc object describing the residual block to evaluate.
- * @param batchSize The batch size to use for computation.
- * @param nnXLen The x length of the neural network computation context.
- * @param nnYLen The y length of the neural network computation context.
- * @param useFP16 A boolean indicating whether to use half-precision floating point format for computation.
- * @param useNHWC A boolean indicating whether to use NHWC layout for input and output buffers.
- * @param inputBuffer A vector of floats containing the input buffer data.
- * @param maskBuffer A vector of floats containing the mask buffer data.
- * @param outputBuffer A vector of floats to store the computed output.
- * @return true if the residual block evaluation is implemented, false otherwise.
- */
 bool NeuralNet::testEvaluateResidualBlock(
   const ResidualBlockDesc* desc,
   int batchSize,
@@ -1139,50 +1307,16 @@ bool NeuralNet::testEvaluateResidualBlock(
   const vector<float>& maskBuffer,
   vector<float>& outputBuffer) {
 
+  // Metal backend only supports NCHW layout
+  if(useNHWC)
+    return false;
+
+  // useFP16 is ignored - MPSGraph tests use FP32
   (void)useFP16;
-  (void)useNHWC;
+
   return MetalProcess::testEvaluateResidualBlock(desc, batchSize, nnXLen, nnYLen, inputBuffer, maskBuffer, outputBuffer);
 }
 
-bool MetalProcess::testEvaluateGlobalPoolingResidualBlock(const GlobalPoolingResidualBlockDesc* desc,
-                                                          int batchSize,
-                                                          int nnXLen,
-                                                          int nnYLen,
-                                                          const vector<float>& inputBuffer,
-                                                          const vector<float>& maskBuffer,
-                                                          vector<float>& outputBuffer) {
-
-  size_t numTrunkFloats = (size_t)batchSize * nnXLen * nnYLen * desc->preBN.numChannels;
-  outputBuffer.resize(numTrunkFloats);
-
-  testGlobalPoolingResidualBlock(globalPoolingResidualBlockDescToSwift(desc),
-                                 batchSize,
-                                 nnXLen,
-                                 nnYLen,
-                                 (float*)inputBuffer.data(),
-                                 (float*)maskBuffer.data(),
-                                 (float*)outputBuffer.data());
-
-  return true;
-}
-
-/**
- * @brief Evaluate a global pooling residual block using Metal API for testing purposes.
- * This function evaluates a global pooling residual block using the Metal API for testing purposes.
- * The input buffer and output buffer are specified as vectors of floats, and the result of the computation
- * is stored in the output buffer. The function returns true if the evaluation is implemented.
- * @param desc A pointer to the GlobalPoolingResidualBlockDesc object describing the global pooling residual block to
- * evaluate.
- * @param batchSize The batch size to use for computation.
- * @param nnXLen The x length of the neural network computation context.
- * @param nnYLen The y length of the neural network computation context.
- * @param useFP16 A boolean indicating whether to use half-precision floating point format for computation.
- * @param useNHWC A boolean indicating whether to use NHWC layout for input and output buffers.
- * @param inputBuffer A vector of floats containing the input buffer data.
- * @param maskBuffer A vector of floats containing the mask buffer data.
- * @param outputBuffer A vector of floats to store the computed output.
- * @return true if the global pooling residual block evaluation is implemented, false otherwise.
- */
 bool NeuralNet::testEvaluateGlobalPoolingResidualBlock(
   const GlobalPoolingResidualBlockDesc* desc,
   int batchSize,
@@ -1194,9 +1328,14 @@ bool NeuralNet::testEvaluateGlobalPoolingResidualBlock(
   const vector<float>& maskBuffer,
   vector<float>& outputBuffer) {
 
+  // Metal backend only supports NCHW layout
+  if(useNHWC)
+    return false;
+
+  // useFP16 is ignored - MPSGraph tests use FP32
   (void)useFP16;
-  (void)useNHWC;
+
   return MetalProcess::testEvaluateGlobalPoolingResidualBlock(desc, batchSize, nnXLen, nnYLen, inputBuffer, maskBuffer, outputBuffer);
 }
 
-#endif  // USE_METAL_BACKEND
+#endif // USE_METAL_BACKEND
diff --git a/cpp/neuralnet/metalbackend.h b/cpp/neuralnet/metalbackend.h
index 34e44b8e7..0bf26f41b 100644
--- a/cpp/neuralnet/metalbackend.h
+++ b/cpp/neuralnet/metalbackend.h
@@ -1,4 +1,7 @@
-#pragma once
+#ifndef NEURALNET_METALBACKEND_H_
+#define NEURALNET_METALBACKEND_H_
+
+#ifdef USE_METAL_BACKEND
 
 #include <string>
 #include "desc.h"
@@ -12,52 +15,13 @@
 using namespace std;
 using namespace KataGoSwift;
 
+// Backend mode constants for multiplexer architecture.
+// When used as gpuIdx, these select a specific backend for that server thread.
+// Default gpuIdx=-1 maps to 0 (GPU-only) in createComputeHandle.
+static constexpr int METAL_MUX_GPU = 0;    // MPSGraph-only (GPU) - default
+static constexpr int METAL_MUX_ANE = 100;  // CoreML-only (CPU+ANE)
+
 namespace MetalProcess {
-SWConvLayerDesc convLayerDescToSwift(const ConvLayerDesc * desc);
-SWBatchNormLayerDesc batchNormLayerDescToSwift(const BatchNormLayerDesc * desc);
-ActivationKind activationLayerDescToSwift(const ActivationLayerDesc * desc);
-SWResidualBlockDesc residualBlockDescToSwift(const ResidualBlockDesc * desc);
-SWMatMulLayerDesc matMulLayerDescToSwift(const MatMulLayerDesc * desc);
-SWGlobalPoolingResidualBlockDesc globalPoolingResidualBlockDescToSwift(const GlobalPoolingResidualBlockDesc* desc);
-swift::Array<BlockDescriptor> residualBlocksToSwift(const vector<pair<int, unique_ptr_void>>& blocks);
-SWNestedBottleneckResidualBlockDesc nestedBottleneckResidualBlockDescToSwift(const NestedBottleneckResidualBlockDesc* desc);
-swift::Optional<SWSGFMetadataEncoderDesc> sGFMetadataEncoderDescToSwift(const SGFMetadataEncoderDesc * desc);
-SWTrunkDesc trunkDescToSwift(const TrunkDesc * trunk);
-SWPolicyHeadDesc policyHeadDescToSwift(const PolicyHeadDesc * policyHead);
-SWMatBiasLayerDesc matBiasLayerDescToSwift(const MatBiasLayerDesc * desc);
-SWValueHeadDesc valueHeadDescToSwift(const ValueHeadDesc * valueHead);
-SWModelDesc modelDescToSwift(const ModelDesc* modelDesc);
-
-bool testEvaluateConv(const ConvLayerDesc* desc,
-                      int batchSize,
-                      int nnXLen,
-                      int nnYLen,
-                      const vector<float>& inputBuffer,
-                      vector<float>& outputBuffer);
-
-bool testEvaluateBatchNorm(const BatchNormLayerDesc* desc,
-                           int batchSize,
-                           int nnXLen,
-                           int nnYLen,
-                           const vector<float>& inputBuffer,
-                           const vector<float>& maskBuffer,
-                           vector<float>& outputBuffer);
-
-bool testEvaluateResidualBlock(const ResidualBlockDesc* desc,
-                               int batchSize,
-                               int nnXLen,
-                               int nnYLen,
-                               const vector<float>& inputBuffer,
-                               const vector<float>& maskBuffer,
-                               vector<float>& outputBuffer);
-
-bool testEvaluateGlobalPoolingResidualBlock(const GlobalPoolingResidualBlockDesc* desc,
-                                            int batchSize,
-                                            int nnXLen,
-                                            int nnYLen,
-                                            const vector<float>& inputBuffer,
-                                            const vector<float>& maskBuffer,
-                                            vector<float>& outputBuffer);
 
 void copyRowData(float* dest, const float* src, size_t numElements);
 void convertNCHW(float* rowSpatialInput, int C, int H, int W, bool inputsUseNHWC);
@@ -93,59 +57,40 @@ void getMetalOutput(ComputeHandle* gpuHandle,
                     int numBatchEltsFilled,
                     NNResultBuf** inputBufs,
                     vector<NNOutput*>& outputs);
-};
+}
 
 /**
  * @brief Represents a loaded neural network model.
  * A LoadedModel object contains a ModelDesc object that describes the characteristics of the loaded model.
- * The default constructor, copy constructor, and assignment operator are deleted to prevent
- * creation of an uninitialized LoadedModel object, copying of the loaded model, and potential memory leaks.
+ * For Metal backend, we also store the model path for on-demand conversion.
  */
 struct LoadedModel {
   /**
    * @brief The description of the loaded model.
-   * The modelDesc field is a ModelDesc object that describes the characteristics of the loaded model.
    */
   ModelDesc modelDesc;
 
+  /**
+   * @brief Path to the original .bin.gz model file for conversion.
+   */
+  string modelPath;
+
   /**
    * @brief Construct a new Loaded Model object
-   * This constructor loads a machine learning model from a file and sets the modelDesc field to the
-   * characteristics of the loaded model.
+   * This constructor loads a machine learning model from a file and sets the modelDesc field.
    * @param fileName The name of the file containing the machine learning model.
    * @param expectedSha256 The expected SHA-256 hash of the model file.
    */
-  LoadedModel(const string& fileName, const string& expectedSha256)
-  {
-    ModelDesc::loadFromFileMaybeGZipped(fileName, modelDesc, expectedSha256);
-  }
+  LoadedModel(const string& fileName, const string& expectedSha256);
 
-  /**
-   * @brief Delete the default constructor
-   * The default constructor is deleted to prevent creation of an uninitialized LoadedModel object.
-   */
   LoadedModel() = delete;
-
-  /**
-   * @brief Delete the copy constructor
-   * The copy constructor is deleted to prevent copying of the loaded model.
-   */
   LoadedModel(const LoadedModel&) = delete;
-
-  /**
-   * @brief Delete the assignment operator
-   * The assignment operator is deleted to prevent copying of the loaded model.
-   */
   LoadedModel& operator=(const LoadedModel&) = delete;
 };
 
 /**
- * @brief Context for computing neural network operations.
- * A ComputeContext object contains configuration settings for neural network computations, such as
- * whether to use half-precision floating-point (FP16) mode and whether to use the NHWC format for
- * input tensors. The default constructor, copy constructor, and assignment operator are deleted
- * to prevent creation of an uninitialized ComputeContext object, copying of the object, and potential
- * memory leaks.
+ * @brief Context for computing neural network operations using Metal.
+ * Contains global configuration settings for neural network computations.
  */
 struct ComputeContext {
   /**
@@ -154,64 +99,47 @@ struct ComputeContext {
   enabled_t useFP16Mode;
 
   /**
-   * @brief ComputeContext ID
+   * @brief The width of the neural network input.
    */
-  int identifier;
+  int nnXLen;
 
   /**
-   * @brief Metal compute context instance
+   * @brief The height of the neural network input.
    */
-  MetalComputeContext metalComputeContext;
+  int nnYLen;
+
+  /**
+   * @brief Metal compute context instance from Swift.
+   */
+  MetalComputeContext metalContext;
 
   /**
    * @brief Constructs a ComputeContext object.
-   * This constructor creates a ComputeContext object and sets the configuration settings for neural network
-   * computations, including whether to use FP16 mode and whether to use the NHWC format for input tensors.
    * @param nnX The width of the input tensor.
    * @param nnY The height of the input tensor.
-   * @param useFP16Mode Whether to use half-precision floating-point (FP16) mode for computations.
+   * @param useFP16Mode Whether to use half-precision floating-point (FP16) mode.
    * @param useNHWCMode Whether to use the NHWC format for input tensors.
    */
   ComputeContext(int nnX, int nnY, enabled_t useFP16Mode, enabled_t useNHWCMode);
 
-  /**
-   * @brief Destroys the ComputeContext object.
-   */
   ~ComputeContext();
-
-  /**
-   * @brief Deletes the default constructor.
-   */
   ComputeContext() = delete;
-
-  /**
-   * @brief Deletes the copy constructor.
-   */
   ComputeContext(const ComputeContext&) = delete;
-
-  /**
-   * @brief Deletes the copy constructor.
-   *
-   * @return ComputeContext&
-   */
   ComputeContext& operator=(const ComputeContext&) = delete;
 };
 
 /**
- * @brief A handle for performing neural network computations.
- * This struct represents a handle for computing neural network operations. It contains various
- * parameters and settings that determine how the computation is performed.
+ * @brief A handle for performing neural network computations using Metal.
+ * This struct represents a per-thread handle for computing neural network operations.
  */
 struct ComputeHandle {
-  int identifier;
-
   /**
-   * @brief The x length of the neural network computation context.
+   * @brief The x length of the neural network.
    */
   int nnXLen;
 
   /**
-   * @brief The y length of the neural network computation context.
+   * @brief The y length of the neural network.
    */
   int nnYLen;
 
@@ -236,53 +164,53 @@ struct ComputeHandle {
   bool inputsUseNHWC;
 
   /**
-   * @brief Whether to use 16-bit floating-point precision for computation.
+   * @brief Whether to use 16-bit floating-point precision.
    */
   bool useFP16;
 
   /**
-   * @brief The Metal handle instance.
+   * @brief Whether exact neural net length is required (enables mask optimization).
    */
-  swift::Optional<MetalComputeHandle> metalhandle;
+  bool requireExactNNLen;
+
+  /**
+   * @brief The MPSGraph-only handle instance from Swift (GPU-only mode).
+   */
+  swift::Optional<MPSGraphModelHandle> mpsGraphOnlyHandle;
+
+  /**
+   * @brief The CoreML-only handle instance from Swift (ANE mode).
+   */
+  swift::Optional<CoreMLComputeHandle> coremlOnlyHandle;
 
   /**
    * @brief Construct a new ComputeHandle object.
-   * This constructor initializes a new ComputeHandle object with the specified parameters and settings.
    * @param context The ComputeContext object to use for computation.
-   * @param loadedModel A pointer to the LoadedModel object containing the neural network model to use.
+   * @param loadedModel A pointer to the LoadedModel object.
    * @param inputsUseNHWC Whether the input data uses NHWC format.
-   * @param gpuIdx The index of the GPU to use for computation.
-   * @param serverThreadIdx The index of the server thread to use for computation.
+   * @param gpuIdx The index of the GPU to use.
+   * @param serverThreadIdx The index of the server thread.
+   * @param requireExactNNLen Whether exact NN length is required.
+   * @param maxBatchSize Maximum batch size for dynamic batch support.
    */
   ComputeHandle(
     ComputeContext* context,
     const LoadedModel* loadedModel,
     bool inputsUseNHWC,
     int gpuIdx,
-    int serverThreadIdx);
+    int serverThreadIdx,
+    bool requireExactNNLen,
+    int maxBatchSize);
 
-  /**
-   * @brief Destroy the ComputeHandle object.
-   * This destructor frees any resources that were allocated for the ComputeHandle object.
-   */
   ~ComputeHandle();
-
-  /**
-   * @brief Delete the default constructor.
-   */
   ComputeHandle() = delete;
-
-  /**
-   * @brief Delete the copy constructor.
-   */
   ComputeHandle(const ComputeHandle&) = delete;
-
-  /**
-   * @brief Delete the assignment operator.
-   */
   ComputeHandle& operator=(const ComputeHandle&) = delete;
 };
 
+/**
+ * @brief Input and output buffers for neural network inference.
+ */
 struct InputBuffers {
   int maxBatchSize;
   size_t policyResultChannels;
@@ -298,6 +226,7 @@ struct InputBuffers {
   size_t singleOwnershipResultElts;
   size_t singleOwnerMapElts;
   size_t singleScoreValuesResultElts;
+  size_t singleMaskElts;
 
   size_t rowSpatialBufferElts;
   size_t userInputBufferElts;
@@ -310,6 +239,7 @@ struct InputBuffers {
   size_t ownershipResultBufferElts;
   size_t ownerMapBufferElts;
   size_t scoreValuesResultBufferElts;
+  size_t userInputMaskBufferElts;
 
   float* rowSpatialBuffer;
   float* userInputBuffer;
@@ -322,6 +252,7 @@ struct InputBuffers {
   float* ownershipResults;
   float* ownerMapBuffer;
   float* scoreValuesResults;
+  float* userInputMaskBuffer;
 
   InputBuffers(const LoadedModel* loadedModel, int maxBatchSz, int nnXLen, int nnYLen);
   ~InputBuffers();
@@ -329,3 +260,7 @@ struct InputBuffers {
   InputBuffers(const InputBuffers&) = delete;
   InputBuffers& operator=(const InputBuffers&) = delete;
 };
+
+#endif // USE_METAL_BACKEND
+
+#endif // NEURALNET_METALBACKEND_H_
diff --git a/cpp/neuralnet/metalbackend.swift b/cpp/neuralnet/metalbackend.swift
index 97c6e181d..43e17fa56 100644
--- a/cpp/neuralnet/metalbackend.swift
+++ b/cpp/neuralnet/metalbackend.swift
@@ -1,3032 +1,546 @@
 import Foundation
+import CoreML
 import MetalPerformanceShaders
 import MetalPerformanceShadersGraph
 
 /// A class that handles output to standard error.
 class StandardError: TextOutputStream {
-    /// Outputs the specified string to the standard error stream.
     func write(_ string: String) {
-        /// Tries to write the UTF-8 encoded contents of the string to the standard error file handle.
         try? FileHandle.standardError.write(contentsOf: Data(string.utf8))
     }
 }
 
-/// A function to print error messages
+/// Print to standard error
 func printError(_ item: Any) {
-    // Create an instance of StandardError to direct output to the standard error stream
     var instance = StandardError()
-    // Output the provided item to the standard error using the created instance
     print(item, to: &instance)
 }
 
-/// An extension to the Data struct for handling float data with optional FP16 conversion.
-extension Data {
-    /// Initializes a new Data instance using an UnsafeMutablePointer<Float32>, with optional conversion to FP16 format.
-    /// - Parameters:
-    ///   - floatsNoCopy: An UnsafeMutablePointer<Float32> containing the float data.
-    ///   - shape: An array of NSNumber objects representing the shape of the data.
-    init(
-        floatsNoCopy: UnsafeMutablePointer<Float32>,
-        shape: [NSNumber]
-    ) {
-        self.init(
-            bytesNoCopy: floatsNoCopy,
-            count: shape.countBytesOfFloat32(),
-            deallocator: .none)
-    }
-}
-
-/// Extension to MPSNDArray to convert from MPSGraphTensor, and to read/write bytes from/to UnsafeMutableRawPointer
-extension MPSNDArray {
-    /// Read bytes from the buffer
-    /// - Parameter buffer: The buffer to read
-    func readBytes(_ buffer: UnsafeMutableRawPointer) {
-        self.readBytes(buffer, strideBytes: nil)
-    }
-
-    /// Write bytes to the buffer
-    /// - Parameter buffer: The buffer to write
-    func writeBytes(_ buffer: UnsafeMutableRawPointer) {
-        self.writeBytes(buffer, strideBytes: nil)
-    }
-}
-
-/// Extension to Array to count number of elements and bytes
-extension Array where Element == NSNumber {
-    /// Count number of elements
-    /// - Returns: Number of elements
-    func countElements() -> Int {
-        return reduce(1, { $0 * $1.intValue })
-    }
-
-    /// Count number of bytes
-    /// - Parameter dataType: The data type
-    /// - Returns: Number of bytes
-    func countBytesOfFloat32() -> Int {
-        return countElements() * MemoryLayout<Float32>.size
-    }
-}
-
-/// Extension to MPSGraph to the mish activation function
-extension MPSGraph {
-    /// This function applies the Mish activation function on the input tensor `x`. The Mish function is defined as
-    /// x * tanh(Softplus(x)), where Softplus(x) is defined as log(1 + exp(min(x, 10.39))) if x < 10.39 and x otherwise.
-    /// When FP16 is later used, the threshold of softplus will need to be modified to 10.39, which is different from
-    /// the original 20. This is because exp(10.39) = 32532.666936 < 32767.0 < 65504.0, so the result of exp(10.39) can
-    /// be represented by float16. If the threshold of softplus is 20, the result of exp(20) is 485165195.40979004,
-    /// which is out of range of float16.
-    /// - Parameter tensor: The input tensor of mish activation function
-    /// - Returns: The output tensor of mish activation function
-    func mish(tensor: MPSGraphTensor) -> MPSGraphTensor {
-        assert(tensor.dataType == .float32)
-
-        let one = 1.0
-        let threshold = 20.0
-        let thresholdTensor = constant(threshold, dataType: tensor.dataType)
-        let minimumTensor = minimum(tensor, thresholdTensor, name: nil)
-        let expTensor = exponent(with: minimumTensor, name: nil)
-        let oneTensor = constant(one, dataType: tensor.dataType)
-        let addTensor = addition(expTensor, oneTensor, name: nil)
-        let logTensor = logarithm(with: addTensor, name: nil)
-        let lessTensor = lessThan(tensor, thresholdTensor, name: nil)
-        let selectTensor = select(
-            predicate: lessTensor, trueTensor: logTensor, falseTensor: tensor, name: nil)
-        let tanhTensor = tanh(with: selectTensor, name: nil)
-        let mulTensor = multiplication(tensor, tanhTensor, name: nil)
-
-        return mulTensor
-    }
-}
-
-/// A structure that represents the input shape
-struct InputShape {
-    /// Create a shape for the input tensor
-    /// - Parameters:
-    ///   - batchSize: Batch size
-    ///   - numChannels: Number of channels
-    ///   - nnYLen: Y length
-    ///   - nnXLen: X length
-    /// - Returns: The shape
-    static func create(
-        batchSize: NSNumber,
-        numChannels: NSNumber,
-        nnYLen: NSNumber,
-        nnXLen: NSNumber
-    ) -> [NSNumber] {
-        let shape = [
-            batchSize,
-            numChannels,
-            nnYLen,
-            nnXLen,
-        ]
-        return shape
-    }
-
-    /// Get the channel axis
-    /// - Returns: The channel axis
-    static func getChannelAxis() -> Int {
-        return 1
-    }
-
-    /// Get the HW axes
-    /// - Returns: The HW axes
-    static func getHWAxes() -> [NSNumber] {
-        let hwAxes = [2, 3] as [NSNumber]
-        return hwAxes
-    }
-}
-
-/// A structure that represents the input layer
-struct InputLayer {
-    let tensor: MPSGraphTensor
-    let shape: [NSNumber]
-
-    /// Initialize a InputLayer object
-    /// - Parameters:
-    ///   - graph: The graph
-    ///   - nnXLen: X length
-    ///   - nnYLen: Y length
-    ///   - numChannels: Number of channels
-    ///   - dataType: Data type
-    init(
-        graph: MPSGraph,
-        nnXLen: NSNumber,
-        nnYLen: NSNumber,
-        numChannels: NSNumber,
-        dataType: MPSDataType = .float32
-    ) {
-        shape = InputShape.create(
-            batchSize: -1,
-            numChannels: numChannels,
-            nnYLen: nnYLen,
-            nnXLen: nnXLen)
-
-        self.tensor = graph.placeholder(
-            shape: shape,
-            dataType: dataType,
-            name: nil)
-
-        assert(self.tensor.shape?.count == 4)
-    }
-}
-
-/// A structure that represents an input global layer for a neural network model.
-struct InputGlobalLayer {
-    let tensor: MPSGraphTensor
-    let shape: [NSNumber]
-
-    /// Initializes an InputGlobalLayer object with a graph, batch size, number of global features, data type, and input shape.
-    /// - Parameters:
-    ///   - graph: The graph.
-    ///   - numGlobalFeatures: The number of global features.
-    ///   - dataType: The data type.
-    init(
-        graph: MPSGraph,
-        numGlobalFeatures: NSNumber,
-        dataType: MPSDataType = .float32
-    ) {
-        shape = InputShape.create(
-            batchSize: -1,
-            numChannels: numGlobalFeatures,
-            nnYLen: 1,
-            nnXLen: 1)
-
-        self.tensor = graph.placeholder(
-            shape: shape,
-            dataType: dataType,
-            name: nil)
-
-        assert(self.tensor.shape?.count == 4)
-    }
-}
-
-/// A structure representing the input meta layer for a neural network graph.
-struct InputMetaLayer {
-    /// A `MPSGraphTensor` representing the placeholder tensor in the graph.
-    let tensor: MPSGraphTensor
-    /// An array of `NSNumber` representing the shape of the tensor placeholder.
-    let shape: [NSNumber]
-
-    /// Initializes a new `InputMetaLayer` instance with the given graph and number of meta features.
-    ///
-    /// - Parameters:
-    ///   - graph: The `MPSGraph` instance where the placeholder tensor will be created.
-    ///   - numMetaFeatures: The number of meta features (channels) for the input tensor.
-    ///   - dataType: The data type
-    ///
-    /// This initializer sets the shape of the input tensor using a helper function `InputShape.create` with
-    /// a dynamic batch size (-1), the specified number of channels, and a spatial size of 1x1 (nnYLen and nnXLen).
-    /// It also creates a placeholder tensor in the MPS graph with the specified shape and data type `float32`.
-    init(
-        graph: MPSGraph,
-        numMetaFeatures: NSNumber,
-        dataType: MPSDataType = .float32
-    ) {
-        // Define the shape of the input tensor with dynamic batch size, specified number of channels, and spatial dimensions 1x1.
-        shape = InputShape.create(
-            batchSize: -1,
-            numChannels: numMetaFeatures,
-            nnYLen: 1,
-            nnXLen: 1)
-
-        // Create a placeholder tensor in the graph with the above-defined shape and data type float32.
-        self.tensor = graph.placeholder(
-            shape: shape,
-            dataType: dataType,
-            name: nil)
-    }
-}
-
-/// A structure that represents a mask layer for a neural network model.
-struct MaskLayer {
-    let tensor: MPSGraphTensor
-    let shape: [NSNumber]
-
-    /// Initializes a MaskLayer object with a graph, batch size, x and y lengths, data type, and input shape.
-    /// - Parameters:
-    ///   - graph: The graph.
-    ///   - nnXLen: The length of the x-axis.
-    ///   - nnYLen: The length of the y-axis.
-    ///   - dataType: The data type.
-    init(
-        graph: MPSGraph,
-        nnXLen: NSNumber,
-        nnYLen: NSNumber,
-        dataType: MPSDataType = .float32
-    ) {
-        shape = InputShape.create(
-            batchSize: -1,
-            numChannels: 1,
-            nnYLen: nnYLen,
-            nnXLen: nnXLen)
-
-        self.tensor = graph.placeholder(
-            shape: shape,
-            dataType: dataType,
-            name: nil)
-
-        assert(self.tensor.shape?.count == 4)
-    }
-}
-
-/// A structure that represents a layer which performs the summation operation on a mask layer.
-struct MaskSumLayer {
-    let tensor: MPSGraphTensor
-
-    /// Initializes a MaskSumLayer object with a given tensor.
-    /// - Parameter tensor: The tensor to use for the layer.
-    init(tensor: MPSGraphTensor) {
-        self.tensor = tensor
-        assert(self.tensor.shape?.count == 4)
-    }
-
-    /// Initializes a MaskSumLayer object with a graph, a mask layer, and a boolean flag indicating whether to use NHWC or NCHW format.
-    /// - Parameters:
-    ///   - graph: The graph.
-    ///   - maskTensor: The mask tensor.
-    init(
-        graph: MPSGraph,
-        maskTensor: MPSGraphTensor
-    ) {
-        let hwAxes = InputShape.getHWAxes()
-
-        self.tensor = graph.reductionSum(
-            with: maskTensor,
-            axes: hwAxes,
-            name: nil)
-
-        assert(self.tensor.shape?.count == 4)
-    }
-}
-
-/// A structure that represents a layer which performs square root, subtraction, and multiplication operations on a MaskSumLayer object.
-struct MaskSumSqrtS14M01Layer {
-    let tensor: MPSGraphTensor
-
-    /// Initializes a MaskSumSqrtS14M01Layer object with a given tensor.
-    /// - Parameter tensor: The tensor to use for the layer.
-    init(tensor: MPSGraphTensor) {
-        self.tensor = tensor
-        assert(self.tensor.shape?.count == 4)
-    }
-
-    /// Initializes a MaskSumSqrtS14M01Layer object with a graph, a MaskSumLayer object, and a boolean flag indicating whether to use 16-bit floating-point data type.
-    /// - Parameters:
-    ///   - graph: The graph.
-    ///   - maskSum: The MaskSumLayer object.
-    init(
-        graph: MPSGraph,
-        maskSum: MaskSumLayer
-    ) {
-        let sqrtMaskSum = graph.squareRoot(with: maskSum.tensor, name: nil)
-
-        let fourTeen = graph.constant(
-            14.0,
-            shape: [1],
-            dataType: maskSum.tensor.dataType)
-
-        let subtracted = graph.subtraction(sqrtMaskSum, fourTeen, name: nil)
-
-        let zeroPointone = graph.constant(
-            0.1,
-            shape: [1],
-            dataType: maskSum.tensor.dataType)
-
-        self.tensor = graph.multiplication(
-            subtracted,
-            zeroPointone,
-            name: nil)
-
-        assert(self.tensor.shape?.count == 4)
-    }
-}
-
-/// A structure that represents a layer which performs squaring and subtraction operations on a MaskSumSqrtS14M01Layer object.
-struct MaskSumSqrtS14M01SquareS01Layer {
-    let tensor: MPSGraphTensor
-
-    /// Initializes a MaskSumSqrtS14M01SquareS01Layer object with a given tensor.
-    /// - Parameter tensor: The tensor to use for the layer.
-    init(tensor: MPSGraphTensor) {
-        self.tensor = tensor
-        assert(self.tensor.shape?.count == 4)
-    }
-
-    /// Initializes a MaskSumSqrtS14M01SquareS01Layer object with a graph, a MaskSumSqrtS14M01Layer object, and a boolean flag indicating whether to use 16-bit floating-point data type.
-    /// - Parameters:
-    ///   - graph: The graph.
-    ///   - maskSumSqrtS14M01: The MaskSumSqrtS14M01Layer object.
-    init(
-        graph: MPSGraph,
-        maskSumSqrtS14M01: MaskSumSqrtS14M01Layer
-    ) {
-        let squared = graph.square(with: maskSumSqrtS14M01.tensor, name: nil)
-
-        let zeroPointone = graph.constant(
-            0.1,
-            shape: [1],
-            dataType: maskSumSqrtS14M01.tensor.dataType)
-
-        self.tensor = graph.subtraction(
-            squared,
-            zeroPointone,
-            name: nil)
-
-        assert(self.tensor.shape?.count == 4)
-    }
-}
-
-/// A Swift structure that represents a network tester, which tests various neural network configurations.
-struct NetworkTester {
-
-    /// A static function that tests a custom neural network configuration with the given parameters.
-    /// - Parameters:
-    ///   - batchSize: The number of input batches.
-    ///   - nnXLen: The width of the input tensor.
-    ///   - nnYLen: The height of the input tensor.
-    ///   - numChannels: The number of channels in the input tensor.
-    ///   - input: A pointer to the input data.
-    ///   - mask: A pointer to the mask data.
-    ///   - output: A pointer to the output data.
-    ///   - networkBuilder: A closure that takes an MPSGraph, InputLayer, and MaskLayer, and returns an MPSGraphTensor representing the custom network configuration.
-    static func test(
-        batchSize: NSNumber,
-        nnXLen: NSNumber,
-        nnYLen: NSNumber,
-        numChannels: NSNumber,
-        input: UnsafeMutablePointer<Float32>,
-        mask: UnsafeMutablePointer<Float32>,
-        output: UnsafeMutablePointer<Float32>,
-        networkBuilder: (MPSGraph, InputLayer, MaskLayer) -> MPSGraphTensor
-    ) {
-
-        // Create a Metal device.
-        let device = MTLCreateSystemDefaultDevice()!
-
-        // Create a MPSGraph.
-        let graph = MPSGraph()
-
-        // Create the input and mask layers.
-        let inputLayer = InputLayer(
-            graph: graph,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen,
-            numChannels: numChannels)
-
-        let maskLayer = MaskLayer(
-            graph: graph,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        // Build the custom network configuration using the provided networkBuilder closure.
-        let resultTensor = networkBuilder(graph, inputLayer, maskLayer)
-
-        // Create input shape
-        let inputShape = InputShape.create(
-            batchSize: batchSize,
-            numChannels: numChannels,
-            nnYLen: nnYLen,
-            nnXLen: nnXLen)
-
-        // Create MPSNDArrayDescriptors from the input shape.
-        let sourceDescriptor = MPSNDArrayDescriptor(
-            dataType: inputLayer.tensor.dataType,
-            shape: inputShape)
-
-        // Create MPSNDArray from the source descriptor.
-        let sourceArray = MPSNDArray(
-            device: device,
-            descriptor: sourceDescriptor)
-
-        // Create a mask shape
-        let maskShape = InputShape.create(
-            batchSize: batchSize,
-            numChannels: 1,
-            nnYLen: nnYLen,
-            nnXLen: nnXLen)
-
-        // Create MPSNDArrayDescriptors from the mask shape.
-        let maskDescriptor = MPSNDArrayDescriptor(
-            dataType: maskLayer.tensor.dataType,
-            shape: maskShape)
-
-        // Create MPSNDArray from the mask descriptor.
-        let maskArray = MPSNDArray(
-            device: device,
-            descriptor: maskDescriptor)
-
-        // Write input and mask data to their respective MPSNDArrays, converting to FP16 if necessary.
-        sourceArray.writeBytes(input)
-        maskArray.writeBytes(mask)
-
-        // Create MPSGraphTensorData objects from the source and mask arrays.
-        let sourceTensorData = MPSGraphTensorData(sourceArray)
-        let maskTensorData = MPSGraphTensorData(maskArray)
-
-        // Execute the graph and fetch the result.
-        let fetch = graph.run(
-            feeds: [
-                inputLayer.tensor: sourceTensorData,
-                maskLayer.tensor: maskTensorData,
-            ],
-            targetTensors: [resultTensor],
-            targetOperations: nil)
-
-        // Read the output data from the result tensor, converting from FP16 to FP32 if necessary.
-        fetch[resultTensor]?.mpsndarray().readBytes(output)
-    }
-}
-
-/// A struct that represents a description of convolutional layer.
-public struct SWConvLayerDesc {
-    let convYSize: NSNumber
-    let convXSize: NSNumber
-    let inChannels: NSNumber
-    let outChannels: NSNumber
-    let dilationY: Int
-    let dilationX: Int
-    let weights: UnsafeMutablePointer<Float32>
-
-    /// Initializes a SWConvLayerDesc object.
-    /// - Parameters:
-    ///   - convYSize: The Y size of the convolution.
-    ///   - convXSize: The X size of the convolution.
-    ///   - inChannels: The number of input channels.
-    ///   - outChannels: The number of output channels.
-    ///   - dilationY: The dilation in the Y direction.
-    ///   - dilationX: The dilation in the X direction.
-    ///   - weights: A pointer to the weights.
-    init(
-        convYSize: NSNumber,
-        convXSize: NSNumber,
-        inChannels: NSNumber,
-        outChannels: NSNumber,
-        dilationY: Int,
-        dilationX: Int,
-        weights: UnsafeMutablePointer<Float32>
-    ) {
-        self.convYSize = convYSize
-        self.convXSize = convXSize
-        self.inChannels = inChannels
-        self.outChannels = outChannels
-        self.dilationY = dilationY
-        self.dilationX = dilationX
-        self.weights = weights
-    }
-}
-
-public func createSWConvLayerDesc(
-    convYSize: Int32,
-    convXSize: Int32,
-    inChannels: Int32,
-    outChannels: Int32,
-    dilationY: Int32,
-    dilationX: Int32,
-    weights: UnsafeMutablePointer<Float32>
-) -> SWConvLayerDesc {
-    return SWConvLayerDesc(
-        convYSize: convYSize as NSNumber,
-        convXSize: convXSize as NSNumber,
-        inChannels: inChannels as NSNumber,
-        outChannels: outChannels as NSNumber,
-        dilationY: Int(dilationY),
-        dilationX: Int(dilationX),
-        weights: weights)
-}
-
-/// A class that represents a convolutional layer using MPSGraph
-class ConvLayer {
-    /// The result tensor of the convolutional operation
-    let resultTensor: MPSGraphTensor
-    /// The convolution 2D operation descriptor
-    let convDescriptor = MPSGraphConvolution2DOpDescriptor(
-        strideInX: 1,
-        strideInY: 1,
-        dilationRateInX: 1,
-        dilationRateInY: 1,
-        groups: 1,
-        paddingStyle: .TF_SAME,
-        dataLayout: .NCHW,
-        weightsLayout: .OIHW)!
-
-    /// Class method that tests the convolutional layer by running a forward pass
-    /// - Parameters:
-    ///   - descriptor: A descriptor for the convolutional layer
-    ///   - nnXLen: The width of the input tensor
-    ///   - nnYLen: The height of the input tensor
-    ///   - batchSize: The batch size of the input tensor
-    ///   - input: A pointer to the input tensor data
-    ///   - output: A pointer to the output tensor data
-    class func test(
-        descriptor: SWConvLayerDesc,
-        nnXLen: NSNumber,
-        nnYLen: NSNumber,
-        batchSize: NSNumber,
-        input: UnsafeMutablePointer<Float32>,
-        output: UnsafeMutablePointer<Float32>
-    ) {
-        let device = MTLCreateSystemDefaultDevice()!
-        let graph = MPSGraph()
-
-        let source = InputLayer(
-            graph: graph,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen,
-            numChannels: descriptor.inChannels)
-
-        let conv = ConvLayer(
-            graph: graph,
-            sourceTensor: source.tensor,
-            descriptor: descriptor,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let inputShape = InputShape.create(
-            batchSize: batchSize,
-            numChannels: descriptor.inChannels,
-            nnYLen: nnYLen,
-            nnXLen: nnXLen)
-
-        let sourceDescriptor = MPSNDArrayDescriptor(
-            dataType: source.tensor.dataType,
-            shape: inputShape)
-
-        let sourceArray = MPSNDArray(
-            device: device,
-            descriptor: sourceDescriptor)
-
-        sourceArray.writeBytes(input)
-        let sourceTensorData = MPSGraphTensorData(sourceArray)
-
-        let fetch = graph.run(
-            feeds: [source.tensor: sourceTensorData],
-            targetTensors: [conv.resultTensor],
-            targetOperations: nil)
-
-        fetch[conv.resultTensor]?.mpsndarray().readBytes(output)
-    }
-
-    /// Initializes a ConvLayer object
-    /// - Parameters:
-    ///   - graph: An MPSGraph object
-    ///   - sourceTensor: The input tensor for the convolutional layer
-    ///   - descriptor: A descriptor for the convolutional layer
-    ///   - nnXLen: The width of the input tensor
-    ///   - nnYLen: The height of the input tensor
-    init(
-        graph: MPSGraph,
-        sourceTensor: MPSGraphTensor,
-        descriptor: SWConvLayerDesc,
-        nnXLen: NSNumber,
-        nnYLen: NSNumber
-    ) {
-        let weightsShape = [
-            descriptor.outChannels,
-            descriptor.inChannels,
-            descriptor.convYSize,
-            descriptor.convXSize,
-        ]
-
-        let weightsData = Data(
-            floatsNoCopy: descriptor.weights,
-            shape: weightsShape)
-
-        let weightsTensor = graph.constant(
-            weightsData,
-            shape: weightsShape,
-            dataType: sourceTensor.dataType)
-
-        resultTensor = graph.convolution2D(
-            sourceTensor,
-            weights: weightsTensor,
-            descriptor: convDescriptor,
-            name: nil)
-
-        assert(resultTensor.shape?.count == 4)
-    }
-}
-
-public func testConvLayer(
-    descriptor: SWConvLayerDesc,
-    nnXLen: Int32,
-    nnYLen: Int32,
-    batchSize: Int32,
-    input: UnsafeMutablePointer<Float32>,
-    output: UnsafeMutablePointer<Float32>
-) {
-    ConvLayer.test(
-        descriptor: descriptor,
-        nnXLen: nnXLen as NSNumber,
-        nnYLen: nnYLen as NSNumber,
-        batchSize: batchSize as NSNumber,
-        input: input,
-        output: output)
-}
-
-/// A struct that represents a description of a batch normalization layer.
-public struct SWBatchNormLayerDesc {
-    let numChannels: NSNumber
-    let mergedScale: UnsafeMutablePointer<Float32>
-    let mergedBias: UnsafeMutablePointer<Float32>
-
-    /// Initializes a SWBatchNormLayerDesc object.
-    /// - Parameters:
-    ///   - numChannels: The number of channels in the input tensor.
-    ///   - mergedScale: A pointer to the merged scale.
-    ///   - mergedBias: A pointer to the merged bias.
-    init(
-        numChannels: NSNumber,
-        mergedScale: UnsafeMutablePointer<Float32>,
-        mergedBias: UnsafeMutablePointer<Float32>
-    ) {
-        self.numChannels = numChannels
-        self.mergedScale = mergedScale
-        self.mergedBias = mergedBias
-    }
-}
-
-public func createSWBatchNormLayerDesc(
-    numChannels: Int32,
-    mergedScale: UnsafeMutablePointer<Float32>,
-    mergedBias: UnsafeMutablePointer<Float32>
-) -> SWBatchNormLayerDesc {
-    return SWBatchNormLayerDesc(
-        numChannels: numChannels as NSNumber,
-        mergedScale: mergedScale,
-        mergedBias: mergedBias)
-}
-
-/// A class that represents a batch normalization layer.
-class BatchNormLayer {
-    let resultTensor: MPSGraphTensor
-
-    /// Executes a test for the batch normalization layer.
-    /// - Parameters:
-    ///   - descriptor: The description of the batch normalization layer.
-    ///   - nnXLen: The width of the input tensor.
-    ///   - nnYLen: The height of the input tensor.
-    ///   - batchSize: The number of input batches.
-    ///   - input: A pointer to the input data.
-    ///   - mask: A pointer to the mask data.
-    ///   - output: A pointer to the output data.
-    class func test(
-        descriptor: SWBatchNormLayerDesc,
-        nnXLen: NSNumber,
-        nnYLen: NSNumber,
-        batchSize: NSNumber,
-        input: UnsafeMutablePointer<Float32>,
-        mask: UnsafeMutablePointer<Float32>,
-        output: UnsafeMutablePointer<Float32>
-    ) {
-
-        NetworkTester.test(
-            batchSize: batchSize,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen,
-            numChannels: descriptor.numChannels,
-            input: input,
-            mask: mask,
-            output: output
-        ) { graph, inputLayer, maskLayer in
-
-            let batchNorm = BatchNormLayer(
-                graph: graph,
-                sourceTensor: inputLayer.tensor,
-                maskTensor: maskLayer.tensor,
-                descriptor: descriptor,
-                nnXLen: nnXLen,
-                nnYLen: nnYLen)
-
-            return batchNorm.resultTensor
-        }
-    }
-
-    /// Initializes a BatchNormLayer object with the specified parameters, and computes the normalized and masked result tensor.
-    /// - Parameters:
-    ///   - graph: The MPSGraph object used to build the BatchNormLayer.
-    ///   - sourceTensor: The input tensor to the BatchNormLayer.
-    ///   - maskTensor: The mask tensor to apply to the normalized tensor.
-    ///   - descriptor: The BatchNormLayer descriptor containing parameters such as the number of channels, mean, variance, scale, and bias.
-    ///   - nnXLen: The length of the input tensor in the X direction.
-    ///   - nnYLen: The length of the input tensor in the Y direction.
-    init(
-        graph: MPSGraph,
-        sourceTensor: MPSGraphTensor,
-        maskTensor: MPSGraphTensor,
-        descriptor: SWBatchNormLayerDesc,
-        nnXLen: NSNumber,
-        nnYLen: NSNumber
-    ) {
-        let scaleBiasShape = InputShape.create(
-            batchSize: 1,
-            numChannels: descriptor.numChannels,
-            nnYLen: 1,
-            nnXLen: 1)
-
-        let mergedScaleData = Data(
-            floatsNoCopy: descriptor.mergedScale,
-            shape: scaleBiasShape)
-
-        let mergedBiasData = Data(
-            floatsNoCopy: descriptor.mergedBias,
-            shape: scaleBiasShape)
-
-        let scaleTensor = graph.constant(
-            mergedScaleData,
-            shape: scaleBiasShape,
-            dataType: sourceTensor.dataType)
-
-        let biasTensor = graph.constant(
-            mergedBiasData,
-            shape: scaleBiasShape,
-            dataType: sourceTensor.dataType)
-
-        let scaled = graph.multiplication(
-            sourceTensor,
-            scaleTensor,
-            name: nil)
-
-        let normalized = graph.addition(
-            scaled,
-            biasTensor,
-            name: nil)
-
-        resultTensor = graph.multiplication(
-            normalized,
-            maskTensor,
-            name: nil)
-
-        assert(resultTensor.shape?.count == 4)
-    }
-}
-
-public func testBatchNormLayer(
-    descriptor: SWBatchNormLayerDesc,
-    nnXLen: Int32,
-    nnYLen: Int32,
-    batchSize: Int32,
-    input: UnsafeMutablePointer<Float32>,
-    mask: UnsafeMutablePointer<Float32>,
-    output: UnsafeMutablePointer<Float32>
-) {
-    BatchNormLayer.test(
-        descriptor: descriptor,
-        nnXLen: nnXLen as NSNumber,
-        nnYLen: nnYLen as NSNumber,
-        batchSize: batchSize as NSNumber,
-        input: input,
-        mask: mask,
-        output: output)
-}
-
-/// An enumeration of the different kinds of activation function.
-public enum ActivationKind {
-    case identity
-    case relu
-    case mish
-}
-
-/// A structure that represents an activation layer
-struct ActivationLayer {
-    let resultTensor: MPSGraphTensor
-
-    /// Initialize an ActivationLayer object
-    /// - Parameters:
-    ///   - graph: The MPSGraph
-    ///   - sourceTensor: The input tensor
-    ///   - activationKind: The activation kind
-    init(
-        graph: MPSGraph,
-        sourceTensor: MPSGraphTensor,
-        activationKind: ActivationKind
-    ) {
-
-        switch activationKind {
-        case .relu:
-            resultTensor = graph.reLU(with: sourceTensor, name: nil)
-        case .mish:
-            resultTensor = graph.mish(tensor: sourceTensor)
-        default:
-            resultTensor = sourceTensor
-        }
-
-        assert(resultTensor.shape == sourceTensor.shape)
-    }
-}
-
-/// A class that represents a residual block in a convolutional neural network.
-public class SWResidualBlockDesc: BlockDescriptor {
-    /// A description of the batch normalization layer that is applied before the first convolutional layer.
-    let preBN: SWBatchNormLayerDesc
-
-    /// The type of activation function that is applied before the first convolutional layer.
-    let preActivation: ActivationKind
-
-    /// A description of the convolutional layer that is applied in the middle of the residual block.
-    let regularConv: SWConvLayerDesc
-
-    /// A description of the batch normalization layer that is applied after the middle convolutional layer.
-    let midBN: SWBatchNormLayerDesc
-
-    /// The type of activation function that is applied after the middle convolutional layer.
-    let midActivation: ActivationKind
-
-    /// A description of the convolutional layer that is applied at the end of the residual block.
-    let finalConv: SWConvLayerDesc
-
-    /// Initializes a `SWResidualBlockDesc` object.
-    /// - Parameters:
-    ///   - preBN: A description of the batch normalization layer that is applied before the first convolutional layer.
-    ///   - preActivation: The type of activation function that is applied before the first convolutional layer.
-    ///   - regularConv: A description of the convolutional layer that is applied in the middle of the residual block.
-    ///   - midBN: A description of the batch normalization layer that is applied after the middle convolutional layer.
-    ///   - midActivation: The type of activation function that is applied after the middle convolutional layer.
-    ///   - finalConv: A description of the convolutional layer that is applied at the end of the residual block.
-    init(
-        preBN: SWBatchNormLayerDesc,
-        preActivation: ActivationKind,
-        regularConv: SWConvLayerDesc,
-        midBN: SWBatchNormLayerDesc,
-        midActivation: ActivationKind,
-        finalConv: SWConvLayerDesc
-    ) {
-        self.preBN = preBN
-        self.preActivation = preActivation
-        self.regularConv = regularConv
-        self.midBN = midBN
-        self.midActivation = midActivation
-        self.finalConv = finalConv
-    }
-}
-
-public func createSWResidualBlockDesc(
-    preBN: SWBatchNormLayerDesc,
-    preActivation: ActivationKind,
-    regularConv: SWConvLayerDesc,
-    midBN: SWBatchNormLayerDesc,
-    midActivation: ActivationKind,
-    finalConv: SWConvLayerDesc
-) -> SWResidualBlockDesc {
-    return SWResidualBlockDesc(
-        preBN: preBN,
-        preActivation: preActivation,
-        regularConv: regularConv,
-        midBN: midBN,
-        midActivation: midActivation,
-        finalConv: finalConv)
-}
-
-/// A class that represents a Residual Block layer
-class ResidualBlock {
-    let resultTensor: MPSGraphTensor
-
-    /// A function that runs tests on the Residual Block layer
-    ///
-    /// - Parameters:
-    ///   - descriptor: The Residual Block descriptor
-    ///   - batchSize: Batch size
-    ///   - nnXLen: X length
-    ///   - nnYLen: Y length
-    ///   - input: The input float32 pointer
-    ///   - mask: The mask float32 pointer
-    ///   - output: The output float32 pointer
-    class func test(
-        descriptor: SWResidualBlockDesc,
-        batchSize: NSNumber,
-        nnXLen: NSNumber,
-        nnYLen: NSNumber,
-        input: UnsafeMutablePointer<Float32>,
-        mask: UnsafeMutablePointer<Float32>,
-        output: UnsafeMutablePointer<Float32>
-    ) {
-
-        NetworkTester.test(
-            batchSize: batchSize,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen,
-            numChannels: descriptor.preBN.numChannels,
-            input: input,
-            mask: mask,
-            output: output
-        ) { graph, inputLayer, maskLayer in
-
-            let block = ResidualBlock(
-                graph: graph,
-                sourceTensor: inputLayer.tensor,
-                maskTensor: maskLayer.tensor,
-                descriptor: descriptor,
-                nnXLen: nnXLen,
-                nnYLen: nnYLen)
-
-            return block.resultTensor
-        }
-    }
-
-    /// Initialize a ResidualBlock object
-    ///
-    /// - Parameters:
-    ///   - graph: The MPSGraph
-    ///   - sourceTensor: The input tensor
-    ///   - maskTensor: The mask tensor
-    ///   - descriptor: The Residual Block descriptor
-    ///   - nnXLen: X length
-    ///   - nnYLen: Y length
-    init(
-        graph: MPSGraph,
-        sourceTensor: MPSGraphTensor,
-        maskTensor: MPSGraphTensor,
-        descriptor: SWResidualBlockDesc,
-        nnXLen: NSNumber,
-        nnYLen: NSNumber
-    ) {
-        let preBN = BatchNormLayer(
-            graph: graph,
-            sourceTensor: sourceTensor,
-            maskTensor: maskTensor,
-            descriptor: descriptor.preBN,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let preActivation = ActivationLayer(
-            graph: graph,
-            sourceTensor: preBN.resultTensor,
-            activationKind: descriptor.preActivation)
-
-        let regularConv = ConvLayer(
-            graph: graph,
-            sourceTensor: preActivation.resultTensor,
-            descriptor: descriptor.regularConv,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let midBN = BatchNormLayer(
-            graph: graph,
-            sourceTensor: regularConv.resultTensor,
-            maskTensor: maskTensor,
-            descriptor: descriptor.midBN,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let midActivation = ActivationLayer(
-            graph: graph,
-            sourceTensor: midBN.resultTensor,
-            activationKind: descriptor.midActivation)
-
-        let finalConv = ConvLayer(
-            graph: graph,
-            sourceTensor: midActivation.resultTensor,
-            descriptor: descriptor.finalConv,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        resultTensor = graph.addition(
-            sourceTensor,
-            finalConv.resultTensor,
-            name: nil)
-
-        assert(resultTensor.shape?.count == 4)
-    }
-}
-
-public func testResidualBlock(
-    descriptor: SWResidualBlockDesc,
-    batchSize: Int32,
-    nnXLen: Int32,
-    nnYLen: Int32,
-    input: UnsafeMutablePointer<Float32>,
-    mask: UnsafeMutablePointer<Float32>,
-    output: UnsafeMutablePointer<Float32>
-) {
-    ResidualBlock.test(
-        descriptor: descriptor,
-        batchSize: batchSize as NSNumber,
-        nnXLen: nnXLen as NSNumber,
-        nnYLen: nnYLen as NSNumber,
-        input: input,
-        mask: mask,
-        output: output)
-}
-
-/// A structure that represents a global pooling layer
-struct GlobalPoolingLayer {
-    /// The resulting tensor after applying the global pooling operation
-    let resultTensor: MPSGraphTensor
-
-    /// Initialize a GlobalPoolingLayer object
-    /// - Parameters:
-    ///   - graph: The graph
-    ///   - sourceTensor: The source tensor to be pooled
-    ///   - maskTensor: The mask tensor
-    ///   - maskSumTensor: The sum of the mask
-    ///   - maskSumSqrtS14M01Tensor: The multiplication of subtraction of square root of the sum of the mask
-    init(
-        graph: MPSGraph,
-        sourceTensor: MPSGraphTensor,
-        maskTensor: MPSGraphTensor,
-        maskSumTensor: MPSGraphTensor,
-        maskSumSqrtS14M01Tensor: MPSGraphTensor
-    ) {
-        let hwAxes = InputShape.getHWAxes()
-        let channelAxis = InputShape.getChannelAxis()
-
-        let sumTensor = graph.reductionSum(
-            with: sourceTensor,
-            axes: hwAxes,
-            name: nil)
-
-        let meanTensor = graph.division(sumTensor, maskSumTensor, name: nil)
-
-        let meanMaskTensor = graph.multiplication(
-            meanTensor,
-            maskSumSqrtS14M01Tensor,
-            name: nil)
-
-        let oneTensor = graph.constant(1.0, dataType: sourceTensor.dataType)
-        let maskM1Tensor = graph.subtraction(maskTensor, oneTensor, name: nil)
-        let addition = graph.addition(sourceTensor, maskM1Tensor, name: nil)
-
-        let maxTensor = graph.reductionMaximum(
-            with: addition,
-            axes: hwAxes,
-            name: nil)
-
-        resultTensor = graph.concatTensors(
-            [
-                meanTensor,
-                meanMaskTensor,
-                maxTensor,
-            ],
-            dimension: channelAxis,
-            name: nil)
-
-        assert(resultTensor.shape?.count == 4)
-        assert(resultTensor.shape?[2] == 1)
-        assert(resultTensor.shape?[3] == 1)
-    }
-}
-
-/// A structure that represents a layer that performs global pooling on the input tensor
-struct GlobalPoolingValueLayer {
-    let resultTensor: MPSGraphTensor
-
-    /// Initialize a GlobalPoolingValueLayer object
-    /// - Parameters:
-    ///   - graph: The graph
-    ///   - sourceTensor: The input tensor
-    ///   - maskSumTensor: The sum of the mask
-    ///   - maskSumSqrtS14M01Tensor: The multiplication of subtraction of square root of the sum of the mask
-    ///   - maskSumSqrtS14M01SquareS01Tensor: The subtraction of square of multiplication of subtraction of square root of the sum of the mask
-    init(
-        graph: MPSGraph,
-        sourceTensor: MPSGraphTensor,
-        maskSumTensor: MPSGraphTensor,
-        maskSumSqrtS14M01Tensor: MPSGraphTensor,
-        maskSumSqrtS14M01SquareS01Tensor: MPSGraphTensor
-    ) {
-        let hwAxes = InputShape.getHWAxes()
-        let channelAxis = InputShape.getChannelAxis()
-
-        let sumTensor = graph.reductionSum(
-            with: sourceTensor,
-            axes: hwAxes,
-            name: nil)
-
-        let meanTensor = graph.division(sumTensor, maskSumTensor, name: nil)
-
-        let meanMaskTensor = graph.multiplication(
-            meanTensor,
-            maskSumSqrtS14M01Tensor,
-            name: nil)
-
-        let meanMaskSquareTensor = graph.multiplication(
-            meanTensor,
-            maskSumSqrtS14M01SquareS01Tensor,
-            name: nil)
-
-        resultTensor = graph.concatTensors(
-            [
-                meanTensor,
-                meanMaskTensor,
-                meanMaskSquareTensor,
-            ],
-            dimension: channelAxis,
-            name: nil)
-
-        assert(resultTensor.shape?.count == 4)
-        assert(resultTensor.shape?[2] == 1)
-        assert(resultTensor.shape?[3] == 1)
-    }
-}
-
-/// A struct that represents a matrix multiplication layer descriptor
-public struct SWMatMulLayerDesc {
-    /// The number of input channels
-    let inChannels: NSNumber
-    /// The number of output channels
-    let outChannels: NSNumber
-    /// The weights used for the matrix multiplication
-    let weights: UnsafeMutablePointer<Float32>
-
-    /// Initialize a SWMatMulLayerDesc object
-    /// - Parameters:
-    ///   - inChannels: The number of input channels
-    ///   - outChannels: The number of output channels
-    ///   - weights: The weights used for the matrix multiplication
-    init(
-        inChannels: NSNumber,
-        outChannels: NSNumber,
-        weights: UnsafeMutablePointer<Float32>
-    ) {
-        self.inChannels = inChannels
-        self.outChannels = outChannels
-        self.weights = weights
-    }
-}
-
-public func createSWMatMulLayerDesc(
-    inChannels: Int32,
-    outChannels: Int32,
-    weights: UnsafeMutablePointer<Float32>
-) -> SWMatMulLayerDesc {
-    return SWMatMulLayerDesc(
-        inChannels: inChannels as NSNumber,
-        outChannels: outChannels as NSNumber,
-        weights: weights)
-}
-
-/// A structure representing a matrix multiplication layer.
-struct MatMulLayer {
-    /// The resulting tensor from the layer.
-    let resultTensor: MPSGraphTensor
-
-    /// Initializes a MatMulLayer object.
-    /// - Parameters:
-    ///   - graph: The graph.
-    ///   - descriptor: The matrix multiplication layer descriptor.
-    ///   - sourceTensor: The input tensor to the layer.
-    init(
-        graph: MPSGraph,
-        descriptor: SWMatMulLayerDesc,
-        sourceTensor: MPSGraphTensor
-    ) {
-
-        assert(
-            (sourceTensor.shape?.count == 4) || (sourceTensor.shape?[1] == descriptor.inChannels))
-        assert(
-            (sourceTensor.shape?.count == 2) || (sourceTensor.shape?[1] == descriptor.inChannels))
-
-        let weightsShape = [
-            descriptor.inChannels,
-            descriptor.outChannels,
-        ]
-
-        let weightsData = Data(
-            floatsNoCopy: descriptor.weights,
-            shape: weightsShape)
-
-        let weightsTensor = graph.constant(
-            weightsData,
-            shape: weightsShape,
-            dataType: sourceTensor.dataType)
-
-        let shape = [-1, descriptor.inChannels]
-
-        let reshapedSource = graph.reshape(
-            sourceTensor,
-            shape: shape,
-            name: nil)
-
-        resultTensor = graph.matrixMultiplication(
-            primary: reshapedSource,
-            secondary: weightsTensor,
-            name: nil)
-
-        assert(resultTensor.shape?.count == 2)
-    }
-}
-
-/// An Objective-C class that represents the bias layer description used in Swift.
-public struct SWMatBiasLayerDesc {
-    /// The number of channels.
-    let numChannels: NSNumber
-    /// The pointer to the weights.
-    let weights: UnsafeMutablePointer<Float32>
-
-    /// Initialize an instance of SWMatBiasLayerDesc.
-    /// - Parameters:
-    ///   - numChannels: The number of channels.
-    ///   - weights: The pointer to the weights.
-    init(
-        numChannels: NSNumber,
-        weights: UnsafeMutablePointer<Float32>
-    ) {
-        self.numChannels = numChannels
-        self.weights = weights
-    }
-}
-
-public func createSWMatBiasLayerDesc(
-    numChannels: Int32,
-    weights: UnsafeMutablePointer<Float32>
-) -> SWMatBiasLayerDesc {
-    return SWMatBiasLayerDesc(
-        numChannels: numChannels as NSNumber,
-        weights: weights)
-}
-
-/// A structure that performs matrix bias operations
-struct MatBiasLayer {
-    /// The resulting tensor from the layer.
-    let resultTensor: MPSGraphTensor
-
-    /// Initializes a MatBiasLayer object.
-    /// - Parameters:
-    ///   - graph: The graph.
-    ///   - descriptor: The descriptor that contains information about the layer
-    ///   - sourceTensor: The input tensor to the layer.
-    init(
-        graph: MPSGraph,
-        descriptor: SWMatBiasLayerDesc,
-        sourceTensor: MPSGraphTensor
-    ) {
-
-        assert(
-            (sourceTensor.shape?.count == 2) && (sourceTensor.shape?[1] == descriptor.numChannels))
-
-        let weightsShape = [1, descriptor.numChannels]
-
-        let weightsData = Data(
-            floatsNoCopy: descriptor.weights,
-            shape: weightsShape)
-
-        let weightsTensor = graph.constant(
-            weightsData,
-            shape: weightsShape,
-            dataType: sourceTensor.dataType)
-
-        resultTensor = graph.addition(
-            sourceTensor,
-            weightsTensor,
-            name: nil)
-    }
-}
-
-/// A structure that performs bias operations in NC coordinates.
-struct AddNCBiasLayer {
-    /// The resulting tensor from the layer.
-    let resultTensor: MPSGraphTensor
-
-    /// Initializes an AddNCBiasLayer object.
-    /// - Parameters:
-    ///   - graph: The graph.
-    ///   - sourceTensor: The input tensor to the layer.
-    ///   - biasTensor: The bias tensor.
-    ///   - nnXLen: The x length.
-    ///   - nnYLen: The y length.
-    ///   - numChannels: The number of channels.
-    init(
-        graph: MPSGraph,
-        sourceTensor: MPSGraphTensor,
-        biasTensor: MPSGraphTensor,
-        nnXLen: NSNumber,
-        nnYLen: NSNumber,
-        numChannels: NSNumber
-    ) {
-        let shape = InputShape.create(
-            batchSize: -1,
-            numChannels: numChannels,
-            nnYLen: 1,
-            nnXLen: 1)
-
-        assert(biasTensor.shape?[1] == shape[1])
-
-        let reshaped = graph.reshape(biasTensor, shape: shape, name: nil)
-        resultTensor = graph.addition(sourceTensor, reshaped, name: nil)
-
-        assert(resultTensor.shape?.count == 4)
-        assert(resultTensor.shape?[2] == nnYLen)
-        assert(resultTensor.shape?[3] == nnXLen)
-    }
-}
-
-/// A class that represents a residual block with global pooling.
-public class SWGlobalPoolingResidualBlockDesc: BlockDescriptor {
-    /// The batch normalization layer before the residual block.
-    let preBN: SWBatchNormLayerDesc
-
-    /// The pre-activation function of the residual block.
-    let preActivation: ActivationKind
-
-    /// The regular convolutional layer in the residual block.
-    let regularConv: SWConvLayerDesc
-
-    /// The convolutional layer for global pooling.
-    let gpoolConv: SWConvLayerDesc
-
-    /// The batch normalization layer after the global pooling convolutional layer.
-    let gpoolBN: SWBatchNormLayerDesc
-
-    /// The activation function after the global pooling batch normalization layer.
-    let gpoolActivation: ActivationKind
-
-    /// The matrix multiplication layer that multiplies the global pooled output with a bias.
-    let gpoolToBiasMul: SWMatMulLayerDesc
-
-    /// The batch normalization layer after the matrix multiplication layer.
-    let midBN: SWBatchNormLayerDesc
-
-    /// The activation function after the mid batch normalization layer.
-    let midActivation: ActivationKind
-
-    /// The final convolutional layer in the residual block.
-    let finalConv: SWConvLayerDesc
-
-    /// Initialize a SWGlobalPoolingResidualBlockDesc object.
-    /// - Parameters:
-    ///   - preBN: The batch normalization layer before the residual block.
-    ///   - preActivation: The pre-activation function of the residual block.
-    ///   - regularConv: The regular convolutional layer in the residual block.
-    ///   - gpoolConv: The convolutional layer for global pooling.
-    ///   - gpoolBN: The batch normalization layer after the global pooling convolutional layer.
-    ///   - gpoolActivation: The activation function after the global pooling batch normalization layer.
-    ///   - gpoolToBiasMul: The matrix multiplication layer that multiplies the global pooled output with a bias.
-    ///   - midBN: The batch normalization layer after the matrix multiplication layer.
-    ///   - midActivation: The activation function after the mid batch normalization layer.
-    ///   - finalConv: The final convolutional layer in the residual block.
-    init(
-        preBN: SWBatchNormLayerDesc,
-        preActivation: ActivationKind,
-        regularConv: SWConvLayerDesc,
-        gpoolConv: SWConvLayerDesc,
-        gpoolBN: SWBatchNormLayerDesc,
-        gpoolActivation: ActivationKind,
-        gpoolToBiasMul: SWMatMulLayerDesc,
-        midBN: SWBatchNormLayerDesc,
-        midActivation: ActivationKind,
-        finalConv: SWConvLayerDesc
-    ) {
-        self.preBN = preBN
-        self.preActivation = preActivation
-        self.regularConv = regularConv
-        self.gpoolConv = gpoolConv
-        self.gpoolBN = gpoolBN
-        self.gpoolActivation = gpoolActivation
-        self.gpoolToBiasMul = gpoolToBiasMul
-        self.midBN = midBN
-        self.midActivation = midActivation
-        self.finalConv = finalConv
-    }
-}
-
-public func createSWGlobalPoolingResidualBlockDesc(
-    preBN: SWBatchNormLayerDesc,
-    preActivation: ActivationKind,
-    regularConv: SWConvLayerDesc,
-    gpoolConv: SWConvLayerDesc,
-    gpoolBN: SWBatchNormLayerDesc,
-    gpoolActivation: ActivationKind,
-    gpoolToBiasMul: SWMatMulLayerDesc,
-    midBN: SWBatchNormLayerDesc,
-    midActivation: ActivationKind,
-    finalConv: SWConvLayerDesc
-) -> SWGlobalPoolingResidualBlockDesc {
-
-    return SWGlobalPoolingResidualBlockDesc(
-        preBN: preBN,
-        preActivation: preActivation,
-        regularConv: regularConv,
-        gpoolConv: gpoolConv,
-        gpoolBN: gpoolBN,
-        gpoolActivation: gpoolActivation,
-        gpoolToBiasMul: gpoolToBiasMul,
-        midBN: midBN,
-        midActivation: midActivation,
-        finalConv: finalConv)
-}
-
-/// A class representing a residual block with global pooling
-class GlobalPoolingResidualBlock {
-    let resultTensor: MPSGraphTensor
-
-    /// A method to test the global pooling residual block
-    ///
-    /// - Parameters:
-    ///   - descriptor: The descriptor of the global pooling residual block
-    ///   - batchSize: The batch size
-    ///   - nnXLen: The X length
-    ///   - nnYLen: The Y length
-    ///   - input: The input pointer
-    ///   - mask: The mask pointer
-    ///   - output: The output pointer
-    class func test(
-        descriptor: SWGlobalPoolingResidualBlockDesc,
-        batchSize: NSNumber,
-        nnXLen: NSNumber,
-        nnYLen: NSNumber,
-        input: UnsafeMutablePointer<Float32>,
-        mask: UnsafeMutablePointer<Float32>,
-        output: UnsafeMutablePointer<Float32>
-    ) {
-
-        NetworkTester.test(
-            batchSize: batchSize,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen,
-            numChannels: descriptor.preBN.numChannels,
-            input: input,
-            mask: mask,
-            output: output
-        ) { graph, inputLayer, maskLayer in
-
-            let maskSum = MaskSumLayer(
-                graph: graph,
-                maskTensor: maskLayer.tensor)
-
-            let maskSumSqrtS14M01 = MaskSumSqrtS14M01Layer(
-                graph: graph,
-                maskSum: maskSum)
-
-            let block =
-                GlobalPoolingResidualBlock(
-                    graph: graph,
-                    sourceTensor: inputLayer.tensor,
-                    maskTensor: maskLayer.tensor,
-                    maskSumTensor: maskSum.tensor,
-                    maskSumSqrtS14M01Tensor: maskSumSqrtS14M01.tensor,
-                    descriptor: descriptor,
-                    nnXLen: nnXLen,
-                    nnYLen: nnYLen)
-
-            return block.resultTensor
-        }
-    }
-
-    /// Initialize a GlobalPoolingResidualBlock object
-    ///
-    /// - Parameters:
-    ///   - graph: The graph
-    ///   - sourceTensor: The source tensor
-    ///   - maskTensor: The mask tensor
-    ///   - maskSumTensor: The mask sum tensor
-    ///   - maskSumSqrtS14M01Tensor: The mask sum square tensor
-    ///   - descriptor: The descriptor of the global pooling residual block
-    ///   - nnXLen: The X length
-    ///   - nnYLen: The Y length
-    init(
-        graph: MPSGraph,
-        sourceTensor: MPSGraphTensor,
-        maskTensor: MPSGraphTensor,
-        maskSumTensor: MPSGraphTensor,
-        maskSumSqrtS14M01Tensor: MPSGraphTensor,
-        descriptor: SWGlobalPoolingResidualBlockDesc,
-        nnXLen: NSNumber,
-        nnYLen: NSNumber
-    ) {
-        let maskSum = MaskSumLayer(tensor: maskSumTensor)
-        let maskSumSqrtS14M01 = MaskSumSqrtS14M01Layer(tensor: maskSumSqrtS14M01Tensor)
-
-        let preBN = BatchNormLayer(
-            graph: graph,
-            sourceTensor: sourceTensor,
-            maskTensor: maskTensor,
-            descriptor: descriptor.preBN,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let preActivation = ActivationLayer(
-            graph: graph,
-            sourceTensor: preBN.resultTensor,
-            activationKind: descriptor.preActivation)
-
-        let regularConv = ConvLayer(
-            graph: graph,
-            sourceTensor: preActivation.resultTensor,
-            descriptor: descriptor.regularConv,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let gpoolConv = ConvLayer(
-            graph: graph,
-            sourceTensor: preActivation.resultTensor,
-            descriptor: descriptor.gpoolConv,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let gpoolBN = BatchNormLayer(
-            graph: graph,
-            sourceTensor: gpoolConv.resultTensor,
-            maskTensor: maskTensor,
-            descriptor: descriptor.gpoolBN,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let gpoolActivation = ActivationLayer(
-            graph: graph,
-            sourceTensor: gpoolBN.resultTensor,
-            activationKind: descriptor.gpoolActivation)
-
-        let gpoolConcat = GlobalPoolingLayer(
-            graph: graph,
-            sourceTensor: gpoolActivation.resultTensor,
-            maskTensor: maskTensor,
-            maskSumTensor: maskSum.tensor,
-            maskSumSqrtS14M01Tensor: maskSumSqrtS14M01.tensor)
-
-        assert(gpoolConcat.resultTensor.shape?[1] == descriptor.gpoolToBiasMul.inChannels)
-
-        let gpoolToBiasMul = MatMulLayer(
-            graph: graph,
-            descriptor: descriptor.gpoolToBiasMul,
-            sourceTensor: gpoolConcat.resultTensor)
-
-        let added = AddNCBiasLayer(
-            graph: graph,
-            sourceTensor: regularConv.resultTensor,
-            biasTensor: gpoolToBiasMul.resultTensor,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen,
-            numChannels: descriptor.gpoolToBiasMul.outChannels)
-
-        let midBN = BatchNormLayer(
-            graph: graph,
-            sourceTensor: added.resultTensor,
-            maskTensor: maskTensor,
-            descriptor: descriptor.midBN,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let midActivation = ActivationLayer(
-            graph: graph,
-            sourceTensor: midBN.resultTensor,
-            activationKind: descriptor.midActivation)
-
-        let finalConv = ConvLayer(
-            graph: graph,
-            sourceTensor: midActivation.resultTensor,
-            descriptor: descriptor.finalConv,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        resultTensor = graph.addition(
-            sourceTensor,
-            finalConv.resultTensor,
-            name: nil)
-
-        assert(resultTensor.shape?.count == 4)
-    }
-}
-
-public func testGlobalPoolingResidualBlock(
-    descriptor: SWGlobalPoolingResidualBlockDesc,
-    batchSize: Int32,
-    nnXLen: Int32,
-    nnYLen: Int32,
-    input: UnsafeMutablePointer<Float32>,
-    mask: UnsafeMutablePointer<Float32>,
-    output: UnsafeMutablePointer<Float32>
-) {
-    GlobalPoolingResidualBlock.test(
-        descriptor: descriptor,
-        batchSize: batchSize as NSNumber,
-        nnXLen: nnXLen as NSNumber,
-        nnYLen: nnYLen as NSNumber,
-        input: input,
-        mask: mask,
-        output: output)
-}
-
-/// A class that represents a nested bottleneck residual block
-public class SWNestedBottleneckResidualBlockDesc: BlockDescriptor {
-    /// The batch normalization layer before the residual block.
-    let preBN: SWBatchNormLayerDesc
-
-    /// The pre-activation function of the residual block.
-    let preActivation: ActivationKind
-
-    /// The convolutional layer before the residual block.
-    let preConv: SWConvLayerDesc
-
-    /// The list of blocks that make up the trunk
-    let blockDescriptors: [BlockDescriptor]
-
-    /// The batch normalization layer after the residual block.
-    let postBN: SWBatchNormLayerDesc
-
-    /// The activation function after the post batch normalization layer.
-    let postActivation: ActivationKind
-
-    /// The convolutional layer after the post activation layer.
-    let postConv: SWConvLayerDesc
-
-    /// Initialize a SWNestedBottleneckResidualBlockDesc object.
-    /// - Parameters:
-    ///   - preBN: The batch normalization layer before the residual block.
-    ///   - preActivation: The pre-activation function of the residual block.
-    ///   - preConv: The convolutional layer before the residual block.
-    ///   - postBN: The batch normalization layer after the residual block.
-    ///   - postActivation: The activation function after the post batch normalization layer.
-    ///   - postConv: The convolutional layer after the post activation layer.
-    init(
-        preBN: SWBatchNormLayerDesc,
-        preActivation: ActivationKind,
-        preConv: SWConvLayerDesc,
-        blockDescriptors: [BlockDescriptor],
-        postBN: SWBatchNormLayerDesc,
-        postActivation: ActivationKind,
-        postConv: SWConvLayerDesc
-    ) {
-        self.preBN = preBN
-        self.preActivation = preActivation
-        self.preConv = preConv
-        self.blockDescriptors = blockDescriptors
-        self.postBN = postBN
-        self.postActivation = postActivation
-        self.postConv = postConv
-    }
-}
-
-public func createSWNestedBottleneckResidualBlockDesc(
-    preBN: SWBatchNormLayerDesc,
-    preActivation: ActivationKind,
-    preConv: SWConvLayerDesc,
-    blockDescriptors: [BlockDescriptor],
-    postBN: SWBatchNormLayerDesc,
-    postActivation: ActivationKind,
-    postConv: SWConvLayerDesc
-) -> SWNestedBottleneckResidualBlockDesc {
-    return SWNestedBottleneckResidualBlockDesc(
-        preBN: preBN,
-        preActivation: preActivation,
-        preConv: preConv,
-        blockDescriptors: blockDescriptors,
-        postBN: postBN,
-        postActivation: postActivation,
-        postConv: postConv)
-}
-
-public class BlockDescriptor {
-}
-
-public class BlockDescriptorBuilder {
-    public var blockDescriptors: [BlockDescriptor] = []
-
-    public func enque(with descriptor: BlockDescriptor) {
-        blockDescriptors.append(descriptor)
-    }
-}
-
-public func createBlockDescriptorBuilder() -> BlockDescriptorBuilder {
-    return BlockDescriptorBuilder()
-}
-
-/// A structure that represents a block stack
-struct BlockStack {
-    /// The resulting tensor after processing the block stack
-    let resultTensor: MPSGraphTensor
-
-    /// Process block descriptors
-    /// - Parameters:
-    ///   - graph: The MPSGraph
-    ///   - sourceTensor: The input tensor
-    ///   - maskTensor: The mask tensor
-    ///   - maskSumTensor: The sum of the mask tensor
-    ///   - maskSumSqrtS14M01Tensor: The square root of the sum of the mask tensor
-    ///   - blockDescriptors: The block descriptors
-    ///   - index: The index of the block descriptor
-    ///   - nnXLen: X length
-    ///   - nnYLen: Y length
-    /// - Returns: The result tensor
-    static func processBlockDescriptors(
-        _ graph: MPSGraph,
-        _ sourceTensor: MPSGraphTensor,
-        _ maskTensor: MPSGraphTensor,
-        _ maskSumTensor: MPSGraphTensor,
-        _ maskSumSqrtS14M01Tensor: MPSGraphTensor,
-        _ blockDescriptors: [BlockDescriptor],
-        _ index: Int,
-        _ nnXLen: NSNumber,
-        _ nnYLen: NSNumber
-    ) -> MPSGraphTensor {
-        guard index < blockDescriptors.count else {
-            return sourceTensor
-        }
-
-        let blockDescriptor = blockDescriptors[index]
-        let blockInput: MPSGraphTensor
-
-        switch blockDescriptor {
-        case let globalPoolingDescriptor as SWGlobalPoolingResidualBlockDesc:
-            let globalPooling = GlobalPoolingResidualBlock(
-                graph: graph,
-                sourceTensor: sourceTensor,
-                maskTensor: maskTensor,
-                maskSumTensor: maskSumTensor,
-                maskSumSqrtS14M01Tensor: maskSumSqrtS14M01Tensor,
-                descriptor: globalPoolingDescriptor,
-                nnXLen: nnXLen,
-                nnYLen: nnYLen)
-
-            blockInput = globalPooling.resultTensor
-        case let nestedBottleneckDescriptor as SWNestedBottleneckResidualBlockDesc:
-            let nestedBottleneck = NestedBottleneckResidualBlock(
-                graph: graph,
-                sourceTensor: sourceTensor,
-                maskTensor: maskTensor,
-                maskSumTensor: maskSumTensor,
-                maskSumSqrtS14M01Tensor: maskSumSqrtS14M01Tensor,
-                descriptor: nestedBottleneckDescriptor,
-                nnXLen: nnXLen,
-                nnYLen: nnYLen)
-
-            blockInput = nestedBottleneck.resultTensor
-        case let residualBlockDescriptor as SWResidualBlockDesc:
-            let ordinary = ResidualBlock(
-                graph: graph,
-                sourceTensor: sourceTensor,
-                maskTensor: maskTensor,
-                descriptor: residualBlockDescriptor,
-                nnXLen: nnXLen,
-                nnYLen: nnYLen)
-
-            blockInput = ordinary.resultTensor
-        default:
-            blockInput = sourceTensor
-        }
-
-        return processBlockDescriptors(
-            graph,
-            blockInput,
-            maskTensor,
-            maskSumTensor,
-            maskSumSqrtS14M01Tensor,
-            blockDescriptors,
-            index + 1,
-            nnXLen,
-            nnYLen)
-    }
-
-    /// Initialize a BlockStack object
-    /// - Parameters:
-    ///   - graph: The MPSGraph
-    ///   - sourceTensor: The input tensor
-    ///   - maskTensor: The mask tensor
-    ///   - maskSumTensor: The sum of the mask tensor
-    ///   - maskSumSqrtS14M01Tensor: The square root of the sum of the mask tensor
-    ///   - blockDescriptors: The block descriptors
-    ///   - nnXLen: X length
-    ///   - nnYLen: Y length
-    init(
-        graph: MPSGraph,
-        sourceTensor: MPSGraphTensor,
-        maskTensor: MPSGraphTensor,
-        maskSumTensor: MPSGraphTensor,
-        maskSumSqrtS14M01Tensor: MPSGraphTensor,
-        blockDescriptors: [BlockDescriptor],
-        nnXLen: NSNumber,
-        nnYLen: NSNumber
-    ) {
-        resultTensor = BlockStack.processBlockDescriptors(
-            graph,
-            sourceTensor,
-            maskTensor,
-            maskSumTensor,
-            maskSumSqrtS14M01Tensor,
-            blockDescriptors,
-            0,
-            nnXLen,
-            nnYLen)
-    }
-}
-
-/// A structure that represents a nested bottleneck residual block
-struct NestedBottleneckResidualBlock {
-    /// The resulting tensor after processing the nested bottleneck residual block
-    let resultTensor: MPSGraphTensor
-
-    /// Initialize a ResidualBlock object
-    ///
-    /// - Parameters:
-    ///   - graph: The MPSGraph
-    ///   - sourceTensor: The input tensor
-    ///   - maskTensor: The mask tensor
-    ///   - maskSumTensor: The sum of the mask tensor
-    ///   - maskSumSqrtS14M01Tensor: The square root of the sum of the mask tensor
-    ///   - descriptor: The nested bottleneck residual block descriptor
-    ///   - nnXLen: X length
-    ///   - nnYLen: Y length
-    init(
-        graph: MPSGraph,
-        sourceTensor: MPSGraphTensor,
-        maskTensor: MPSGraphTensor,
-        maskSumTensor: MPSGraphTensor,
-        maskSumSqrtS14M01Tensor: MPSGraphTensor,
-        descriptor: SWNestedBottleneckResidualBlockDesc,
-        nnXLen: NSNumber,
-        nnYLen: NSNumber
-    ) {
-
-        let preBN = BatchNormLayer(
-            graph: graph,
-            sourceTensor: sourceTensor,
-            maskTensor: maskTensor,
-            descriptor: descriptor.preBN,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let preActivation = ActivationLayer(
-            graph: graph,
-            sourceTensor: preBN.resultTensor,
-            activationKind: descriptor.preActivation)
-
-        let preConv = ConvLayer(
-            graph: graph,
-            sourceTensor: preActivation.resultTensor,
-            descriptor: descriptor.preConv,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let blocks = BlockStack(
-            graph: graph,
-            sourceTensor: preConv.resultTensor,
-            maskTensor: maskTensor,
-            maskSumTensor: maskSumTensor,
-            maskSumSqrtS14M01Tensor: maskSumSqrtS14M01Tensor,
-            blockDescriptors: descriptor.blockDescriptors,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let postBN = BatchNormLayer(
-            graph: graph,
-            sourceTensor: blocks.resultTensor,
-            maskTensor: maskTensor,
-            descriptor: descriptor.postBN,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let postActivation = ActivationLayer(
-            graph: graph,
-            sourceTensor: postBN.resultTensor,
-            activationKind: descriptor.postActivation)
-
-        let postConv = ConvLayer(
-            graph: graph,
-            sourceTensor: postActivation.resultTensor,
-            descriptor: descriptor.postConv,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        resultTensor = graph.addition(
-            sourceTensor,
-            postConv.resultTensor,
-            name: nil)
-
-        assert(resultTensor.shape?.count == 4)
-    }
-}
-
-/// Class representing the description of the SGF Metadata Encoder.
-///
-/// This encoder consists of three matrix multiplication layers, each followed by a bias and an activation function.
-public class SWSGFMetadataEncoderDesc {
-    /// Version of the SGF Metadata Encoder.
-    let version: Int
-
-    /// Number of input metadata channels.
-    let numInputMetaChannels: Int
-
-    /// Description of the first multiplication layer.
-    let mul1: SWMatMulLayerDesc
-
-    /// Description of the bias for the first layer.
-    let bias1: SWMatBiasLayerDesc
-
-    /// Activation kind for the first layer.
-    let act1: ActivationKind
-
-    /// Description of the second multiplication layer.
-    let mul2: SWMatMulLayerDesc
-
-    /// Description of the bias for the second layer.
-    let bias2: SWMatBiasLayerDesc
-
-    /// Activation kind for the second layer.
-    let act2: ActivationKind
-
-    /// Description of the third multiplication layer.
-    let mul3: SWMatMulLayerDesc
-
-    /// Initializes a new instance of the `SWSGFMetadataEncoderDesc` class.
-    ///
-    /// - Parameters:
-    ///   - version: The version of the SGF Metadata Encoder.
-    ///   - numInputMetaChannels: The number of input metadata channels.
-    ///   - mul1: Description of the first multiplication layer.
-    ///   - bias1: Description of the bias for the first layer.
-    ///   - act1: Activation kind for the first layer.
-    ///   - mul2: Description of the second multiplication layer.
-    ///   - bias2: Description of the bias for the second layer.
-    ///   - act2: Activation kind for the second layer.
-    ///   - mul3: Description of the third multiplication layer.
-    init(
-        version: Int,
-        numInputMetaChannels: Int,
-        mul1: SWMatMulLayerDesc,
-        bias1: SWMatBiasLayerDesc,
-        act1: ActivationKind,
-        mul2: SWMatMulLayerDesc,
-        bias2: SWMatBiasLayerDesc,
-        act2: ActivationKind,
-        mul3: SWMatMulLayerDesc
-    ) {
-        self.version = version
-        self.numInputMetaChannels = numInputMetaChannels
-        self.mul1 = mul1
-        self.bias1 = bias1
-        self.act1 = act1
-        self.mul2 = mul2
-        self.bias2 = bias2
-        self.act2 = act2
-        self.mul3 = mul3
-    }
-}
-
-/// Creates an instance of `SWSGFMetadataEncoderDesc` using the specified parameters.
-///
-/// - Parameters:
-///   - version: An `Int32` representing the version of the encoder descriptor.
-///   - numInputMetaChannels: An `Int32` specifying the number of input metadata channels.
-///   - mul1: A `SWMatMulLayerDesc` representing the description of the first matrix multiplication layer.
-///   - bias1: A `SWMatBiasLayerDesc` representing the description of the bias for the first layer.
-///   - act1: An `ActivationKind` specifying the activation function applied after the first layer.
-///   - mul2: A `SWMatMulLayerDesc` representing the description of the second matrix multiplication layer.
-///   - bias2: A `SWMatBiasLayerDesc` representing the description of the bias for the second layer.
-///   - act2: An `ActivationKind` specifying the activation function applied after the second layer.
-///   - mul3: A `SWMatMulLayerDesc` representing the description of the third matrix multiplication layer.
-///
-/// - Returns:
-///   An instance of `SWSGFMetadataEncoderDesc` initialized with the provided parameters.
-public func createSWSGFMetadataEncoderDesc(
-    version: Int32,
-    numInputMetaChannels: Int32,
-    mul1: SWMatMulLayerDesc,
-    bias1: SWMatBiasLayerDesc,
-    act1: ActivationKind,
-    mul2: SWMatMulLayerDesc,
-    bias2: SWMatBiasLayerDesc,
-    act2: ActivationKind,
-    mul3: SWMatMulLayerDesc
-) -> SWSGFMetadataEncoderDesc? {
-    return SWSGFMetadataEncoderDesc(
-        version: Int(version),
-        numInputMetaChannels: Int(numInputMetaChannels),
-        mul1: mul1,
-        bias1: bias1,
-        act1: act1,
-        mul2: mul2,
-        bias2: bias2,
-        act2: act2,
-        mul3: mul3)
-}
-
-/// A class that describes SGF metadata encoder.
-/// SGFMetadataEncoder takes a graph, a descriptor object defining various parameters for the encoding process,
-/// and an input tensor, and performs a sequence of matrix multiplications, bias additions, and activation functions
-/// to produce a final encoded tensor.
-class SGFMetadataEncoder {
-    /// The resulting tensor after encoding the metadata.
-    let resultTensor: MPSGraphTensor
-
-    /// Initializes an `SGFMetadataEncoder` instance and performs the encoding process.
-    ///
-    /// - Parameters:
-    ///   - graph: The computational graph object used to define and manage tensor operations.
-    ///   - descriptor: An object holding all the required parameters, including matrix multiplication, biases,
-    ///                 and activation functions for each layer.
-    ///   - sourceTensor: The initial input tensor containing the metadata to be encoded.
-    init(
-        graph: MPSGraph,
-        descriptor: SWSGFMetadataEncoderDesc,
-        sourceTensor: MPSGraphTensor
-    ) {
-
-        // First matrix multiplication layer.
-        let mul1 = MatMulLayer(
-            graph: graph,
-            descriptor: descriptor.mul1,
-            sourceTensor: sourceTensor)
-
-        // Adding bias to the result of the first matrix multiplication.
-        let bias1 = MatBiasLayer(
-            graph: graph,
-            descriptor: descriptor.bias1,
-            sourceTensor: mul1.resultTensor)
+// NOTE: Model caching and conversion are now handled in C++ using the native katagocoreml library.
+// The Python-based CoreMLConverter and ModelCacheManager have been removed to eliminate Python dependency.
 
-        // Applying the first activation function to the biased tensor.
-        let act1 = ActivationLayer(
-            graph: graph,
-            sourceTensor: bias1.resultTensor,
-            activationKind: descriptor.act1)
-
-        // Second matrix multiplication layer taking the output of the first activation layer.
-        let mul2 = MatMulLayer(
-            graph: graph,
-            descriptor: descriptor.mul2,
-            sourceTensor: act1.resultTensor)
-
-        // Adding bias to the result of the second matrix multiplication.
-        let bias2 = MatBiasLayer(
-            graph: graph,
-            descriptor: descriptor.bias2,
-            sourceTensor: mul2.resultTensor)
-
-        // Applying the second activation function to the biased tensor.
-        let act2 = ActivationLayer(
-            graph: graph,
-            sourceTensor: bias2.resultTensor,
-            activationKind: descriptor.act2)
-
-        // Third and final matrix multiplication layer taking the output of the second activation layer.
-        let mul3 = MatMulLayer(
-            graph: graph,
-            descriptor: descriptor.mul3,
-            sourceTensor: act2.resultTensor)
-
-        // Setting the final result tensor to the output of the last matrix multiplication layer.
-        resultTensor = mul3.resultTensor
-
-        assert(resultTensor.shape?.count == 2)
-    }
-}
+/// Context storing board dimensions and settings
+public class MetalComputeContext {
+    public let nnXLen: Int32
+    public let nnYLen: Int32
+    public let useFP16: Bool
 
-/// A class that describes a trunk for a neural network
-public class SWTrunkDesc {
-    /// The version of the ResNet trunk
-    let version: Int
-    /// Number of channels for the trunk
-    let trunkNumChannels: NSNumber
-    /// Number of channels for the mid section
-    let midNumChannels: NSNumber
-    /// Number of channels for the regular section
-    let regularNumChannels: NSNumber
-    /// Number of channels for the global pooling section
-    let gpoolNumChannels: NSNumber
-    /// The description of the initial convolutional layer
-    let initialConv: SWConvLayerDesc
-    /// The description of the initial matrix multiplication layer
-    let initialMatMul: SWMatMulLayerDesc
-    /// The description of the SGF metadata encoder
-    let sgfMetadataEncoder: SWSGFMetadataEncoderDesc?
-    /// The list of blocks that make up the trunk
-    let blockDescriptors: [BlockDescriptor]
-    /// The description of the batch normalization layer that is applied at the end of the trunk
-    let trunkTipBN: SWBatchNormLayerDesc
-    /// The activation function that is applied at the end of the trunk
-    let trunkTipActivation: ActivationKind
-
-    /// Initializes a SWTrunkDesc object
-    /// - Parameters:
-    ///   - version: The version of the ResNet trunk
-    ///   - trunkNumChannels: Number of channels for the trunk
-    ///   - midNumChannels: Number of channels for the mid section
-    ///   - regularNumChannels: Number of channels for the regular section
-    ///   - gpoolNumChannels: Number of channels for the global pooling section
-    ///   - initialConv: The description of the initial convolutional layer
-    ///   - initialMatMul: The description of the initial matrix multiplication layer
-    ///   - sgfMetadataEncoder: The description of the SGF metadata encoder
-    ///   - blockDescriptors: The list of blocks that make up the trunk
-    ///   - trunkTipBN: The description of the batch normalization layer that is applied at the end of the trunk
-    ///   - trunkTipActivation: The activation function that is applied at the end of the trunk
-    init(
-        version: Int,
-        trunkNumChannels: NSNumber,
-        midNumChannels: NSNumber,
-        regularNumChannels: NSNumber,
-        gpoolNumChannels: NSNumber,
-        initialConv: SWConvLayerDesc,
-        initialMatMul: SWMatMulLayerDesc,
-        sgfMetadataEncoder: SWSGFMetadataEncoderDesc?,
-        blockDescriptors: [BlockDescriptor],
-        trunkTipBN: SWBatchNormLayerDesc,
-        trunkTipActivation: ActivationKind
-    ) {
-        self.version = version
-        self.trunkNumChannels = trunkNumChannels
-        self.midNumChannels = midNumChannels
-        self.regularNumChannels = regularNumChannels
-        self.gpoolNumChannels = gpoolNumChannels
-        self.initialConv = initialConv
-        self.initialMatMul = initialMatMul
-        self.sgfMetadataEncoder = sgfMetadataEncoder
-        self.blockDescriptors = blockDescriptors
-        self.trunkTipBN = trunkTipBN
-        self.trunkTipActivation = trunkTipActivation
+    init(nnXLen: Int32, nnYLen: Int32, useFP16: Bool) {
+        self.nnXLen = nnXLen
+        self.nnYLen = nnYLen
+        self.useFP16 = useFP16
     }
 }
 
-public func createSWTrunkDesc(
-    version: Int32,
-    trunkNumChannels: Int32,
-    midNumChannels: Int32,
-    regularNumChannels: Int32,
-    gpoolNumChannels: Int32,
-    initialConv: SWConvLayerDesc,
-    initialMatMul: SWMatMulLayerDesc,
-    sgfMetadataEncoder: SWSGFMetadataEncoderDesc?,
-    blockDescriptors: [BlockDescriptor],
-    trunkTipBN: SWBatchNormLayerDesc,
-    trunkTipActivation: ActivationKind
-) -> SWTrunkDesc {
-    return SWTrunkDesc(
-        version: Int(version),
-        trunkNumChannels: trunkNumChannels as NSNumber,
-        midNumChannels: midNumChannels as NSNumber,
-        regularNumChannels: regularNumChannels as NSNumber,
-        gpoolNumChannels: gpoolNumChannels as NSNumber,
-        initialConv: initialConv,
-        initialMatMul: initialMatMul,
-        sgfMetadataEncoder: sgfMetadataEncoder,
-        blockDescriptors: blockDescriptors,
-        trunkTipBN: trunkTipBN,
-        trunkTipActivation: trunkTipActivation)
+/// Create a Metal compute context
+public func createMetalComputeContext(
+    nnXLen: Int32,
+    nnYLen: Int32,
+    useFP16: Bool
+) -> MetalComputeContext {
+    return MetalComputeContext(nnXLen: nnXLen, nnYLen: nnYLen, useFP16: useFP16)
 }
 
-/// A structure representing a ResNet trunk for a neural network
-struct Trunk {
-    /// The resulting tensor after processing the trunk
-    let resultTensor: MPSGraphTensor
-
-    /// Returns the block source tensor by processing the input meta tensor, if available, and adding a bias term.
-    ///
-    /// - Parameters:
-    ///     - graph: The Metal Performance Shaders (MPS) graph.
-    ///     - descriptor: The SGF metadata encoder descriptor.
-    ///     - initialAdd: The initial add operation result tensor.
-    ///     - inputMetaTensor: The input meta tensor.
-    ///     - nnXLen: The X length of the neural network (NN).
-    ///     - nnYLen: The Y length of the neural network (NN).
-    ///     - numChannels: The number of channels of the initial add operation result tensor.
-    ///
-    /// - Returns:
-    ///     - blockSourceTensor: The processed block source tensor.
-    ///
-    /// This function is used to get the block source tensor by processing the input meta tensor, if available.
-    /// If the input meta tensor is not available, it returns the result tensor from the initial add operation.
-    /// The function uses SGF metadata encoder and AddNCBiasLayer to process the input meta tensor.
-    static func getBlockSourceTensor(
-        graph: MPSGraph,
-        descriptor: SWSGFMetadataEncoderDesc?,
-        initialAdd: AddNCBiasLayer,
-        inputMetaTensor: MPSGraphTensor?,
-        nnXLen: NSNumber,
-        nnYLen: NSNumber,
-        numChannels: NSNumber
-    ) -> MPSGraphTensor {
-        var blockSourceTensor: MPSGraphTensor
-
-        if let inputMetaTensor,
-            let descriptor, descriptor.numInputMetaChannels > 0
-        {
-            let encoded = SGFMetadataEncoder(
-                graph: graph,
-                descriptor: descriptor,
-                sourceTensor: inputMetaTensor)
-
-            let encodedAdd = AddNCBiasLayer(
-                graph: graph,
-                sourceTensor: initialAdd.resultTensor,
-                biasTensor: encoded.resultTensor,
-                nnXLen: nnXLen,
-                nnYLen: nnYLen,
-                numChannels: numChannels)
-
-            blockSourceTensor = encodedAdd.resultTensor
-        } else {
-            blockSourceTensor = initialAdd.resultTensor
-        }
-
-        return blockSourceTensor
+/// Handle that wraps the loaded MLModel for inference
+public class CoreMLComputeHandle {
+    let model: MLModel
+    let nnXLen: Int32
+    let nnYLen: Int32
+    let optimizeIdentityMask: Bool
+    let numInputChannels: Int
+    let numInputGlobalChannels: Int
+    let numInputMetaChannels: Int
+    let numPolicyChannels: Int
+    let numValueChannels: Int
+    let numScoreValueChannels: Int
+    let numOwnershipChannels: Int
+
+    /// Model input/output names matching KataGoCoremltools output
+    struct IONames {
+        static let spatialInput = "spatial_input"
+        static let globalInput = "global_input"
+        static let inputMask = "input_mask"
+        static let metaInput = "meta_input"
+
+        static let policyOutput = "policy_p2_conv"
+        static let policyPassOutput = "policy_pass"
+        static let valueOutput = "value_v3_bias"
+        static let ownershipOutput = "value_ownership_conv"
+        static let scoreValueOutput = "value_sv3_bias"
+    }
+
+    init(model: MLModel, nnXLen: Int32, nnYLen: Int32,
+         optimizeIdentityMask: Bool,
+         numInputChannels: Int,
+         numInputGlobalChannels: Int,
+         numInputMetaChannels: Int,
+         numPolicyChannels: Int,
+         numValueChannels: Int,
+         numScoreValueChannels: Int,
+         numOwnershipChannels: Int) {
+        self.model = model
+        self.nnXLen = nnXLen
+        self.nnYLen = nnYLen
+        self.optimizeIdentityMask = optimizeIdentityMask
+        self.numInputChannels = numInputChannels
+        self.numInputGlobalChannels = numInputGlobalChannels
+        self.numInputMetaChannels = numInputMetaChannels
+        self.numPolicyChannels = numPolicyChannels
+        self.numValueChannels = numValueChannels
+        self.numScoreValueChannels = numScoreValueChannels
+        self.numOwnershipChannels = numOwnershipChannels
     }
 
-    /// Initializes a Trunk object
-    /// - Parameters:
-    ///   - graph: The graph used to build the trunk
-    ///   - descriptor: A SWTrunkDesc object that describes the trunk
-    ///   - inputTensor: The input tensor
-    ///   - inputGlobalTensor: The input global tensor
-    ///   - inputMetaTensor: The input meta tensor
-    ///   - maskTensor: The tensor used to mask input activations
-    ///   - maskSumTensor: The sum of the mask tensor
-    ///   - maskSumSqrtS14M01Tensor: The square root of the sum of the mask tensor
-    ///   - nnXLen: The length of the X dimension of the input tensor
-    ///   - nnYLen: The length of the Y dimension of the input tensor
-    init(
-        graph: MPSGraph,
-        descriptor: SWTrunkDesc,
-        inputTensor: MPSGraphTensor,
-        inputGlobalTensor: MPSGraphTensor,
-        inputMetaTensor: MPSGraphTensor?,
-        maskTensor: MPSGraphTensor,
-        maskSumTensor: MPSGraphTensor,
-        maskSumSqrtS14M01Tensor: MPSGraphTensor,
-        nnXLen: NSNumber,
-        nnYLen: NSNumber
+    /// Run inference on a batch of inputs
+    public func apply(
+        spatialInput: UnsafeMutablePointer<Float32>,
+        globalInput: UnsafeMutablePointer<Float32>,
+        metaInput: UnsafeMutablePointer<Float32>,
+        maskInput: UnsafeMutablePointer<Float32>,
+        policy: UnsafeMutablePointer<Float32>,
+        policyPass: UnsafeMutablePointer<Float32>,
+        value: UnsafeMutablePointer<Float32>,
+        scoreValue: UnsafeMutablePointer<Float32>,
+        ownership: UnsafeMutablePointer<Float32>,
+        batchSize: Int
     ) {
+        // Process batch elements in parallel using Grand Central Dispatch
+        // Each inference is independent, reading/writing to different buffer offsets
+        DispatchQueue.concurrentPerform(iterations: batchSize) { b in
+            autoreleasepool {
+                do {
+                    try runSingleInference(
+                        batchIndex: b,
+                        spatialInput: spatialInput,
+                        globalInput: globalInput,
+                        metaInput: metaInput,
+                        maskInput: maskInput,
+                        policy: policy,
+                        policyPass: policyPass,
+                        value: value,
+                        scoreValue: scoreValue,
+                        ownership: ownership
+                    )
+                } catch {
+                    fatalError("Metal backend: CoreML inference error: \(error)")
+                }
+            }
+        }
+    }
 
-        let initialConv = ConvLayer(
-            graph: graph,
-            sourceTensor: inputTensor,
-            descriptor: descriptor.initialConv,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let initialMatMul = MatMulLayer(
-            graph: graph,
-            descriptor: descriptor.initialMatMul,
-            sourceTensor: inputGlobalTensor)
-
-        let initialAdd = AddNCBiasLayer(
-            graph: graph,
-            sourceTensor: initialConv.resultTensor,
-            biasTensor: initialMatMul.resultTensor,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen,
-            numChannels: descriptor.initialMatMul.outChannels)
-
-        let blockSourceTensor = Trunk.getBlockSourceTensor(
-            graph: graph,
-            descriptor: descriptor.sgfMetadataEncoder,
-            initialAdd: initialAdd,
-            inputMetaTensor: inputMetaTensor,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen,
-            numChannels: descriptor.initialMatMul.outChannels)
-
-        let blocks = BlockStack(
-            graph: graph,
-            sourceTensor: blockSourceTensor,
-            maskTensor: maskTensor,
-            maskSumTensor: maskSumTensor,
-            maskSumSqrtS14M01Tensor: maskSumSqrtS14M01Tensor,
-            blockDescriptors: descriptor.blockDescriptors,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let trunkTipBN = BatchNormLayer(
-            graph: graph,
-            sourceTensor: blocks.resultTensor,
-            maskTensor: maskTensor,
-            descriptor: descriptor.trunkTipBN,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let trunkTipActivation = ActivationLayer(
-            graph: graph,
-            sourceTensor: trunkTipBN.resultTensor,
-            activationKind: descriptor.trunkTipActivation)
-
-        resultTensor = trunkTipActivation.resultTensor
+    private func runSingleInference(
+        batchIndex: Int,
+        spatialInput: UnsafeMutablePointer<Float32>,
+        globalInput: UnsafeMutablePointer<Float32>,
+        metaInput: UnsafeMutablePointer<Float32>,
+        maskInput: UnsafeMutablePointer<Float32>,
+        policy: UnsafeMutablePointer<Float32>,
+        policyPass: UnsafeMutablePointer<Float32>,
+        value: UnsafeMutablePointer<Float32>,
+        scoreValue: UnsafeMutablePointer<Float32>,
+        ownership: UnsafeMutablePointer<Float32>
+    ) throws {
+        let spatialSize = Int(nnXLen) * Int(nnYLen) * numInputChannels
+        let spatialOffset = batchIndex * spatialSize
+
+        // Create MLMultiArray for spatial input (1, C, H, W)
+        let spatialArray = try MLMultiArray(
+            shape: [1, NSNumber(value: numInputChannels),
+                   NSNumber(value: nnYLen), NSNumber(value: nnXLen)],
+            dataType: .float32)
+
+        // Copy spatial data using fast memcpy
+        let spatialPtr = spatialArray.dataPointer.assumingMemoryBound(to: Float32.self)
+        memcpy(spatialPtr, spatialInput.advanced(by: spatialOffset), spatialSize * MemoryLayout<Float32>.size)
+
+        // Create global input array (1, C) - rank 2 as expected by converter
+        let globalArray = try MLMultiArray(
+            shape: [1, NSNumber(value: numInputGlobalChannels)],
+            dataType: .float32)
+        let globalPtr = globalArray.dataPointer.assumingMemoryBound(to: Float32.self)
+        let globalOffset = batchIndex * numInputGlobalChannels
+        memcpy(globalPtr, globalInput.advanced(by: globalOffset), numInputGlobalChannels * MemoryLayout<Float32>.size)
+
+        // Build feature provider dictionary
+        var inputDict: [String: MLFeatureValue] = [
+            IONames.spatialInput: MLFeatureValue(multiArray: spatialArray),
+            IONames.globalInput: MLFeatureValue(multiArray: globalArray)
+        ]
 
-        assert(resultTensor.shape?.count == 4)
-    }
-}
+        // Add mask input (always required, even with optimize_identity_mask=True)
+        // When optimize_identity_mask=True, the mask is still required as input but
+        // internal mask operations are optimized away for ~6.5% speedup
+        let maskArray = try MLMultiArray(
+            shape: [1, 1, NSNumber(value: nnYLen), NSNumber(value: nnXLen)],
+            dataType: .float32)
+        let maskPtr = maskArray.dataPointer.assumingMemoryBound(to: Float32.self)
+        let maskSize = Int(nnXLen) * Int(nnYLen)
+        let maskOffset = batchIndex * maskSize
+        memcpy(maskPtr, maskInput.advanced(by: maskOffset), maskSize * MemoryLayout<Float32>.size)
+        inputDict[IONames.inputMask] = MLFeatureValue(multiArray: maskArray)
+
+        // Add meta input if model has it
+        if numInputMetaChannels > 0 {
+            let metaArray = try MLMultiArray(
+                shape: [1, NSNumber(value: numInputMetaChannels)],
+                dataType: .float32)
+            let metaPtr = metaArray.dataPointer.assumingMemoryBound(to: Float32.self)
+            let metaOffset = batchIndex * numInputMetaChannels
+            memcpy(metaPtr, metaInput.advanced(by: metaOffset), numInputMetaChannels * MemoryLayout<Float32>.size)
+            inputDict[IONames.metaInput] = MLFeatureValue(multiArray: metaArray)
+        }
 
-/// A class that describes a policy head for a neural network, responsible for predicting
-/// the best moves for the current player and the opposing player on the subsequent turn.
-public struct SWPolicyHeadDesc {
-    /// The version of the policy head
-    let version: Int
-    /// The 1x1 convolution layer for P
-    let p1Conv: SWConvLayerDesc
-    /// The 1x1 convolution layer for G
-    let g1Conv: SWConvLayerDesc
-    /// The batch normalization layer for G
-    let g1BN: SWBatchNormLayerDesc
-    /// The activation function for G
-    let g1Activation: ActivationKind
-    /// The global pooling bias structure that pools the output of G to bias the output of P
-    let gpoolToBiasMul: SWMatMulLayerDesc
-    /// The batch normalization layer for P
-    let p1BN: SWBatchNormLayerDesc
-    /// The activation function for P
-    let p1Activation: ActivationKind
-    /// The 1x1 convolution layer with 2 channels for outputting two policy distributions
-    let p2Conv: SWConvLayerDesc
-    /// The fully connected linear layer for outputting logits for the pass move
-    let gpoolToPassMul: SWMatMulLayerDesc
-    /// The description of the bias layer that is applied to the output of the matrix multiplication layer for model version >= 15
-    let gpoolToPassBias: SWMatBiasLayerDesc?
-    /// The activation function for the bias layer in model version >= 15
-    let passActivation: ActivationKind?
-    /// The fully connected linear layer for outputting logits for the pass move in model version >= 15
-    let gpoolToPassMul2: SWMatMulLayerDesc?
-
-    /// Initializes a SWPolicyHeadDesc object with the given parameters
-    /// - Parameters:
-    ///   - version: The version of the policy head
-    ///   - p1Conv: The 1x1 convolution layer for P
-    ///   - g1Conv: The 1x1 convolution layer for G
-    ///   - g1BN: The batch normalization layer for G
-    ///   - g1Activation: The activation function for G
-    ///   - gpoolToBiasMul: The global pooling bias structure that pools the output of G to bias the output of P
-    ///   - p1BN: The batch normalization layer for P
-    ///   - p1Activation: The activation function for P
-    ///   - p2Conv: The 1x1 convolution layer with 2 channels for outputting two policy distributions
-    ///   - gpoolToPassMul: The fully connected linear layer for outputting logits for the pass move
-    init(
-        version: Int,
-        p1Conv: SWConvLayerDesc,
-        g1Conv: SWConvLayerDesc,
-        g1BN: SWBatchNormLayerDesc,
-        g1Activation: ActivationKind,
-        gpoolToBiasMul: SWMatMulLayerDesc,
-        p1BN: SWBatchNormLayerDesc,
-        p1Activation: ActivationKind,
-        p2Conv: SWConvLayerDesc,
-        gpoolToPassMul: SWMatMulLayerDesc,
-        gpoolToPassBias: SWMatBiasLayerDesc?,
-        passActivation: ActivationKind?,
-        gpoolToPassMul2: SWMatMulLayerDesc?
-    ) {
-        self.version = version
-        self.p1Conv = p1Conv
-        self.g1Conv = g1Conv
-        self.g1BN = g1BN
-        self.g1Activation = g1Activation
-        self.gpoolToBiasMul = gpoolToBiasMul
-        self.p1BN = p1BN
-        self.p1Activation = p1Activation
-        self.p2Conv = p2Conv
-        self.gpoolToPassMul = gpoolToPassMul
-        self.gpoolToPassBias = gpoolToPassBias
-        self.passActivation = passActivation
-        self.gpoolToPassMul2 = gpoolToPassMul2
-
-        assert(
-            (version >= 15)
-                || ((gpoolToPassBias == nil) && (passActivation == nil) && (gpoolToPassMul2 == nil))
+        // Run prediction
+        let featureProvider = try MLDictionaryFeatureProvider(dictionary: inputDict)
+        let prediction = try model.prediction(from: featureProvider)
+
+        // Extract outputs and copy to output buffers
+        extractOutputs(
+            prediction: prediction,
+            batchIndex: batchIndex,
+            policy: policy,
+            policyPass: policyPass,
+            value: value,
+            scoreValue: scoreValue,
+            ownership: ownership
         )
-        assert(
-            (version < 15)
-                || ((gpoolToPassBias != nil) && (passActivation != nil) && (gpoolToPassMul2 != nil))
-        )
-    }
-}
-
-public func createSWPolicyHeadDesc(
-    version: Int32,
-    p1Conv: SWConvLayerDesc,
-    g1Conv: SWConvLayerDesc,
-    g1BN: SWBatchNormLayerDesc,
-    g1Activation: ActivationKind,
-    gpoolToBiasMul: SWMatMulLayerDesc,
-    p1BN: SWBatchNormLayerDesc,
-    p1Activation: ActivationKind,
-    p2Conv: SWConvLayerDesc,
-    gpoolToPassMul: SWMatMulLayerDesc,
-    gpoolToPassBias: SWMatBiasLayerDesc,
-    passActivation: ActivationKind,
-    gpoolToPassMul2: SWMatMulLayerDesc
-) -> SWPolicyHeadDesc {
-    if version >= 15 {
-        return SWPolicyHeadDesc(
-            version: Int(version),
-            p1Conv: p1Conv,
-            g1Conv: g1Conv,
-            g1BN: g1BN,
-            g1Activation: g1Activation,
-            gpoolToBiasMul: gpoolToBiasMul,
-            p1BN: p1BN,
-            p1Activation: p1Activation,
-            p2Conv: p2Conv,
-            gpoolToPassMul: gpoolToPassMul,
-            gpoolToPassBias: gpoolToPassBias,
-            passActivation: passActivation,
-            gpoolToPassMul2: gpoolToPassMul2)
-    } else {
-        return SWPolicyHeadDesc(
-            version: Int(version),
-            p1Conv: p1Conv,
-            g1Conv: g1Conv,
-            g1BN: g1BN,
-            g1Activation: g1Activation,
-            gpoolToBiasMul: gpoolToBiasMul,
-            p1BN: p1BN,
-            p1Activation: p1Activation,
-            p2Conv: p2Conv,
-            gpoolToPassMul: gpoolToPassMul,
-            gpoolToPassBias: nil,
-            passActivation: nil,
-            gpoolToPassMul2: nil)
     }
-}
-
-/// A structure that represents a policy head of a neural network.
-struct PolicyHead {
-    /// The tensor that holds the policy prediction of the neural network
-    let policyTensor: MPSGraphTensor
-    /// The tensor that holds the policy pass of the neural network
-    let policyPassTensor: MPSGraphTensor
-
-    /// Initializes a PolicyHead object
-    /// - Parameters:
-    ///   - graph: The MPSGraph object to which the policy head is added
-    ///   - descriptor: The description of the policy head
-    ///   - sourceTensor: The input tensor to the policy head
-    ///   - maskTensor: The mask tensor for the input tensor
-    ///   - maskSumTensor: The sum of the mask tensor
-    ///   - maskSumSqrtS14M01Tensor: The square root of the sum of the mask tensor and a small epsilon
-    ///   - nnXLen: The number of X pixels in the input tensor
-    ///   - nnYLen: The number of Y pixels in the input tensor
-    init(
-        graph: MPSGraph,
-        descriptor: SWPolicyHeadDesc,
-        sourceTensor: MPSGraphTensor,
-        maskTensor: MPSGraphTensor,
-        maskSumTensor: MPSGraphTensor,
-        maskSumSqrtS14M01Tensor: MPSGraphTensor,
-        nnXLen: NSNumber,
-        nnYLen: NSNumber
-    ) {
-
-        let p1Conv = ConvLayer(
-            graph: graph,
-            sourceTensor: sourceTensor,
-            descriptor: descriptor.p1Conv,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let g1Conv = ConvLayer(
-            graph: graph,
-            sourceTensor: sourceTensor,
-            descriptor: descriptor.g1Conv,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let g1BN = BatchNormLayer(
-            graph: graph,
-            sourceTensor: g1Conv.resultTensor,
-            maskTensor: maskTensor,
-            descriptor: descriptor.g1BN,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let g1Activation = ActivationLayer(
-            graph: graph,
-            sourceTensor: g1BN.resultTensor,
-            activationKind: descriptor.g1Activation)
-
-        let g1Concat = GlobalPoolingLayer(
-            graph: graph,
-            sourceTensor: g1Activation.resultTensor,
-            maskTensor: maskTensor,
-            maskSumTensor: maskSumTensor,
-            maskSumSqrtS14M01Tensor: maskSumSqrtS14M01Tensor)
-
-        assert(g1Concat.resultTensor.shape?[1] == descriptor.gpoolToBiasMul.inChannels)
-
-        let gpoolToBiasMul = MatMulLayer(
-            graph: graph,
-            descriptor: descriptor.gpoolToBiasMul,
-            sourceTensor: g1Concat.resultTensor)
-
-        let added = AddNCBiasLayer(
-            graph: graph,
-            sourceTensor: p1Conv.resultTensor,
-            biasTensor: gpoolToBiasMul.resultTensor,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen,
-            numChannels: descriptor.gpoolToBiasMul.outChannels)
-
-        let p1BN = BatchNormLayer(
-            graph: graph,
-            sourceTensor: added.resultTensor,
-            maskTensor: maskTensor,
-            descriptor: descriptor.p1BN,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let p1Activation = ActivationLayer(
-            graph: graph,
-            sourceTensor: p1BN.resultTensor,
-            activationKind: descriptor.p1Activation)
 
-        let p2Conv = ConvLayer(
-            graph: graph,
-            sourceTensor: p1Activation.resultTensor,
-            descriptor: descriptor.p2Conv,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        policyTensor = p2Conv.resultTensor
-
-        assert(g1Concat.resultTensor.shape?[1] == descriptor.gpoolToPassMul.inChannels)
-
-        let gpoolToPassMul = MatMulLayer(
-            graph: graph,
-            descriptor: descriptor.gpoolToPassMul,
-            sourceTensor: g1Concat.resultTensor)
-
-        if let gpoolToPassBias = descriptor.gpoolToPassBias,
-            let passActivation = descriptor.passActivation,
-            let gpoolToPassMul2 = descriptor.gpoolToPassMul2
-        {
-            assert(descriptor.version >= 15)
-
-            let gpoolToPassBiasLayer = MatBiasLayer(
-                graph: graph,
-                descriptor: gpoolToPassBias,
-                sourceTensor: gpoolToPassMul.resultTensor)
-
-            let passActivationLayer = ActivationLayer(
-                graph: graph,
-                sourceTensor: gpoolToPassBiasLayer.resultTensor,
-                activationKind: passActivation)
-
-            let gpoolToPassMul2Layer = MatMulLayer(
-                graph: graph,
-                descriptor: gpoolToPassMul2,
-                sourceTensor: passActivationLayer.resultTensor)
+    /// Copy MLMultiArray data to destination buffer, respecting strides.
+    /// Core ML may return non-contiguous arrays, especially for spatial outputs after GPU computation.
+    private func copyMultiArray(
+        _ array: MLMultiArray,
+        to dest: UnsafeMutablePointer<Float32>,
+        destOffset: Int
+    ) {
+        let shape = array.shape.map { $0.intValue }
+        let strides = array.strides.map { $0.intValue }
+        let ptr = array.dataPointer.assumingMemoryBound(to: Float32.self)
+        let totalElements = shape.reduce(1, *)
+
+        // Check if contiguous (strides match expected for row-major C-order)
+        var isContiguous = true
+        var expectedStride = 1
+        for i in (0..<shape.count).reversed() {
+            if strides[i] != expectedStride {
+                isContiguous = false
+                break
+            }
+            expectedStride *= shape[i]
+        }
 
-            policyPassTensor = gpoolToPassMul2Layer.resultTensor
+        if isContiguous {
+            // Fast path: direct memcpy
+            memcpy(dest.advanced(by: destOffset), ptr, totalElements * MemoryLayout<Float32>.size)
         } else {
-            assert(descriptor.version < 15)
-            policyPassTensor = gpoolToPassMul.resultTensor
+            // Slow path: copy with strides (handles non-contiguous layouts)
+            copyWithStrides(
+                from: ptr,
+                to: dest,
+                destOffset: destOffset,
+                shape: shape,
+                strides: strides,
+                dim: 0,
+                srcOffset: 0,
+                destIdx: 0
+            )
         }
-
-        assert(policyTensor.shape?.count == 4)
-        assert(policyPassTensor.shape?.count == 2)
     }
-}
 
-/// A struct that describes the value head of a neural network
-public struct SWValueHeadDesc {
-    /// The version of the value head
-    let version: Int
-    /// The description of the first convolutional layer in the value head
-    let v1Conv: SWConvLayerDesc
-    /// The description of the batch normalization layer after the first convolutional layer in the value head
-    let v1BN: SWBatchNormLayerDesc
-    /// The activation function that is applied after the first batch normalization layer in the value head
-    let v1Activation: ActivationKind
-    /// The description of the matrix multiplication layer that is applied to the output of the first convolutional layer in the value head
-    let v2Mul: SWMatMulLayerDesc
-    /// The description of the bias layer that is applied to the output of the matrix multiplication layer in the value head
-    let v2Bias: SWMatBiasLayerDesc
-    /// The activation function that is applied after the bias layer in the value head
-    let v2Activation: ActivationKind
-    /// The description of the matrix multiplication layer that is applied to the output of the bias layer in the value head
-    let v3Mul: SWMatMulLayerDesc
-    /// The description of the bias layer that is applied to the output of the matrix multiplication layer in the value head
-    let v3Bias: SWMatBiasLayerDesc
-    /// The description of the matrix multiplication layer that is applied to the output of the third bias layer in the value head
-    let sv3Mul: SWMatMulLayerDesc
-    /// The description of the bias layer that is applied to the output of the matrix multiplication layer in the value head
-    let sv3Bias: SWMatBiasLayerDesc
-    /// The description of the convolutional layer that is applied to the board ownership map in the value head
-    let vOwnershipConv: SWConvLayerDesc
-
-    /// Initializes a SWValueHeadDesc object
-    /// - Parameters:
-    ///   - version: The version of the value head
-    ///   - v1Conv: The description of the first convolutional layer in the value head
-    ///   - v1BN: The description of the batch normalization layer after the first convolutional layer in the value head
-    ///   - v1Activation: The activation function that is applied after the first batch normalization layer in the value head
-    ///   - v2Mul: The description of the matrix multiplication layer that is applied to the output of the first convolutional layer in the value head
-    ///   - v2Bias: The description of the bias layer that is applied to the output of the matrix multiplication layer in the value head
-    ///   - v2Activation: The activation function that is applied after the bias layer in the value head
-    ///   - v3Mul: The description of the matrix multiplication layer that is applied to the output of the bias layer in the value head
-    ///   - v3Bias: The description of the bias layer that is applied to the output of the matrix multiplication layer in the value head
-    ///   - sv3Mul: The description of the matrix multiplication layer that is applied to the output of the third bias layer in the value head
-    ///   - sv3Bias: The description of the bias layer that is applied to the output of the matrix multiplication layer in the value head
-    ///   - vOwnershipConv: The description of the convolutional layer that is applied to the board ownership map in the value head
-    init(
-        version: Int,
-        v1Conv: SWConvLayerDesc,
-        v1BN: SWBatchNormLayerDesc,
-        v1Activation: ActivationKind,
-        v2Mul: SWMatMulLayerDesc,
-        v2Bias: SWMatBiasLayerDesc,
-        v2Activation: ActivationKind,
-        v3Mul: SWMatMulLayerDesc,
-        v3Bias: SWMatBiasLayerDesc,
-        sv3Mul: SWMatMulLayerDesc,
-        sv3Bias: SWMatBiasLayerDesc,
-        vOwnershipConv: SWConvLayerDesc
-    ) {
-        self.version = version
-        self.v1Conv = v1Conv
-        self.v1BN = v1BN
-        self.v1Activation = v1Activation
-        self.v2Mul = v2Mul
-        self.v2Bias = v2Bias
-        self.v2Activation = v2Activation
-        self.v3Mul = v3Mul
-        self.v3Bias = v3Bias
-        self.sv3Mul = sv3Mul
-        self.sv3Bias = sv3Bias
-        self.vOwnershipConv = vOwnershipConv
-    }
-}
+    /// Recursively copy array elements respecting strides (NCHW order)
+    @discardableResult
+    private func copyWithStrides(
+        from src: UnsafePointer<Float32>,
+        to dest: UnsafeMutablePointer<Float32>,
+        destOffset: Int,
+        shape: [Int],
+        strides: [Int],
+        dim: Int,
+        srcOffset: Int,
+        destIdx: Int
+    ) -> Int {
+        var currentDestIdx = destIdx
+
+        if dim == shape.count - 1 {
+            // Innermost dimension: copy elements
+            for i in 0..<shape[dim] {
+                dest[destOffset + currentDestIdx] = src[srcOffset + i * strides[dim]]
+                currentDestIdx += 1
+            }
+        } else {
+            // Recurse into next dimension
+            for i in 0..<shape[dim] {
+                currentDestIdx = copyWithStrides(
+                    from: src,
+                    to: dest,
+                    destOffset: destOffset,
+                    shape: shape,
+                    strides: strides,
+                    dim: dim + 1,
+                    srcOffset: srcOffset + i * strides[dim],
+                    destIdx: currentDestIdx
+                )
+            }
+        }
 
-public func createSWValueHeadDesc(
-    version: Int32,
-    v1Conv: SWConvLayerDesc,
-    v1BN: SWBatchNormLayerDesc,
-    v1Activation: ActivationKind,
-    v2Mul: SWMatMulLayerDesc,
-    v2Bias: SWMatBiasLayerDesc,
-    v2Activation: ActivationKind,
-    v3Mul: SWMatMulLayerDesc,
-    v3Bias: SWMatBiasLayerDesc,
-    sv3Mul: SWMatMulLayerDesc,
-    sv3Bias: SWMatBiasLayerDesc,
-    vOwnershipConv: SWConvLayerDesc
-) -> SWValueHeadDesc {
-    return SWValueHeadDesc(
-        version: Int(version),
-        v1Conv: v1Conv,
-        v1BN: v1BN,
-        v1Activation: v1Activation,
-        v2Mul: v2Mul,
-        v2Bias: v2Bias,
-        v2Activation: v2Activation,
-        v3Mul: v3Mul,
-        v3Bias: v3Bias,
-        sv3Mul: sv3Mul,
-        sv3Bias: sv3Bias,
-        vOwnershipConv: vOwnershipConv)
-}
+        return currentDestIdx
+    }
 
-/// A structure that creates a value head for the neural network, which produces the value, score value, and ownership tensors.
-struct ValueHead {
-    /// The tensor that represents the value of the board
-    let valueTensor: MPSGraphTensor
-    /// The tensor that represents the score value of the board
-    let scoreValueTensor: MPSGraphTensor
-    /// The tensor that represents the ownership of the board
-    let ownershipTensor: MPSGraphTensor
-
-    /// Initializes the value head using a graph, a descriptor, a source tensor, and other relevant tensors.
-    /// - Parameters:
-    ///   - graph: The graph used to perform calculations on tensors
-    ///   - descriptor: The SWValueHeadDesc object that describes the value head
-    ///   - sourceTensor: The tensor used to source data to the neural network
-    ///   - maskTensor: The tensor used to mask out invalid moves
-    ///   - maskSumTensor: The tensor used to sum up the mask tensor values
-    ///   - maskSumSqrtS14M01Tensor: The tensor used to calculate a square root value
-    ///   - maskSumSqrtS14M01SquareS01Tensor: The tensor used to calculate a square value
-    ///   - nnXLen: The x-axis length of the neural network
-    ///   - nnYLen: The y-axis length of the neural network
-    init(
-        graph: MPSGraph,
-        descriptor: SWValueHeadDesc,
-        sourceTensor: MPSGraphTensor,
-        maskTensor: MPSGraphTensor,
-        maskSumTensor: MPSGraphTensor,
-        maskSumSqrtS14M01Tensor: MPSGraphTensor,
-        maskSumSqrtS14M01SquareS01Tensor: MPSGraphTensor,
-        nnXLen: NSNumber,
-        nnYLen: NSNumber
+    private func extractOutputs(
+        prediction: MLFeatureProvider,
+        batchIndex: Int,
+        policy: UnsafeMutablePointer<Float32>,
+        policyPass: UnsafeMutablePointer<Float32>,
+        value: UnsafeMutablePointer<Float32>,
+        scoreValue: UnsafeMutablePointer<Float32>,
+        ownership: UnsafeMutablePointer<Float32>
     ) {
+        // Extract policy output (1, policyChannels, H, W)
+        // Must use stride-aware copy as Core ML may return non-contiguous arrays
+        if let policyArray = prediction.featureValue(for: IONames.policyOutput)?.multiArrayValue {
+            let policyOffset = batchIndex * Int(nnXLen) * Int(nnYLen) * numPolicyChannels
+            copyMultiArray(policyArray, to: policy, destOffset: policyOffset)
+        }
 
-        let v1Conv = ConvLayer(
-            graph: graph,
-            sourceTensor: sourceTensor,
-            descriptor: descriptor.v1Conv,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let v1BN = BatchNormLayer(
-            graph: graph,
-            sourceTensor: v1Conv.resultTensor,
-            maskTensor: maskTensor,
-            descriptor: descriptor.v1BN,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let v1Activation = ActivationLayer(
-            graph: graph,
-            sourceTensor: v1BN.resultTensor,
-            activationKind: descriptor.v1Activation)
-
-        let v1Mean =
-            GlobalPoolingValueLayer(
-                graph: graph,
-                sourceTensor: v1Activation.resultTensor,
-                maskSumTensor: maskSumTensor,
-                maskSumSqrtS14M01Tensor: maskSumSqrtS14M01Tensor,
-                maskSumSqrtS14M01SquareS01Tensor: maskSumSqrtS14M01SquareS01Tensor)
-
-        assert(v1Mean.resultTensor.shape?[1] == descriptor.v2Mul.inChannels)
-
-        let v2Mul = MatMulLayer(
-            graph: graph,
-            descriptor: descriptor.v2Mul,
-            sourceTensor: v1Mean.resultTensor)
-
-        let v2Bias = MatBiasLayer(
-            graph: graph,
-            descriptor: descriptor.v2Bias,
-            sourceTensor: v2Mul.resultTensor)
-
-        let v2Activation = ActivationLayer(
-            graph: graph,
-            sourceTensor: v2Bias.resultTensor,
-            activationKind: descriptor.v2Activation)
-
-        let v3Mul = MatMulLayer(
-            graph: graph,
-            descriptor: descriptor.v3Mul,
-            sourceTensor: v2Activation.resultTensor)
-
-        let v3Bias = MatBiasLayer(
-            graph: graph,
-            descriptor: descriptor.v3Bias,
-            sourceTensor: v3Mul.resultTensor)
+        // Extract policy pass output (1, numPolicyChannels)
+        if let passArray = prediction.featureValue(for: IONames.policyPassOutput)?.multiArrayValue {
+            let passOffset = batchIndex * numPolicyChannels
+            copyMultiArray(passArray, to: policyPass, destOffset: passOffset)
+        }
 
-        let sv3Mul = MatMulLayer(
-            graph: graph,
-            descriptor: descriptor.sv3Mul,
-            sourceTensor: v2Activation.resultTensor)
+        // Extract value output (1, 3)
+        if let valueArray = prediction.featureValue(for: IONames.valueOutput)?.multiArrayValue {
+            let valueOffset = batchIndex * numValueChannels
+            copyMultiArray(valueArray, to: value, destOffset: valueOffset)
+        }
 
-        let sv3Bias = MatBiasLayer(
-            graph: graph,
-            descriptor: descriptor.sv3Bias,
-            sourceTensor: sv3Mul.resultTensor)
+        // Extract score value output (1, numScoreValueChannels)
+        if let svArray = prediction.featureValue(for: IONames.scoreValueOutput)?.multiArrayValue {
+            let svOffset = batchIndex * numScoreValueChannels
+            copyMultiArray(svArray, to: scoreValue, destOffset: svOffset)
+        }
 
-        let vOwnershipConv = ConvLayer(
-            graph: graph,
-            sourceTensor: v1Activation.resultTensor,
-            descriptor: descriptor.vOwnershipConv,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        valueTensor = v3Bias.resultTensor
-        scoreValueTensor = sv3Bias.resultTensor
-        ownershipTensor = vOwnershipConv.resultTensor
-
-        assert(valueTensor.shape?.count == 2)
-        assert(scoreValueTensor.shape?.count == 2)
-        assert(ownershipTensor.shape?.count == 4)
+        // Extract ownership output (1, 1, H, W)
+        // Must use stride-aware copy as Core ML may return non-contiguous arrays
+        if let ownArray = prediction.featureValue(for: IONames.ownershipOutput)?.multiArrayValue {
+            let ownOffset = batchIndex * Int(nnXLen) * Int(nnYLen) * numOwnershipChannels
+            copyMultiArray(ownArray, to: ownership, destOffset: ownOffset)
+        }
     }
 }
 
-/// A struct that describes a neural network model used for playing the game of Go.
-public struct SWModelDesc {
-    /// The version of the model.
-    let version: Int
-    /// The name of the model.
-    let name: String
-    /// Number of channels for input features.
-    let numInputChannels: NSNumber
-    /// Number of channels for global input features.
-    let numInputGlobalChannels: NSNumber
-    /// Number of channels for meta input features.
-    let numInputMetaChannels: NSNumber
-    /// Number of channels for the value head output.
-    let numValueChannels: NSNumber
-    /// Number of channels for the score value head output.
-    let numScoreValueChannels: NSNumber
-    /// Number of channels for the ownership head output.
-    let numOwnershipChannels: NSNumber
-    /// The description of the trunk that makes up the backbone of the model.
-    let trunk: SWTrunkDesc
-    /// The description of the policy head that predicts the probability of playing at a particular position.
-    let policyHead: SWPolicyHeadDesc
-    /// The description of the value head that predicts the expected outcome of a game state.
-    let valueHead: SWValueHeadDesc
-
-    /// Initializes an SWModelDesc object.
-    /// - Parameters:
-    ///   - version: The version of the model.
-    ///   - name: The name of the model.
-    ///   - numInputChannels: Number of channels for input features.
-    ///   - numInputGlobalChannels: Number of channels for global input features.
-    ///   - numInputMetaChannels: Number of channels for meta input features.
-    ///   - numValueChannels: Number of channels for the value head output.
-    ///   - numScoreValueChannels: Number of channels for the score value head output.
-    ///   - numOwnershipChannels: Number of channels for the ownership head output.
-    ///   - trunk: The description of the trunk that makes up the backbone of the model.
-    ///   - policyHead: The description of the policy head that predicts the probability of playing at a particular position.
-    ///   - valueHead: The description of the value head that predicts the expected outcome of a game state.
-    init(
-        version: Int,
-        name: String,
-        numInputChannels: NSNumber,
-        numInputGlobalChannels: NSNumber,
-        numInputMetaChannels: NSNumber,
-        numValueChannels: NSNumber,
-        numScoreValueChannels: NSNumber,
-        numOwnershipChannels: NSNumber,
-        trunk: SWTrunkDesc,
-        policyHead: SWPolicyHeadDesc,
-        valueHead: SWValueHeadDesc
-    ) {
-        self.version = version
-        self.name = name
-        self.numInputChannels = numInputChannels
-        self.numInputGlobalChannels = numInputGlobalChannels
-        self.numInputMetaChannels = numInputMetaChannels
-        self.numValueChannels = numValueChannels
-        self.numScoreValueChannels = numScoreValueChannels
-        self.numOwnershipChannels = numOwnershipChannels
-        self.trunk = trunk
-        self.policyHead = policyHead
-        self.valueHead = valueHead
+/// Delete the source .mlpackage after compilation
+/// CoreML caches the compiled model, so the source is no longer needed
+private func deleteSourceModel(at url: URL, serverThreadIdx: Int) {
+    do {
+        try FileManager.default.removeItem(at: url)
+        printError("Metal backend \(serverThreadIdx): Deleted temp model")
+    } catch {
+        printError("Metal backend \(serverThreadIdx): Warning: Failed to delete temp model: \(error)")
     }
 }
 
-public func createSWModelDesc(
-    version: Int32,
-    name: String,
+/// Create compute handle - loads pre-converted Core ML model
+/// Model conversion is now handled in C++ using the native katagocoreml library
+public func createCoreMLComputeHandle(
+    coremlModelPath: String,
+    serverThreadIdx: Int,
+    requireExactNNLen: Bool,
     numInputChannels: Int32,
     numInputGlobalChannels: Int32,
     numInputMetaChannels: Int32,
+    numPolicyChannels: Int32,
     numValueChannels: Int32,
     numScoreValueChannels: Int32,
     numOwnershipChannels: Int32,
-    trunk: SWTrunkDesc,
-    policyHead: SWPolicyHeadDesc,
-    valueHead: SWValueHeadDesc
-) -> SWModelDesc {
-    return SWModelDesc(
-        version: Int(version),
-        name: name,
-        numInputChannels: numInputChannels as NSNumber,
-        numInputGlobalChannels: numInputGlobalChannels as NSNumber,
-        numInputMetaChannels: numInputMetaChannels as NSNumber,
-        numValueChannels: numValueChannels as NSNumber,
-        numScoreValueChannels: numScoreValueChannels as NSNumber,
-        numOwnershipChannels: numOwnershipChannels as NSNumber,
-        trunk: trunk,
-        policyHead: policyHead,
-        valueHead: valueHead)
+    context: MetalComputeContext
+) -> CoreMLComputeHandle? {
+
+    let optimizeMask = requireExactNNLen  // When true: skips internal mask operations (~6.5% speedup)
+    let mlpackagePath = URL(fileURLWithPath: coremlModelPath)
+
+    // Ensure temp file is deleted regardless of success/failure
+    defer { deleteSourceModel(at: mlpackagePath, serverThreadIdx: serverThreadIdx) }
+
+    // Load Core ML model (already converted by C++ katagocoreml library)
+    do {
+        let config = MLModelConfiguration()
+        config.computeUnits = .cpuAndNeuralEngine  // Exclude GPU for hybrid mode
+
+        printError("Metal backend \(serverThreadIdx): Compiling model...")
+        let compiledURL = try MLModel.compileModel(at: mlpackagePath)
+
+        printError("Metal backend \(serverThreadIdx): Loading compiled model...")
+        let model = try MLModel(contentsOf: compiledURL, configuration: config)
+
+        printError("Metal backend \(serverThreadIdx): Model loaded successfully, \(context.nnXLen)x\(context.nnYLen)")
+
+        return CoreMLComputeHandle(
+            model: model,
+            nnXLen: context.nnXLen,
+            nnYLen: context.nnYLen,
+            optimizeIdentityMask: optimizeMask,
+            numInputChannels: Int(numInputChannels),
+            numInputGlobalChannels: Int(numInputGlobalChannels),
+            numInputMetaChannels: Int(numInputMetaChannels),
+            numPolicyChannels: Int(numPolicyChannels),
+            numValueChannels: Int(numValueChannels),
+            numScoreValueChannels: Int(numScoreValueChannels),
+            numOwnershipChannels: Int(numOwnershipChannels)
+        )
+    } catch {
+        printError("Metal backend: Failed to load model: \(error)")
+        return nil
+    }
+}
+
+/// Print available Metal compute devices
+public func printMetalDevices() {
+    printError("Metal backend: Available modes - GPU (MPSGraph), CPU+ANE (CoreML)")
 }
 
-/// A structure representing a neural network model for processing Go game states.
-struct Model {
-    /// The Metal device
+// MARK: - MPSGraph-based Model for GPU Inference
+
+/// GPU-based model using MPSGraph for inference
+public class MPSGraphModelHandle {
     let device: MTLDevice
-    /// The command queue used to execute the graph on the GPU
     let commandQueue: MTLCommandQueue
-    /// The Metal Performance Shaders graph object used for building and executing the graph
     let graph: MPSGraph
-    /// The length of the neural network input in the x dimension
-    let nnXLen: NSNumber
-    /// The length of the neural network input in the y dimension
-    let nnYLen: NSNumber
-    /// The version of the model
-    let version: Int
-    /// The number of channels in the value output layer
-    let numValueChannels: NSNumber
-    /// The number of channels in the score value output layer
-    let numScoreValueChannels: NSNumber
-    /// The number of channels in the ownership output layer
-    let numOwnershipChannels: NSNumber
-    /// The input layer of the neural network
+    let nnXLen: Int32
+    let nnYLen: Int32
+    let numInputChannels: Int
+    let numInputGlobalChannels: Int
+    let numInputMetaChannels: Int
+    let numPolicyChannels: Int
+    let numValueChannels: Int
+    let numScoreValueChannels: Int
+    let numOwnershipChannels: Int
+
+    // Layers
     let input: InputLayer
-    /// The global input layer of the neural network
     let inputGlobal: InputGlobalLayer
-    /// The meta input layer of the neural network
     let inputMeta: InputMetaLayer
-    /// The mask layer of the neural network
     let mask: MaskLayer
-    /// The trunk of the neural network
     let trunk: Trunk
-    /// The policy head of the neural network
     let policyHead: PolicyHead
-    /// The value head of the neural network
     let valueHead: ValueHead
-    /// The dictionary that maps the output tensors to the tensor data
     let targetTensors: [MPSGraphTensor]
 
-    /// Initializes a Model object.
-    /// - Parameters:
-    ///   - device: The Metal device to use for computations.
-    ///   - graph: The Metal Performance Shaders graph object used for building and executing the graph.
-    ///   - descriptor: The description of the model.
-    ///   - nnXLen: The length of the neural network input in the x dimension.
-    ///   - nnYLen: The length of the neural network input in the y dimension.
-    init(
-        device: MTLDevice,
-        graph: MPSGraph,
-        descriptor: SWModelDesc,
-        nnXLen: NSNumber,
-        nnYLen: NSNumber
+    public init?(
+        modelDesc: SWModelDesc,
+        nnXLen: Int32,
+        nnYLen: Int32,
+        optimizeIdentityMask: Bool = false
     ) {
+        guard let device = MTLCreateSystemDefaultDevice() else {
+            printError("Metal backend: Failed to create Metal device")
+            return nil
+        }
+
         self.device = device
-        self.commandQueue = device.makeCommandQueue()!
-        self.graph = graph
+        guard let queue = device.makeCommandQueue() else {
+            printError("Metal backend: Failed to create command queue")
+            return nil
+        }
+        self.commandQueue = queue
+        self.graph = MPSGraph()
         self.nnXLen = nnXLen
         self.nnYLen = nnYLen
-        self.version = descriptor.version
-        self.numValueChannels = descriptor.numValueChannels
-        self.numScoreValueChannels = descriptor.numScoreValueChannels
-        self.numOwnershipChannels = descriptor.numOwnershipChannels
+        self.numInputChannels = modelDesc.numInputChannels.intValue
+        self.numInputGlobalChannels = modelDesc.numInputGlobalChannels.intValue
+        self.numInputMetaChannels = modelDesc.numInputMetaChannels.intValue
+        self.numPolicyChannels = modelDesc.numPolicyChannels.intValue
+        self.numValueChannels = modelDesc.numValueChannels.intValue
+        self.numScoreValueChannels = modelDesc.numScoreValueChannels.intValue
+        self.numOwnershipChannels = modelDesc.numOwnershipChannels.intValue
+
+        let nnXLenNS = nnXLen as NSNumber
+        let nnYLenNS = nnYLen as NSNumber
 
         input = InputLayer(
             graph: graph,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen,
-            numChannels: descriptor.numInputChannels)
+            nnXLen: nnXLenNS,
+            nnYLen: nnYLenNS,
+            numChannels: modelDesc.numInputChannels)
 
         inputGlobal = InputGlobalLayer(
             graph: graph,
-            numGlobalFeatures: descriptor.numInputGlobalChannels)
+            numGlobalFeatures: modelDesc.numInputGlobalChannels)
 
         inputMeta = InputMetaLayer(
             graph: graph,
-            numMetaFeatures: descriptor.numInputMetaChannels)
+            numMetaFeatures: modelDesc.numInputMetaChannels)
 
         mask = MaskLayer(
             graph: graph,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
-
-        let maskSum = MaskSumLayer(
-            graph: graph,
-            maskTensor: mask.tensor)
+            nnXLen: nnXLenNS,
+            nnYLen: nnYLenNS)
 
-        let maskSumSqrtS14M01 = MaskSumSqrtS14M01Layer(
-            graph: graph,
-            maskSum: maskSum)
+        // Use constant tensors when mask is all 1s (requireExactNNLen=true)
+        let maskSum: MaskSumLayer
+        let maskSumSqrtS14M01: MaskSumSqrtS14M01Layer
+        let maskSumSqrtS14M01SquareS01: MaskSumSqrtS14M01SquareS01Layer
 
-        let maskSumSqrtS14M01SquareS01 = MaskSumSqrtS14M01SquareS01Layer(
-            graph: graph,
-            maskSumSqrtS14M01: maskSumSqrtS14M01)
+        if optimizeIdentityMask {
+            maskSum = MaskSumLayer(
+                graph: graph,
+                nnXLen: nnXLenNS,
+                nnYLen: nnYLenNS)
+            maskSumSqrtS14M01 = MaskSumSqrtS14M01Layer(
+                graph: graph,
+                nnXLen: nnXLenNS,
+                nnYLen: nnYLenNS)
+            maskSumSqrtS14M01SquareS01 = MaskSumSqrtS14M01SquareS01Layer(
+                graph: graph,
+                nnXLen: nnXLenNS,
+                nnYLen: nnYLenNS)
+        } else {
+            maskSum = MaskSumLayer(
+                graph: graph,
+                maskTensor: mask.tensor)
+            maskSumSqrtS14M01 = MaskSumSqrtS14M01Layer(
+                graph: graph,
+                maskSum: maskSum)
+            maskSumSqrtS14M01SquareS01 = MaskSumSqrtS14M01SquareS01Layer(
+                graph: graph,
+                maskSumSqrtS14M01: maskSumSqrtS14M01)
+        }
 
         trunk = Trunk(
             graph: graph,
-            descriptor: descriptor.trunk,
+            descriptor: modelDesc.trunk,
             inputTensor: input.tensor,
             inputGlobalTensor: inputGlobal.tensor,
             inputMetaTensor: inputMeta.tensor,
             maskTensor: mask.tensor,
             maskSumTensor: maskSum.tensor,
             maskSumSqrtS14M01Tensor: maskSumSqrtS14M01.tensor,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
+            nnXLen: nnXLenNS,
+            nnYLen: nnYLenNS,
+            optimizeIdentityMask: optimizeIdentityMask)
 
         policyHead = PolicyHead(
             graph: graph,
-            descriptor: descriptor.policyHead,
+            descriptor: modelDesc.policyHead,
             sourceTensor: trunk.resultTensor,
             maskTensor: mask.tensor,
             maskSumTensor: maskSum.tensor,
             maskSumSqrtS14M01Tensor: maskSumSqrtS14M01.tensor,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
+            nnXLen: nnXLenNS,
+            nnYLen: nnYLenNS,
+            optimizeIdentityMask: optimizeIdentityMask)
 
         valueHead = ValueHead(
             graph: graph,
-            descriptor: descriptor.valueHead,
+            descriptor: modelDesc.valueHead,
             sourceTensor: trunk.resultTensor,
             maskTensor: mask.tensor,
             maskSumTensor: maskSum.tensor,
             maskSumSqrtS14M01Tensor: maskSumSqrtS14M01.tensor,
             maskSumSqrtS14M01SquareS01Tensor: maskSumSqrtS14M01SquareS01.tensor,
-            nnXLen: nnXLen,
-            nnYLen: nnYLen)
+            nnXLen: nnXLenNS,
+            nnYLen: nnYLenNS,
+            optimizeIdentityMask: optimizeIdentityMask)
 
         targetTensors = [
             policyHead.policyTensor,
@@ -3035,20 +549,12 @@ struct Model {
             valueHead.scoreValueTensor,
             valueHead.ownershipTensor,
         ]
+
+        printError("Metal backend: MPSGraph initialized on \(device.name)\(optimizeIdentityMask ? " (mask optimized)" : "")")
     }
 
-    /// Applies the model to the given input data, and generates predictions for policy, value and ownership
-    /// - Parameters:
-    ///   - inputPointer: UnsafeMutablePointer to a flattened 2D array of floats representing the input state
-    ///   - inputGlobalPointer: UnsafeMutablePointer to a flattened array of floats representing global state features
-    ///   - inputMetaPointer: UnsafeMutablePointer to a flattened array of floats representing the metadata
-    ///   - policy: UnsafeMutablePointer to a flattened 2D array of floats representing predicted policy
-    ///   - policyPass: UnsafeMutablePointer to a flattened array of floats representing predicted probability of passing
-    ///   - value: UnsafeMutablePointer to a flattened array of floats representing predicted value
-    ///   - scoreValue: UnsafeMutablePointer to a flattened array of floats representing predicted score value
-    ///   - ownership: UnsafeMutablePointer to a flattened 2D array of floats representing predicted ownership
-    ///   - batchSize: The batch size
-    func apply(
+    /// Run inference on a batch using MPSGraph (GPU).
+    public func apply(
         input inputPointer: UnsafeMutablePointer<Float32>,
         inputGlobal inputGlobalPointer: UnsafeMutablePointer<Float32>,
         inputMeta inputMetaPointer: UnsafeMutablePointer<Float32>,
@@ -3059,84 +565,64 @@ struct Model {
         ownership: UnsafeMutablePointer<Float32>,
         batchSize: Int
     ) {
-
         let channelAxis = InputShape.getChannelAxis()
-        let numInputChannels = input.shape[channelAxis]
-
-        let inputShape = InputShape.create(
-            batchSize: batchSize as NSNumber,
-            numChannels: numInputChannels,
-            nnYLen: nnYLen,
-            nnXLen: nnXLen)
+        let numInputChannelsNS = input.shape[channelAxis]
+        let numInputGlobalChannelsNS = inputGlobal.shape[channelAxis]
+        let numInputMetaChannelsNS = inputMeta.shape[channelAxis]
+        let nnXLenNS = nnXLen as NSNumber
+        let nnYLenNS = nnYLen as NSNumber
 
-        let inputDescriptor = MPSNDArrayDescriptor(
-            dataType: input.tensor.dataType,
-            shape: inputShape)
+        // Mask strides describe the source (input) memory layout for extracting channel 0.
+        var maskStrideArray = [
+            MemoryLayout<Float32>.size,
+            Int(nnXLen) * MemoryLayout<Float32>.size,
+            Int(nnYLen) * Int(nnXLen) * MemoryLayout<Float32>.size,
+            numInputChannels * Int(nnYLen) * Int(nnXLen) * MemoryLayout<Float32>.size,
+        ]
 
-        let inputArray = MPSNDArray(
-            device: device,
-            descriptor: inputDescriptor)
+        guard let mtlCommandBuffer = commandQueue.makeCommandBuffer() else {
+            fatalError("Metal backend: Failed to create command buffer")
+        }
+        let commandBuffer = MPSCommandBuffer(commandBuffer: mtlCommandBuffer)
 
+        // Spatial input
+        let inputShape = InputShape.create(
+            batchSize: batchSize as NSNumber,
+            numChannels: numInputChannelsNS,
+            nnYLen: nnYLenNS,
+            nnXLen: nnXLenNS)
+        let inputDescriptor = MPSNDArrayDescriptor(dataType: input.tensor.dataType, shape: inputShape)
+        let inputArray = MPSNDArray(device: device, descriptor: inputDescriptor)
         inputArray.writeBytes(inputPointer)
 
-        let numInputGlobalChannels = inputGlobal.shape[channelAxis]
-
+        // Global input
         let inputGlobalShape = InputShape.create(
             batchSize: batchSize as NSNumber,
-            numChannels: numInputGlobalChannels,
+            numChannels: numInputGlobalChannelsNS,
             nnYLen: 1,
             nnXLen: 1)
-
-        let inputGlobalDescriptor = MPSNDArrayDescriptor(
-            dataType: inputGlobal.tensor.dataType,
-            shape: inputGlobalShape)
-
-        let inputGlobalArray = MPSNDArray(
-            device: device,
-            descriptor: inputGlobalDescriptor)
-
+        let inputGlobalDescriptor = MPSNDArrayDescriptor(dataType: inputGlobal.tensor.dataType, shape: inputGlobalShape)
+        let inputGlobalArray = MPSNDArray(device: device, descriptor: inputGlobalDescriptor)
         inputGlobalArray.writeBytes(inputGlobalPointer)
 
-        let numInputMetaChannels = inputMeta.shape[channelAxis]
-
+        // Meta input
         let inputMetaShape = InputShape.create(
             batchSize: batchSize as NSNumber,
-            numChannels: numInputMetaChannels,
+            numChannels: numInputMetaChannelsNS,
             nnYLen: 1,
             nnXLen: 1)
-
-        let inputMetaDescriptor = MPSNDArrayDescriptor(
-            dataType: inputMeta.tensor.dataType,
-            shape: inputMetaShape)
-
-        let inputMetaArray = MPSNDArray(
-            device: device,
-            descriptor: inputMetaDescriptor)
-
+        let inputMetaDescriptor = MPSNDArrayDescriptor(dataType: inputMeta.tensor.dataType, shape: inputMetaShape)
+        let inputMetaArray = MPSNDArray(device: device, descriptor: inputMetaDescriptor)
         inputMetaArray.writeBytes(inputMetaPointer)
 
+        // Mask (extracted from first channel of spatial input)
         let maskShape = InputShape.create(
             batchSize: batchSize as NSNumber,
             numChannels: 1,
-            nnYLen: nnYLen,
-            nnXLen: nnXLen)
-
-        let maskDescriptor = MPSNDArrayDescriptor(
-            dataType: mask.tensor.dataType,
-            shape: maskShape)
-
-        let maskArray = MPSNDArray(
-            device: device,
-            descriptor: maskDescriptor)
-
-        var maskStrideArray = [
-            MemoryLayout<Float32>.size,
-            nnXLen.intValue * MemoryLayout<Float32>.size,
-            nnYLen.intValue * nnXLen.intValue * MemoryLayout<Float32>.size,
-            numInputChannels.intValue * nnYLen.intValue * nnXLen.intValue
-                * MemoryLayout<Float32>.size,
-        ]
-
+            nnYLen: nnYLenNS,
+            nnXLen: nnXLenNS)
+        let maskDescriptor = MPSNDArrayDescriptor(dataType: mask.tensor.dataType, shape: maskShape)
+        let maskArray = MPSNDArray(device: device, descriptor: maskDescriptor)
         maskArray.writeBytes(inputPointer, strideBytes: &maskStrideArray)
 
         let feeds = [
@@ -3146,18 +632,21 @@ struct Model {
             mask.tensor: MPSGraphTensorData(maskArray),
         ]
 
-        let fetch = graph.run(
-            with: commandQueue,
+        let fetch = graph.encode(
+            to: commandBuffer,
             feeds: feeds,
             targetTensors: targetTensors,
-            targetOperations: nil)
+            targetOperations: nil,
+            executionDescriptor: nil)
+
+        commandBuffer.commit()
+        commandBuffer.waitUntilCompleted()
 
-        assert(fetch[policyHead.policyTensor] != nil)
-        assert(fetch[policyHead.policyPassTensor] != nil)
-        assert(fetch[valueHead.valueTensor] != nil)
-        assert(fetch[valueHead.scoreValueTensor] != nil)
-        assert(fetch[valueHead.ownershipTensor] != nil)
+        if let error = commandBuffer.error {
+            fatalError("Metal backend: GPU error: \(error)")
+        }
 
+        // Copy results into output buffers
         fetch[policyHead.policyTensor]?.mpsndarray().readBytes(policy)
         fetch[policyHead.policyPassTensor]?.mpsndarray().readBytes(policyPass)
         fetch[valueHead.valueTensor]?.mpsndarray().readBytes(value)
@@ -3166,101 +655,23 @@ struct Model {
     }
 }
 
-// A enum to represent enabled/disabled/auto option of a feature.
-public enum SWEnable {
-    case False
-    case True
-    case Auto
-}
-
-/// A class that represents context of GPU devices.
-public class MetalComputeContext {
-    public let nnXLen: Int32
-    public let nnYLen: Int32
-
-    /// Initialize a context.
-    /// - Parameters:
-    ///   - nnXLen: The width of the input tensor.
-    ///   - nnYLen: The height of the input tensor.
-    init(
-        nnXLen: Int32,
-        nnYLen: Int32
-    ) {
-        self.nnXLen = nnXLen
-        self.nnYLen = nnYLen
-    }
-}
-
-public func createMetalComputeContext(
-    nnXLen: Int32,
-    nnYLen: Int32
-) -> MetalComputeContext {
-    return MetalComputeContext(
-        nnXLen: nnXLen,
-        nnYLen: nnYLen)
-}
-
-/// A class that represents a handle of GPU device.
-public class MetalComputeHandle {
-    let model: Model
-
-    init(model: Model) {
-        self.model = model
-    }
-
-    public func apply(
-        input inputPointer: UnsafeMutablePointer<Float32>,
-        inputGlobal inputGlobalPointer: UnsafeMutablePointer<Float32>,
-        inputMeta inputMetaPointer: UnsafeMutablePointer<Float32>,
-        policy: UnsafeMutablePointer<Float32>,
-        policyPass: UnsafeMutablePointer<Float32>,
-        value: UnsafeMutablePointer<Float32>,
-        scoreValue: UnsafeMutablePointer<Float32>,
-        ownership: UnsafeMutablePointer<Float32>,
-        batchSize: Int
-    ) {
-        autoreleasepool {
-            model.apply(
-                input: inputPointer,
-                inputGlobal: inputGlobalPointer,
-                inputMeta: inputMetaPointer,
-                policy: policy,
-                policyPass: policyPass,
-                value: value,
-                scoreValue: scoreValue,
-                ownership: ownership,
-                batchSize: batchSize)
-        }
-    }
-}
-
-public func maybeCreateMetalComputeHandle(
-    condition: Bool,
-    serverThreadIdx: Int = 0,
-    descriptor: SWModelDesc,
+/// Create a GPU-only compute handle using MPSGraph
+public func createMPSGraphOnlyHandle(
+    modelDesc: SWModelDesc,
+    serverThreadIdx: Int,
+    requireExactNNLen: Bool,
     context: MetalComputeContext
-) -> MetalComputeHandle? {
-    guard condition else { return nil }
-
-    let device = MTLCreateSystemDefaultDevice()!
-
-    let model = Model(
-        device: device,
-        graph: MPSGraph(),
-        descriptor: descriptor,
-        nnXLen: context.nnXLen as NSNumber,
-        nnYLen: context.nnYLen as NSNumber)
-
-    let handle = MetalComputeHandle(model: model)
-
-    printError(
-        "Metal backend \(serverThreadIdx): \(device.name), Model version \(descriptor.version) \(descriptor.name), \(context.nnXLen)x\(context.nnYLen)"
-    )
-
-    return handle
-}
-
-public func printMetalDevices() {
-    let device = MTLCreateSystemDefaultDevice()!
-    printError("Found Metal Device: \(device.name)")
+) -> MPSGraphModelHandle? {
+    guard let mpsGraphHandle = MPSGraphModelHandle(
+        modelDesc: modelDesc,
+        nnXLen: context.nnXLen,
+        nnYLen: context.nnYLen,
+        optimizeIdentityMask: requireExactNNLen
+    ) else {
+        printError("Metal backend \(serverThreadIdx): Failed to create MPSGraph handle")
+        return nil
+    }
+
+    printError("Metal backend \(serverThreadIdx): Initialized MPSGraph GPU-only mode")
+    return mpsGraphHandle
 }
diff --git a/cpp/neuralnet/metallayers.swift b/cpp/neuralnet/metallayers.swift
new file mode 100644
index 000000000..811fe0915
--- /dev/null
+++ b/cpp/neuralnet/metallayers.swift
@@ -0,0 +1,2911 @@
+// MPSGraph layer implementations shared between Metal and CoreML backends
+// Extracted from metalbackend.swift to enable hybrid CoreML + MPSGraph execution
+
+import Foundation
+import MetalPerformanceShaders
+import MetalPerformanceShadersGraph
+
+// MARK: - Helper Extensions
+
+/// An extension to the Data struct for handling float data with optional FP16 conversion.
+extension Data {
+    /// Initializes a new Data instance using an UnsafeMutablePointer<Float32>, with optional conversion to FP16 format.
+    init(
+        floatsNoCopy: UnsafeMutablePointer<Float32>,
+        shape: [NSNumber]
+    ) {
+        self.init(
+            bytesNoCopy: floatsNoCopy,
+            count: shape.countBytesOfFloat32(),
+            deallocator: .none)
+    }
+}
+
+/// Extension to MPSNDArray to convert from MPSGraphTensor, and to read/write bytes from/to UnsafeMutableRawPointer
+extension MPSNDArray {
+    /// Read bytes from the buffer
+    func readBytes(_ buffer: UnsafeMutableRawPointer) {
+        self.readBytes(buffer, strideBytes: nil)
+    }
+
+    /// Write bytes to the buffer
+    func writeBytes(_ buffer: UnsafeMutableRawPointer) {
+        self.writeBytes(buffer, strideBytes: nil)
+    }
+}
+
+/// Extension to Array to count number of elements and bytes
+extension Array where Element == NSNumber {
+    /// Count number of elements
+    func countElements() -> Int {
+        return reduce(1, { $0 * $1.intValue })
+    }
+
+    /// Count number of bytes
+    func countBytesOfFloat32() -> Int {
+        return countElements() * MemoryLayout<Float32>.size
+    }
+}
+
+/// Extension to MPSGraph to the mish activation function
+extension MPSGraph {
+    /// Mish activation: x * tanh(softplus(x))
+    func mish(tensor: MPSGraphTensor) -> MPSGraphTensor {
+        assert(tensor.dataType == .float32)
+
+        let one = 1.0
+        let threshold = 20.0
+        let thresholdTensor = constant(threshold, dataType: tensor.dataType)
+        let minimumTensor = minimum(tensor, thresholdTensor, name: nil)
+        let expTensor = exponent(with: minimumTensor, name: nil)
+        let oneTensor = constant(one, dataType: tensor.dataType)
+        let addTensor = addition(expTensor, oneTensor, name: nil)
+        let logTensor = logarithm(with: addTensor, name: nil)
+        let lessTensor = lessThan(tensor, thresholdTensor, name: nil)
+        let selectTensor = select(
+            predicate: lessTensor, trueTensor: logTensor, falseTensor: tensor, name: nil)
+        let tanhTensor = tanh(with: selectTensor, name: nil)
+        let mulTensor = multiplication(tensor, tanhTensor, name: nil)
+
+        return mulTensor
+    }
+}
+
+// MARK: - Input Shape Utilities
+
+/// A structure that represents the input shape (internal - not exposed to C++)
+struct InputShape {
+    /// Create a shape for the input tensor
+    static func create(
+        batchSize: NSNumber,
+        numChannels: NSNumber,
+        nnYLen: NSNumber,
+        nnXLen: NSNumber
+    ) -> [NSNumber] {
+        return [batchSize, numChannels, nnYLen, nnXLen]
+    }
+
+    /// Get the channel axis
+    static func getChannelAxis() -> Int {
+        return 1
+    }
+
+    /// Get the HW axes
+    static func getHWAxes() -> [NSNumber] {
+        return [2, 3] as [NSNumber]
+    }
+}
+
+// MARK: - Input Layers
+
+/// A structure that represents the input layer
+struct InputLayer {
+    let tensor: MPSGraphTensor
+    let shape: [NSNumber]
+
+    init(
+        graph: MPSGraph,
+        nnXLen: NSNumber,
+        nnYLen: NSNumber,
+        numChannels: NSNumber,
+        dataType: MPSDataType = .float32
+    ) {
+        shape = InputShape.create(
+            batchSize: -1,
+            numChannels: numChannels,
+            nnYLen: nnYLen,
+            nnXLen: nnXLen)
+
+        self.tensor = graph.placeholder(
+            shape: shape,
+            dataType: dataType,
+            name: nil)
+
+        assert(self.tensor.shape?.count == 4)
+    }
+}
+
+/// A structure that represents an input global layer for a neural network model.
+struct InputGlobalLayer {
+    let tensor: MPSGraphTensor
+    let shape: [NSNumber]
+
+    init(
+        graph: MPSGraph,
+        numGlobalFeatures: NSNumber,
+        dataType: MPSDataType = .float32
+    ) {
+        shape = InputShape.create(
+            batchSize: -1,
+            numChannels: numGlobalFeatures,
+            nnYLen: 1,
+            nnXLen: 1)
+
+        self.tensor = graph.placeholder(
+            shape: shape,
+            dataType: dataType,
+            name: nil)
+
+        assert(self.tensor.shape?.count == 4)
+    }
+}
+
+/// A structure representing the input meta layer for a neural network graph.
+struct InputMetaLayer {
+    let tensor: MPSGraphTensor
+    let shape: [NSNumber]
+
+    init(
+        graph: MPSGraph,
+        numMetaFeatures: NSNumber,
+        dataType: MPSDataType = .float32
+    ) {
+        shape = InputShape.create(
+            batchSize: -1,
+            numChannels: numMetaFeatures,
+            nnYLen: 1,
+            nnXLen: 1)
+
+        self.tensor = graph.placeholder(
+            shape: shape,
+            dataType: dataType,
+            name: nil)
+    }
+}
+
+/// A structure that represents a mask layer for a neural network model.
+struct MaskLayer {
+    let tensor: MPSGraphTensor
+    let shape: [NSNumber]
+
+    init(
+        graph: MPSGraph,
+        nnXLen: NSNumber,
+        nnYLen: NSNumber,
+        dataType: MPSDataType = .float32
+    ) {
+        shape = InputShape.create(
+            batchSize: -1,
+            numChannels: 1,
+            nnYLen: nnYLen,
+            nnXLen: nnXLen)
+
+        self.tensor = graph.placeholder(
+            shape: shape,
+            dataType: dataType,
+            name: nil)
+
+        assert(self.tensor.shape?.count == 4)
+    }
+}
+
+// MARK: - Mask Processing Layers
+
+/// A structure that represents a layer which performs the summation operation on a mask layer.
+struct MaskSumLayer {
+    let tensor: MPSGraphTensor
+
+    init(tensor: MPSGraphTensor) {
+        self.tensor = tensor
+        assert(self.tensor.shape?.count == 4)
+    }
+
+    init(
+        graph: MPSGraph,
+        maskTensor: MPSGraphTensor
+    ) {
+        let hwAxes = InputShape.getHWAxes()
+
+        self.tensor = graph.reductionSum(
+            with: maskTensor,
+            axes: hwAxes,
+            name: nil)
+
+        assert(self.tensor.shape?.count == 4)
+    }
+
+    /// Optimized init for when mask is all 1s (requireExactNNLen=true)
+    /// Returns constant tensor with boardSize value
+    init(
+        graph: MPSGraph,
+        nnXLen: NSNumber,
+        nnYLen: NSNumber,
+        dataType: MPSDataType = .float32
+    ) {
+        let boardSize = Double(nnXLen.intValue * nnYLen.intValue)
+        self.tensor = graph.constant(
+            boardSize,
+            shape: [1, 1, 1, 1],
+            dataType: dataType)
+
+        assert(self.tensor.shape?.count == 4)
+    }
+}
+
+/// A structure that represents sqrt(maskSum) * 0.1 - 1.4
+struct MaskSumSqrtS14M01Layer {
+    let tensor: MPSGraphTensor
+
+    init(tensor: MPSGraphTensor) {
+        self.tensor = tensor
+        assert(self.tensor.shape?.count == 4)
+    }
+
+    init(
+        graph: MPSGraph,
+        maskSum: MaskSumLayer
+    ) {
+        let sqrtMaskSum = graph.squareRoot(with: maskSum.tensor, name: nil)
+
+        let fourTeen = graph.constant(
+            14.0,
+            shape: [1],
+            dataType: maskSum.tensor.dataType)
+
+        let subtracted = graph.subtraction(sqrtMaskSum, fourTeen, name: nil)
+
+        let zeroPointone = graph.constant(
+            0.1,
+            shape: [1],
+            dataType: maskSum.tensor.dataType)
+
+        self.tensor = graph.multiplication(
+            subtracted,
+            zeroPointone,
+            name: nil)
+
+        assert(self.tensor.shape?.count == 4)
+    }
+
+    /// Optimized init for when mask is all 1s (requireExactNNLen=true)
+    /// Returns constant tensor: (sqrt(boardSize) - 14) * 0.1
+    init(
+        graph: MPSGraph,
+        nnXLen: NSNumber,
+        nnYLen: NSNumber,
+        dataType: MPSDataType = .float32
+    ) {
+        let boardSize = Double(nnXLen.intValue * nnYLen.intValue)
+        let value = (sqrt(boardSize) - 14.0) * 0.1
+        self.tensor = graph.constant(
+            value,
+            shape: [1, 1, 1, 1],
+            dataType: dataType)
+
+        assert(self.tensor.shape?.count == 4)
+    }
+}
+
+/// A structure for (sqrt(maskSum) * 0.1 - 1.4)^2 - 0.1
+struct MaskSumSqrtS14M01SquareS01Layer {
+    let tensor: MPSGraphTensor
+
+    init(tensor: MPSGraphTensor) {
+        self.tensor = tensor
+        assert(self.tensor.shape?.count == 4)
+    }
+
+    init(
+        graph: MPSGraph,
+        maskSumSqrtS14M01: MaskSumSqrtS14M01Layer
+    ) {
+        let squared = graph.square(with: maskSumSqrtS14M01.tensor, name: nil)
+
+        let zeroPointone = graph.constant(
+            0.1,
+            shape: [1],
+            dataType: maskSumSqrtS14M01.tensor.dataType)
+
+        self.tensor = graph.subtraction(
+            squared,
+            zeroPointone,
+            name: nil)
+
+        assert(self.tensor.shape?.count == 4)
+    }
+
+    /// Optimized init for when mask is all 1s (requireExactNNLen=true)
+    /// Returns constant tensor: ((sqrt(boardSize) - 14) * 0.1)^2 - 0.1
+    init(
+        graph: MPSGraph,
+        nnXLen: NSNumber,
+        nnYLen: NSNumber,
+        dataType: MPSDataType = .float32
+    ) {
+        let boardSize = Double(nnXLen.intValue * nnYLen.intValue)
+        let sqrtS14M01 = (sqrt(boardSize) - 14.0) * 0.1
+        let value = sqrtS14M01 * sqrtS14M01 - 0.1
+        self.tensor = graph.constant(
+            value,
+            shape: [1, 1, 1, 1],
+            dataType: dataType)
+
+        assert(self.tensor.shape?.count == 4)
+    }
+}
+
+// MARK: - Layer Descriptors
+
+/// An enumeration of the different kinds of activation function.
+public enum ActivationKind {
+    case identity
+    case relu
+    case mish
+}
+
+/// A struct that represents a description of convolutional layer.
+public struct SWConvLayerDesc {
+    let convYSize: NSNumber
+    let convXSize: NSNumber
+    let inChannels: NSNumber
+    let outChannels: NSNumber
+    let dilationY: Int
+    let dilationX: Int
+    let weights: UnsafeMutablePointer<Float32>
+
+    init(
+        convYSize: NSNumber,
+        convXSize: NSNumber,
+        inChannels: NSNumber,
+        outChannels: NSNumber,
+        dilationY: Int,
+        dilationX: Int,
+        weights: UnsafeMutablePointer<Float32>
+    ) {
+        self.convYSize = convYSize
+        self.convXSize = convXSize
+        self.inChannels = inChannels
+        self.outChannels = outChannels
+        self.dilationY = dilationY
+        self.dilationX = dilationX
+        self.weights = weights
+    }
+}
+
+public func createSWConvLayerDesc(
+    convYSize: Int32,
+    convXSize: Int32,
+    inChannels: Int32,
+    outChannels: Int32,
+    dilationY: Int32,
+    dilationX: Int32,
+    weights: UnsafeMutablePointer<Float32>
+) -> SWConvLayerDesc {
+    return SWConvLayerDesc(
+        convYSize: convYSize as NSNumber,
+        convXSize: convXSize as NSNumber,
+        inChannels: inChannels as NSNumber,
+        outChannels: outChannels as NSNumber,
+        dilationY: Int(dilationY),
+        dilationX: Int(dilationX),
+        weights: weights)
+}
+
+/// A struct that represents a description of a batch normalization layer.
+public struct SWBatchNormLayerDesc {
+    let numChannels: NSNumber
+    let mergedScale: UnsafeMutablePointer<Float32>
+    let mergedBias: UnsafeMutablePointer<Float32>
+
+    init(
+        numChannels: NSNumber,
+        mergedScale: UnsafeMutablePointer<Float32>,
+        mergedBias: UnsafeMutablePointer<Float32>
+    ) {
+        self.numChannels = numChannels
+        self.mergedScale = mergedScale
+        self.mergedBias = mergedBias
+    }
+}
+
+public func createSWBatchNormLayerDesc(
+    numChannels: Int32,
+    mergedScale: UnsafeMutablePointer<Float32>,
+    mergedBias: UnsafeMutablePointer<Float32>
+) -> SWBatchNormLayerDesc {
+    return SWBatchNormLayerDesc(
+        numChannels: numChannels as NSNumber,
+        mergedScale: mergedScale,
+        mergedBias: mergedBias)
+}
+
+/// A struct that represents a matrix multiplication layer descriptor
+public struct SWMatMulLayerDesc {
+    let inChannels: NSNumber
+    let outChannels: NSNumber
+    let weights: UnsafeMutablePointer<Float32>
+
+    init(
+        inChannels: NSNumber,
+        outChannels: NSNumber,
+        weights: UnsafeMutablePointer<Float32>
+    ) {
+        self.inChannels = inChannels
+        self.outChannels = outChannels
+        self.weights = weights
+    }
+}
+
+public func createSWMatMulLayerDesc(
+    inChannels: Int32,
+    outChannels: Int32,
+    weights: UnsafeMutablePointer<Float32>
+) -> SWMatMulLayerDesc {
+    return SWMatMulLayerDesc(
+        inChannels: inChannels as NSNumber,
+        outChannels: outChannels as NSNumber,
+        weights: weights)
+}
+
+/// A struct that represents the bias layer description.
+public struct SWMatBiasLayerDesc {
+    let numChannels: NSNumber
+    let weights: UnsafeMutablePointer<Float32>
+
+    init(
+        numChannels: NSNumber,
+        weights: UnsafeMutablePointer<Float32>
+    ) {
+        self.numChannels = numChannels
+        self.weights = weights
+    }
+}
+
+public func createSWMatBiasLayerDesc(
+    numChannels: Int32,
+    weights: UnsafeMutablePointer<Float32>
+) -> SWMatBiasLayerDesc {
+    return SWMatBiasLayerDesc(
+        numChannels: numChannels as NSNumber,
+        weights: weights)
+}
+
+// MARK: - Core Layers
+
+/// A class that represents a convolutional layer using MPSGraph
+class ConvLayer {
+    let resultTensor: MPSGraphTensor
+    let convDescriptor = MPSGraphConvolution2DOpDescriptor(
+        strideInX: 1,
+        strideInY: 1,
+        dilationRateInX: 1,
+        dilationRateInY: 1,
+        groups: 1,
+        paddingStyle: .TF_SAME,
+        dataLayout: .NCHW,
+        weightsLayout: .OIHW)!
+
+    init(
+        graph: MPSGraph,
+        sourceTensor: MPSGraphTensor,
+        descriptor: SWConvLayerDesc,
+        nnXLen: NSNumber,
+        nnYLen: NSNumber
+    ) {
+        assert(descriptor.dilationX == 1 && descriptor.dilationY == 1)
+
+        let weightsShape = [
+            descriptor.outChannels,
+            descriptor.inChannels,
+            descriptor.convYSize,
+            descriptor.convXSize,
+        ]
+
+        let weightsData = Data(
+            floatsNoCopy: descriptor.weights,
+            shape: weightsShape)
+
+        let weightsTensor = graph.constant(
+            weightsData,
+            shape: weightsShape,
+            dataType: sourceTensor.dataType)
+
+        resultTensor = graph.convolution2D(
+            sourceTensor,
+            weights: weightsTensor,
+            descriptor: convDescriptor,
+            name: nil)
+
+        assert(resultTensor.shape?.count == 4)
+    }
+}
+
+/// A class that represents a batch normalization layer.
+class BatchNormLayer {
+    let resultTensor: MPSGraphTensor
+
+    init(
+        graph: MPSGraph,
+        sourceTensor: MPSGraphTensor,
+        maskTensor: MPSGraphTensor,
+        descriptor: SWBatchNormLayerDesc,
+        nnXLen: NSNumber,
+        nnYLen: NSNumber,
+        optimizeIdentityMask: Bool = false
+    ) {
+        let scaleBiasShape = InputShape.create(
+            batchSize: 1,
+            numChannels: descriptor.numChannels,
+            nnYLen: 1,
+            nnXLen: 1)
+
+        let mergedScaleData = Data(
+            floatsNoCopy: descriptor.mergedScale,
+            shape: scaleBiasShape)
+
+        let mergedBiasData = Data(
+            floatsNoCopy: descriptor.mergedBias,
+            shape: scaleBiasShape)
+
+        let scaleTensor = graph.constant(
+            mergedScaleData,
+            shape: scaleBiasShape,
+            dataType: sourceTensor.dataType)
+
+        let biasTensor = graph.constant(
+            mergedBiasData,
+            shape: scaleBiasShape,
+            dataType: sourceTensor.dataType)
+
+        let scaled = graph.multiplication(
+            sourceTensor,
+            scaleTensor,
+            name: nil)
+
+        let normalized = graph.addition(
+            scaled,
+            biasTensor,
+            name: nil)
+
+        // Skip mask multiplication when all mask values are 1
+        if optimizeIdentityMask {
+            resultTensor = normalized
+        } else {
+            resultTensor = graph.multiplication(
+                normalized,
+                maskTensor,
+                name: nil)
+        }
+
+        assert(resultTensor.shape?.count == 4)
+    }
+}
+
+/// A structure that represents an activation layer
+struct ActivationLayer {
+    let resultTensor: MPSGraphTensor
+
+    init(
+        graph: MPSGraph,
+        sourceTensor: MPSGraphTensor,
+        activationKind: ActivationKind
+    ) {
+        switch activationKind {
+        case .relu:
+            resultTensor = graph.reLU(with: sourceTensor, name: nil)
+        case .mish:
+            resultTensor = graph.mish(tensor: sourceTensor)
+        default:
+            resultTensor = sourceTensor
+        }
+
+        assert(resultTensor.shape == sourceTensor.shape)
+    }
+}
+
+/// A structure representing a matrix multiplication layer.
+struct MatMulLayer {
+    let resultTensor: MPSGraphTensor
+
+    init(
+        graph: MPSGraph,
+        descriptor: SWMatMulLayerDesc,
+        sourceTensor: MPSGraphTensor
+    ) {
+        let weightsShape = [
+            descriptor.inChannels,
+            descriptor.outChannels,
+        ]
+
+        let weightsData = Data(
+            floatsNoCopy: descriptor.weights,
+            shape: weightsShape)
+
+        let weightsTensor = graph.constant(
+            weightsData,
+            shape: weightsShape,
+            dataType: sourceTensor.dataType)
+
+        let shape = [-1, descriptor.inChannels]
+
+        let reshapedSource = graph.reshape(
+            sourceTensor,
+            shape: shape,
+            name: nil)
+
+        resultTensor = graph.matrixMultiplication(
+            primary: reshapedSource,
+            secondary: weightsTensor,
+            name: nil)
+
+        assert(resultTensor.shape?.count == 2)
+    }
+}
+
+/// A structure that performs matrix bias operations
+struct MatBiasLayer {
+    let resultTensor: MPSGraphTensor
+
+    init(
+        graph: MPSGraph,
+        descriptor: SWMatBiasLayerDesc,
+        sourceTensor: MPSGraphTensor
+    ) {
+        assert(
+            (sourceTensor.shape?.count == 2) && (sourceTensor.shape?[1] == descriptor.numChannels))
+
+        let weightsShape = [1, descriptor.numChannels]
+
+        let weightsData = Data(
+            floatsNoCopy: descriptor.weights,
+            shape: weightsShape)
+
+        let weightsTensor = graph.constant(
+            weightsData,
+            shape: weightsShape,
+            dataType: sourceTensor.dataType)
+
+        resultTensor = graph.addition(
+            sourceTensor,
+            weightsTensor,
+            name: nil)
+    }
+}
+
+/// A structure that performs bias operations in NC coordinates.
+struct AddNCBiasLayer {
+    let resultTensor: MPSGraphTensor
+
+    init(
+        graph: MPSGraph,
+        sourceTensor: MPSGraphTensor,
+        biasTensor: MPSGraphTensor,
+        nnXLen: NSNumber,
+        nnYLen: NSNumber,
+        numChannels: NSNumber
+    ) {
+        let shape = InputShape.create(
+            batchSize: -1,
+            numChannels: numChannels,
+            nnYLen: 1,
+            nnXLen: 1)
+
+        assert(biasTensor.shape?[1] == shape[1])
+
+        let reshaped = graph.reshape(biasTensor, shape: shape, name: nil)
+        resultTensor = graph.addition(sourceTensor, reshaped, name: nil)
+
+        assert(resultTensor.shape?.count == 4)
+        assert(resultTensor.shape?[2] == nnYLen)
+        assert(resultTensor.shape?[3] == nnXLen)
+    }
+}
+
+// MARK: - Pooling Layers
+
+/// A structure that represents a global pooling layer
+struct GlobalPoolingLayer {
+    let resultTensor: MPSGraphTensor
+
+    init(
+        graph: MPSGraph,
+        sourceTensor: MPSGraphTensor,
+        maskTensor: MPSGraphTensor,
+        maskSumTensor: MPSGraphTensor,
+        maskSumSqrtS14M01Tensor: MPSGraphTensor,
+        optimizeIdentityMask: Bool = false
+    ) {
+        let hwAxes = InputShape.getHWAxes()
+        let channelAxis = InputShape.getChannelAxis()
+
+        let sumTensor = graph.reductionSum(
+            with: sourceTensor,
+            axes: hwAxes,
+            name: nil)
+
+        let meanTensor = graph.division(sumTensor, maskSumTensor, name: nil)
+
+        let meanMaskTensor = graph.multiplication(
+            meanTensor,
+            maskSumSqrtS14M01Tensor,
+            name: nil)
+
+        let maxTensor: MPSGraphTensor
+        if optimizeIdentityMask {
+            // When all mask values are 1, directly compute max without mask adjustment
+            maxTensor = graph.reductionMaximum(
+                with: sourceTensor,
+                axes: hwAxes,
+                name: nil)
+        } else {
+            // Mask out invalid positions by subtracting 1 (making them very negative)
+            let oneTensor = graph.constant(1.0, dataType: sourceTensor.dataType)
+            let maskM1Tensor = graph.subtraction(maskTensor, oneTensor, name: nil)
+            let addition = graph.addition(sourceTensor, maskM1Tensor, name: nil)
+
+            maxTensor = graph.reductionMaximum(
+                with: addition,
+                axes: hwAxes,
+                name: nil)
+        }
+
+        resultTensor = graph.concatTensors(
+            [
+                meanTensor,
+                meanMaskTensor,
+                maxTensor,
+            ],
+            dimension: channelAxis,
+            name: nil)
+
+        assert(resultTensor.shape?.count == 4)
+        assert(resultTensor.shape?[2] == 1)
+        assert(resultTensor.shape?[3] == 1)
+    }
+}
+
+/// A structure that represents a layer that performs global pooling on the input tensor
+struct GlobalPoolingValueLayer {
+    let resultTensor: MPSGraphTensor
+
+    init(
+        graph: MPSGraph,
+        sourceTensor: MPSGraphTensor,
+        maskSumTensor: MPSGraphTensor,
+        maskSumSqrtS14M01Tensor: MPSGraphTensor,
+        maskSumSqrtS14M01SquareS01Tensor: MPSGraphTensor
+    ) {
+        let hwAxes = InputShape.getHWAxes()
+        let channelAxis = InputShape.getChannelAxis()
+
+        let sumTensor = graph.reductionSum(
+            with: sourceTensor,
+            axes: hwAxes,
+            name: nil)
+
+        let meanTensor = graph.division(sumTensor, maskSumTensor, name: nil)
+
+        let meanMaskTensor = graph.multiplication(
+            meanTensor,
+            maskSumSqrtS14M01Tensor,
+            name: nil)
+
+        let meanMaskSquareTensor = graph.multiplication(
+            meanTensor,
+            maskSumSqrtS14M01SquareS01Tensor,
+            name: nil)
+
+        resultTensor = graph.concatTensors(
+            [
+                meanTensor,
+                meanMaskTensor,
+                meanMaskSquareTensor,
+            ],
+            dimension: channelAxis,
+            name: nil)
+
+        assert(resultTensor.shape?.count == 4)
+        assert(resultTensor.shape?[2] == 1)
+        assert(resultTensor.shape?[3] == 1)
+    }
+}
+
+// MARK: - Block Descriptors
+
+/// Base class for block descriptors
+public class BlockDescriptor {
+}
+
+/// A class that represents a residual block.
+public class SWResidualBlockDesc: BlockDescriptor {
+    let preBN: SWBatchNormLayerDesc
+    let preActivation: ActivationKind
+    let regularConv: SWConvLayerDesc
+    let midBN: SWBatchNormLayerDesc
+    let midActivation: ActivationKind
+    let finalConv: SWConvLayerDesc
+
+    init(
+        preBN: SWBatchNormLayerDesc,
+        preActivation: ActivationKind,
+        regularConv: SWConvLayerDesc,
+        midBN: SWBatchNormLayerDesc,
+        midActivation: ActivationKind,
+        finalConv: SWConvLayerDesc
+    ) {
+        self.preBN = preBN
+        self.preActivation = preActivation
+        self.regularConv = regularConv
+        self.midBN = midBN
+        self.midActivation = midActivation
+        self.finalConv = finalConv
+    }
+}
+
+public func createSWResidualBlockDesc(
+    preBN: SWBatchNormLayerDesc,
+    preActivation: ActivationKind,
+    regularConv: SWConvLayerDesc,
+    midBN: SWBatchNormLayerDesc,
+    midActivation: ActivationKind,
+    finalConv: SWConvLayerDesc
+) -> SWResidualBlockDesc {
+    return SWResidualBlockDesc(
+        preBN: preBN,
+        preActivation: preActivation,
+        regularConv: regularConv,
+        midBN: midBN,
+        midActivation: midActivation,
+        finalConv: finalConv)
+}
+
+/// A class that represents a residual block with global pooling.
+public class SWGlobalPoolingResidualBlockDesc: BlockDescriptor {
+    let preBN: SWBatchNormLayerDesc
+    let preActivation: ActivationKind
+    let regularConv: SWConvLayerDesc
+    let gpoolConv: SWConvLayerDesc
+    let gpoolBN: SWBatchNormLayerDesc
+    let gpoolActivation: ActivationKind
+    let gpoolToBiasMul: SWMatMulLayerDesc
+    let midBN: SWBatchNormLayerDesc
+    let midActivation: ActivationKind
+    let finalConv: SWConvLayerDesc
+
+    init(
+        preBN: SWBatchNormLayerDesc,
+        preActivation: ActivationKind,
+        regularConv: SWConvLayerDesc,
+        gpoolConv: SWConvLayerDesc,
+        gpoolBN: SWBatchNormLayerDesc,
+        gpoolActivation: ActivationKind,
+        gpoolToBiasMul: SWMatMulLayerDesc,
+        midBN: SWBatchNormLayerDesc,
+        midActivation: ActivationKind,
+        finalConv: SWConvLayerDesc
+    ) {
+        self.preBN = preBN
+        self.preActivation = preActivation
+        self.regularConv = regularConv
+        self.gpoolConv = gpoolConv
+        self.gpoolBN = gpoolBN
+        self.gpoolActivation = gpoolActivation
+        self.gpoolToBiasMul = gpoolToBiasMul
+        self.midBN = midBN
+        self.midActivation = midActivation
+        self.finalConv = finalConv
+    }
+}
+
+public func createSWGlobalPoolingResidualBlockDesc(
+    preBN: SWBatchNormLayerDesc,
+    preActivation: ActivationKind,
+    regularConv: SWConvLayerDesc,
+    gpoolConv: SWConvLayerDesc,
+    gpoolBN: SWBatchNormLayerDesc,
+    gpoolActivation: ActivationKind,
+    gpoolToBiasMul: SWMatMulLayerDesc,
+    midBN: SWBatchNormLayerDesc,
+    midActivation: ActivationKind,
+    finalConv: SWConvLayerDesc
+) -> SWGlobalPoolingResidualBlockDesc {
+    return SWGlobalPoolingResidualBlockDesc(
+        preBN: preBN,
+        preActivation: preActivation,
+        regularConv: regularConv,
+        gpoolConv: gpoolConv,
+        gpoolBN: gpoolBN,
+        gpoolActivation: gpoolActivation,
+        gpoolToBiasMul: gpoolToBiasMul,
+        midBN: midBN,
+        midActivation: midActivation,
+        finalConv: finalConv)
+}
+
+/// A class that represents a nested bottleneck residual block
+public class SWNestedBottleneckResidualBlockDesc: BlockDescriptor {
+    let preBN: SWBatchNormLayerDesc
+    let preActivation: ActivationKind
+    let preConv: SWConvLayerDesc
+    let blockDescriptors: [BlockDescriptor]
+    let postBN: SWBatchNormLayerDesc
+    let postActivation: ActivationKind
+    let postConv: SWConvLayerDesc
+
+    init(
+        preBN: SWBatchNormLayerDesc,
+        preActivation: ActivationKind,
+        preConv: SWConvLayerDesc,
+        blockDescriptors: [BlockDescriptor],
+        postBN: SWBatchNormLayerDesc,
+        postActivation: ActivationKind,
+        postConv: SWConvLayerDesc
+    ) {
+        self.preBN = preBN
+        self.preActivation = preActivation
+        self.preConv = preConv
+        self.blockDescriptors = blockDescriptors
+        self.postBN = postBN
+        self.postActivation = postActivation
+        self.postConv = postConv
+    }
+}
+
+public func createSWNestedBottleneckResidualBlockDesc(
+    preBN: SWBatchNormLayerDesc,
+    preActivation: ActivationKind,
+    preConv: SWConvLayerDesc,
+    blockDescriptors: [BlockDescriptor],
+    postBN: SWBatchNormLayerDesc,
+    postActivation: ActivationKind,
+    postConv: SWConvLayerDesc
+) -> SWNestedBottleneckResidualBlockDesc {
+    return SWNestedBottleneckResidualBlockDesc(
+        preBN: preBN,
+        preActivation: preActivation,
+        preConv: preConv,
+        blockDescriptors: blockDescriptors,
+        postBN: postBN,
+        postActivation: postActivation,
+        postConv: postConv)
+}
+
+public class BlockDescriptorBuilder {
+    public var blockDescriptors: [BlockDescriptor] = []
+
+    init() {}
+
+    public func enque(with descriptor: BlockDescriptor) {
+        blockDescriptors.append(descriptor)
+    }
+}
+
+public func createBlockDescriptorBuilder() -> BlockDescriptorBuilder {
+    return BlockDescriptorBuilder()
+}
+
+// MARK: - Block Implementations
+
+/// A class that represents a Residual Block layer
+class ResidualBlock {
+    let resultTensor: MPSGraphTensor
+
+    init(
+        graph: MPSGraph,
+        sourceTensor: MPSGraphTensor,
+        maskTensor: MPSGraphTensor,
+        descriptor: SWResidualBlockDesc,
+        nnXLen: NSNumber,
+        nnYLen: NSNumber,
+        optimizeIdentityMask: Bool = false
+    ) {
+        let preBN = BatchNormLayer(
+            graph: graph,
+            sourceTensor: sourceTensor,
+            maskTensor: maskTensor,
+            descriptor: descriptor.preBN,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen,
+            optimizeIdentityMask: optimizeIdentityMask)
+
+        let preActivation = ActivationLayer(
+            graph: graph,
+            sourceTensor: preBN.resultTensor,
+            activationKind: descriptor.preActivation)
+
+        let regularConv = ConvLayer(
+            graph: graph,
+            sourceTensor: preActivation.resultTensor,
+            descriptor: descriptor.regularConv,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen)
+
+        let midBN = BatchNormLayer(
+            graph: graph,
+            sourceTensor: regularConv.resultTensor,
+            maskTensor: maskTensor,
+            descriptor: descriptor.midBN,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen,
+            optimizeIdentityMask: optimizeIdentityMask)
+
+        let midActivation = ActivationLayer(
+            graph: graph,
+            sourceTensor: midBN.resultTensor,
+            activationKind: descriptor.midActivation)
+
+        let finalConv = ConvLayer(
+            graph: graph,
+            sourceTensor: midActivation.resultTensor,
+            descriptor: descriptor.finalConv,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen)
+
+        resultTensor = graph.addition(
+            sourceTensor,
+            finalConv.resultTensor,
+            name: nil)
+
+        assert(resultTensor.shape?.count == 4)
+    }
+}
+
+/// A class representing a residual block with global pooling
+class GlobalPoolingResidualBlock {
+    let resultTensor: MPSGraphTensor
+
+    init(
+        graph: MPSGraph,
+        sourceTensor: MPSGraphTensor,
+        maskTensor: MPSGraphTensor,
+        maskSumTensor: MPSGraphTensor,
+        maskSumSqrtS14M01Tensor: MPSGraphTensor,
+        descriptor: SWGlobalPoolingResidualBlockDesc,
+        nnXLen: NSNumber,
+        nnYLen: NSNumber,
+        optimizeIdentityMask: Bool = false
+    ) {
+        let maskSum = MaskSumLayer(tensor: maskSumTensor)
+        let maskSumSqrtS14M01 = MaskSumSqrtS14M01Layer(tensor: maskSumSqrtS14M01Tensor)
+
+        let preBN = BatchNormLayer(
+            graph: graph,
+            sourceTensor: sourceTensor,
+            maskTensor: maskTensor,
+            descriptor: descriptor.preBN,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen,
+            optimizeIdentityMask: optimizeIdentityMask)
+
+        let preActivation = ActivationLayer(
+            graph: graph,
+            sourceTensor: preBN.resultTensor,
+            activationKind: descriptor.preActivation)
+
+        let regularConv = ConvLayer(
+            graph: graph,
+            sourceTensor: preActivation.resultTensor,
+            descriptor: descriptor.regularConv,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen)
+
+        let gpoolConv = ConvLayer(
+            graph: graph,
+            sourceTensor: preActivation.resultTensor,
+            descriptor: descriptor.gpoolConv,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen)
+
+        let gpoolBN = BatchNormLayer(
+            graph: graph,
+            sourceTensor: gpoolConv.resultTensor,
+            maskTensor: maskTensor,
+            descriptor: descriptor.gpoolBN,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen,
+            optimizeIdentityMask: optimizeIdentityMask)
+
+        let gpoolActivation = ActivationLayer(
+            graph: graph,
+            sourceTensor: gpoolBN.resultTensor,
+            activationKind: descriptor.gpoolActivation)
+
+        let gpoolConcat = GlobalPoolingLayer(
+            graph: graph,
+            sourceTensor: gpoolActivation.resultTensor,
+            maskTensor: maskTensor,
+            maskSumTensor: maskSum.tensor,
+            maskSumSqrtS14M01Tensor: maskSumSqrtS14M01.tensor,
+            optimizeIdentityMask: optimizeIdentityMask)
+
+        assert(gpoolConcat.resultTensor.shape?[1] == descriptor.gpoolToBiasMul.inChannels)
+
+        let gpoolToBiasMul = MatMulLayer(
+            graph: graph,
+            descriptor: descriptor.gpoolToBiasMul,
+            sourceTensor: gpoolConcat.resultTensor)
+
+        let added = AddNCBiasLayer(
+            graph: graph,
+            sourceTensor: regularConv.resultTensor,
+            biasTensor: gpoolToBiasMul.resultTensor,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen,
+            numChannels: descriptor.gpoolToBiasMul.outChannels)
+
+        let midBN = BatchNormLayer(
+            graph: graph,
+            sourceTensor: added.resultTensor,
+            maskTensor: maskTensor,
+            descriptor: descriptor.midBN,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen,
+            optimizeIdentityMask: optimizeIdentityMask)
+
+        let midActivation = ActivationLayer(
+            graph: graph,
+            sourceTensor: midBN.resultTensor,
+            activationKind: descriptor.midActivation)
+
+        let finalConv = ConvLayer(
+            graph: graph,
+            sourceTensor: midActivation.resultTensor,
+            descriptor: descriptor.finalConv,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen)
+
+        resultTensor = graph.addition(
+            sourceTensor,
+            finalConv.resultTensor,
+            name: nil)
+
+        assert(resultTensor.shape?.count == 4)
+    }
+}
+
+/// A structure that represents a block stack
+struct BlockStack {
+    let resultTensor: MPSGraphTensor
+
+    static func processBlockDescriptors(
+        _ graph: MPSGraph,
+        _ sourceTensor: MPSGraphTensor,
+        _ maskTensor: MPSGraphTensor,
+        _ maskSumTensor: MPSGraphTensor,
+        _ maskSumSqrtS14M01Tensor: MPSGraphTensor,
+        _ blockDescriptors: [BlockDescriptor],
+        _ index: Int,
+        _ nnXLen: NSNumber,
+        _ nnYLen: NSNumber,
+        _ optimizeIdentityMask: Bool
+    ) -> MPSGraphTensor {
+        guard index < blockDescriptors.count else {
+            return sourceTensor
+        }
+
+        let blockDescriptor = blockDescriptors[index]
+        let blockInput: MPSGraphTensor
+
+        switch blockDescriptor {
+        case let globalPoolingDescriptor as SWGlobalPoolingResidualBlockDesc:
+            let globalPooling = GlobalPoolingResidualBlock(
+                graph: graph,
+                sourceTensor: sourceTensor,
+                maskTensor: maskTensor,
+                maskSumTensor: maskSumTensor,
+                maskSumSqrtS14M01Tensor: maskSumSqrtS14M01Tensor,
+                descriptor: globalPoolingDescriptor,
+                nnXLen: nnXLen,
+                nnYLen: nnYLen,
+                optimizeIdentityMask: optimizeIdentityMask)
+
+            blockInput = globalPooling.resultTensor
+        case let nestedBottleneckDescriptor as SWNestedBottleneckResidualBlockDesc:
+            let nestedBottleneck = NestedBottleneckResidualBlock(
+                graph: graph,
+                sourceTensor: sourceTensor,
+                maskTensor: maskTensor,
+                maskSumTensor: maskSumTensor,
+                maskSumSqrtS14M01Tensor: maskSumSqrtS14M01Tensor,
+                descriptor: nestedBottleneckDescriptor,
+                nnXLen: nnXLen,
+                nnYLen: nnYLen,
+                optimizeIdentityMask: optimizeIdentityMask)
+
+            blockInput = nestedBottleneck.resultTensor
+        case let residualBlockDescriptor as SWResidualBlockDesc:
+            let ordinary = ResidualBlock(
+                graph: graph,
+                sourceTensor: sourceTensor,
+                maskTensor: maskTensor,
+                descriptor: residualBlockDescriptor,
+                nnXLen: nnXLen,
+                nnYLen: nnYLen,
+                optimizeIdentityMask: optimizeIdentityMask)
+
+            blockInput = ordinary.resultTensor
+        default:
+            blockInput = sourceTensor
+        }
+
+        return processBlockDescriptors(
+            graph,
+            blockInput,
+            maskTensor,
+            maskSumTensor,
+            maskSumSqrtS14M01Tensor,
+            blockDescriptors,
+            index + 1,
+            nnXLen,
+            nnYLen,
+            optimizeIdentityMask)
+    }
+
+    init(
+        graph: MPSGraph,
+        sourceTensor: MPSGraphTensor,
+        maskTensor: MPSGraphTensor,
+        maskSumTensor: MPSGraphTensor,
+        maskSumSqrtS14M01Tensor: MPSGraphTensor,
+        blockDescriptors: [BlockDescriptor],
+        nnXLen: NSNumber,
+        nnYLen: NSNumber,
+        optimizeIdentityMask: Bool = false
+    ) {
+        resultTensor = BlockStack.processBlockDescriptors(
+            graph,
+            sourceTensor,
+            maskTensor,
+            maskSumTensor,
+            maskSumSqrtS14M01Tensor,
+            blockDescriptors,
+            0,
+            nnXLen,
+            nnYLen,
+            optimizeIdentityMask)
+    }
+}
+
+/// A structure that represents a nested bottleneck residual block
+struct NestedBottleneckResidualBlock {
+    let resultTensor: MPSGraphTensor
+
+    init(
+        graph: MPSGraph,
+        sourceTensor: MPSGraphTensor,
+        maskTensor: MPSGraphTensor,
+        maskSumTensor: MPSGraphTensor,
+        maskSumSqrtS14M01Tensor: MPSGraphTensor,
+        descriptor: SWNestedBottleneckResidualBlockDesc,
+        nnXLen: NSNumber,
+        nnYLen: NSNumber,
+        optimizeIdentityMask: Bool = false
+    ) {
+        let preBN = BatchNormLayer(
+            graph: graph,
+            sourceTensor: sourceTensor,
+            maskTensor: maskTensor,
+            descriptor: descriptor.preBN,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen,
+            optimizeIdentityMask: optimizeIdentityMask)
+
+        let preActivation = ActivationLayer(
+            graph: graph,
+            sourceTensor: preBN.resultTensor,
+            activationKind: descriptor.preActivation)
+
+        let preConv = ConvLayer(
+            graph: graph,
+            sourceTensor: preActivation.resultTensor,
+            descriptor: descriptor.preConv,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen)
+
+        let blocks = BlockStack(
+            graph: graph,
+            sourceTensor: preConv.resultTensor,
+            maskTensor: maskTensor,
+            maskSumTensor: maskSumTensor,
+            maskSumSqrtS14M01Tensor: maskSumSqrtS14M01Tensor,
+            blockDescriptors: descriptor.blockDescriptors,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen,
+            optimizeIdentityMask: optimizeIdentityMask)
+
+        let postBN = BatchNormLayer(
+            graph: graph,
+            sourceTensor: blocks.resultTensor,
+            maskTensor: maskTensor,
+            descriptor: descriptor.postBN,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen,
+            optimizeIdentityMask: optimizeIdentityMask)
+
+        let postActivation = ActivationLayer(
+            graph: graph,
+            sourceTensor: postBN.resultTensor,
+            activationKind: descriptor.postActivation)
+
+        let postConv = ConvLayer(
+            graph: graph,
+            sourceTensor: postActivation.resultTensor,
+            descriptor: descriptor.postConv,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen)
+
+        resultTensor = graph.addition(
+            sourceTensor,
+            postConv.resultTensor,
+            name: nil)
+
+        assert(resultTensor.shape?.count == 4)
+    }
+}
+
+// MARK: - SGF Metadata Encoder
+
+/// Class representing the description of the SGF Metadata Encoder.
+public class SWSGFMetadataEncoderDesc {
+    let version: Int
+    let numInputMetaChannels: Int
+    let mul1: SWMatMulLayerDesc
+    let bias1: SWMatBiasLayerDesc
+    let act1: ActivationKind
+    let mul2: SWMatMulLayerDesc
+    let bias2: SWMatBiasLayerDesc
+    let act2: ActivationKind
+    let mul3: SWMatMulLayerDesc
+
+    init(
+        version: Int,
+        numInputMetaChannels: Int,
+        mul1: SWMatMulLayerDesc,
+        bias1: SWMatBiasLayerDesc,
+        act1: ActivationKind,
+        mul2: SWMatMulLayerDesc,
+        bias2: SWMatBiasLayerDesc,
+        act2: ActivationKind,
+        mul3: SWMatMulLayerDesc
+    ) {
+        self.version = version
+        self.numInputMetaChannels = numInputMetaChannels
+        self.mul1 = mul1
+        self.bias1 = bias1
+        self.act1 = act1
+        self.mul2 = mul2
+        self.bias2 = bias2
+        self.act2 = act2
+        self.mul3 = mul3
+    }
+}
+
+public func createSWSGFMetadataEncoderDesc(
+    version: Int32,
+    numInputMetaChannels: Int32,
+    mul1: SWMatMulLayerDesc,
+    bias1: SWMatBiasLayerDesc,
+    act1: ActivationKind,
+    mul2: SWMatMulLayerDesc,
+    bias2: SWMatBiasLayerDesc,
+    act2: ActivationKind,
+    mul3: SWMatMulLayerDesc
+) -> SWSGFMetadataEncoderDesc? {
+    return SWSGFMetadataEncoderDesc(
+        version: Int(version),
+        numInputMetaChannels: Int(numInputMetaChannels),
+        mul1: mul1,
+        bias1: bias1,
+        act1: act1,
+        mul2: mul2,
+        bias2: bias2,
+        act2: act2,
+        mul3: mul3)
+}
+
+/// A class that encodes SGF metadata.
+class SGFMetadataEncoder {
+    let resultTensor: MPSGraphTensor
+
+    init(
+        graph: MPSGraph,
+        descriptor: SWSGFMetadataEncoderDesc,
+        sourceTensor: MPSGraphTensor
+    ) {
+        let mul1 = MatMulLayer(
+            graph: graph,
+            descriptor: descriptor.mul1,
+            sourceTensor: sourceTensor)
+
+        let bias1 = MatBiasLayer(
+            graph: graph,
+            descriptor: descriptor.bias1,
+            sourceTensor: mul1.resultTensor)
+
+        let act1 = ActivationLayer(
+            graph: graph,
+            sourceTensor: bias1.resultTensor,
+            activationKind: descriptor.act1)
+
+        let mul2 = MatMulLayer(
+            graph: graph,
+            descriptor: descriptor.mul2,
+            sourceTensor: act1.resultTensor)
+
+        let bias2 = MatBiasLayer(
+            graph: graph,
+            descriptor: descriptor.bias2,
+            sourceTensor: mul2.resultTensor)
+
+        let act2 = ActivationLayer(
+            graph: graph,
+            sourceTensor: bias2.resultTensor,
+            activationKind: descriptor.act2)
+
+        let mul3 = MatMulLayer(
+            graph: graph,
+            descriptor: descriptor.mul3,
+            sourceTensor: act2.resultTensor)
+
+        resultTensor = mul3.resultTensor
+
+        assert(resultTensor.shape?.count == 2)
+    }
+}
+
+// MARK: - Trunk
+
+/// A class that describes a trunk for a neural network
+public class SWTrunkDesc {
+    let version: Int
+    let trunkNumChannels: NSNumber
+    let midNumChannels: NSNumber
+    let regularNumChannels: NSNumber
+    let gpoolNumChannels: NSNumber
+    let initialConv: SWConvLayerDesc
+    let initialMatMul: SWMatMulLayerDesc
+    let sgfMetadataEncoder: SWSGFMetadataEncoderDesc?
+    let blockDescriptors: [BlockDescriptor]
+    let trunkTipBN: SWBatchNormLayerDesc
+    let trunkTipActivation: ActivationKind
+
+    init(
+        version: Int,
+        trunkNumChannels: NSNumber,
+        midNumChannels: NSNumber,
+        regularNumChannels: NSNumber,
+        gpoolNumChannels: NSNumber,
+        initialConv: SWConvLayerDesc,
+        initialMatMul: SWMatMulLayerDesc,
+        sgfMetadataEncoder: SWSGFMetadataEncoderDesc?,
+        blockDescriptors: [BlockDescriptor],
+        trunkTipBN: SWBatchNormLayerDesc,
+        trunkTipActivation: ActivationKind
+    ) {
+        self.version = version
+        self.trunkNumChannels = trunkNumChannels
+        self.midNumChannels = midNumChannels
+        self.regularNumChannels = regularNumChannels
+        self.gpoolNumChannels = gpoolNumChannels
+        self.initialConv = initialConv
+        self.initialMatMul = initialMatMul
+        self.sgfMetadataEncoder = sgfMetadataEncoder
+        self.blockDescriptors = blockDescriptors
+        self.trunkTipBN = trunkTipBN
+        self.trunkTipActivation = trunkTipActivation
+    }
+}
+
+public func createSWTrunkDesc(
+    version: Int32,
+    trunkNumChannels: Int32,
+    midNumChannels: Int32,
+    regularNumChannels: Int32,
+    gpoolNumChannels: Int32,
+    initialConv: SWConvLayerDesc,
+    initialMatMul: SWMatMulLayerDesc,
+    sgfMetadataEncoder: SWSGFMetadataEncoderDesc?,
+    blockDescriptors: [BlockDescriptor],
+    trunkTipBN: SWBatchNormLayerDesc,
+    trunkTipActivation: ActivationKind
+) -> SWTrunkDesc {
+    return SWTrunkDesc(
+        version: Int(version),
+        trunkNumChannels: trunkNumChannels as NSNumber,
+        midNumChannels: midNumChannels as NSNumber,
+        regularNumChannels: regularNumChannels as NSNumber,
+        gpoolNumChannels: gpoolNumChannels as NSNumber,
+        initialConv: initialConv,
+        initialMatMul: initialMatMul,
+        sgfMetadataEncoder: sgfMetadataEncoder,
+        blockDescriptors: blockDescriptors,
+        trunkTipBN: trunkTipBN,
+        trunkTipActivation: trunkTipActivation)
+}
+
+/// A structure representing a ResNet trunk for a neural network
+struct Trunk {
+    let resultTensor: MPSGraphTensor
+
+    static func getBlockSourceTensor(
+        graph: MPSGraph,
+        descriptor: SWSGFMetadataEncoderDesc?,
+        initialAdd: AddNCBiasLayer,
+        inputMetaTensor: MPSGraphTensor?,
+        nnXLen: NSNumber,
+        nnYLen: NSNumber,
+        numChannels: NSNumber
+    ) -> MPSGraphTensor {
+        var blockSourceTensor: MPSGraphTensor
+
+        if let inputMetaTensor,
+            let descriptor, descriptor.numInputMetaChannels > 0
+        {
+            let encoded = SGFMetadataEncoder(
+                graph: graph,
+                descriptor: descriptor,
+                sourceTensor: inputMetaTensor)
+
+            let encodedAdd = AddNCBiasLayer(
+                graph: graph,
+                sourceTensor: initialAdd.resultTensor,
+                biasTensor: encoded.resultTensor,
+                nnXLen: nnXLen,
+                nnYLen: nnYLen,
+                numChannels: numChannels)
+
+            blockSourceTensor = encodedAdd.resultTensor
+        } else {
+            blockSourceTensor = initialAdd.resultTensor
+        }
+
+        return blockSourceTensor
+    }
+
+    init(
+        graph: MPSGraph,
+        descriptor: SWTrunkDesc,
+        inputTensor: MPSGraphTensor,
+        inputGlobalTensor: MPSGraphTensor,
+        inputMetaTensor: MPSGraphTensor?,
+        maskTensor: MPSGraphTensor,
+        maskSumTensor: MPSGraphTensor,
+        maskSumSqrtS14M01Tensor: MPSGraphTensor,
+        nnXLen: NSNumber,
+        nnYLen: NSNumber,
+        optimizeIdentityMask: Bool = false
+    ) {
+        let initialConv = ConvLayer(
+            graph: graph,
+            sourceTensor: inputTensor,
+            descriptor: descriptor.initialConv,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen)
+
+        let initialMatMul = MatMulLayer(
+            graph: graph,
+            descriptor: descriptor.initialMatMul,
+            sourceTensor: inputGlobalTensor)
+
+        let initialAdd = AddNCBiasLayer(
+            graph: graph,
+            sourceTensor: initialConv.resultTensor,
+            biasTensor: initialMatMul.resultTensor,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen,
+            numChannels: descriptor.initialMatMul.outChannels)
+
+        let blockSourceTensor = Trunk.getBlockSourceTensor(
+            graph: graph,
+            descriptor: descriptor.sgfMetadataEncoder,
+            initialAdd: initialAdd,
+            inputMetaTensor: inputMetaTensor,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen,
+            numChannels: descriptor.initialMatMul.outChannels)
+
+        let blocks = BlockStack(
+            graph: graph,
+            sourceTensor: blockSourceTensor,
+            maskTensor: maskTensor,
+            maskSumTensor: maskSumTensor,
+            maskSumSqrtS14M01Tensor: maskSumSqrtS14M01Tensor,
+            blockDescriptors: descriptor.blockDescriptors,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen,
+            optimizeIdentityMask: optimizeIdentityMask)
+
+        let trunkTipBN = BatchNormLayer(
+            graph: graph,
+            sourceTensor: blocks.resultTensor,
+            maskTensor: maskTensor,
+            descriptor: descriptor.trunkTipBN,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen,
+            optimizeIdentityMask: optimizeIdentityMask)
+
+        let trunkTipActivation = ActivationLayer(
+            graph: graph,
+            sourceTensor: trunkTipBN.resultTensor,
+            activationKind: descriptor.trunkTipActivation)
+
+        resultTensor = trunkTipActivation.resultTensor
+
+        assert(resultTensor.shape?.count == 4)
+    }
+}
+
+// MARK: - Policy Head
+
+/// A class that describes a policy head for a neural network
+public struct SWPolicyHeadDesc {
+    let version: Int
+    let p1Conv: SWConvLayerDesc
+    let g1Conv: SWConvLayerDesc
+    let g1BN: SWBatchNormLayerDesc
+    let g1Activation: ActivationKind
+    let gpoolToBiasMul: SWMatMulLayerDesc
+    let p1BN: SWBatchNormLayerDesc
+    let p1Activation: ActivationKind
+    let p2Conv: SWConvLayerDesc
+    let gpoolToPassMul: SWMatMulLayerDesc
+    let gpoolToPassBias: SWMatBiasLayerDesc?
+    let passActivation: ActivationKind?
+    let gpoolToPassMul2: SWMatMulLayerDesc?
+
+    init(
+        version: Int,
+        p1Conv: SWConvLayerDesc,
+        g1Conv: SWConvLayerDesc,
+        g1BN: SWBatchNormLayerDesc,
+        g1Activation: ActivationKind,
+        gpoolToBiasMul: SWMatMulLayerDesc,
+        p1BN: SWBatchNormLayerDesc,
+        p1Activation: ActivationKind,
+        p2Conv: SWConvLayerDesc,
+        gpoolToPassMul: SWMatMulLayerDesc,
+        gpoolToPassBias: SWMatBiasLayerDesc?,
+        passActivation: ActivationKind?,
+        gpoolToPassMul2: SWMatMulLayerDesc?
+    ) {
+        self.version = version
+        self.p1Conv = p1Conv
+        self.g1Conv = g1Conv
+        self.g1BN = g1BN
+        self.g1Activation = g1Activation
+        self.gpoolToBiasMul = gpoolToBiasMul
+        self.p1BN = p1BN
+        self.p1Activation = p1Activation
+        self.p2Conv = p2Conv
+        self.gpoolToPassMul = gpoolToPassMul
+        self.gpoolToPassBias = gpoolToPassBias
+        self.passActivation = passActivation
+        self.gpoolToPassMul2 = gpoolToPassMul2
+
+        assert(
+            (version >= 15)
+                || ((gpoolToPassBias == nil) && (passActivation == nil) && (gpoolToPassMul2 == nil))
+        )
+        assert(
+            (version < 15)
+                || ((gpoolToPassBias != nil) && (passActivation != nil) && (gpoolToPassMul2 != nil))
+        )
+    }
+}
+
+public func createSWPolicyHeadDesc(
+    version: Int32,
+    p1Conv: SWConvLayerDesc,
+    g1Conv: SWConvLayerDesc,
+    g1BN: SWBatchNormLayerDesc,
+    g1Activation: ActivationKind,
+    gpoolToBiasMul: SWMatMulLayerDesc,
+    p1BN: SWBatchNormLayerDesc,
+    p1Activation: ActivationKind,
+    p2Conv: SWConvLayerDesc,
+    gpoolToPassMul: SWMatMulLayerDesc,
+    gpoolToPassBias: SWMatBiasLayerDesc,
+    passActivation: ActivationKind,
+    gpoolToPassMul2: SWMatMulLayerDesc
+) -> SWPolicyHeadDesc {
+    if version >= 15 {
+        return SWPolicyHeadDesc(
+            version: Int(version),
+            p1Conv: p1Conv,
+            g1Conv: g1Conv,
+            g1BN: g1BN,
+            g1Activation: g1Activation,
+            gpoolToBiasMul: gpoolToBiasMul,
+            p1BN: p1BN,
+            p1Activation: p1Activation,
+            p2Conv: p2Conv,
+            gpoolToPassMul: gpoolToPassMul,
+            gpoolToPassBias: gpoolToPassBias,
+            passActivation: passActivation,
+            gpoolToPassMul2: gpoolToPassMul2)
+    } else {
+        return SWPolicyHeadDesc(
+            version: Int(version),
+            p1Conv: p1Conv,
+            g1Conv: g1Conv,
+            g1BN: g1BN,
+            g1Activation: g1Activation,
+            gpoolToBiasMul: gpoolToBiasMul,
+            p1BN: p1BN,
+            p1Activation: p1Activation,
+            p2Conv: p2Conv,
+            gpoolToPassMul: gpoolToPassMul,
+            gpoolToPassBias: nil,
+            passActivation: nil,
+            gpoolToPassMul2: nil)
+    }
+}
+
+/// A structure that represents a policy head of a neural network.
+struct PolicyHead {
+    let policyTensor: MPSGraphTensor
+    let policyPassTensor: MPSGraphTensor
+
+    init(
+        graph: MPSGraph,
+        descriptor: SWPolicyHeadDesc,
+        sourceTensor: MPSGraphTensor,
+        maskTensor: MPSGraphTensor,
+        maskSumTensor: MPSGraphTensor,
+        maskSumSqrtS14M01Tensor: MPSGraphTensor,
+        nnXLen: NSNumber,
+        nnYLen: NSNumber,
+        optimizeIdentityMask: Bool = false
+    ) {
+        let p1Conv = ConvLayer(
+            graph: graph,
+            sourceTensor: sourceTensor,
+            descriptor: descriptor.p1Conv,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen)
+
+        let g1Conv = ConvLayer(
+            graph: graph,
+            sourceTensor: sourceTensor,
+            descriptor: descriptor.g1Conv,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen)
+
+        let g1BN = BatchNormLayer(
+            graph: graph,
+            sourceTensor: g1Conv.resultTensor,
+            maskTensor: maskTensor,
+            descriptor: descriptor.g1BN,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen,
+            optimizeIdentityMask: optimizeIdentityMask)
+
+        let g1Activation = ActivationLayer(
+            graph: graph,
+            sourceTensor: g1BN.resultTensor,
+            activationKind: descriptor.g1Activation)
+
+        let g1Concat = GlobalPoolingLayer(
+            graph: graph,
+            sourceTensor: g1Activation.resultTensor,
+            maskTensor: maskTensor,
+            maskSumTensor: maskSumTensor,
+            maskSumSqrtS14M01Tensor: maskSumSqrtS14M01Tensor,
+            optimizeIdentityMask: optimizeIdentityMask)
+
+        assert(g1Concat.resultTensor.shape?[1] == descriptor.gpoolToBiasMul.inChannels)
+
+        let gpoolToBiasMul = MatMulLayer(
+            graph: graph,
+            descriptor: descriptor.gpoolToBiasMul,
+            sourceTensor: g1Concat.resultTensor)
+
+        let added = AddNCBiasLayer(
+            graph: graph,
+            sourceTensor: p1Conv.resultTensor,
+            biasTensor: gpoolToBiasMul.resultTensor,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen,
+            numChannels: descriptor.gpoolToBiasMul.outChannels)
+
+        let p1BN = BatchNormLayer(
+            graph: graph,
+            sourceTensor: added.resultTensor,
+            maskTensor: maskTensor,
+            descriptor: descriptor.p1BN,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen,
+            optimizeIdentityMask: optimizeIdentityMask)
+
+        let p1Activation = ActivationLayer(
+            graph: graph,
+            sourceTensor: p1BN.resultTensor,
+            activationKind: descriptor.p1Activation)
+
+        let p2Conv = ConvLayer(
+            graph: graph,
+            sourceTensor: p1Activation.resultTensor,
+            descriptor: descriptor.p2Conv,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen)
+
+        policyTensor = p2Conv.resultTensor
+
+        assert(g1Concat.resultTensor.shape?[1] == descriptor.gpoolToPassMul.inChannels)
+
+        let gpoolToPassMul = MatMulLayer(
+            graph: graph,
+            descriptor: descriptor.gpoolToPassMul,
+            sourceTensor: g1Concat.resultTensor)
+
+        if let gpoolToPassBias = descriptor.gpoolToPassBias,
+            let passActivation = descriptor.passActivation,
+            let gpoolToPassMul2 = descriptor.gpoolToPassMul2
+        {
+            assert(descriptor.version >= 15)
+
+            let gpoolToPassBiasLayer = MatBiasLayer(
+                graph: graph,
+                descriptor: gpoolToPassBias,
+                sourceTensor: gpoolToPassMul.resultTensor)
+
+            let passActivationLayer = ActivationLayer(
+                graph: graph,
+                sourceTensor: gpoolToPassBiasLayer.resultTensor,
+                activationKind: passActivation)
+
+            let gpoolToPassMul2Layer = MatMulLayer(
+                graph: graph,
+                descriptor: gpoolToPassMul2,
+                sourceTensor: passActivationLayer.resultTensor)
+
+            policyPassTensor = gpoolToPassMul2Layer.resultTensor
+        } else {
+            assert(descriptor.version < 15)
+            policyPassTensor = gpoolToPassMul.resultTensor
+        }
+
+        assert(policyTensor.shape?.count == 4)
+        assert(policyPassTensor.shape?.count == 2)
+    }
+}
+
+// MARK: - Value Head
+
+/// A struct that describes the value head of a neural network
+public struct SWValueHeadDesc {
+    let version: Int
+    let v1Conv: SWConvLayerDesc
+    let v1BN: SWBatchNormLayerDesc
+    let v1Activation: ActivationKind
+    let v2Mul: SWMatMulLayerDesc
+    let v2Bias: SWMatBiasLayerDesc
+    let v2Activation: ActivationKind
+    let v3Mul: SWMatMulLayerDesc
+    let v3Bias: SWMatBiasLayerDesc
+    let sv3Mul: SWMatMulLayerDesc
+    let sv3Bias: SWMatBiasLayerDesc
+    let vOwnershipConv: SWConvLayerDesc
+
+    init(
+        version: Int,
+        v1Conv: SWConvLayerDesc,
+        v1BN: SWBatchNormLayerDesc,
+        v1Activation: ActivationKind,
+        v2Mul: SWMatMulLayerDesc,
+        v2Bias: SWMatBiasLayerDesc,
+        v2Activation: ActivationKind,
+        v3Mul: SWMatMulLayerDesc,
+        v3Bias: SWMatBiasLayerDesc,
+        sv3Mul: SWMatMulLayerDesc,
+        sv3Bias: SWMatBiasLayerDesc,
+        vOwnershipConv: SWConvLayerDesc
+    ) {
+        self.version = version
+        self.v1Conv = v1Conv
+        self.v1BN = v1BN
+        self.v1Activation = v1Activation
+        self.v2Mul = v2Mul
+        self.v2Bias = v2Bias
+        self.v2Activation = v2Activation
+        self.v3Mul = v3Mul
+        self.v3Bias = v3Bias
+        self.sv3Mul = sv3Mul
+        self.sv3Bias = sv3Bias
+        self.vOwnershipConv = vOwnershipConv
+    }
+}
+
+public func createSWValueHeadDesc(
+    version: Int32,
+    v1Conv: SWConvLayerDesc,
+    v1BN: SWBatchNormLayerDesc,
+    v1Activation: ActivationKind,
+    v2Mul: SWMatMulLayerDesc,
+    v2Bias: SWMatBiasLayerDesc,
+    v2Activation: ActivationKind,
+    v3Mul: SWMatMulLayerDesc,
+    v3Bias: SWMatBiasLayerDesc,
+    sv3Mul: SWMatMulLayerDesc,
+    sv3Bias: SWMatBiasLayerDesc,
+    vOwnershipConv: SWConvLayerDesc
+) -> SWValueHeadDesc {
+    return SWValueHeadDesc(
+        version: Int(version),
+        v1Conv: v1Conv,
+        v1BN: v1BN,
+        v1Activation: v1Activation,
+        v2Mul: v2Mul,
+        v2Bias: v2Bias,
+        v2Activation: v2Activation,
+        v3Mul: v3Mul,
+        v3Bias: v3Bias,
+        sv3Mul: sv3Mul,
+        sv3Bias: sv3Bias,
+        vOwnershipConv: vOwnershipConv)
+}
+
+/// A structure that creates a value head for the neural network
+struct ValueHead {
+    let valueTensor: MPSGraphTensor
+    let scoreValueTensor: MPSGraphTensor
+    let ownershipTensor: MPSGraphTensor
+
+    init(
+        graph: MPSGraph,
+        descriptor: SWValueHeadDesc,
+        sourceTensor: MPSGraphTensor,
+        maskTensor: MPSGraphTensor,
+        maskSumTensor: MPSGraphTensor,
+        maskSumSqrtS14M01Tensor: MPSGraphTensor,
+        maskSumSqrtS14M01SquareS01Tensor: MPSGraphTensor,
+        nnXLen: NSNumber,
+        nnYLen: NSNumber,
+        optimizeIdentityMask: Bool = false
+    ) {
+        let v1Conv = ConvLayer(
+            graph: graph,
+            sourceTensor: sourceTensor,
+            descriptor: descriptor.v1Conv,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen)
+
+        let v1BN = BatchNormLayer(
+            graph: graph,
+            sourceTensor: v1Conv.resultTensor,
+            maskTensor: maskTensor,
+            descriptor: descriptor.v1BN,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen,
+            optimizeIdentityMask: optimizeIdentityMask)
+
+        let v1Activation = ActivationLayer(
+            graph: graph,
+            sourceTensor: v1BN.resultTensor,
+            activationKind: descriptor.v1Activation)
+
+        let v1Mean =
+            GlobalPoolingValueLayer(
+                graph: graph,
+                sourceTensor: v1Activation.resultTensor,
+                maskSumTensor: maskSumTensor,
+                maskSumSqrtS14M01Tensor: maskSumSqrtS14M01Tensor,
+                maskSumSqrtS14M01SquareS01Tensor: maskSumSqrtS14M01SquareS01Tensor)
+
+        assert(v1Mean.resultTensor.shape?[1] == descriptor.v2Mul.inChannels)
+
+        let v2Mul = MatMulLayer(
+            graph: graph,
+            descriptor: descriptor.v2Mul,
+            sourceTensor: v1Mean.resultTensor)
+
+        let v2Bias = MatBiasLayer(
+            graph: graph,
+            descriptor: descriptor.v2Bias,
+            sourceTensor: v2Mul.resultTensor)
+
+        let v2Activation = ActivationLayer(
+            graph: graph,
+            sourceTensor: v2Bias.resultTensor,
+            activationKind: descriptor.v2Activation)
+
+        let v3Mul = MatMulLayer(
+            graph: graph,
+            descriptor: descriptor.v3Mul,
+            sourceTensor: v2Activation.resultTensor)
+
+        let v3Bias = MatBiasLayer(
+            graph: graph,
+            descriptor: descriptor.v3Bias,
+            sourceTensor: v3Mul.resultTensor)
+
+        let sv3Mul = MatMulLayer(
+            graph: graph,
+            descriptor: descriptor.sv3Mul,
+            sourceTensor: v2Activation.resultTensor)
+
+        let sv3Bias = MatBiasLayer(
+            graph: graph,
+            descriptor: descriptor.sv3Bias,
+            sourceTensor: sv3Mul.resultTensor)
+
+        let vOwnershipConv = ConvLayer(
+            graph: graph,
+            sourceTensor: v1Activation.resultTensor,
+            descriptor: descriptor.vOwnershipConv,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen)
+
+        valueTensor = v3Bias.resultTensor
+        scoreValueTensor = sv3Bias.resultTensor
+        ownershipTensor = vOwnershipConv.resultTensor
+
+        assert(valueTensor.shape?.count == 2)
+        assert(scoreValueTensor.shape?.count == 2)
+        assert(ownershipTensor.shape?.count == 4)
+    }
+}
+
+// MARK: - Model Descriptor
+
+/// A struct that describes a neural network model used for playing the game of Go.
+public struct SWModelDesc {
+    let version: Int
+    let name: String
+    let numInputChannels: NSNumber
+    let numInputGlobalChannels: NSNumber
+    let numInputMetaChannels: NSNumber
+    let numValueChannels: NSNumber
+    let numScoreValueChannels: NSNumber
+    let numOwnershipChannels: NSNumber
+    let numPolicyChannels: NSNumber
+    let trunk: SWTrunkDesc
+    let policyHead: SWPolicyHeadDesc
+    let valueHead: SWValueHeadDesc
+
+    init(
+        version: Int,
+        name: String,
+        numInputChannels: NSNumber,
+        numInputGlobalChannels: NSNumber,
+        numInputMetaChannels: NSNumber,
+        numValueChannels: NSNumber,
+        numScoreValueChannels: NSNumber,
+        numOwnershipChannels: NSNumber,
+        numPolicyChannels: NSNumber,
+        trunk: SWTrunkDesc,
+        policyHead: SWPolicyHeadDesc,
+        valueHead: SWValueHeadDesc
+    ) {
+        self.version = version
+        self.name = name
+        self.numInputChannels = numInputChannels
+        self.numInputGlobalChannels = numInputGlobalChannels
+        self.numInputMetaChannels = numInputMetaChannels
+        self.numValueChannels = numValueChannels
+        self.numScoreValueChannels = numScoreValueChannels
+        self.numOwnershipChannels = numOwnershipChannels
+        self.numPolicyChannels = numPolicyChannels
+        self.trunk = trunk
+        self.policyHead = policyHead
+        self.valueHead = valueHead
+    }
+}
+
+public func createSWModelDesc(
+    version: Int32,
+    name: String,
+    numInputChannels: Int32,
+    numInputGlobalChannels: Int32,
+    numInputMetaChannels: Int32,
+    numValueChannels: Int32,
+    numScoreValueChannels: Int32,
+    numOwnershipChannels: Int32,
+    numPolicyChannels: Int32,
+    trunk: SWTrunkDesc,
+    policyHead: SWPolicyHeadDesc,
+    valueHead: SWValueHeadDesc
+) -> SWModelDesc {
+    return SWModelDesc(
+        version: Int(version),
+        name: name,
+        numInputChannels: numInputChannels as NSNumber,
+        numInputGlobalChannels: numInputGlobalChannels as NSNumber,
+        numInputMetaChannels: numInputMetaChannels as NSNumber,
+        numValueChannels: numValueChannels as NSNumber,
+        numScoreValueChannels: numScoreValueChannels as NSNumber,
+        numOwnershipChannels: numOwnershipChannels as NSNumber,
+        numPolicyChannels: numPolicyChannels as NSNumber,
+        trunk: trunk,
+        policyHead: policyHead,
+        valueHead: valueHead)
+}
+
+// MARK: - MPSGraph Model (for GPU inference)
+
+/// A structure representing a neural network model for processing Go game states using MPSGraph.
+struct MPSGraphModel {
+    let device: MTLDevice
+    let commandQueue: MTLCommandQueue
+    let graph: MPSGraph
+    let nnXLen: NSNumber
+    let nnYLen: NSNumber
+    let version: Int
+    let numValueChannels: NSNumber
+    let numScoreValueChannels: NSNumber
+    let numOwnershipChannels: NSNumber
+    let input: InputLayer
+    let inputGlobal: InputGlobalLayer
+    let inputMeta: InputMetaLayer
+    let mask: MaskLayer
+    let trunk: Trunk
+    let policyHead: PolicyHead
+    let valueHead: ValueHead
+    let targetTensors: [MPSGraphTensor]
+
+    init(
+        device: MTLDevice,
+        graph: MPSGraph,
+        descriptor: SWModelDesc,
+        nnXLen: NSNumber,
+        nnYLen: NSNumber
+    ) {
+        self.device = device
+        self.commandQueue = device.makeCommandQueue()!
+        self.graph = graph
+        self.nnXLen = nnXLen
+        self.nnYLen = nnYLen
+        self.version = descriptor.version
+        self.numValueChannels = descriptor.numValueChannels
+        self.numScoreValueChannels = descriptor.numScoreValueChannels
+        self.numOwnershipChannels = descriptor.numOwnershipChannels
+
+        input = InputLayer(
+            graph: graph,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen,
+            numChannels: descriptor.numInputChannels)
+
+        inputGlobal = InputGlobalLayer(
+            graph: graph,
+            numGlobalFeatures: descriptor.numInputGlobalChannels)
+
+        inputMeta = InputMetaLayer(
+            graph: graph,
+            numMetaFeatures: descriptor.numInputMetaChannels)
+
+        mask = MaskLayer(
+            graph: graph,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen)
+
+        let maskSum = MaskSumLayer(
+            graph: graph,
+            maskTensor: mask.tensor)
+
+        let maskSumSqrtS14M01 = MaskSumSqrtS14M01Layer(
+            graph: graph,
+            maskSum: maskSum)
+
+        let maskSumSqrtS14M01SquareS01 = MaskSumSqrtS14M01SquareS01Layer(
+            graph: graph,
+            maskSumSqrtS14M01: maskSumSqrtS14M01)
+
+        trunk = Trunk(
+            graph: graph,
+            descriptor: descriptor.trunk,
+            inputTensor: input.tensor,
+            inputGlobalTensor: inputGlobal.tensor,
+            inputMetaTensor: inputMeta.tensor,
+            maskTensor: mask.tensor,
+            maskSumTensor: maskSum.tensor,
+            maskSumSqrtS14M01Tensor: maskSumSqrtS14M01.tensor,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen)
+
+        policyHead = PolicyHead(
+            graph: graph,
+            descriptor: descriptor.policyHead,
+            sourceTensor: trunk.resultTensor,
+            maskTensor: mask.tensor,
+            maskSumTensor: maskSum.tensor,
+            maskSumSqrtS14M01Tensor: maskSumSqrtS14M01.tensor,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen)
+
+        valueHead = ValueHead(
+            graph: graph,
+            descriptor: descriptor.valueHead,
+            sourceTensor: trunk.resultTensor,
+            maskTensor: mask.tensor,
+            maskSumTensor: maskSum.tensor,
+            maskSumSqrtS14M01Tensor: maskSumSqrtS14M01.tensor,
+            maskSumSqrtS14M01SquareS01Tensor: maskSumSqrtS14M01SquareS01.tensor,
+            nnXLen: nnXLen,
+            nnYLen: nnYLen)
+
+        targetTensors = [
+            policyHead.policyTensor,
+            policyHead.policyPassTensor,
+            valueHead.valueTensor,
+            valueHead.scoreValueTensor,
+            valueHead.ownershipTensor,
+        ]
+    }
+
+    /// Applies the model to the given input data
+    public func apply(
+        input inputPointer: UnsafeMutablePointer<Float32>,
+        inputGlobal inputGlobalPointer: UnsafeMutablePointer<Float32>,
+        inputMeta inputMetaPointer: UnsafeMutablePointer<Float32>,
+        policy: UnsafeMutablePointer<Float32>,
+        policyPass: UnsafeMutablePointer<Float32>,
+        value: UnsafeMutablePointer<Float32>,
+        scoreValue: UnsafeMutablePointer<Float32>,
+        ownership: UnsafeMutablePointer<Float32>,
+        batchSize: Int
+    ) {
+        let channelAxis = InputShape.getChannelAxis()
+        let numInputChannels = input.shape[channelAxis]
+
+        let inputShape = InputShape.create(
+            batchSize: batchSize as NSNumber,
+            numChannels: numInputChannels,
+            nnYLen: nnYLen,
+            nnXLen: nnXLen)
+
+        let inputDescriptor = MPSNDArrayDescriptor(
+            dataType: input.tensor.dataType,
+            shape: inputShape)
+
+        let inputArray = MPSNDArray(
+            device: device,
+            descriptor: inputDescriptor)
+
+        inputArray.writeBytes(inputPointer)
+
+        let numInputGlobalChannels = inputGlobal.shape[channelAxis]
+
+        let inputGlobalShape = InputShape.create(
+            batchSize: batchSize as NSNumber,
+            numChannels: numInputGlobalChannels,
+            nnYLen: 1,
+            nnXLen: 1)
+
+        let inputGlobalDescriptor = MPSNDArrayDescriptor(
+            dataType: inputGlobal.tensor.dataType,
+            shape: inputGlobalShape)
+
+        let inputGlobalArray = MPSNDArray(
+            device: device,
+            descriptor: inputGlobalDescriptor)
+
+        inputGlobalArray.writeBytes(inputGlobalPointer)
+
+        let numInputMetaChannels = inputMeta.shape[channelAxis]
+
+        let inputMetaShape = InputShape.create(
+            batchSize: batchSize as NSNumber,
+            numChannels: numInputMetaChannels,
+            nnYLen: 1,
+            nnXLen: 1)
+
+        let inputMetaDescriptor = MPSNDArrayDescriptor(
+            dataType: inputMeta.tensor.dataType,
+            shape: inputMetaShape)
+
+        let inputMetaArray = MPSNDArray(
+            device: device,
+            descriptor: inputMetaDescriptor)
+
+        inputMetaArray.writeBytes(inputMetaPointer)
+
+        let maskShape = InputShape.create(
+            batchSize: batchSize as NSNumber,
+            numChannels: 1,
+            nnYLen: nnYLen,
+            nnXLen: nnXLen)
+
+        let maskDescriptor = MPSNDArrayDescriptor(
+            dataType: mask.tensor.dataType,
+            shape: maskShape)
+
+        let maskArray = MPSNDArray(
+            device: device,
+            descriptor: maskDescriptor)
+
+        var maskStrideArray = [
+            MemoryLayout<Float32>.size,
+            nnXLen.intValue * MemoryLayout<Float32>.size,
+            nnYLen.intValue * nnXLen.intValue * MemoryLayout<Float32>.size,
+            numInputChannels.intValue * nnYLen.intValue * nnXLen.intValue
+                * MemoryLayout<Float32>.size,
+        ]
+
+        maskArray.writeBytes(inputPointer, strideBytes: &maskStrideArray)
+
+        let feeds = [
+            input.tensor: MPSGraphTensorData(inputArray),
+            inputGlobal.tensor: MPSGraphTensorData(inputGlobalArray),
+            inputMeta.tensor: MPSGraphTensorData(inputMetaArray),
+            mask.tensor: MPSGraphTensorData(maskArray),
+        ]
+
+        let fetch = graph.run(
+            with: commandQueue,
+            feeds: feeds,
+            targetTensors: targetTensors,
+            targetOperations: nil)
+
+        assert(fetch[policyHead.policyTensor] != nil)
+        assert(fetch[policyHead.policyPassTensor] != nil)
+        assert(fetch[valueHead.valueTensor] != nil)
+        assert(fetch[valueHead.scoreValueTensor] != nil)
+        assert(fetch[valueHead.ownershipTensor] != nil)
+
+        fetch[policyHead.policyTensor]?.mpsndarray().readBytes(policy)
+        fetch[policyHead.policyPassTensor]?.mpsndarray().readBytes(policyPass)
+        fetch[valueHead.valueTensor]?.mpsndarray().readBytes(value)
+        fetch[valueHead.scoreValueTensor]?.mpsndarray().readBytes(scoreValue)
+        fetch[valueHead.ownershipTensor]?.mpsndarray().readBytes(ownership)
+    }
+}
+
+// MARK: - Test Infrastructure
+
+/// Helper struct for testing individual network layers using MPSGraph
+struct NetworkTester {
+    let device: MTLDevice
+    let commandQueue: MTLCommandQueue
+    let graph: MPSGraph
+    let inputTensor: MPSGraphTensor
+    let maskTensor: MPSGraphTensor
+    let outputTensor: MPSGraphTensor
+    let inputShape: [NSNumber]
+    let maskShape: [NSNumber]
+    let outputShape: [NSNumber]
+
+    /// Initialize a network tester for testing a single layer
+    init(
+        device: MTLDevice,
+        graph: MPSGraph,
+        inputTensor: MPSGraphTensor,
+        maskTensor: MPSGraphTensor,
+        outputTensor: MPSGraphTensor,
+        batchSize: NSNumber,
+        nnXLen: NSNumber,
+        nnYLen: NSNumber,
+        inChannels: NSNumber,
+        outChannels: NSNumber
+    ) {
+        self.device = device
+        self.commandQueue = device.makeCommandQueue()!
+        self.graph = graph
+        self.inputTensor = inputTensor
+        self.maskTensor = maskTensor
+        self.outputTensor = outputTensor
+        self.inputShape = InputShape.create(
+            batchSize: batchSize,
+            numChannels: inChannels,
+            nnYLen: nnYLen,
+            nnXLen: nnXLen)
+        self.maskShape = InputShape.create(
+            batchSize: batchSize,
+            numChannels: 1,
+            nnYLen: nnYLen,
+            nnXLen: nnXLen)
+        self.outputShape = InputShape.create(
+            batchSize: batchSize,
+            numChannels: outChannels,
+            nnYLen: nnYLen,
+            nnXLen: nnXLen)
+    }
+
+    /// Run the test with given input and mask data, writing results to output
+    func run(
+        inputPointer: UnsafePointer<Float32>,
+        maskPointer: UnsafePointer<Float32>,
+        outputPointer: UnsafeMutablePointer<Float32>
+    ) {
+        let inputDescriptor = MPSNDArrayDescriptor(
+            dataType: .float32,
+            shape: inputShape)
+
+        let inputArray = MPSNDArray(
+            device: device,
+            descriptor: inputDescriptor)
+
+        inputArray.writeBytes(UnsafeMutableRawPointer(mutating: inputPointer))
+
+        let maskDescriptor = MPSNDArrayDescriptor(
+            dataType: .float32,
+            shape: maskShape)
+
+        let maskArray = MPSNDArray(
+            device: device,
+            descriptor: maskDescriptor)
+
+        maskArray.writeBytes(UnsafeMutableRawPointer(mutating: maskPointer))
+
+        let feeds = [
+            inputTensor: MPSGraphTensorData(inputArray),
+            maskTensor: MPSGraphTensorData(maskArray),
+        ]
+
+        let fetch = graph.run(
+            with: commandQueue,
+            feeds: feeds,
+            targetTensors: [outputTensor],
+            targetOperations: nil)
+
+        fetch[outputTensor]?.mpsndarray().readBytes(outputPointer)
+    }
+}
+
+// MARK: - ConvLayer Test Extension
+
+extension ConvLayer {
+    /// Test the convolution layer with given parameters
+    static func test(
+        descriptor: SWConvLayerDesc,
+        batchSize: Int32,
+        nnXLen: Int32,
+        nnYLen: Int32,
+        inputPointer: UnsafePointer<Float32>,
+        outputPointer: UnsafeMutablePointer<Float32>
+    ) -> Bool {
+        guard let device = MTLCreateSystemDefaultDevice() else {
+            return false
+        }
+
+        let graph = MPSGraph()
+
+        let inputShape = InputShape.create(
+            batchSize: -1 as NSNumber,
+            numChannels: descriptor.inChannels,
+            nnYLen: nnYLen as NSNumber,
+            nnXLen: nnXLen as NSNumber)
+
+        let inputTensor = graph.placeholder(
+            shape: inputShape,
+            dataType: .float32,
+            name: nil)
+
+        let convLayer = ConvLayer(
+            graph: graph,
+            sourceTensor: inputTensor,
+            descriptor: descriptor,
+            nnXLen: nnXLen as NSNumber,
+            nnYLen: nnYLen as NSNumber)
+
+        // Run the graph
+        let commandQueue = device.makeCommandQueue()!
+
+        let actualInputShape = InputShape.create(
+            batchSize: batchSize as NSNumber,
+            numChannels: descriptor.inChannels,
+            nnYLen: nnYLen as NSNumber,
+            nnXLen: nnXLen as NSNumber)
+
+        let inputDescriptor = MPSNDArrayDescriptor(
+            dataType: .float32,
+            shape: actualInputShape)
+
+        let inputArray = MPSNDArray(
+            device: device,
+            descriptor: inputDescriptor)
+
+        inputArray.writeBytes(UnsafeMutableRawPointer(mutating: inputPointer))
+
+        let feeds = [inputTensor: MPSGraphTensorData(inputArray)]
+
+        let fetch = graph.run(
+            with: commandQueue,
+            feeds: feeds,
+            targetTensors: [convLayer.resultTensor],
+            targetOperations: nil)
+
+        fetch[convLayer.resultTensor]?.mpsndarray().readBytes(outputPointer)
+
+        return true
+    }
+}
+
+// MARK: - BatchNormLayer Test Extension
+
+extension BatchNormLayer {
+    /// Test the batch normalization layer with given parameters
+    static func test(
+        descriptor: SWBatchNormLayerDesc,
+        batchSize: Int32,
+        nnXLen: Int32,
+        nnYLen: Int32,
+        inputPointer: UnsafePointer<Float32>,
+        maskPointer: UnsafePointer<Float32>,
+        outputPointer: UnsafeMutablePointer<Float32>
+    ) -> Bool {
+        guard let device = MTLCreateSystemDefaultDevice() else {
+            return false
+        }
+
+        let graph = MPSGraph()
+
+        let inputShape = InputShape.create(
+            batchSize: -1 as NSNumber,
+            numChannels: descriptor.numChannels,
+            nnYLen: nnYLen as NSNumber,
+            nnXLen: nnXLen as NSNumber)
+
+        let inputTensor = graph.placeholder(
+            shape: inputShape,
+            dataType: .float32,
+            name: nil)
+
+        let maskShape = InputShape.create(
+            batchSize: -1 as NSNumber,
+            numChannels: 1,
+            nnYLen: nnYLen as NSNumber,
+            nnXLen: nnXLen as NSNumber)
+
+        let maskTensor = graph.placeholder(
+            shape: maskShape,
+            dataType: .float32,
+            name: nil)
+
+        let bnLayer = BatchNormLayer(
+            graph: graph,
+            sourceTensor: inputTensor,
+            maskTensor: maskTensor,
+            descriptor: descriptor,
+            nnXLen: nnXLen as NSNumber,
+            nnYLen: nnYLen as NSNumber)
+
+        // Run the graph
+        let commandQueue = device.makeCommandQueue()!
+
+        let actualInputShape = InputShape.create(
+            batchSize: batchSize as NSNumber,
+            numChannels: descriptor.numChannels,
+            nnYLen: nnYLen as NSNumber,
+            nnXLen: nnXLen as NSNumber)
+
+        let inputDescriptor = MPSNDArrayDescriptor(
+            dataType: .float32,
+            shape: actualInputShape)
+
+        let inputArray = MPSNDArray(
+            device: device,
+            descriptor: inputDescriptor)
+
+        inputArray.writeBytes(UnsafeMutableRawPointer(mutating: inputPointer))
+
+        let actualMaskShape = InputShape.create(
+            batchSize: batchSize as NSNumber,
+            numChannels: 1,
+            nnYLen: nnYLen as NSNumber,
+            nnXLen: nnXLen as NSNumber)
+
+        let maskDescriptor = MPSNDArrayDescriptor(
+            dataType: .float32,
+            shape: actualMaskShape)
+
+        let maskArray = MPSNDArray(
+            device: device,
+            descriptor: maskDescriptor)
+
+        maskArray.writeBytes(UnsafeMutableRawPointer(mutating: maskPointer))
+
+        let feeds = [
+            inputTensor: MPSGraphTensorData(inputArray),
+            maskTensor: MPSGraphTensorData(maskArray),
+        ]
+
+        let fetch = graph.run(
+            with: commandQueue,
+            feeds: feeds,
+            targetTensors: [bnLayer.resultTensor],
+            targetOperations: nil)
+
+        fetch[bnLayer.resultTensor]?.mpsndarray().readBytes(outputPointer)
+
+        return true
+    }
+}
+
+// MARK: - ResidualBlock Test Extension
+
+extension ResidualBlock {
+    /// Test the residual block with given parameters
+    static func test(
+        descriptor: SWResidualBlockDesc,
+        batchSize: Int32,
+        nnXLen: Int32,
+        nnYLen: Int32,
+        inputPointer: UnsafePointer<Float32>,
+        maskPointer: UnsafePointer<Float32>,
+        outputPointer: UnsafeMutablePointer<Float32>
+    ) -> Bool {
+        guard let device = MTLCreateSystemDefaultDevice() else {
+            return false
+        }
+
+        let graph = MPSGraph()
+
+        let inputShape = InputShape.create(
+            batchSize: -1 as NSNumber,
+            numChannels: descriptor.preBN.numChannels,
+            nnYLen: nnYLen as NSNumber,
+            nnXLen: nnXLen as NSNumber)
+
+        let inputTensor = graph.placeholder(
+            shape: inputShape,
+            dataType: .float32,
+            name: nil)
+
+        let maskShape = InputShape.create(
+            batchSize: -1 as NSNumber,
+            numChannels: 1,
+            nnYLen: nnYLen as NSNumber,
+            nnXLen: nnXLen as NSNumber)
+
+        let maskTensor = graph.placeholder(
+            shape: maskShape,
+            dataType: .float32,
+            name: nil)
+
+        let resBlock = ResidualBlock(
+            graph: graph,
+            sourceTensor: inputTensor,
+            maskTensor: maskTensor,
+            descriptor: descriptor,
+            nnXLen: nnXLen as NSNumber,
+            nnYLen: nnYLen as NSNumber)
+
+        // Run the graph
+        let commandQueue = device.makeCommandQueue()!
+
+        let actualInputShape = InputShape.create(
+            batchSize: batchSize as NSNumber,
+            numChannels: descriptor.preBN.numChannels,
+            nnYLen: nnYLen as NSNumber,
+            nnXLen: nnXLen as NSNumber)
+
+        let inputDescriptor = MPSNDArrayDescriptor(
+            dataType: .float32,
+            shape: actualInputShape)
+
+        let inputArray = MPSNDArray(
+            device: device,
+            descriptor: inputDescriptor)
+
+        inputArray.writeBytes(UnsafeMutableRawPointer(mutating: inputPointer))
+
+        let actualMaskShape = InputShape.create(
+            batchSize: batchSize as NSNumber,
+            numChannels: 1,
+            nnYLen: nnYLen as NSNumber,
+            nnXLen: nnXLen as NSNumber)
+
+        let maskDescriptor = MPSNDArrayDescriptor(
+            dataType: .float32,
+            shape: actualMaskShape)
+
+        let maskArray = MPSNDArray(
+            device: device,
+            descriptor: maskDescriptor)
+
+        maskArray.writeBytes(UnsafeMutableRawPointer(mutating: maskPointer))
+
+        let feeds = [
+            inputTensor: MPSGraphTensorData(inputArray),
+            maskTensor: MPSGraphTensorData(maskArray),
+        ]
+
+        let fetch = graph.run(
+            with: commandQueue,
+            feeds: feeds,
+            targetTensors: [resBlock.resultTensor],
+            targetOperations: nil)
+
+        fetch[resBlock.resultTensor]?.mpsndarray().readBytes(outputPointer)
+
+        return true
+    }
+}
+
+// MARK: - GlobalPoolingResidualBlock Test Extension
+
+extension GlobalPoolingResidualBlock {
+    /// Test the global pooling residual block with given parameters
+    static func test(
+        descriptor: SWGlobalPoolingResidualBlockDesc,
+        batchSize: Int32,
+        nnXLen: Int32,
+        nnYLen: Int32,
+        inputPointer: UnsafePointer<Float32>,
+        maskPointer: UnsafePointer<Float32>,
+        outputPointer: UnsafeMutablePointer<Float32>
+    ) -> Bool {
+        guard let device = MTLCreateSystemDefaultDevice() else {
+            return false
+        }
+
+        let graph = MPSGraph()
+
+        let inputShape = InputShape.create(
+            batchSize: -1 as NSNumber,
+            numChannels: descriptor.preBN.numChannels,
+            nnYLen: nnYLen as NSNumber,
+            nnXLen: nnXLen as NSNumber)
+
+        let inputTensor = graph.placeholder(
+            shape: inputShape,
+            dataType: .float32,
+            name: nil)
+
+        let maskShape = InputShape.create(
+            batchSize: -1 as NSNumber,
+            numChannels: 1,
+            nnYLen: nnYLen as NSNumber,
+            nnXLen: nnXLen as NSNumber)
+
+        let maskTensor = graph.placeholder(
+            shape: maskShape,
+            dataType: .float32,
+            name: nil)
+
+        // Compute mask sum and related tensors from mask
+        let maskSum = MaskSumLayer(graph: graph, maskTensor: maskTensor)
+        let maskSumSqrtS14M01 = MaskSumSqrtS14M01Layer(graph: graph, maskSum: maskSum)
+
+        let gpoolBlock = GlobalPoolingResidualBlock(
+            graph: graph,
+            sourceTensor: inputTensor,
+            maskTensor: maskTensor,
+            maskSumTensor: maskSum.tensor,
+            maskSumSqrtS14M01Tensor: maskSumSqrtS14M01.tensor,
+            descriptor: descriptor,
+            nnXLen: nnXLen as NSNumber,
+            nnYLen: nnYLen as NSNumber)
+
+        // Run the graph
+        let commandQueue = device.makeCommandQueue()!
+
+        let actualInputShape = InputShape.create(
+            batchSize: batchSize as NSNumber,
+            numChannels: descriptor.preBN.numChannels,
+            nnYLen: nnYLen as NSNumber,
+            nnXLen: nnXLen as NSNumber)
+
+        let inputDescriptor = MPSNDArrayDescriptor(
+            dataType: .float32,
+            shape: actualInputShape)
+
+        let inputArray = MPSNDArray(
+            device: device,
+            descriptor: inputDescriptor)
+
+        inputArray.writeBytes(UnsafeMutableRawPointer(mutating: inputPointer))
+
+        let actualMaskShape = InputShape.create(
+            batchSize: batchSize as NSNumber,
+            numChannels: 1,
+            nnYLen: nnYLen as NSNumber,
+            nnXLen: nnXLen as NSNumber)
+
+        let maskDescriptor = MPSNDArrayDescriptor(
+            dataType: .float32,
+            shape: actualMaskShape)
+
+        let maskArray = MPSNDArray(
+            device: device,
+            descriptor: maskDescriptor)
+
+        maskArray.writeBytes(UnsafeMutableRawPointer(mutating: maskPointer))
+
+        let feeds = [
+            inputTensor: MPSGraphTensorData(inputArray),
+            maskTensor: MPSGraphTensorData(maskArray),
+        ]
+
+        let fetch = graph.run(
+            with: commandQueue,
+            feeds: feeds,
+            targetTensors: [gpoolBlock.resultTensor],
+            targetOperations: nil)
+
+        fetch[gpoolBlock.resultTensor]?.mpsndarray().readBytes(outputPointer)
+
+        return true
+    }
+}
+
+// MARK: - Public Test Functions (callable from C++)
+
+/// Test the convolution layer
+public func testConvLayer(
+    descriptor: SWConvLayerDesc,
+    batchSize: Int32,
+    nnXLen: Int32,
+    nnYLen: Int32,
+    inputPointer: UnsafePointer<Float32>,
+    outputPointer: UnsafeMutablePointer<Float32>
+) -> Bool {
+    return ConvLayer.test(
+        descriptor: descriptor,
+        batchSize: batchSize,
+        nnXLen: nnXLen,
+        nnYLen: nnYLen,
+        inputPointer: inputPointer,
+        outputPointer: outputPointer)
+}
+
+/// Test the batch normalization layer
+public func testBatchNormLayer(
+    descriptor: SWBatchNormLayerDesc,
+    batchSize: Int32,
+    nnXLen: Int32,
+    nnYLen: Int32,
+    inputPointer: UnsafePointer<Float32>,
+    maskPointer: UnsafePointer<Float32>,
+    outputPointer: UnsafeMutablePointer<Float32>
+) -> Bool {
+    return BatchNormLayer.test(
+        descriptor: descriptor,
+        batchSize: batchSize,
+        nnXLen: nnXLen,
+        nnYLen: nnYLen,
+        inputPointer: inputPointer,
+        maskPointer: maskPointer,
+        outputPointer: outputPointer)
+}
+
+/// Test the residual block
+public func testResidualBlock(
+    descriptor: SWResidualBlockDesc,
+    batchSize: Int32,
+    nnXLen: Int32,
+    nnYLen: Int32,
+    inputPointer: UnsafePointer<Float32>,
+    maskPointer: UnsafePointer<Float32>,
+    outputPointer: UnsafeMutablePointer<Float32>
+) -> Bool {
+    return ResidualBlock.test(
+        descriptor: descriptor,
+        batchSize: batchSize,
+        nnXLen: nnXLen,
+        nnYLen: nnYLen,
+        inputPointer: inputPointer,
+        maskPointer: maskPointer,
+        outputPointer: outputPointer)
+}
+
+/// Test the global pooling residual block
+public func testGlobalPoolingResidualBlock(
+    descriptor: SWGlobalPoolingResidualBlockDesc,
+    batchSize: Int32,
+    nnXLen: Int32,
+    nnYLen: Int32,
+    inputPointer: UnsafePointer<Float32>,
+    maskPointer: UnsafePointer<Float32>,
+    outputPointer: UnsafeMutablePointer<Float32>
+) -> Bool {
+    return GlobalPoolingResidualBlock.test(
+        descriptor: descriptor,
+        batchSize: batchSize,
+        nnXLen: nnXLen,
+        nnYLen: nnYLen,
+        inputPointer: inputPointer,
+        maskPointer: maskPointer,
+        outputPointer: outputPointer)
+}