ROCm · JeniferC99 · Apr 21, 2026 · Feb 6, 2026 · Feb 14, 2026 · Mar 7, 2026
@@ -1,7 +1,7 @@
 #####################################################################################
 # The MIT License (MIT)
 #
-# Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -77,6 +77,7 @@ add_library(migraphx
     fp8_ocp_to_fnuz.cpp
     fuse_attention.cpp
     fuse_concat.cpp
+    fuse_horizontal.cpp
     fuse_pointwise.cpp
     fuse_pointwise_reduce.cpp
     fuse_reduce.cpp

@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -43,7 +43,10 @@ void adjust_allocation::apply(module& m) const
         if(ins->get_operator().is_context_free())
             continue;
 
-        auto alias_ins = instruction::get_output_alias(ins, true);
+        auto aliases = instruction::get_output_alias(ins, true);
+        if(aliases.size() != 1)
+            continue;
+        auto alias_ins = aliases.front();
         if(alias_ins->name() != model.name() and alias_ins->name() != "@param")
             continue;
         // shape allocated is different from actual shape

@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -377,19 +377,9 @@ struct custom_operation
         return op.compute(std::move(ctx), std::move(output_shape), std::move(inputs));
     }
 
-    std::ptrdiff_t output_alias(std::vector<shape> inputs) const
+    std::vector<std::size_t> output_alias(std::vector<shape> inputs) const
     {
-        auto alias_vec = op.output_alias(std::move(inputs));
-        // TODO: For now, only support one output alias
-        if(alias_vec.empty())
-        {
-            return -1;
-        }
-        if(alias_vec.size() > 1)
-        {
-            MIGRAPHX_THROW("Currently, CustomOps in MIGraphX only supports one output_alias");
-        }
-        return alias_vec.front();
+        return op.output_alias(std::move(inputs));
     }
 
     bool runs_on_offload_target() const { return op.runs_on_offload_target(); }

@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -97,25 +97,66 @@ static bool compute_common_dim(std::vector<std::size_t>& cd_dims,
     assert(state1.get() < state2.get());
     auto d2    = state2.get();
     auto dims  = state1.dims_for(d2);
-    auto n     = elements(dims);
     auto naxes = distance(dims);
+
     if(naxes == 0)
         return false;
+
+    // Check if state1 has a remainder from previous split
+    bool has_remainder = (state1.rem != 1);
+
+    // Compute the product of dimensions, adjusting for remainder if needed
+    auto n = elements(dims);
+    if(has_remainder and naxes > 0)
+    {
+        n = n / *dims.begin() * (*dims.begin() / state1.rem);
+    }
+
     // If not divisible then we can't compute a common dim
     if((d2 % n) != 0)
         return false;
+
     auto rem = d2 / n;
-    state1.add_multi_axes(naxes, cd_dims.size());
-    state2.add_axes(rem == 1 ? naxes : naxes + 1, cd_dims.size());
+    auto start_pos = cd_dims.size();
 
+    // Add axes mappings
+    if(has_remainder)
+    {
+        // state1: dimension was split, keep axes together
+        state1.add_axes(naxes, start_pos);
+        // state2: axes should include the previous remainder dimension
+        state2.add_axes(rem == 1 ? naxes : naxes + 1, start_pos - 1);
+    }
+    else
+    {
+        // state1: separate axes for each dimension
+        state1.add_multi_axes(naxes, start_pos);
+        // state2: normal axes mapping
+        state2.add_axes(rem == 1 ? naxes : naxes + 1, start_pos);
+    }
+
+    // Add dimensions to cd_dims
+    if(has_remainder and naxes > 0)
+    {
+        // Adjust the first dimension by dividing by the remainder
+        cd_dims.push_back(*dims.begin() / state1.rem);
+        cd_dims.insert(cd_dims.end(), std::next(dims.begin()), dims.end());
+    }
+    else
+    {
+        cd_dims.insert(cd_dims.end(), dims.begin(), dims.end());
+    }
+
+    // Add remainder dimension if needed
+    if(rem != 1)
+        cd_dims.push_back(rem);
+
+    // Update states
     state1.rem = rem;
     state2.rem = 1;
-
-    cd_dims.insert(cd_dims.end(), dims.begin(), dims.end());
-    if(state1.rem != 1)
-        cd_dims.push_back(state1.rem);
-    state1.next(distance(dims));
+    state1.next(naxes);
     state2.next();
+
     return true;
 }
 
@@ -152,6 +193,22 @@ common_dims common_dims::compute(const std::vector<std::size_t>& dims1,
                 return {};
         }
     }
+
+    // Handle case where one state has a remainder that equals the next dimension
+    // In this case, the dimension was already added as a remainder, we just need the axes mapping
+    auto handle_remaining_dimension = [&cd](common_dim_state& state) {
+        if(not state.is_end() and state.rem != 1 and state.get() == 1)
+        {
+            // The remainder already added to cd_dims matches this dimension
+            // Add a single axes mapping
+            state.axes_map->push_back({cd.dims.size() - 1});
+            state.next();
+        }
+    };
+
+    handle_remaining_dimension(state1);
+    handle_remaining_dimension(state2);
+
     assert(elements(dims1) == elements(cd.dims));
     return cd;
 }

@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -48,13 +48,11 @@ void dead_code_elimination::apply(module& m) const
         // Skip the last instruction
         if(i == last)
             break;
-        // Skip instruction with empty shape as output unless its [dynamic, builtin, undefined,
+        // Skip instruction with empty shape as output unless its [builtin, undefined,
         // identity, allocate, or tuple_type]
-        if((not i->get_shape().dynamic() and
-            (i->get_shape().elements() == 0 and
-             i->get_shape().type() != migraphx::shape::tuple_type)) and
-           not(i->name().front() == '@') and not contains({"identity", "allocate"}, i->name()) and
-           not i->is_undefined())
+        if(i->get_shape().ndim() == 0 and not i->is_undefined() and
+           i->get_shape().type() != migraphx::shape::tuple_type and i->name().front() != '@' and
+           not contains({"identity", "allocate"}, i->name()))
             continue;
         assert(std::distance(m.begin(), i) <= std::distance(m.begin(), last));
         std::unordered_set<instruction_ref> visited;

@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -118,19 +118,23 @@ bool is_offload_copy_set(const program& p)
     {
         if(i.name() == "hip::copy_to_gpu")
         {
-            auto copy_arg = instruction::get_output_alias(i.inputs().front(), true);
-            param_ins.erase(copy_arg);
+            auto copy_args = instruction::get_output_alias(i.inputs().front(), true);
+            for(auto copy_arg : copy_args)
+                param_ins.erase(copy_arg);
         }
         else if(i.name() == "@return")
         {
             auto return_args = i.inputs();
-            for(const auto& j : return_args)
-            {
-                auto alias_ins = instruction::get_output_alias(j, true);
-                if((alias_ins->name() == "@param" and param_ins.erase(alias_ins) == 0) or
-                   (alias_ins->name() != "hip::copy_from_gpu"))
+            return std::all_of(return_args.begin(), return_args.end(), [&](const auto& j) {
+                auto aliases = instruction::get_output_alias(j, true);
+                return std::all_of(aliases.begin(), aliases.end(), [&](instruction_ref alias_ins) {
+                    if(alias_ins->name() == "hip::copy_from_gpu")
+                        return true;
+                    if(alias_ins->name() == "@param")
+                        return not contains(param_ins, alias_ins);
                     return false;
-            }
+                });
+            });
         }
     }
     return param_ins.empty();

@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -37,11 +37,11 @@ inline namespace MIGRAPHX_INLINE_NS {
 
 static instruction_ref capture_arg(std::unordered_set<instruction_ref>& s, instruction_ref ins)
 {
-    auto alias = instruction::get_output_alias(ins, true);
-    if(alias != ins)
+    auto aliases = instruction::get_output_alias(ins, true);
+    if(aliases.size() == 1 and aliases.front() != ins)
     {
         s.insert(ins);
-        return capture_arg(s, alias);
+        return capture_arg(s, aliases.front());
     }
     if(contains({"reshape", "contiguous"}, ins->name()))
     {

@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -70,11 +70,14 @@ void eliminate_concat::apply(module& m) const
             // Where are the allocations for the tensors to be concatenated?
             std::vector<instruction_ref> allocations;
 
-            std::transform(
-                ins->inputs().begin(),
-                std::prev(ins->inputs().end()),
-                std::back_inserter(allocations),
-                [&](instruction_ref x) { return instruction::get_output_alias(x, true); });
+            std::transform(ins->inputs().begin(),
+                           std::prev(ins->inputs().end()),
+                           std::back_inserter(allocations),
+                           [&](instruction_ref x) {
+                               auto aliases = instruction::get_output_alias(x, true);
+                               // cppcheck-suppress returnDanglingLifetime
+                               return aliases.front();
+                           });
 
             if(std::any_of(allocations.begin(), allocations.end(), [&](auto x) {
                    return x->name() != concat_opt.allocate();