From f2404add767be035e4f2145a8a7aaca4d120ec07 Mon Sep 17 00:00:00 2001
From: Niko Maroulis <niko@hackthebox.eu>
Date: Fri, 30 Jan 2026 18:31:02 -0500
Subject: [PATCH] Add BFGS and Nelder-Mead multivariate optimization algorithms

- BFGS: Quasi-Newton method with automatic differentiation
- Nelder-Mead: Derivative-free simplex method
- Update optimize.livemd with multivariate section
- Add efficient_frontier.livemd portfolio optimization example
---
 lib/scholar/optimize/bfgs.ex               | 297 ++++++++++++++
 lib/scholar/optimize/nelder_mead.ex        | 437 +++++++++++++++++++++
 notebooks/efficient_frontier.livemd        | 298 ++++++++++++++
 notebooks/optimize.livemd                  | 131 +++++-
 test/scholar/optimize/bfgs_test.exs        | 178 +++++++++
 test/scholar/optimize/nelder_mead_test.exs | 171 ++++++++
 6 files changed, 1511 insertions(+), 1 deletion(-)
 create mode 100644 lib/scholar/optimize/bfgs.ex
 create mode 100644 lib/scholar/optimize/nelder_mead.ex
 create mode 100644 notebooks/efficient_frontier.livemd
 create mode 100644 test/scholar/optimize/bfgs_test.exs
 create mode 100644 test/scholar/optimize/nelder_mead_test.exs

diff --git a/lib/scholar/optimize/bfgs.ex b/lib/scholar/optimize/bfgs.ex
new file mode 100644
index 00000000..ea9d7eca
--- /dev/null
+++ b/lib/scholar/optimize/bfgs.ex
@@ -0,0 +1,297 @@
+defmodule Scholar.Optimize.BFGS do
+  @moduledoc """
+  BFGS (Broyden-Fletcher-Goldfarb-Shanno) algorithm for multivariate function minimization.
+
+  BFGS is a quasi-Newton optimization method that approximates the inverse Hessian
+  matrix using gradient information. It is well-suited for smooth, differentiable
+  objective functions and typically converges much faster than derivative-free methods
+  like Nelder-Mead.
+
+  ## Algorithm
+
+  At each iteration, the algorithm:
+  1. Computes the gradient using automatic differentiation
+  2. Determines a search direction from the inverse Hessian approximation
+  3. Performs a line search to find an acceptable step length
+  4. Updates the inverse Hessian approximation using the BFGS formula
+
+  ## Convergence
+
+  The algorithm converges when the gradient norm falls below the specified tolerance.
+
+  ## References
+
+  * Nocedal, J. and Wright, S. J. (2006). "Numerical Optimization"
+  * Fletcher, R. (1987). "Practical Methods of Optimization"
+  """
+
+  import Nx.Defn
+
+  @derive {Nx.Container, containers: [:x, :fun, :converged, :iterations, :fun_evals, :grad_evals]}
+  defstruct [:x, :fun, :converged, :iterations, :fun_evals, :grad_evals]
+
+  @type t :: %__MODULE__{
+          x: Nx.Tensor.t(),
+          fun: Nx.Tensor.t(),
+          converged: Nx.Tensor.t(),
+          iterations: Nx.Tensor.t(),
+          fun_evals: Nx.Tensor.t(),
+          grad_evals: Nx.Tensor.t()
+        }
+
+  # Line search parameter (Armijo condition)
+  @c1 1.0e-4
+
+  opts = [
+    gtol: [
+      type: {:custom, Scholar.Options, :positive_number, []},
+      default: 1.0e-5,
+      doc: """
+      Gradient norm tolerance for convergence. The algorithm stops when
+      the infinity norm of the gradient is below this threshold.
+      """
+    ],
+    maxiter: [
+      type: :pos_integer,
+      default: 500,
+      doc: "Maximum number of iterations."
+    ]
+  ]
+
+  @opts_schema NimbleOptions.new!(opts)
+
+  @doc """
+  Minimizes a multivariate function using the BFGS algorithm.
+
+  BFGS is a gradient-based method that uses automatic differentiation to compute
+  gradients and approximates the inverse Hessian matrix for fast convergence.
+
+  ## Arguments
+
+  * `x0` - Initial guess as a 1-D tensor of shape `{n}`.
+  * `fun` - The objective function to minimize. Must be a defn-compatible function
+    that takes a 1-D tensor of shape `{n}` and returns a scalar tensor.
+  * `opts` - Options (see below).
+
+  ## Options
+
+  #{NimbleOptions.docs(@opts_schema)}
+
+  ## Returns
+
+  A `Scholar.Optimize.BFGS` struct with the optimization result:
+
+  * `:x` - The optimal point found (shape `{n}`)
+  * `:fun` - The function value at the optimal point
+  * `:converged` - Whether the optimization converged (1 if true, 0 if false)
+  * `:iterations` - Number of iterations performed
+  * `:fun_evals` - Number of function evaluations
+  * `:grad_evals` - Number of gradient evaluations
+
+  ## Examples
+
+      iex> # Minimize a simple quadratic: f(x) = x1^2 + x2^2
+      iex> fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+      iex> x0 = Nx.tensor([1.0, 2.0])
+      iex> result = Scholar.Optimize.BFGS.minimize(x0, fun)
+      iex> Nx.to_number(result.converged)
+      1
+      iex> Nx.all_close(result.x, Nx.tensor([0.0, 0.0]), atol: 1.0e-4) |> Nx.to_number()
+      1
+
+  For higher precision, use f64 tensors:
+
+      iex> fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+      iex> x0 = Nx.tensor([1.0, 2.0], type: :f64)
+      iex> result = Scholar.Optimize.BFGS.minimize(x0, fun, gtol: 1.0e-10)
+      iex> Nx.to_number(result.converged)
+      1
+      iex> Nx.all_close(result.x, Nx.tensor([0.0, 0.0]), atol: 1.0e-8) |> Nx.to_number()
+      1
+
+  Minimizing the Rosenbrock function:
+
+      iex> # Rosenbrock: f(x,y) = (1-x)^2 + 100*(y-x^2)^2, minimum at (1, 1)
+      iex> rosenbrock = fn x ->
+      ...>   x0 = x[0]
+      ...>   x1 = x[1]
+      ...>   term1 = Nx.pow(Nx.subtract(1, x0), 2)
+      ...>   term2 = Nx.multiply(100, Nx.pow(Nx.subtract(x1, Nx.pow(x0, 2)), 2))
+      ...>   Nx.add(term1, term2)
+      ...> end
+      iex> x0 = Nx.tensor([0.0, 0.0], type: :f64)
+      iex> result = Scholar.Optimize.BFGS.minimize(x0, rosenbrock, gtol: 1.0e-8, maxiter: 1000)
+      iex> Nx.to_number(result.converged)
+      1
+      iex> Nx.all_close(result.x, Nx.tensor([1.0, 1.0]), atol: 1.0e-4) |> Nx.to_number()
+      1
+  """
+  defn minimize(x0, fun, opts \\ []) do
+    {gtol, maxiter} = transform_opts(opts)
+    minimize_n(x0, fun, gtol, maxiter)
+  end
+
+  deftransformp transform_opts(opts) do
+    opts = NimbleOptions.validate!(opts, @opts_schema)
+    {opts[:gtol], opts[:maxiter]}
+  end
+
+  defnp minimize_n(x0, fun, gtol, maxiter) do
+    x0 = Nx.flatten(x0)
+    {n} = Nx.shape(x0)
+
+    # Initial function value and gradient
+    {f0, g0} = value_and_grad(x0, fun)
+
+    # Initialize inverse Hessian as identity matrix
+    h_inv = Nx.eye(n, type: Nx.type(x0))
+
+    # Initial state
+    initial_state = %{
+      x: x0,
+      f: f0,
+      g: g0,
+      h_inv: h_inv,
+      iter: Nx.u32(0),
+      f_evals: Nx.u32(1),
+      g_evals: Nx.u32(1)
+    }
+
+    # Main optimization loop
+    {final_state, _} =
+      while {state = initial_state, {gtol, maxiter}},
+            not converged?(state, gtol) and state.iter < maxiter do
+        new_state = bfgs_step(state, fun)
+        {new_state, {gtol, maxiter}}
+      end
+
+    # Check convergence
+    converged = converged?(final_state, gtol)
+
+    %__MODULE__{
+      x: final_state.x,
+      fun: final_state.f,
+      converged: converged,
+      iterations: final_state.iter,
+      fun_evals: final_state.f_evals,
+      grad_evals: final_state.g_evals
+    }
+  end
+
+  # Check convergence: infinity norm of gradient < gtol
+  defnp converged?(state, gtol) do
+    grad_norm = Nx.reduce_max(Nx.abs(state.g))
+    Nx.less(grad_norm, gtol)
+  end
+
+  # Perform one BFGS iteration
+  defnp bfgs_step(state, fun) do
+    x = state.x
+    f = state.f
+    g = state.g
+    h_inv = state.h_inv
+
+    # Compute search direction: p = -H_inv * g
+    p = Nx.negate(Nx.dot(h_inv, g))
+
+    # Line search to find step length alpha
+    {alpha, f_new, g_new, ls_f_evals, ls_g_evals} =
+      line_search(x, f, g, p, fun)
+
+    # Compute step and gradient change
+    s = Nx.multiply(alpha, p)
+    x_new = Nx.add(x, s)
+    y = Nx.subtract(g_new, g)
+
+    # Update inverse Hessian using BFGS formula
+    h_inv_new = update_inverse_hessian(h_inv, s, y)
+
+    %{
+      state
+      | x: x_new,
+        f: f_new,
+        g: g_new,
+        h_inv: h_inv_new,
+        iter: Nx.add(state.iter, 1),
+        f_evals: Nx.add(state.f_evals, ls_f_evals),
+        g_evals: Nx.add(state.g_evals, ls_g_evals)
+    }
+  end
+
+  # Backtracking line search with Armijo condition (unrolled for defn compatibility)
+  defnp line_search(x, f, g, p, fun) do
+    # Directional derivative at alpha=0
+    slope = Nx.dot(g, p)
+
+    # Try step sizes: 1, 0.5, 0.25, 0.125, ...
+    # Unroll 10 iterations of backtracking
+    alpha = backtrack_step(x, f, p, slope, fun, 1.0)
+    alpha = backtrack_step(x, f, p, slope, fun, alpha)
+    alpha = backtrack_step(x, f, p, slope, fun, alpha)
+    alpha = backtrack_step(x, f, p, slope, fun, alpha)
+    alpha = backtrack_step(x, f, p, slope, fun, alpha)
+    alpha = backtrack_step(x, f, p, slope, fun, alpha)
+    alpha = backtrack_step(x, f, p, slope, fun, alpha)
+    alpha = backtrack_step(x, f, p, slope, fun, alpha)
+    alpha = backtrack_step(x, f, p, slope, fun, alpha)
+    alpha = backtrack_step(x, f, p, slope, fun, alpha)
+
+    # Evaluate function and gradient at final alpha
+    x_final = Nx.add(x, Nx.multiply(alpha, p))
+    {f_final, g_final} = value_and_grad(x_final, fun)
+
+    {alpha, f_final, g_final, Nx.u32(11), Nx.u32(1)}
+  end
+
+  # Single backtracking step
+  defnp backtrack_step(x, f, p, slope, fun, alpha) do
+    x_trial = Nx.add(x, Nx.multiply(alpha, p))
+    f_trial = fun.(x_trial)
+
+    # Check Armijo condition
+    armijo_ok =
+      Nx.less_equal(f_trial, Nx.add(f, Nx.multiply(@c1, Nx.multiply(alpha, slope))))
+
+    # If satisfied, keep alpha; otherwise halve it
+    Nx.select(armijo_ok, alpha, Nx.multiply(0.5, alpha))
+  end
+
+  # BFGS inverse Hessian update
+  # H_new = (I - rho*s*y') * H * (I - rho*y*s') + rho*s*s'
+  defnp update_inverse_hessian(h_inv, s, y) do
+    {n} = Nx.shape(s)
+
+    # rho = 1 / (y' * s)
+    ys = Nx.dot(y, s)
+
+    # Skip update if ys is too small (would cause numerical issues)
+    skip_update = Nx.less(Nx.abs(ys), 1.0e-10)
+
+    rho = Nx.select(skip_update, 0.0, Nx.divide(1.0, ys))
+
+    # Compute update terms
+    # I - rho*s*y' and I - rho*y*s'
+    eye = Nx.eye(n, type: Nx.type(s))
+
+    # s and y as column vectors for outer products
+    s_col = Nx.reshape(s, {n, 1})
+    y_col = Nx.reshape(y, {n, 1})
+    s_row = Nx.reshape(s, {1, n})
+    y_row = Nx.reshape(y, {1, n})
+
+    # (I - rho*s*y')
+    left = Nx.subtract(eye, Nx.multiply(rho, Nx.dot(s_col, y_row)))
+
+    # (I - rho*y*s')
+    right = Nx.subtract(eye, Nx.multiply(rho, Nx.dot(y_col, s_row)))
+
+    # rho*s*s'
+    ss_term = Nx.multiply(rho, Nx.dot(s_col, s_row))
+
+    # H_new = left * H * right + ss_term
+    h_new = Nx.add(Nx.dot(Nx.dot(left, h_inv), right), ss_term)
+
+    # Use old H if update skipped
+    Nx.select(skip_update, h_inv, h_new)
+  end
+end
diff --git a/lib/scholar/optimize/nelder_mead.ex b/lib/scholar/optimize/nelder_mead.ex
new file mode 100644
index 00000000..173d0af6
--- /dev/null
+++ b/lib/scholar/optimize/nelder_mead.ex
@@ -0,0 +1,437 @@
+defmodule Scholar.Optimize.NelderMead do
+  @moduledoc """
+  Nelder-Mead simplex algorithm for derivative-free multivariate function minimization.
+
+  The Nelder-Mead algorithm (also known as the downhill simplex method) is a
+  derivative-free optimization technique for finding the minimum of a function
+  of multiple variables. It maintains a simplex of n+1 points in n-dimensional
+  space and iteratively updates the simplex by reflecting, expanding, contracting,
+  or shrinking based on function values.
+
+  ## Algorithm
+
+  At each iteration, the algorithm:
+  1. Orders the simplex vertices by function value (best to worst)
+  2. Computes the centroid of all vertices except the worst
+  3. Attempts reflection of the worst point through the centroid
+  4. Based on the reflected point's value:
+     - If best so far: try expansion
+     - If better than second-worst: accept reflection
+     - If worse: try contraction
+     - If contraction fails: shrink simplex toward best point
+
+  ## Convergence
+
+  The algorithm converges when the standard deviation of function values
+  at simplex vertices falls below the specified tolerance.
+
+  ## References
+
+  * Nelder, J. A. and Mead, R. (1965). "A simplex method for function minimization"
+  * Press, W. H., et al. "Numerical Recipes: The Art of Scientific Computing"
+  """
+
+  import Nx.Defn
+
+  @derive {Nx.Container, containers: [:x, :fun, :converged, :iterations, :fun_evals]}
+  defstruct [:x, :fun, :converged, :iterations, :fun_evals]
+
+  @type t :: %__MODULE__{
+          x: Nx.Tensor.t(),
+          fun: Nx.Tensor.t(),
+          converged: Nx.Tensor.t(),
+          iterations: Nx.Tensor.t(),
+          fun_evals: Nx.Tensor.t()
+        }
+
+  # Standard Nelder-Mead coefficients
+  @rho 1.0
+  @chi 2.0
+  @gamma 0.5
+  @sigma 0.5
+
+  opts = [
+    tol: [
+      type: {:custom, Scholar.Options, :positive_number, []},
+      default: 1.0e-5,
+      doc: """
+      Tolerance for convergence. The algorithm stops when the standard deviation
+      of function values at simplex vertices is below this threshold.
+      """
+    ],
+    maxiter: [
+      type: :pos_integer,
+      default: 500,
+      doc: "Maximum number of iterations."
+    ],
+    initial_simplex_step: [
+      type: {:custom, Scholar.Options, :positive_number, []},
+      default: 0.05,
+      doc: """
+      Step size for constructing the initial simplex. Each vertex (except the first)
+      is created by moving along one coordinate axis by this factor times the
+      corresponding coordinate value (or by this value if the coordinate is zero).
+      """
+    ]
+  ]
+
+  @opts_schema NimbleOptions.new!(opts)
+
+  @doc """
+  Minimizes a multivariate function using the Nelder-Mead simplex algorithm.
+
+  This is a derivative-free method suitable for optimizing functions where
+  gradients are unavailable or expensive to compute.
+
+  ## Arguments
+
+  * `x0` - Initial guess as a 1-D tensor of shape `{n}`.
+  * `fun` - The objective function to minimize. Must be a defn-compatible function
+    that takes a 1-D tensor of shape `{n}` and returns a scalar tensor.
+  * `opts` - Options (see below).
+
+  ## Options
+
+  #{NimbleOptions.docs(@opts_schema)}
+
+  ## Returns
+
+  A `Scholar.Optimize.NelderMead` struct with the optimization result:
+
+  * `:x` - The optimal point found (shape `{n}`)
+  * `:fun` - The function value at the optimal point
+  * `:converged` - Whether the optimization converged (1 if true, 0 if false)
+  * `:iterations` - Number of iterations performed
+  * `:fun_evals` - Number of function evaluations
+
+  ## Examples
+
+      iex> # Minimize a simple quadratic: f(x) = x1^2 + x2^2
+      iex> fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+      iex> x0 = Nx.tensor([1.0, 2.0])
+      iex> result = Scholar.Optimize.NelderMead.minimize(x0, fun)
+      iex> Nx.to_number(result.converged)
+      1
+      iex> Nx.all_close(result.x, Nx.tensor([0.0, 0.0]), atol: 1.0e-2) |> Nx.to_number()
+      1
+
+  For higher precision, use f64 tensors:
+
+      iex> fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+      iex> x0 = Nx.tensor([1.0, 2.0], type: :f64)
+      iex> result = Scholar.Optimize.NelderMead.minimize(x0, fun, tol: 1.0e-12)
+      iex> Nx.to_number(result.converged)
+      1
+      iex> Nx.all_close(result.x, Nx.tensor([0.0, 0.0]), atol: 1.0e-4) |> Nx.to_number()
+      1
+
+  Minimizing the Rosenbrock function:
+
+      iex> # Rosenbrock: f(x,y) = (1-x)^2 + 100*(y-x^2)^2, minimum at (1, 1)
+      iex> rosenbrock = fn x ->
+      ...>   x0 = x[0]
+      ...>   x1 = x[1]
+      ...>   term1 = Nx.pow(Nx.subtract(1, x0), 2)
+      ...>   term2 = Nx.multiply(100, Nx.pow(Nx.subtract(x1, Nx.pow(x0, 2)), 2))
+      ...>   Nx.add(term1, term2)
+      ...> end
+      iex> x0 = Nx.tensor([0.0, 0.0], type: :f64)
+      iex> result = Scholar.Optimize.NelderMead.minimize(x0, rosenbrock, tol: 1.0e-8, maxiter: 1000)
+      iex> Nx.to_number(result.converged)
+      1
+      iex> Nx.all_close(result.x, Nx.tensor([1.0, 1.0]), atol: 1.0e-4) |> Nx.to_number()
+      1
+  """
+  defn minimize(x0, fun, opts \\ []) do
+    {tol, maxiter, initial_simplex_step} = transform_opts(opts)
+    minimize_n(x0, fun, tol, maxiter, initial_simplex_step)
+  end
+
+  deftransformp transform_opts(opts) do
+    opts = NimbleOptions.validate!(opts, @opts_schema)
+    {opts[:tol], opts[:maxiter], opts[:initial_simplex_step]}
+  end
+
+  defnp minimize_n(x0, fun, tol, maxiter, initial_simplex_step) do
+    x0 = Nx.flatten(x0)
+    {n} = Nx.shape(x0)
+
+    # Build initial simplex: n+1 vertices
+    simplex = build_initial_simplex(x0, initial_simplex_step)
+
+    # Evaluate function at all simplex vertices
+    f_values = evaluate_all_vertices(simplex, fun)
+
+    # Initial state
+    initial_state = %{
+      simplex: simplex,
+      f_values: f_values,
+      iter: Nx.u32(0),
+      f_evals: Nx.u32(n + 1)
+    }
+
+    # Main optimization loop
+    {final_state, _} =
+      while {state = initial_state, {tol, maxiter}},
+            not converged?(state, tol) and state.iter < maxiter do
+        new_state = nelder_mead_step(state, fun)
+        {new_state, {tol, maxiter}}
+      end
+
+    # Get best point
+    order = Nx.argsort(final_state.f_values)
+    best_idx = order[0]
+    x_opt = final_state.simplex[best_idx]
+    f_opt = final_state.f_values[best_idx]
+
+    # Check convergence
+    converged = converged?(final_state, tol)
+
+    %__MODULE__{
+      x: x_opt,
+      fun: f_opt,
+      converged: converged,
+      iterations: final_state.iter,
+      fun_evals: final_state.f_evals
+    }
+  end
+
+  # Build initial simplex with n+1 vertices
+  defnp build_initial_simplex(x0, step) do
+    {n} = Nx.shape(x0)
+    simplex_shape = {n + 1, n}
+
+    # Broadcast x0 to all rows
+    base = Nx.broadcast(x0, simplex_shape)
+
+    # Create identity-like steps for vertices 1 to n
+    indices = Nx.iota(simplex_shape, axis: 0)
+    col_indices = Nx.iota(simplex_shape, axis: 1)
+
+    # For row i (i > 0), add step to column i-1
+    should_add = Nx.equal(Nx.subtract(indices, 1), col_indices)
+
+    # Step size: use step * |x0[j]| if x0[j] != 0, else step
+    x0_abs = Nx.abs(x0)
+    step_sizes = Nx.select(Nx.greater(x0_abs, 1.0e-10), Nx.multiply(step, x0_abs), step)
+
+    # Broadcast step_sizes to simplex shape
+    step_matrix = Nx.broadcast(step_sizes, simplex_shape)
+
+    # Add steps where appropriate
+    Nx.select(should_add, Nx.add(base, step_matrix), base)
+  end
+
+  # Evaluate function at all simplex vertices
+  defnp evaluate_all_vertices(simplex, fun) do
+    {n_plus_1, _n} = Nx.shape(simplex)
+
+    # Evaluate first vertex to get type
+    f0 = fun.(simplex[0])
+    init_f = Nx.broadcast(f0, {n_plus_1})
+
+    {f_values, _} =
+      while {f_values = init_f, {simplex, i = Nx.u32(0)}}, i < n_plus_1 do
+        vertex = simplex[i]
+        f_val = fun.(vertex)
+        f_values = Nx.indexed_put(f_values, Nx.new_axis(i, 0), f_val)
+        {f_values, {simplex, i + 1}}
+      end
+
+    f_values
+  end
+
+  # Check convergence: std of function values < tol
+  defnp converged?(state, tol) do
+    f_values = state.f_values
+    mean = Nx.mean(f_values)
+    variance = Nx.mean(Nx.pow(Nx.subtract(f_values, mean), 2))
+    std = Nx.sqrt(variance)
+    Nx.less(std, tol)
+  end
+
+  # Perform one Nelder-Mead iteration
+  defnp nelder_mead_step(state, fun) do
+    simplex = state.simplex
+    f_values = state.f_values
+    {n_plus_1, _n} = Nx.shape(simplex)
+
+    # Sort vertices by function value
+    order = Nx.argsort(f_values)
+    sorted_simplex = Nx.take(simplex, order, axis: 0)
+    sorted_f = Nx.take(f_values, order)
+
+    # Best, second-worst, and worst points
+    f_best = sorted_f[0]
+    f_second_worst = sorted_f[n_plus_1 - 2]
+    x_worst = sorted_simplex[n_plus_1 - 1]
+    f_worst = sorted_f[n_plus_1 - 1]
+
+    # Compute centroid of all points except worst
+    centroid = Nx.mean(sorted_simplex[0..-2//1], axes: [0])
+
+    # Reflection: x_r = centroid + rho * (centroid - x_worst)
+    x_r = Nx.add(centroid, Nx.multiply(@rho, Nx.subtract(centroid, x_worst)))
+    f_r = fun.(x_r)
+
+    # Expansion: x_e = centroid + chi * (x_r - centroid)
+    x_e = Nx.add(centroid, Nx.multiply(@chi, Nx.subtract(x_r, centroid)))
+    f_e = fun.(x_e)
+
+    # Outside contraction: x_oc = centroid + gamma * (x_r - centroid)
+    x_oc = Nx.add(centroid, Nx.multiply(@gamma, Nx.subtract(x_r, centroid)))
+    f_oc = fun.(x_oc)
+
+    # Inside contraction: x_ic = centroid - gamma * (centroid - x_worst)
+    x_ic = Nx.subtract(centroid, Nx.multiply(@gamma, Nx.subtract(centroid, x_worst)))
+    f_ic = fun.(x_ic)
+
+    # Decide action based on f_r
+    {new_simplex, new_f_values, extra_evals} =
+      nelder_mead_update(
+        sorted_simplex,
+        sorted_f,
+        x_r,
+        f_r,
+        x_e,
+        f_e,
+        x_oc,
+        f_oc,
+        x_ic,
+        f_ic,
+        f_best,
+        f_second_worst,
+        f_worst,
+        fun
+      )
+
+    %{
+      state
+      | simplex: new_simplex,
+        f_values: new_f_values,
+        iter: Nx.add(state.iter, 1),
+        # 4 base evals (f_r, f_e, f_oc, f_ic) + extra for shrink
+        f_evals: Nx.add(state.f_evals, Nx.add(4, extra_evals))
+    }
+  end
+
+  # Update simplex based on reflection result
+  defnp nelder_mead_update(
+          sorted_simplex,
+          sorted_f,
+          x_r,
+          f_r,
+          x_e,
+          f_e,
+          x_oc,
+          f_oc,
+          x_ic,
+          f_ic,
+          f_best,
+          f_second_worst,
+          f_worst,
+          fun
+        ) do
+    {n_plus_1, n} = Nx.shape(sorted_simplex)
+
+    # Case 1: f_r < f_best -> try expansion
+    # Case 2: f_best <= f_r < f_second_worst -> accept reflection
+    # Case 3: f_second_worst <= f_r < f_worst -> outside contraction
+    # Case 4: f_r >= f_worst -> inside contraction
+    # If contraction fails -> shrink
+
+    # Determine which case applies
+    case1 = Nx.less(f_r, f_best)
+    case2 = Nx.logical_and(Nx.greater_equal(f_r, f_best), Nx.less(f_r, f_second_worst))
+    case3 = Nx.logical_and(Nx.greater_equal(f_r, f_second_worst), Nx.less(f_r, f_worst))
+    case4 = Nx.greater_equal(f_r, f_worst)
+
+    # Case 1: expansion - use x_e if better than x_r, else use x_r
+    use_expansion = Nx.logical_and(case1, Nx.less(f_e, f_r))
+    x_case1 = Nx.select(use_expansion, x_e, x_r)
+    f_case1 = Nx.select(use_expansion, f_e, f_r)
+
+    # Case 2: accept reflection
+    x_case2 = x_r
+    f_case2 = f_r
+
+    # Case 3: outside contraction - use if better than x_r
+    contraction_success_3 = Nx.less_equal(f_oc, f_r)
+
+    # Case 4: inside contraction - use if better than x_worst
+    contraction_success_4 = Nx.less(f_ic, f_worst)
+
+    # Determine if we need to shrink
+    need_shrink =
+      Nx.logical_or(
+        Nx.logical_and(case3, Nx.logical_not(contraction_success_3)),
+        Nx.logical_and(case4, Nx.logical_not(contraction_success_4))
+      )
+
+    # Select point for non-shrink cases
+    x_new_non_shrink =
+      Nx.select(
+        case1,
+        x_case1,
+        Nx.select(
+          case2,
+          x_case2,
+          Nx.select(case3, x_oc, x_ic)
+        )
+      )
+
+    f_new_non_shrink =
+      Nx.select(
+        case1,
+        f_case1,
+        Nx.select(
+          case2,
+          f_case2,
+          Nx.select(case3, f_oc, f_ic)
+        )
+      )
+
+    # Replace worst point (last row) with new point
+    # Use indexed_put with proper indices
+    worst_idx = Nx.u32(n_plus_1 - 1)
+    row_indices = Nx.broadcast(worst_idx, {n})
+    col_indices = Nx.iota({n}, type: :u32)
+    indices = Nx.stack([row_indices, col_indices], axis: 1)
+
+    new_simplex_replace = Nx.indexed_put(sorted_simplex, indices, x_new_non_shrink)
+    new_f_replace = Nx.indexed_put(sorted_f, Nx.new_axis(worst_idx, 0), f_new_non_shrink)
+
+    # Shrink simplex: x_i = x_best + sigma * (x_i - x_best)
+    x_best = sorted_simplex[0]
+
+    shrunk_simplex =
+      Nx.add(x_best, Nx.multiply(@sigma, Nx.subtract(sorted_simplex, x_best)))
+
+    # Evaluate shrunk simplex (except x_best which stays the same)
+    shrunk_f = evaluate_shrunk_vertices(shrunk_simplex, sorted_f, fun)
+
+    # Select between shrink and non-shrink
+    final_simplex = Nx.select(need_shrink, shrunk_simplex, new_simplex_replace)
+    final_f = Nx.select(need_shrink, shrunk_f, new_f_replace)
+
+    # Extra function evaluations for shrink: n evals (all except best)
+    extra_evals = Nx.select(need_shrink, Nx.u32(n), Nx.u32(0))
+
+    {final_simplex, final_f, extra_evals}
+  end
+
+  # Evaluate shrunk simplex vertices (skip first which is unchanged)
+  defnp evaluate_shrunk_vertices(shrunk_simplex, original_f, fun) do
+    {n_plus_1, _n} = Nx.shape(shrunk_simplex)
+
+    {shrunk_f, _} =
+      while {shrunk_f = original_f, {shrunk_simplex, i = Nx.u32(1)}}, i < n_plus_1 do
+        vertex = shrunk_simplex[i]
+        f_val = fun.(vertex)
+        shrunk_f = Nx.indexed_put(shrunk_f, Nx.new_axis(i, 0), f_val)
+        {shrunk_f, {shrunk_simplex, i + 1}}
+      end
+
+    shrunk_f
+  end
+end
diff --git a/notebooks/efficient_frontier.livemd b/notebooks/efficient_frontier.livemd
new file mode 100644
index 00000000..9cfe117b
--- /dev/null
+++ b/notebooks/efficient_frontier.livemd
@@ -0,0 +1,298 @@
+# Portfolio Optimization: Efficient Frontier
+
+```elixir
+Mix.install([
+  {:scholar, path: "."},
+  {:nx, "~> 0.9"},
+  {:kino_vega_lite, "~> 0.1"},
+  {:req, "~> 0.5"}
+])
+```
+
+## Introduction
+
+This notebook demonstrates portfolio optimization using Scholar's multivariate optimization algorithms. We'll compute the efficient frontier for a portfolio of stocks using the Markowitz mean-variance framework.
+
+The efficient frontier represents the set of portfolios that offer the highest expected return for a given level of risk, or equivalently, the lowest risk for a given expected return.
+
+## Fetch Stock Data
+
+We'll use historical stock data for a few major tech companies. For simplicity, we'll use pre-computed monthly returns.
+
+```elixir
+# Sample monthly returns data for 5 stocks (12 months)
+# In practice, you would fetch this from Yahoo Finance or another data source
+stock_names = ["AAPL", "GOOGL", "MSFT", "AMZN", "META"]
+
+# Monthly returns (12 months x 5 stocks) - simulated realistic data
+returns = Nx.tensor([
+  [0.05, 0.03, 0.04, 0.06, 0.02],
+  [-0.02, -0.01, 0.01, -0.03, -0.04],
+  [0.08, 0.06, 0.07, 0.09, 0.05],
+  [0.03, 0.02, 0.03, 0.04, 0.01],
+  [-0.01, 0.02, 0.01, -0.02, 0.03],
+  [0.04, 0.03, 0.05, 0.03, 0.04],
+  [0.02, 0.01, 0.02, 0.05, 0.02],
+  [-0.03, -0.02, -0.01, -0.04, -0.01],
+  [0.06, 0.04, 0.05, 0.07, 0.03],
+  [0.01, 0.02, 0.01, 0.02, 0.01],
+  [0.04, 0.03, 0.04, 0.05, 0.02],
+  [-0.01, 0.01, 0.00, -0.01, 0.02]
+], type: :f64)
+
+IO.puts("Returns matrix shape: #{inspect(Nx.shape(returns))}")
+IO.puts("Stocks: #{inspect(stock_names)}")
+```
+
+## Compute Expected Returns and Covariance
+
+```elixir
+# Expected returns (mean of historical returns)
+expected_returns = Nx.mean(returns, axes: [0])
+
+# Covariance matrix
+n_samples = Nx.axis_size(returns, 0)
+centered = Nx.subtract(returns, expected_returns)
+covariance = Nx.divide(Nx.dot(Nx.transpose(centered), centered), n_samples - 1)
+
+IO.puts("Expected Monthly Returns:")
+Enum.zip(stock_names, Nx.to_flat_list(expected_returns))
+|> Enum.each(fn {name, ret} ->
+  IO.puts("  #{name}: #{Float.round(ret * 100, 2)}%")
+end)
+
+IO.puts("\nCovariance Matrix:")
+covariance
+```
+
+## Portfolio Optimization Problem
+
+We want to find portfolio weights $w$ that minimize risk (variance) for a given target return, or equivalently, maximize the Sharpe ratio.
+
+The optimization problem is:
+- **Minimize:** $w^T \Sigma w$ (portfolio variance)
+- **Subject to:** $w^T \mu = r_{target}$ (target return) and $\sum w_i = 1$ (weights sum to 1)
+
+For the efficient frontier, we solve this for multiple target returns.
+
+### Unconstrained Formulation
+
+To handle constraints with unconstrained optimizers, we use a penalty method:
+
+$$\text{minimize } w^T \Sigma w - \lambda \cdot w^T \mu + \rho \cdot (1 - \sum w_i)^2$$
+
+where $\lambda$ controls the risk-return tradeoff and $\rho$ is a penalty parameter.
+
+```elixir
+alias Scholar.Optimize.BFGS
+
+defmodule Portfolio do
+  import Nx.Defn
+
+  @doc """
+  Portfolio objective function.
+
+  Minimizes variance while penalizing deviation from target return
+  and ensuring weights sum to 1.
+  """
+  def objective(cov, mu, lambda, penalty \\ 1000.0) do
+    fn w ->
+      # Portfolio variance: w' * Sigma * w
+      variance = Nx.dot(w, Nx.dot(cov, w))
+
+      # Portfolio return: w' * mu
+      portfolio_return = Nx.dot(w, mu)
+
+      # Constraint: weights sum to 1
+      sum_constraint = Nx.pow(Nx.subtract(Nx.sum(w), 1), 2)
+
+      # Objective: variance - lambda * return + penalty * constraint
+      Nx.add(
+        Nx.subtract(variance, Nx.multiply(lambda, portfolio_return)),
+        Nx.multiply(penalty, sum_constraint)
+      )
+    end
+  end
+
+  @doc """
+  Compute portfolio variance and return from weights.
+  """
+  def portfolio_stats(w, cov, mu) do
+    variance = Nx.dot(w, Nx.dot(cov, w)) |> Nx.to_number()
+    portfolio_return = Nx.dot(w, mu) |> Nx.to_number()
+    {portfolio_return, :math.sqrt(variance)}
+  end
+end
+
+IO.puts("Portfolio module defined.")
+```
+
+## Compute the Efficient Frontier
+
+We'll compute optimal portfolios for different values of $\lambda$ (risk-return tradeoff):
+
+```elixir
+# Range of lambda values (higher = more return-seeking)
+lambdas = [0.0, 0.5, 1.0, 2.0, 3.0, 5.0, 8.0, 12.0, 18.0, 25.0, 35.0, 50.0]
+
+# Initial weights (equal weighted)
+n_assets = Nx.axis_size(expected_returns, 0)
+w0 = Nx.broadcast(Nx.divide(1.0, n_assets), {n_assets}) |> Nx.as_type(:f64)
+
+# Compute optimal portfolios for each lambda
+efficient_portfolios = Enum.map(lambdas, fn lambda ->
+  objective = Portfolio.objective(covariance, expected_returns, lambda)
+
+  result = BFGS.minimize(w0, objective, gtol: 1.0e-8, maxiter: 500)
+
+  # Normalize weights to sum to 1
+  weights = Nx.divide(result.x, Nx.sum(result.x))
+
+  {ret, risk} = Portfolio.portfolio_stats(weights, covariance, expected_returns)
+
+  %{
+    lambda: lambda,
+    weights: Nx.to_list(weights),
+    return: ret * 12,  # Annualized
+    risk: risk * :math.sqrt(12),  # Annualized
+    converged: Nx.to_number(result.converged) == 1
+  }
+end)
+
+IO.puts("Efficient Frontier Portfolios:")
+IO.puts("")
+IO.puts("Lambda | Ann. Return | Ann. Risk | Converged")
+IO.puts("-------|-------------|-----------|----------")
+
+Enum.each(efficient_portfolios, fn p ->
+  ret_pct = Float.round(p.return * 100, 2)
+  risk_pct = Float.round(p.risk * 100, 2)
+  IO.puts("#{String.pad_leading("#{p.lambda}", 6)} | #{String.pad_leading("#{ret_pct}%", 11)} | #{String.pad_leading("#{risk_pct}%", 9)} | #{p.converged}")
+end)
+```
+
+## Visualize the Efficient Frontier
+
+```elixir
+alias VegaLite, as: Vl
+
+# Prepare data for plotting
+frontier_data = Enum.map(efficient_portfolios, fn p ->
+  %{
+    "Risk (%)" => Float.round(p.risk * 100, 2),
+    "Return (%)" => Float.round(p.return * 100, 2),
+    "Lambda" => p.lambda
+  }
+end)
+
+Vl.new(width: 600, height: 400, title: "Efficient Frontier")
+|> Vl.data_from_values(frontier_data)
+|> Vl.mark(:line, point: true)
+|> Vl.encode_field(:x, "Risk (%)", type: :quantitative, title: "Annualized Risk (%)")
+|> Vl.encode_field(:y, "Return (%)", type: :quantitative, title: "Annualized Return (%)")
+|> Vl.encode_field(:tooltip, "Lambda", type: :quantitative)
+```
+
+## Optimal Portfolio Weights
+
+Let's examine the weights for a balanced portfolio (middle of the frontier):
+
+```elixir
+# Select a balanced portfolio (lambda = 5.0)
+balanced = Enum.find(efficient_portfolios, fn p -> p.lambda == 5.0 end)
+
+IO.puts("Balanced Portfolio (λ = 5.0):")
+IO.puts("  Annualized Return: #{Float.round(balanced.return * 100, 2)}%")
+IO.puts("  Annualized Risk: #{Float.round(balanced.risk * 100, 2)}%")
+IO.puts("")
+IO.puts("Weights:")
+
+Enum.zip(stock_names, balanced.weights)
+|> Enum.each(fn {name, weight} ->
+  pct = Float.round(weight * 100, 1)
+  bar = String.duplicate("█", round(pct / 2))
+  IO.puts("  #{String.pad_trailing(name, 5)}: #{String.pad_leading("#{pct}%", 6)} #{bar}")
+end)
+```
+
+## Compare BFGS vs Nelder-Mead
+
+Let's compare the two optimization methods on portfolio optimization:
+
+```elixir
+alias Scholar.Optimize.{BFGS, NelderMead}
+
+lambda = 5.0
+objective = Portfolio.objective(covariance, expected_returns, lambda)
+
+# BFGS
+bfgs_result = BFGS.minimize(w0, objective, gtol: 1.0e-8, maxiter: 500)
+bfgs_weights = Nx.divide(bfgs_result.x, Nx.sum(bfgs_result.x))
+{bfgs_ret, bfgs_risk} = Portfolio.portfolio_stats(bfgs_weights, covariance, expected_returns)
+
+# Nelder-Mead
+nm_result = NelderMead.minimize(w0, objective, tol: 1.0e-10, maxiter: 1000)
+nm_weights = Nx.divide(nm_result.x, Nx.sum(nm_result.x))
+{nm_ret, nm_risk} = Portfolio.portfolio_stats(nm_weights, covariance, expected_returns)
+
+IO.puts("Portfolio Optimization Comparison (λ = #{lambda}):")
+IO.puts("")
+IO.puts("Method       | Return (Ann) | Risk (Ann) | Fun Evals | Converged")
+IO.puts("-------------|--------------|------------|-----------|----------")
+IO.puts("BFGS         | #{String.pad_leading("#{Float.round(bfgs_ret * 1200, 2)}%", 12)} | #{String.pad_leading("#{Float.round(bfgs_risk * 100 * :math.sqrt(12), 2)}%", 10)} | #{String.pad_leading("#{Nx.to_number(bfgs_result.fun_evals)}", 9)} | #{Nx.to_number(bfgs_result.converged) == 1}")
+IO.puts("Nelder-Mead  | #{String.pad_leading("#{Float.round(nm_ret * 1200, 2)}%", 12)} | #{String.pad_leading("#{Float.round(nm_risk * 100 * :math.sqrt(12), 2)}%", 10)} | #{String.pad_leading("#{Nx.to_number(nm_result.fun_evals)}", 9)} | #{Nx.to_number(nm_result.converged) == 1}")
+```
+
+## Minimum Variance Portfolio
+
+The minimum variance portfolio is found by setting $\lambda = 0$:
+
+```elixir
+min_var = Enum.find(efficient_portfolios, fn p -> p.lambda == 0.0 end)
+
+IO.puts("Minimum Variance Portfolio:")
+IO.puts("  Annualized Return: #{Float.round(min_var.return * 100, 2)}%")
+IO.puts("  Annualized Risk: #{Float.round(min_var.risk * 100, 2)}%")
+IO.puts("")
+IO.puts("Weights:")
+
+Enum.zip(stock_names, min_var.weights)
+|> Enum.each(fn {name, weight} ->
+  pct = Float.round(weight * 100, 1)
+  IO.puts("  #{String.pad_trailing(name, 5)}: #{String.pad_leading("#{pct}%", 6)}")
+end)
+```
+
+## Maximum Return Portfolio
+
+The maximum return portfolio is found with a high $\lambda$ value:
+
+```elixir
+max_ret = Enum.max_by(efficient_portfolios, fn p -> p.return end)
+
+IO.puts("Maximum Return Portfolio (λ = #{max_ret.lambda}):")
+IO.puts("  Annualized Return: #{Float.round(max_ret.return * 100, 2)}%")
+IO.puts("  Annualized Risk: #{Float.round(max_ret.risk * 100, 2)}%")
+IO.puts("")
+IO.puts("Weights:")
+
+Enum.zip(stock_names, max_ret.weights)
+|> Enum.each(fn {name, weight} ->
+  pct = Float.round(weight * 100, 1)
+  IO.puts("  #{String.pad_trailing(name, 5)}: #{String.pad_leading("#{pct}%", 6)}")
+end)
+```
+
+## Summary
+
+This notebook demonstrated:
+
+1. **Portfolio optimization** using Markowitz mean-variance framework
+2. **Efficient frontier** computation using BFGS optimization
+3. **Penalty methods** to handle equality constraints with unconstrained optimizers
+4. **Comparison** between BFGS and Nelder-Mead for portfolio optimization
+
+Scholar's optimization algorithms are well-suited for financial applications:
+- **BFGS** provides fast convergence for smooth objectives
+- **Nelder-Mead** works when gradients are unavailable or noisy
+- Both are JIT-compatible for high performance
diff --git a/notebooks/optimize.livemd b/notebooks/optimize.livemd
index 167053b8..52f5b0d9 100644
--- a/notebooks/optimize.livemd
+++ b/notebooks/optimize.livemd
@@ -1,4 +1,4 @@
-# Scalar Optimization with Scholar.Optimize
+# Optimization with Scholar.Optimize
 
 ```elixir
 Mix.install([
@@ -10,6 +10,18 @@ Mix.install([
 
 ## Introduction
 
+Scholar provides optimization algorithms for both scalar (univariate) and multivariate functions:
+
+**Scalar Optimization:**
+- **`Scholar.Optimize.Brent`** - Brent's method (recommended for 1D)
+- **`Scholar.Optimize.GoldenSection`** - Golden Section search
+
+**Multivariate Optimization:**
+- **`Scholar.Optimize.BFGS`** - Quasi-Newton method with gradients (recommended)
+- **`Scholar.Optimize.NelderMead`** - Derivative-free simplex method
+
+## Scalar Optimization
+
 Scholar provides two algorithms for scalar (univariate) function minimization:
 
 - **`Scholar.Optimize.Brent`** - Brent's method (recommended)
@@ -283,3 +295,120 @@ Both methods:
 - Require no derivatives
 - Are JIT/GPU compatible
 - Guarantee convergence within the specified bracket
+
+## Multivariate Optimization
+
+For functions of multiple variables, Scholar provides two algorithms:
+
+### BFGS (Recommended)
+
+BFGS (Broyden-Fletcher-Goldfarb-Shanno) is a quasi-Newton method that uses automatic differentiation to compute gradients. It's the recommended choice for smooth, differentiable functions.
+
+```elixir
+alias Scholar.Optimize.BFGS
+
+# Minimize a simple quadratic: f(x) = x1^2 + x2^2
+fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+x0 = Nx.tensor([1.0, 2.0])
+
+result = BFGS.minimize(x0, fun)
+
+IO.puts("BFGS Result:")
+IO.puts("  x = #{inspect(Nx.to_list(result.x))}")
+IO.puts("  f(x) = #{Nx.to_number(result.fun)}")
+IO.puts("  Converged: #{Nx.to_number(result.converged) == 1}")
+IO.puts("  Iterations: #{Nx.to_number(result.iterations)}")
+IO.puts("  Function evals: #{Nx.to_number(result.fun_evals)}")
+IO.puts("  Gradient evals: #{Nx.to_number(result.grad_evals)}")
+```
+
+### Nelder-Mead
+
+Nelder-Mead is a derivative-free method (simplex algorithm). Use it when gradients are unavailable or expensive to compute.
+
+```elixir
+alias Scholar.Optimize.NelderMead
+
+# Same function, but without using gradients
+fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+x0 = Nx.tensor([1.0, 2.0])
+
+result = NelderMead.minimize(x0, fun)
+
+IO.puts("Nelder-Mead Result:")
+IO.puts("  x = #{inspect(Nx.to_list(result.x))}")
+IO.puts("  f(x) = #{Nx.to_number(result.fun)}")
+IO.puts("  Converged: #{Nx.to_number(result.converged) == 1}")
+IO.puts("  Iterations: #{Nx.to_number(result.iterations)}")
+IO.puts("  Function evals: #{Nx.to_number(result.fun_evals)}")
+```
+
+### The Rosenbrock Function
+
+The Rosenbrock function is a classic optimization test case. It has a global minimum at $(1, 1)$ inside a narrow, curved valley:
+
+$$f(x, y) = (1 - x)^2 + 100(y - x^2)^2$$
+
+```elixir
+alias Scholar.Optimize.{BFGS, NelderMead}
+
+# Rosenbrock function
+rosenbrock = fn x ->
+  x0 = x[0]
+  x1 = x[1]
+  term1 = Nx.pow(Nx.subtract(1, x0), 2)
+  term2 = Nx.multiply(100, Nx.pow(Nx.subtract(x1, Nx.pow(x0, 2)), 2))
+  Nx.add(term1, term2)
+end
+
+x0 = Nx.tensor([0.0, 0.0], type: :f64)
+
+# Solve with BFGS
+bfgs_result = BFGS.minimize(x0, rosenbrock, gtol: 1.0e-8, maxiter: 1000)
+
+# Solve with Nelder-Mead
+nm_result = NelderMead.minimize(x0, rosenbrock, tol: 1.0e-10, maxiter: 2000)
+
+IO.puts("Rosenbrock minimum at (1, 1):")
+IO.puts("")
+IO.puts("BFGS:")
+IO.puts("  x = #{inspect(Nx.to_list(bfgs_result.x))}")
+IO.puts("  Function evals: #{Nx.to_number(bfgs_result.fun_evals)}")
+IO.puts("")
+IO.puts("Nelder-Mead:")
+IO.puts("  x = #{inspect(Nx.to_list(nm_result.x))}")
+IO.puts("  Function evals: #{Nx.to_number(nm_result.fun_evals)}")
+```
+
+### BFGS vs Nelder-Mead
+
+BFGS typically converges much faster for smooth functions because it uses gradient information:
+
+```elixir
+alias Scholar.Optimize.{BFGS, NelderMead}
+
+# 3D sphere function
+fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+x0 = Nx.tensor([1.0, 2.0, 3.0])
+
+bfgs_result = BFGS.minimize(x0, fun, gtol: 1.0e-6)
+nm_result = NelderMead.minimize(x0, fun, tol: 1.0e-6)
+
+IO.puts("3D Sphere function minimization:")
+IO.puts("  BFGS:        #{Nx.to_number(bfgs_result.fun_evals)} function evals")
+IO.puts("  Nelder-Mead: #{Nx.to_number(nm_result.fun_evals)} function evals")
+IO.puts("")
+IO.puts("BFGS speedup: #{Float.round(Nx.to_number(nm_result.fun_evals) / Nx.to_number(bfgs_result.fun_evals), 1)}x fewer evaluations")
+```
+
+### When to Use Which Method
+
+| Method | Use Case |
+|--------|----------|
+| **BFGS** | Default choice for smooth, differentiable functions |
+| **Nelder-Mead** | Non-differentiable functions, noisy functions, or when gradients are expensive |
+
+Both multivariate methods:
+- Work with functions of any number of variables
+- Are JIT/GPU compatible
+- Return similar result structs with `x`, `fun`, `converged`, `iterations`, `fun_evals`
diff --git a/test/scholar/optimize/bfgs_test.exs b/test/scholar/optimize/bfgs_test.exs
new file mode 100644
index 00000000..98b90cb6
--- /dev/null
+++ b/test/scholar/optimize/bfgs_test.exs
@@ -0,0 +1,178 @@
+defmodule Scholar.Optimize.BFGSTest do
+  use Scholar.Case, async: true
+  alias Scholar.Optimize.BFGS
+  doctest BFGS
+
+  describe "minimize/3" do
+    test "minimizes sphere function" do
+      # f(x) = sum(x^2), minimum at origin
+      fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+      x0 = Nx.tensor([1.0, 2.0, 3.0])
+
+      result = BFGS.minimize(x0, fun, gtol: 1.0e-6, maxiter: 500)
+
+      assert Nx.to_number(result.converged) == 1
+      assert_all_close(result.x, Nx.tensor([0.0, 0.0, 0.0]), atol: 1.0e-4)
+      assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-8)
+    end
+
+    test "minimizes Rosenbrock function" do
+      # f(x,y) = (1-x)^2 + 100*(y-x^2)^2, minimum at (1, 1)
+      rosenbrock = fn x ->
+        x0 = x[0]
+        x1 = x[1]
+        term1 = Nx.pow(Nx.subtract(1, x0), 2)
+        term2 = Nx.multiply(100, Nx.pow(Nx.subtract(x1, Nx.pow(x0, 2)), 2))
+        Nx.add(term1, term2)
+      end
+
+      x0 = Nx.tensor([0.0, 0.0], type: :f64)
+
+      result = BFGS.minimize(x0, rosenbrock, gtol: 1.0e-8, maxiter: 1000)
+
+      assert Nx.to_number(result.converged) == 1
+      assert_all_close(result.x, Nx.tensor([1.0, 1.0]), atol: 1.0e-4)
+      assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-6)
+    end
+
+    test "minimizes Booth function" do
+      # f(x,y) = (x + 2y - 7)^2 + (2x + y - 5)^2, minimum at (1, 3)
+      booth = fn x ->
+        x0 = x[0]
+        x1 = x[1]
+        # (x + 2y - 7)^2
+        term1 = Nx.pow(Nx.subtract(Nx.add(x0, Nx.multiply(2, x1)), 7), 2)
+        # (2x + y - 5)^2
+        term2 = Nx.pow(Nx.subtract(Nx.add(Nx.multiply(2, x0), x1), 5), 2)
+        Nx.add(term1, term2)
+      end
+
+      x0 = Nx.tensor([0.0, 0.0])
+
+      result = BFGS.minimize(x0, booth, gtol: 1.0e-6, maxiter: 500)
+
+      assert Nx.to_number(result.converged) == 1
+      assert_all_close(result.x, Nx.tensor([1.0, 3.0]), atol: 1.0e-4)
+      assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-8)
+    end
+
+    test "minimizes Beale function" do
+      # f(x,y) = (1.5 - x + xy)^2 + (2.25 - x + xy^2)^2 + (2.625 - x + xy^3)^2
+      # minimum at (3, 0.5)
+      beale = fn x ->
+        x0 = x[0]
+        x1 = x[1]
+
+        # 1.5 - x + xy
+        t1 = Nx.add(Nx.subtract(1.5, x0), Nx.multiply(x0, x1))
+        term1 = Nx.pow(t1, 2)
+
+        # 2.25 - x + xy^2
+        t2 = Nx.add(Nx.subtract(2.25, x0), Nx.multiply(x0, Nx.pow(x1, 2)))
+        term2 = Nx.pow(t2, 2)
+
+        # 2.625 - x + xy^3
+        t3 = Nx.add(Nx.subtract(2.625, x0), Nx.multiply(x0, Nx.pow(x1, 3)))
+        term3 = Nx.pow(t3, 2)
+
+        Nx.add(Nx.add(term1, term2), term3)
+      end
+
+      x0 = Nx.tensor([0.0, 0.0], type: :f64)
+
+      result = BFGS.minimize(x0, beale, gtol: 1.0e-8, maxiter: 1000)
+
+      assert Nx.to_number(result.converged) == 1
+      assert_all_close(result.x, Nx.tensor([3.0, 0.5]), atol: 1.0e-3)
+      assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-6)
+    end
+
+    test "minimizes shifted quadratic" do
+      # f(x) = (x1 - 2)^2 + (x2 + 3)^2 + (x3 - 1)^2, minimum at (2, -3, 1)
+      fun = fn x ->
+        x0 = x[0]
+        x1 = x[1]
+        x2 = x[2]
+        term1 = Nx.pow(Nx.subtract(x0, 2), 2)
+        term2 = Nx.pow(Nx.add(x1, 3), 2)
+        term3 = Nx.pow(Nx.subtract(x2, 1), 2)
+        Nx.add(Nx.add(term1, term2), term3)
+      end
+
+      x0 = Nx.tensor([0.0, 0.0, 0.0])
+
+      result = BFGS.minimize(x0, fun, gtol: 1.0e-6, maxiter: 500)
+
+      assert Nx.to_number(result.converged) == 1
+      assert_all_close(result.x, Nx.tensor([2.0, -3.0, 1.0]), atol: 1.0e-4)
+      assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-8)
+    end
+
+    test "respects maxiter limit" do
+      fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+      x0 = Nx.tensor([100.0, 100.0])
+
+      result = BFGS.minimize(x0, fun, gtol: 1.0e-15, maxiter: 5)
+
+      assert Nx.to_number(result.iterations) <= 5
+    end
+
+    test "works with jit_apply" do
+      fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+      x0 = Nx.tensor([1.0, 2.0])
+      opts = [gtol: 1.0e-6, maxiter: 500]
+
+      result = Nx.Defn.jit_apply(&BFGS.minimize/3, [x0, fun, opts])
+
+      assert Nx.to_number(result.converged) == 1
+      assert_all_close(result.x, Nx.tensor([0.0, 0.0]), atol: 1.0e-4)
+    end
+
+    test "returns correct struct" do
+      fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+      x0 = Nx.tensor([1.0, 2.0])
+
+      result = BFGS.minimize(x0, fun, gtol: 1.0e-5, maxiter: 500)
+
+      assert %Scholar.Optimize.BFGS{} = result
+      assert Map.has_key?(result, :grad_evals)
+    end
+
+    test "achieves higher precision with f64" do
+      fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+      x0 = Nx.tensor([1.0, 2.0], type: :f64)
+
+      result = BFGS.minimize(x0, fun, gtol: 1.0e-12, maxiter: 1000)
+
+      assert Nx.to_number(result.converged) == 1
+      assert_all_close(result.x, Nx.tensor([0.0, 0.0]), atol: 1.0e-10)
+    end
+
+    test "handles higher dimensions" do
+      # 5-dimensional sphere function
+      fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+      x0 = Nx.tensor([1.0, 2.0, 3.0, 4.0, 5.0])
+
+      result = BFGS.minimize(x0, fun, gtol: 1.0e-5, maxiter: 1000)
+
+      assert Nx.to_number(result.converged) == 1
+      assert_all_close(result.x, Nx.tensor([0.0, 0.0, 0.0, 0.0, 0.0]), atol: 1.0e-3)
+    end
+
+    test "converges faster than Nelder-Mead on smooth functions" do
+      # BFGS should use significantly fewer function evaluations
+      fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+      x0 = Nx.tensor([1.0, 2.0, 3.0])
+
+      bfgs_result = BFGS.minimize(x0, fun, gtol: 1.0e-6)
+      nm_result = Scholar.Optimize.NelderMead.minimize(x0, fun, tol: 1.0e-6)
+
+      # Both should converge
+      assert Nx.to_number(bfgs_result.converged) == 1
+      assert Nx.to_number(nm_result.converged) == 1
+
+      # BFGS should use fewer function evaluations for this smooth problem
+      assert Nx.to_number(bfgs_result.fun_evals) < Nx.to_number(nm_result.fun_evals)
+    end
+  end
+end
diff --git a/test/scholar/optimize/nelder_mead_test.exs b/test/scholar/optimize/nelder_mead_test.exs
new file mode 100644
index 00000000..f3b8cd6e
--- /dev/null
+++ b/test/scholar/optimize/nelder_mead_test.exs
@@ -0,0 +1,171 @@
+defmodule Scholar.Optimize.NelderMeadTest do
+  use Scholar.Case, async: true
+  alias Scholar.Optimize.NelderMead
+  doctest NelderMead
+
+  describe "minimize/3" do
+    test "minimizes sphere function" do
+      # f(x) = sum(x^2), minimum at origin
+      fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+      x0 = Nx.tensor([1.0, 2.0, 3.0])
+
+      result = NelderMead.minimize(x0, fun, tol: 1.0e-6, maxiter: 500)
+
+      assert Nx.to_number(result.converged) == 1
+      assert_all_close(result.x, Nx.tensor([0.0, 0.0, 0.0]), atol: 1.0e-2)
+      assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-4)
+    end
+
+    test "minimizes Rosenbrock function" do
+      # f(x,y) = (1-x)^2 + 100*(y-x^2)^2, minimum at (1, 1)
+      rosenbrock = fn x ->
+        x0 = x[0]
+        x1 = x[1]
+        term1 = Nx.pow(Nx.subtract(1, x0), 2)
+        term2 = Nx.multiply(100, Nx.pow(Nx.subtract(x1, Nx.pow(x0, 2)), 2))
+        Nx.add(term1, term2)
+      end
+
+      x0 = Nx.tensor([0.0, 0.0], type: :f64)
+
+      result = NelderMead.minimize(x0, rosenbrock, tol: 1.0e-10, maxiter: 2000)
+
+      assert Nx.to_number(result.converged) == 1
+      assert_all_close(result.x, Nx.tensor([1.0, 1.0]), atol: 1.0e-3)
+      assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-4)
+    end
+
+    test "minimizes Booth function" do
+      # f(x,y) = (x + 2y - 7)^2 + (2x + y - 5)^2, minimum at (1, 3)
+      booth = fn x ->
+        x0 = x[0]
+        x1 = x[1]
+        # (x + 2y - 7)^2
+        term1 = Nx.pow(Nx.subtract(Nx.add(x0, Nx.multiply(2, x1)), 7), 2)
+        # (2x + y - 5)^2
+        term2 = Nx.pow(Nx.subtract(Nx.add(Nx.multiply(2, x0), x1), 5), 2)
+        Nx.add(term1, term2)
+      end
+
+      x0 = Nx.tensor([0.0, 0.0])
+
+      result = NelderMead.minimize(x0, booth, tol: 1.0e-8, maxiter: 500)
+
+      assert Nx.to_number(result.converged) == 1
+      assert_all_close(result.x, Nx.tensor([1.0, 3.0]), atol: 1.0e-2)
+      assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-4)
+    end
+
+    test "minimizes Beale function" do
+      # f(x,y) = (1.5 - x + xy)^2 + (2.25 - x + xy^2)^2 + (2.625 - x + xy^3)^2
+      # minimum at (3, 0.5)
+      beale = fn x ->
+        x0 = x[0]
+        x1 = x[1]
+
+        # 1.5 - x + xy
+        t1 = Nx.add(Nx.subtract(1.5, x0), Nx.multiply(x0, x1))
+        term1 = Nx.pow(t1, 2)
+
+        # 2.25 - x + xy^2
+        t2 = Nx.add(Nx.subtract(2.25, x0), Nx.multiply(x0, Nx.pow(x1, 2)))
+        term2 = Nx.pow(t2, 2)
+
+        # 2.625 - x + xy^3
+        t3 = Nx.add(Nx.subtract(2.625, x0), Nx.multiply(x0, Nx.pow(x1, 3)))
+        term3 = Nx.pow(t3, 2)
+
+        Nx.add(Nx.add(term1, term2), term3)
+      end
+
+      x0 = Nx.tensor([0.0, 0.0], type: :f64)
+
+      result = NelderMead.minimize(x0, beale, tol: 1.0e-10, maxiter: 1000)
+
+      assert Nx.to_number(result.converged) == 1
+      assert_all_close(result.x, Nx.tensor([3.0, 0.5]), atol: 1.0e-2)
+      assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-4)
+    end
+
+    test "minimizes shifted quadratic" do
+      # f(x) = (x1 - 2)^2 + (x2 + 3)^2 + (x3 - 1)^2, minimum at (2, -3, 1)
+      fun = fn x ->
+        x0 = x[0]
+        x1 = x[1]
+        x2 = x[2]
+        term1 = Nx.pow(Nx.subtract(x0, 2), 2)
+        term2 = Nx.pow(Nx.add(x1, 3), 2)
+        term3 = Nx.pow(Nx.subtract(x2, 1), 2)
+        Nx.add(Nx.add(term1, term2), term3)
+      end
+
+      x0 = Nx.tensor([0.0, 0.0, 0.0])
+
+      result = NelderMead.minimize(x0, fun, tol: 1.0e-6, maxiter: 500)
+
+      assert Nx.to_number(result.converged) == 1
+      assert_all_close(result.x, Nx.tensor([2.0, -3.0, 1.0]), atol: 1.0e-2)
+      assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-4)
+    end
+
+    test "respects maxiter limit" do
+      fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+      x0 = Nx.tensor([100.0, 100.0])
+
+      result = NelderMead.minimize(x0, fun, tol: 1.0e-10, maxiter: 5)
+
+      assert Nx.to_number(result.iterations) <= 5
+    end
+
+    test "works with jit_apply" do
+      fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+      x0 = Nx.tensor([1.0, 2.0])
+      opts = [tol: 1.0e-6, maxiter: 500]
+
+      result = Nx.Defn.jit_apply(&NelderMead.minimize/3, [x0, fun, opts])
+
+      assert Nx.to_number(result.converged) == 1
+      assert_all_close(result.x, Nx.tensor([0.0, 0.0]), atol: 1.0e-2)
+    end
+
+    test "returns correct struct" do
+      fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+      x0 = Nx.tensor([1.0, 2.0])
+
+      result = NelderMead.minimize(x0, fun, tol: 1.0e-5, maxiter: 500)
+
+      assert %Scholar.Optimize.NelderMead{} = result
+    end
+
+    test "achieves higher precision with f64" do
+      fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+      x0 = Nx.tensor([1.0, 2.0], type: :f64)
+
+      result = NelderMead.minimize(x0, fun, tol: 1.0e-12, maxiter: 1000)
+
+      assert Nx.to_number(result.converged) == 1
+      assert_all_close(result.x, Nx.tensor([0.0, 0.0]), atol: 1.0e-4)
+    end
+
+    test "handles higher dimensions" do
+      # 5-dimensional sphere function
+      fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+      x0 = Nx.tensor([1.0, 2.0, 3.0, 4.0, 5.0])
+
+      result = NelderMead.minimize(x0, fun, tol: 1.0e-5, maxiter: 2000)
+
+      assert Nx.to_number(result.converged) == 1
+      assert_all_close(result.x, Nx.tensor([0.0, 0.0, 0.0, 0.0, 0.0]), atol: 0.1)
+    end
+
+    test "custom initial_simplex_step" do
+      fun = fn x -> Nx.sum(Nx.pow(x, 2)) end
+      x0 = Nx.tensor([1.0, 2.0])
+
+      result = NelderMead.minimize(x0, fun, tol: 1.0e-6, initial_simplex_step: 0.1)
+
+      assert Nx.to_number(result.converged) == 1
+      assert_all_close(result.x, Nx.tensor([0.0, 0.0]), atol: 1.0e-2)
+    end
+  end
+end