From f2404add767be035e4f2145a8a7aaca4d120ec07 Mon Sep 17 00:00:00 2001 From: Niko Maroulis Date: Fri, 30 Jan 2026 18:31:02 -0500 Subject: [PATCH] Add BFGS and Nelder-Mead multivariate optimization algorithms - BFGS: Quasi-Newton method with automatic differentiation - Nelder-Mead: Derivative-free simplex method - Update optimize.livemd with multivariate section - Add efficient_frontier.livemd portfolio optimization example --- lib/scholar/optimize/bfgs.ex | 297 ++++++++++++++ lib/scholar/optimize/nelder_mead.ex | 437 +++++++++++++++++++++ notebooks/efficient_frontier.livemd | 298 ++++++++++++++ notebooks/optimize.livemd | 131 +++++- test/scholar/optimize/bfgs_test.exs | 178 +++++++++ test/scholar/optimize/nelder_mead_test.exs | 171 ++++++++ 6 files changed, 1511 insertions(+), 1 deletion(-) create mode 100644 lib/scholar/optimize/bfgs.ex create mode 100644 lib/scholar/optimize/nelder_mead.ex create mode 100644 notebooks/efficient_frontier.livemd create mode 100644 test/scholar/optimize/bfgs_test.exs create mode 100644 test/scholar/optimize/nelder_mead_test.exs diff --git a/lib/scholar/optimize/bfgs.ex b/lib/scholar/optimize/bfgs.ex new file mode 100644 index 00000000..ea9d7eca --- /dev/null +++ b/lib/scholar/optimize/bfgs.ex @@ -0,0 +1,297 @@ +defmodule Scholar.Optimize.BFGS do + @moduledoc """ + BFGS (Broyden-Fletcher-Goldfarb-Shanno) algorithm for multivariate function minimization. + + BFGS is a quasi-Newton optimization method that approximates the inverse Hessian + matrix using gradient information. It is well-suited for smooth, differentiable + objective functions and typically converges much faster than derivative-free methods + like Nelder-Mead. + + ## Algorithm + + At each iteration, the algorithm: + 1. Computes the gradient using automatic differentiation + 2. Determines a search direction from the inverse Hessian approximation + 3. Performs a line search to find an acceptable step length + 4. Updates the inverse Hessian approximation using the BFGS formula + + ## Convergence + + The algorithm converges when the gradient norm falls below the specified tolerance. + + ## References + + * Nocedal, J. and Wright, S. J. (2006). "Numerical Optimization" + * Fletcher, R. (1987). "Practical Methods of Optimization" + """ + + import Nx.Defn + + @derive {Nx.Container, containers: [:x, :fun, :converged, :iterations, :fun_evals, :grad_evals]} + defstruct [:x, :fun, :converged, :iterations, :fun_evals, :grad_evals] + + @type t :: %__MODULE__{ + x: Nx.Tensor.t(), + fun: Nx.Tensor.t(), + converged: Nx.Tensor.t(), + iterations: Nx.Tensor.t(), + fun_evals: Nx.Tensor.t(), + grad_evals: Nx.Tensor.t() + } + + # Line search parameter (Armijo condition) + @c1 1.0e-4 + + opts = [ + gtol: [ + type: {:custom, Scholar.Options, :positive_number, []}, + default: 1.0e-5, + doc: """ + Gradient norm tolerance for convergence. The algorithm stops when + the infinity norm of the gradient is below this threshold. + """ + ], + maxiter: [ + type: :pos_integer, + default: 500, + doc: "Maximum number of iterations." + ] + ] + + @opts_schema NimbleOptions.new!(opts) + + @doc """ + Minimizes a multivariate function using the BFGS algorithm. + + BFGS is a gradient-based method that uses automatic differentiation to compute + gradients and approximates the inverse Hessian matrix for fast convergence. + + ## Arguments + + * `x0` - Initial guess as a 1-D tensor of shape `{n}`. + * `fun` - The objective function to minimize. Must be a defn-compatible function + that takes a 1-D tensor of shape `{n}` and returns a scalar tensor. + * `opts` - Options (see below). + + ## Options + + #{NimbleOptions.docs(@opts_schema)} + + ## Returns + + A `Scholar.Optimize.BFGS` struct with the optimization result: + + * `:x` - The optimal point found (shape `{n}`) + * `:fun` - The function value at the optimal point + * `:converged` - Whether the optimization converged (1 if true, 0 if false) + * `:iterations` - Number of iterations performed + * `:fun_evals` - Number of function evaluations + * `:grad_evals` - Number of gradient evaluations + + ## Examples + + iex> # Minimize a simple quadratic: f(x) = x1^2 + x2^2 + iex> fun = fn x -> Nx.sum(Nx.pow(x, 2)) end + iex> x0 = Nx.tensor([1.0, 2.0]) + iex> result = Scholar.Optimize.BFGS.minimize(x0, fun) + iex> Nx.to_number(result.converged) + 1 + iex> Nx.all_close(result.x, Nx.tensor([0.0, 0.0]), atol: 1.0e-4) |> Nx.to_number() + 1 + + For higher precision, use f64 tensors: + + iex> fun = fn x -> Nx.sum(Nx.pow(x, 2)) end + iex> x0 = Nx.tensor([1.0, 2.0], type: :f64) + iex> result = Scholar.Optimize.BFGS.minimize(x0, fun, gtol: 1.0e-10) + iex> Nx.to_number(result.converged) + 1 + iex> Nx.all_close(result.x, Nx.tensor([0.0, 0.0]), atol: 1.0e-8) |> Nx.to_number() + 1 + + Minimizing the Rosenbrock function: + + iex> # Rosenbrock: f(x,y) = (1-x)^2 + 100*(y-x^2)^2, minimum at (1, 1) + iex> rosenbrock = fn x -> + ...> x0 = x[0] + ...> x1 = x[1] + ...> term1 = Nx.pow(Nx.subtract(1, x0), 2) + ...> term2 = Nx.multiply(100, Nx.pow(Nx.subtract(x1, Nx.pow(x0, 2)), 2)) + ...> Nx.add(term1, term2) + ...> end + iex> x0 = Nx.tensor([0.0, 0.0], type: :f64) + iex> result = Scholar.Optimize.BFGS.minimize(x0, rosenbrock, gtol: 1.0e-8, maxiter: 1000) + iex> Nx.to_number(result.converged) + 1 + iex> Nx.all_close(result.x, Nx.tensor([1.0, 1.0]), atol: 1.0e-4) |> Nx.to_number() + 1 + """ + defn minimize(x0, fun, opts \\ []) do + {gtol, maxiter} = transform_opts(opts) + minimize_n(x0, fun, gtol, maxiter) + end + + deftransformp transform_opts(opts) do + opts = NimbleOptions.validate!(opts, @opts_schema) + {opts[:gtol], opts[:maxiter]} + end + + defnp minimize_n(x0, fun, gtol, maxiter) do + x0 = Nx.flatten(x0) + {n} = Nx.shape(x0) + + # Initial function value and gradient + {f0, g0} = value_and_grad(x0, fun) + + # Initialize inverse Hessian as identity matrix + h_inv = Nx.eye(n, type: Nx.type(x0)) + + # Initial state + initial_state = %{ + x: x0, + f: f0, + g: g0, + h_inv: h_inv, + iter: Nx.u32(0), + f_evals: Nx.u32(1), + g_evals: Nx.u32(1) + } + + # Main optimization loop + {final_state, _} = + while {state = initial_state, {gtol, maxiter}}, + not converged?(state, gtol) and state.iter < maxiter do + new_state = bfgs_step(state, fun) + {new_state, {gtol, maxiter}} + end + + # Check convergence + converged = converged?(final_state, gtol) + + %__MODULE__{ + x: final_state.x, + fun: final_state.f, + converged: converged, + iterations: final_state.iter, + fun_evals: final_state.f_evals, + grad_evals: final_state.g_evals + } + end + + # Check convergence: infinity norm of gradient < gtol + defnp converged?(state, gtol) do + grad_norm = Nx.reduce_max(Nx.abs(state.g)) + Nx.less(grad_norm, gtol) + end + + # Perform one BFGS iteration + defnp bfgs_step(state, fun) do + x = state.x + f = state.f + g = state.g + h_inv = state.h_inv + + # Compute search direction: p = -H_inv * g + p = Nx.negate(Nx.dot(h_inv, g)) + + # Line search to find step length alpha + {alpha, f_new, g_new, ls_f_evals, ls_g_evals} = + line_search(x, f, g, p, fun) + + # Compute step and gradient change + s = Nx.multiply(alpha, p) + x_new = Nx.add(x, s) + y = Nx.subtract(g_new, g) + + # Update inverse Hessian using BFGS formula + h_inv_new = update_inverse_hessian(h_inv, s, y) + + %{ + state + | x: x_new, + f: f_new, + g: g_new, + h_inv: h_inv_new, + iter: Nx.add(state.iter, 1), + f_evals: Nx.add(state.f_evals, ls_f_evals), + g_evals: Nx.add(state.g_evals, ls_g_evals) + } + end + + # Backtracking line search with Armijo condition (unrolled for defn compatibility) + defnp line_search(x, f, g, p, fun) do + # Directional derivative at alpha=0 + slope = Nx.dot(g, p) + + # Try step sizes: 1, 0.5, 0.25, 0.125, ... + # Unroll 10 iterations of backtracking + alpha = backtrack_step(x, f, p, slope, fun, 1.0) + alpha = backtrack_step(x, f, p, slope, fun, alpha) + alpha = backtrack_step(x, f, p, slope, fun, alpha) + alpha = backtrack_step(x, f, p, slope, fun, alpha) + alpha = backtrack_step(x, f, p, slope, fun, alpha) + alpha = backtrack_step(x, f, p, slope, fun, alpha) + alpha = backtrack_step(x, f, p, slope, fun, alpha) + alpha = backtrack_step(x, f, p, slope, fun, alpha) + alpha = backtrack_step(x, f, p, slope, fun, alpha) + alpha = backtrack_step(x, f, p, slope, fun, alpha) + + # Evaluate function and gradient at final alpha + x_final = Nx.add(x, Nx.multiply(alpha, p)) + {f_final, g_final} = value_and_grad(x_final, fun) + + {alpha, f_final, g_final, Nx.u32(11), Nx.u32(1)} + end + + # Single backtracking step + defnp backtrack_step(x, f, p, slope, fun, alpha) do + x_trial = Nx.add(x, Nx.multiply(alpha, p)) + f_trial = fun.(x_trial) + + # Check Armijo condition + armijo_ok = + Nx.less_equal(f_trial, Nx.add(f, Nx.multiply(@c1, Nx.multiply(alpha, slope)))) + + # If satisfied, keep alpha; otherwise halve it + Nx.select(armijo_ok, alpha, Nx.multiply(0.5, alpha)) + end + + # BFGS inverse Hessian update + # H_new = (I - rho*s*y') * H * (I - rho*y*s') + rho*s*s' + defnp update_inverse_hessian(h_inv, s, y) do + {n} = Nx.shape(s) + + # rho = 1 / (y' * s) + ys = Nx.dot(y, s) + + # Skip update if ys is too small (would cause numerical issues) + skip_update = Nx.less(Nx.abs(ys), 1.0e-10) + + rho = Nx.select(skip_update, 0.0, Nx.divide(1.0, ys)) + + # Compute update terms + # I - rho*s*y' and I - rho*y*s' + eye = Nx.eye(n, type: Nx.type(s)) + + # s and y as column vectors for outer products + s_col = Nx.reshape(s, {n, 1}) + y_col = Nx.reshape(y, {n, 1}) + s_row = Nx.reshape(s, {1, n}) + y_row = Nx.reshape(y, {1, n}) + + # (I - rho*s*y') + left = Nx.subtract(eye, Nx.multiply(rho, Nx.dot(s_col, y_row))) + + # (I - rho*y*s') + right = Nx.subtract(eye, Nx.multiply(rho, Nx.dot(y_col, s_row))) + + # rho*s*s' + ss_term = Nx.multiply(rho, Nx.dot(s_col, s_row)) + + # H_new = left * H * right + ss_term + h_new = Nx.add(Nx.dot(Nx.dot(left, h_inv), right), ss_term) + + # Use old H if update skipped + Nx.select(skip_update, h_inv, h_new) + end +end diff --git a/lib/scholar/optimize/nelder_mead.ex b/lib/scholar/optimize/nelder_mead.ex new file mode 100644 index 00000000..173d0af6 --- /dev/null +++ b/lib/scholar/optimize/nelder_mead.ex @@ -0,0 +1,437 @@ +defmodule Scholar.Optimize.NelderMead do + @moduledoc """ + Nelder-Mead simplex algorithm for derivative-free multivariate function minimization. + + The Nelder-Mead algorithm (also known as the downhill simplex method) is a + derivative-free optimization technique for finding the minimum of a function + of multiple variables. It maintains a simplex of n+1 points in n-dimensional + space and iteratively updates the simplex by reflecting, expanding, contracting, + or shrinking based on function values. + + ## Algorithm + + At each iteration, the algorithm: + 1. Orders the simplex vertices by function value (best to worst) + 2. Computes the centroid of all vertices except the worst + 3. Attempts reflection of the worst point through the centroid + 4. Based on the reflected point's value: + - If best so far: try expansion + - If better than second-worst: accept reflection + - If worse: try contraction + - If contraction fails: shrink simplex toward best point + + ## Convergence + + The algorithm converges when the standard deviation of function values + at simplex vertices falls below the specified tolerance. + + ## References + + * Nelder, J. A. and Mead, R. (1965). "A simplex method for function minimization" + * Press, W. H., et al. "Numerical Recipes: The Art of Scientific Computing" + """ + + import Nx.Defn + + @derive {Nx.Container, containers: [:x, :fun, :converged, :iterations, :fun_evals]} + defstruct [:x, :fun, :converged, :iterations, :fun_evals] + + @type t :: %__MODULE__{ + x: Nx.Tensor.t(), + fun: Nx.Tensor.t(), + converged: Nx.Tensor.t(), + iterations: Nx.Tensor.t(), + fun_evals: Nx.Tensor.t() + } + + # Standard Nelder-Mead coefficients + @rho 1.0 + @chi 2.0 + @gamma 0.5 + @sigma 0.5 + + opts = [ + tol: [ + type: {:custom, Scholar.Options, :positive_number, []}, + default: 1.0e-5, + doc: """ + Tolerance for convergence. The algorithm stops when the standard deviation + of function values at simplex vertices is below this threshold. + """ + ], + maxiter: [ + type: :pos_integer, + default: 500, + doc: "Maximum number of iterations." + ], + initial_simplex_step: [ + type: {:custom, Scholar.Options, :positive_number, []}, + default: 0.05, + doc: """ + Step size for constructing the initial simplex. Each vertex (except the first) + is created by moving along one coordinate axis by this factor times the + corresponding coordinate value (or by this value if the coordinate is zero). + """ + ] + ] + + @opts_schema NimbleOptions.new!(opts) + + @doc """ + Minimizes a multivariate function using the Nelder-Mead simplex algorithm. + + This is a derivative-free method suitable for optimizing functions where + gradients are unavailable or expensive to compute. + + ## Arguments + + * `x0` - Initial guess as a 1-D tensor of shape `{n}`. + * `fun` - The objective function to minimize. Must be a defn-compatible function + that takes a 1-D tensor of shape `{n}` and returns a scalar tensor. + * `opts` - Options (see below). + + ## Options + + #{NimbleOptions.docs(@opts_schema)} + + ## Returns + + A `Scholar.Optimize.NelderMead` struct with the optimization result: + + * `:x` - The optimal point found (shape `{n}`) + * `:fun` - The function value at the optimal point + * `:converged` - Whether the optimization converged (1 if true, 0 if false) + * `:iterations` - Number of iterations performed + * `:fun_evals` - Number of function evaluations + + ## Examples + + iex> # Minimize a simple quadratic: f(x) = x1^2 + x2^2 + iex> fun = fn x -> Nx.sum(Nx.pow(x, 2)) end + iex> x0 = Nx.tensor([1.0, 2.0]) + iex> result = Scholar.Optimize.NelderMead.minimize(x0, fun) + iex> Nx.to_number(result.converged) + 1 + iex> Nx.all_close(result.x, Nx.tensor([0.0, 0.0]), atol: 1.0e-2) |> Nx.to_number() + 1 + + For higher precision, use f64 tensors: + + iex> fun = fn x -> Nx.sum(Nx.pow(x, 2)) end + iex> x0 = Nx.tensor([1.0, 2.0], type: :f64) + iex> result = Scholar.Optimize.NelderMead.minimize(x0, fun, tol: 1.0e-12) + iex> Nx.to_number(result.converged) + 1 + iex> Nx.all_close(result.x, Nx.tensor([0.0, 0.0]), atol: 1.0e-4) |> Nx.to_number() + 1 + + Minimizing the Rosenbrock function: + + iex> # Rosenbrock: f(x,y) = (1-x)^2 + 100*(y-x^2)^2, minimum at (1, 1) + iex> rosenbrock = fn x -> + ...> x0 = x[0] + ...> x1 = x[1] + ...> term1 = Nx.pow(Nx.subtract(1, x0), 2) + ...> term2 = Nx.multiply(100, Nx.pow(Nx.subtract(x1, Nx.pow(x0, 2)), 2)) + ...> Nx.add(term1, term2) + ...> end + iex> x0 = Nx.tensor([0.0, 0.0], type: :f64) + iex> result = Scholar.Optimize.NelderMead.minimize(x0, rosenbrock, tol: 1.0e-8, maxiter: 1000) + iex> Nx.to_number(result.converged) + 1 + iex> Nx.all_close(result.x, Nx.tensor([1.0, 1.0]), atol: 1.0e-4) |> Nx.to_number() + 1 + """ + defn minimize(x0, fun, opts \\ []) do + {tol, maxiter, initial_simplex_step} = transform_opts(opts) + minimize_n(x0, fun, tol, maxiter, initial_simplex_step) + end + + deftransformp transform_opts(opts) do + opts = NimbleOptions.validate!(opts, @opts_schema) + {opts[:tol], opts[:maxiter], opts[:initial_simplex_step]} + end + + defnp minimize_n(x0, fun, tol, maxiter, initial_simplex_step) do + x0 = Nx.flatten(x0) + {n} = Nx.shape(x0) + + # Build initial simplex: n+1 vertices + simplex = build_initial_simplex(x0, initial_simplex_step) + + # Evaluate function at all simplex vertices + f_values = evaluate_all_vertices(simplex, fun) + + # Initial state + initial_state = %{ + simplex: simplex, + f_values: f_values, + iter: Nx.u32(0), + f_evals: Nx.u32(n + 1) + } + + # Main optimization loop + {final_state, _} = + while {state = initial_state, {tol, maxiter}}, + not converged?(state, tol) and state.iter < maxiter do + new_state = nelder_mead_step(state, fun) + {new_state, {tol, maxiter}} + end + + # Get best point + order = Nx.argsort(final_state.f_values) + best_idx = order[0] + x_opt = final_state.simplex[best_idx] + f_opt = final_state.f_values[best_idx] + + # Check convergence + converged = converged?(final_state, tol) + + %__MODULE__{ + x: x_opt, + fun: f_opt, + converged: converged, + iterations: final_state.iter, + fun_evals: final_state.f_evals + } + end + + # Build initial simplex with n+1 vertices + defnp build_initial_simplex(x0, step) do + {n} = Nx.shape(x0) + simplex_shape = {n + 1, n} + + # Broadcast x0 to all rows + base = Nx.broadcast(x0, simplex_shape) + + # Create identity-like steps for vertices 1 to n + indices = Nx.iota(simplex_shape, axis: 0) + col_indices = Nx.iota(simplex_shape, axis: 1) + + # For row i (i > 0), add step to column i-1 + should_add = Nx.equal(Nx.subtract(indices, 1), col_indices) + + # Step size: use step * |x0[j]| if x0[j] != 0, else step + x0_abs = Nx.abs(x0) + step_sizes = Nx.select(Nx.greater(x0_abs, 1.0e-10), Nx.multiply(step, x0_abs), step) + + # Broadcast step_sizes to simplex shape + step_matrix = Nx.broadcast(step_sizes, simplex_shape) + + # Add steps where appropriate + Nx.select(should_add, Nx.add(base, step_matrix), base) + end + + # Evaluate function at all simplex vertices + defnp evaluate_all_vertices(simplex, fun) do + {n_plus_1, _n} = Nx.shape(simplex) + + # Evaluate first vertex to get type + f0 = fun.(simplex[0]) + init_f = Nx.broadcast(f0, {n_plus_1}) + + {f_values, _} = + while {f_values = init_f, {simplex, i = Nx.u32(0)}}, i < n_plus_1 do + vertex = simplex[i] + f_val = fun.(vertex) + f_values = Nx.indexed_put(f_values, Nx.new_axis(i, 0), f_val) + {f_values, {simplex, i + 1}} + end + + f_values + end + + # Check convergence: std of function values < tol + defnp converged?(state, tol) do + f_values = state.f_values + mean = Nx.mean(f_values) + variance = Nx.mean(Nx.pow(Nx.subtract(f_values, mean), 2)) + std = Nx.sqrt(variance) + Nx.less(std, tol) + end + + # Perform one Nelder-Mead iteration + defnp nelder_mead_step(state, fun) do + simplex = state.simplex + f_values = state.f_values + {n_plus_1, _n} = Nx.shape(simplex) + + # Sort vertices by function value + order = Nx.argsort(f_values) + sorted_simplex = Nx.take(simplex, order, axis: 0) + sorted_f = Nx.take(f_values, order) + + # Best, second-worst, and worst points + f_best = sorted_f[0] + f_second_worst = sorted_f[n_plus_1 - 2] + x_worst = sorted_simplex[n_plus_1 - 1] + f_worst = sorted_f[n_plus_1 - 1] + + # Compute centroid of all points except worst + centroid = Nx.mean(sorted_simplex[0..-2//1], axes: [0]) + + # Reflection: x_r = centroid + rho * (centroid - x_worst) + x_r = Nx.add(centroid, Nx.multiply(@rho, Nx.subtract(centroid, x_worst))) + f_r = fun.(x_r) + + # Expansion: x_e = centroid + chi * (x_r - centroid) + x_e = Nx.add(centroid, Nx.multiply(@chi, Nx.subtract(x_r, centroid))) + f_e = fun.(x_e) + + # Outside contraction: x_oc = centroid + gamma * (x_r - centroid) + x_oc = Nx.add(centroid, Nx.multiply(@gamma, Nx.subtract(x_r, centroid))) + f_oc = fun.(x_oc) + + # Inside contraction: x_ic = centroid - gamma * (centroid - x_worst) + x_ic = Nx.subtract(centroid, Nx.multiply(@gamma, Nx.subtract(centroid, x_worst))) + f_ic = fun.(x_ic) + + # Decide action based on f_r + {new_simplex, new_f_values, extra_evals} = + nelder_mead_update( + sorted_simplex, + sorted_f, + x_r, + f_r, + x_e, + f_e, + x_oc, + f_oc, + x_ic, + f_ic, + f_best, + f_second_worst, + f_worst, + fun + ) + + %{ + state + | simplex: new_simplex, + f_values: new_f_values, + iter: Nx.add(state.iter, 1), + # 4 base evals (f_r, f_e, f_oc, f_ic) + extra for shrink + f_evals: Nx.add(state.f_evals, Nx.add(4, extra_evals)) + } + end + + # Update simplex based on reflection result + defnp nelder_mead_update( + sorted_simplex, + sorted_f, + x_r, + f_r, + x_e, + f_e, + x_oc, + f_oc, + x_ic, + f_ic, + f_best, + f_second_worst, + f_worst, + fun + ) do + {n_plus_1, n} = Nx.shape(sorted_simplex) + + # Case 1: f_r < f_best -> try expansion + # Case 2: f_best <= f_r < f_second_worst -> accept reflection + # Case 3: f_second_worst <= f_r < f_worst -> outside contraction + # Case 4: f_r >= f_worst -> inside contraction + # If contraction fails -> shrink + + # Determine which case applies + case1 = Nx.less(f_r, f_best) + case2 = Nx.logical_and(Nx.greater_equal(f_r, f_best), Nx.less(f_r, f_second_worst)) + case3 = Nx.logical_and(Nx.greater_equal(f_r, f_second_worst), Nx.less(f_r, f_worst)) + case4 = Nx.greater_equal(f_r, f_worst) + + # Case 1: expansion - use x_e if better than x_r, else use x_r + use_expansion = Nx.logical_and(case1, Nx.less(f_e, f_r)) + x_case1 = Nx.select(use_expansion, x_e, x_r) + f_case1 = Nx.select(use_expansion, f_e, f_r) + + # Case 2: accept reflection + x_case2 = x_r + f_case2 = f_r + + # Case 3: outside contraction - use if better than x_r + contraction_success_3 = Nx.less_equal(f_oc, f_r) + + # Case 4: inside contraction - use if better than x_worst + contraction_success_4 = Nx.less(f_ic, f_worst) + + # Determine if we need to shrink + need_shrink = + Nx.logical_or( + Nx.logical_and(case3, Nx.logical_not(contraction_success_3)), + Nx.logical_and(case4, Nx.logical_not(contraction_success_4)) + ) + + # Select point for non-shrink cases + x_new_non_shrink = + Nx.select( + case1, + x_case1, + Nx.select( + case2, + x_case2, + Nx.select(case3, x_oc, x_ic) + ) + ) + + f_new_non_shrink = + Nx.select( + case1, + f_case1, + Nx.select( + case2, + f_case2, + Nx.select(case3, f_oc, f_ic) + ) + ) + + # Replace worst point (last row) with new point + # Use indexed_put with proper indices + worst_idx = Nx.u32(n_plus_1 - 1) + row_indices = Nx.broadcast(worst_idx, {n}) + col_indices = Nx.iota({n}, type: :u32) + indices = Nx.stack([row_indices, col_indices], axis: 1) + + new_simplex_replace = Nx.indexed_put(sorted_simplex, indices, x_new_non_shrink) + new_f_replace = Nx.indexed_put(sorted_f, Nx.new_axis(worst_idx, 0), f_new_non_shrink) + + # Shrink simplex: x_i = x_best + sigma * (x_i - x_best) + x_best = sorted_simplex[0] + + shrunk_simplex = + Nx.add(x_best, Nx.multiply(@sigma, Nx.subtract(sorted_simplex, x_best))) + + # Evaluate shrunk simplex (except x_best which stays the same) + shrunk_f = evaluate_shrunk_vertices(shrunk_simplex, sorted_f, fun) + + # Select between shrink and non-shrink + final_simplex = Nx.select(need_shrink, shrunk_simplex, new_simplex_replace) + final_f = Nx.select(need_shrink, shrunk_f, new_f_replace) + + # Extra function evaluations for shrink: n evals (all except best) + extra_evals = Nx.select(need_shrink, Nx.u32(n), Nx.u32(0)) + + {final_simplex, final_f, extra_evals} + end + + # Evaluate shrunk simplex vertices (skip first which is unchanged) + defnp evaluate_shrunk_vertices(shrunk_simplex, original_f, fun) do + {n_plus_1, _n} = Nx.shape(shrunk_simplex) + + {shrunk_f, _} = + while {shrunk_f = original_f, {shrunk_simplex, i = Nx.u32(1)}}, i < n_plus_1 do + vertex = shrunk_simplex[i] + f_val = fun.(vertex) + shrunk_f = Nx.indexed_put(shrunk_f, Nx.new_axis(i, 0), f_val) + {shrunk_f, {shrunk_simplex, i + 1}} + end + + shrunk_f + end +end diff --git a/notebooks/efficient_frontier.livemd b/notebooks/efficient_frontier.livemd new file mode 100644 index 00000000..9cfe117b --- /dev/null +++ b/notebooks/efficient_frontier.livemd @@ -0,0 +1,298 @@ +# Portfolio Optimization: Efficient Frontier + +```elixir +Mix.install([ + {:scholar, path: "."}, + {:nx, "~> 0.9"}, + {:kino_vega_lite, "~> 0.1"}, + {:req, "~> 0.5"} +]) +``` + +## Introduction + +This notebook demonstrates portfolio optimization using Scholar's multivariate optimization algorithms. We'll compute the efficient frontier for a portfolio of stocks using the Markowitz mean-variance framework. + +The efficient frontier represents the set of portfolios that offer the highest expected return for a given level of risk, or equivalently, the lowest risk for a given expected return. + +## Fetch Stock Data + +We'll use historical stock data for a few major tech companies. For simplicity, we'll use pre-computed monthly returns. + +```elixir +# Sample monthly returns data for 5 stocks (12 months) +# In practice, you would fetch this from Yahoo Finance or another data source +stock_names = ["AAPL", "GOOGL", "MSFT", "AMZN", "META"] + +# Monthly returns (12 months x 5 stocks) - simulated realistic data +returns = Nx.tensor([ + [0.05, 0.03, 0.04, 0.06, 0.02], + [-0.02, -0.01, 0.01, -0.03, -0.04], + [0.08, 0.06, 0.07, 0.09, 0.05], + [0.03, 0.02, 0.03, 0.04, 0.01], + [-0.01, 0.02, 0.01, -0.02, 0.03], + [0.04, 0.03, 0.05, 0.03, 0.04], + [0.02, 0.01, 0.02, 0.05, 0.02], + [-0.03, -0.02, -0.01, -0.04, -0.01], + [0.06, 0.04, 0.05, 0.07, 0.03], + [0.01, 0.02, 0.01, 0.02, 0.01], + [0.04, 0.03, 0.04, 0.05, 0.02], + [-0.01, 0.01, 0.00, -0.01, 0.02] +], type: :f64) + +IO.puts("Returns matrix shape: #{inspect(Nx.shape(returns))}") +IO.puts("Stocks: #{inspect(stock_names)}") +``` + +## Compute Expected Returns and Covariance + +```elixir +# Expected returns (mean of historical returns) +expected_returns = Nx.mean(returns, axes: [0]) + +# Covariance matrix +n_samples = Nx.axis_size(returns, 0) +centered = Nx.subtract(returns, expected_returns) +covariance = Nx.divide(Nx.dot(Nx.transpose(centered), centered), n_samples - 1) + +IO.puts("Expected Monthly Returns:") +Enum.zip(stock_names, Nx.to_flat_list(expected_returns)) +|> Enum.each(fn {name, ret} -> + IO.puts(" #{name}: #{Float.round(ret * 100, 2)}%") +end) + +IO.puts("\nCovariance Matrix:") +covariance +``` + +## Portfolio Optimization Problem + +We want to find portfolio weights $w$ that minimize risk (variance) for a given target return, or equivalently, maximize the Sharpe ratio. + +The optimization problem is: +- **Minimize:** $w^T \Sigma w$ (portfolio variance) +- **Subject to:** $w^T \mu = r_{target}$ (target return) and $\sum w_i = 1$ (weights sum to 1) + +For the efficient frontier, we solve this for multiple target returns. + +### Unconstrained Formulation + +To handle constraints with unconstrained optimizers, we use a penalty method: + +$$\text{minimize } w^T \Sigma w - \lambda \cdot w^T \mu + \rho \cdot (1 - \sum w_i)^2$$ + +where $\lambda$ controls the risk-return tradeoff and $\rho$ is a penalty parameter. + +```elixir +alias Scholar.Optimize.BFGS + +defmodule Portfolio do + import Nx.Defn + + @doc """ + Portfolio objective function. + + Minimizes variance while penalizing deviation from target return + and ensuring weights sum to 1. + """ + def objective(cov, mu, lambda, penalty \\ 1000.0) do + fn w -> + # Portfolio variance: w' * Sigma * w + variance = Nx.dot(w, Nx.dot(cov, w)) + + # Portfolio return: w' * mu + portfolio_return = Nx.dot(w, mu) + + # Constraint: weights sum to 1 + sum_constraint = Nx.pow(Nx.subtract(Nx.sum(w), 1), 2) + + # Objective: variance - lambda * return + penalty * constraint + Nx.add( + Nx.subtract(variance, Nx.multiply(lambda, portfolio_return)), + Nx.multiply(penalty, sum_constraint) + ) + end + end + + @doc """ + Compute portfolio variance and return from weights. + """ + def portfolio_stats(w, cov, mu) do + variance = Nx.dot(w, Nx.dot(cov, w)) |> Nx.to_number() + portfolio_return = Nx.dot(w, mu) |> Nx.to_number() + {portfolio_return, :math.sqrt(variance)} + end +end + +IO.puts("Portfolio module defined.") +``` + +## Compute the Efficient Frontier + +We'll compute optimal portfolios for different values of $\lambda$ (risk-return tradeoff): + +```elixir +# Range of lambda values (higher = more return-seeking) +lambdas = [0.0, 0.5, 1.0, 2.0, 3.0, 5.0, 8.0, 12.0, 18.0, 25.0, 35.0, 50.0] + +# Initial weights (equal weighted) +n_assets = Nx.axis_size(expected_returns, 0) +w0 = Nx.broadcast(Nx.divide(1.0, n_assets), {n_assets}) |> Nx.as_type(:f64) + +# Compute optimal portfolios for each lambda +efficient_portfolios = Enum.map(lambdas, fn lambda -> + objective = Portfolio.objective(covariance, expected_returns, lambda) + + result = BFGS.minimize(w0, objective, gtol: 1.0e-8, maxiter: 500) + + # Normalize weights to sum to 1 + weights = Nx.divide(result.x, Nx.sum(result.x)) + + {ret, risk} = Portfolio.portfolio_stats(weights, covariance, expected_returns) + + %{ + lambda: lambda, + weights: Nx.to_list(weights), + return: ret * 12, # Annualized + risk: risk * :math.sqrt(12), # Annualized + converged: Nx.to_number(result.converged) == 1 + } +end) + +IO.puts("Efficient Frontier Portfolios:") +IO.puts("") +IO.puts("Lambda | Ann. Return | Ann. Risk | Converged") +IO.puts("-------|-------------|-----------|----------") + +Enum.each(efficient_portfolios, fn p -> + ret_pct = Float.round(p.return * 100, 2) + risk_pct = Float.round(p.risk * 100, 2) + IO.puts("#{String.pad_leading("#{p.lambda}", 6)} | #{String.pad_leading("#{ret_pct}%", 11)} | #{String.pad_leading("#{risk_pct}%", 9)} | #{p.converged}") +end) +``` + +## Visualize the Efficient Frontier + +```elixir +alias VegaLite, as: Vl + +# Prepare data for plotting +frontier_data = Enum.map(efficient_portfolios, fn p -> + %{ + "Risk (%)" => Float.round(p.risk * 100, 2), + "Return (%)" => Float.round(p.return * 100, 2), + "Lambda" => p.lambda + } +end) + +Vl.new(width: 600, height: 400, title: "Efficient Frontier") +|> Vl.data_from_values(frontier_data) +|> Vl.mark(:line, point: true) +|> Vl.encode_field(:x, "Risk (%)", type: :quantitative, title: "Annualized Risk (%)") +|> Vl.encode_field(:y, "Return (%)", type: :quantitative, title: "Annualized Return (%)") +|> Vl.encode_field(:tooltip, "Lambda", type: :quantitative) +``` + +## Optimal Portfolio Weights + +Let's examine the weights for a balanced portfolio (middle of the frontier): + +```elixir +# Select a balanced portfolio (lambda = 5.0) +balanced = Enum.find(efficient_portfolios, fn p -> p.lambda == 5.0 end) + +IO.puts("Balanced Portfolio (λ = 5.0):") +IO.puts(" Annualized Return: #{Float.round(balanced.return * 100, 2)}%") +IO.puts(" Annualized Risk: #{Float.round(balanced.risk * 100, 2)}%") +IO.puts("") +IO.puts("Weights:") + +Enum.zip(stock_names, balanced.weights) +|> Enum.each(fn {name, weight} -> + pct = Float.round(weight * 100, 1) + bar = String.duplicate("█", round(pct / 2)) + IO.puts(" #{String.pad_trailing(name, 5)}: #{String.pad_leading("#{pct}%", 6)} #{bar}") +end) +``` + +## Compare BFGS vs Nelder-Mead + +Let's compare the two optimization methods on portfolio optimization: + +```elixir +alias Scholar.Optimize.{BFGS, NelderMead} + +lambda = 5.0 +objective = Portfolio.objective(covariance, expected_returns, lambda) + +# BFGS +bfgs_result = BFGS.minimize(w0, objective, gtol: 1.0e-8, maxiter: 500) +bfgs_weights = Nx.divide(bfgs_result.x, Nx.sum(bfgs_result.x)) +{bfgs_ret, bfgs_risk} = Portfolio.portfolio_stats(bfgs_weights, covariance, expected_returns) + +# Nelder-Mead +nm_result = NelderMead.minimize(w0, objective, tol: 1.0e-10, maxiter: 1000) +nm_weights = Nx.divide(nm_result.x, Nx.sum(nm_result.x)) +{nm_ret, nm_risk} = Portfolio.portfolio_stats(nm_weights, covariance, expected_returns) + +IO.puts("Portfolio Optimization Comparison (λ = #{lambda}):") +IO.puts("") +IO.puts("Method | Return (Ann) | Risk (Ann) | Fun Evals | Converged") +IO.puts("-------------|--------------|------------|-----------|----------") +IO.puts("BFGS | #{String.pad_leading("#{Float.round(bfgs_ret * 1200, 2)}%", 12)} | #{String.pad_leading("#{Float.round(bfgs_risk * 100 * :math.sqrt(12), 2)}%", 10)} | #{String.pad_leading("#{Nx.to_number(bfgs_result.fun_evals)}", 9)} | #{Nx.to_number(bfgs_result.converged) == 1}") +IO.puts("Nelder-Mead | #{String.pad_leading("#{Float.round(nm_ret * 1200, 2)}%", 12)} | #{String.pad_leading("#{Float.round(nm_risk * 100 * :math.sqrt(12), 2)}%", 10)} | #{String.pad_leading("#{Nx.to_number(nm_result.fun_evals)}", 9)} | #{Nx.to_number(nm_result.converged) == 1}") +``` + +## Minimum Variance Portfolio + +The minimum variance portfolio is found by setting $\lambda = 0$: + +```elixir +min_var = Enum.find(efficient_portfolios, fn p -> p.lambda == 0.0 end) + +IO.puts("Minimum Variance Portfolio:") +IO.puts(" Annualized Return: #{Float.round(min_var.return * 100, 2)}%") +IO.puts(" Annualized Risk: #{Float.round(min_var.risk * 100, 2)}%") +IO.puts("") +IO.puts("Weights:") + +Enum.zip(stock_names, min_var.weights) +|> Enum.each(fn {name, weight} -> + pct = Float.round(weight * 100, 1) + IO.puts(" #{String.pad_trailing(name, 5)}: #{String.pad_leading("#{pct}%", 6)}") +end) +``` + +## Maximum Return Portfolio + +The maximum return portfolio is found with a high $\lambda$ value: + +```elixir +max_ret = Enum.max_by(efficient_portfolios, fn p -> p.return end) + +IO.puts("Maximum Return Portfolio (λ = #{max_ret.lambda}):") +IO.puts(" Annualized Return: #{Float.round(max_ret.return * 100, 2)}%") +IO.puts(" Annualized Risk: #{Float.round(max_ret.risk * 100, 2)}%") +IO.puts("") +IO.puts("Weights:") + +Enum.zip(stock_names, max_ret.weights) +|> Enum.each(fn {name, weight} -> + pct = Float.round(weight * 100, 1) + IO.puts(" #{String.pad_trailing(name, 5)}: #{String.pad_leading("#{pct}%", 6)}") +end) +``` + +## Summary + +This notebook demonstrated: + +1. **Portfolio optimization** using Markowitz mean-variance framework +2. **Efficient frontier** computation using BFGS optimization +3. **Penalty methods** to handle equality constraints with unconstrained optimizers +4. **Comparison** between BFGS and Nelder-Mead for portfolio optimization + +Scholar's optimization algorithms are well-suited for financial applications: +- **BFGS** provides fast convergence for smooth objectives +- **Nelder-Mead** works when gradients are unavailable or noisy +- Both are JIT-compatible for high performance diff --git a/notebooks/optimize.livemd b/notebooks/optimize.livemd index 167053b8..52f5b0d9 100644 --- a/notebooks/optimize.livemd +++ b/notebooks/optimize.livemd @@ -1,4 +1,4 @@ -# Scalar Optimization with Scholar.Optimize +# Optimization with Scholar.Optimize ```elixir Mix.install([ @@ -10,6 +10,18 @@ Mix.install([ ## Introduction +Scholar provides optimization algorithms for both scalar (univariate) and multivariate functions: + +**Scalar Optimization:** +- **`Scholar.Optimize.Brent`** - Brent's method (recommended for 1D) +- **`Scholar.Optimize.GoldenSection`** - Golden Section search + +**Multivariate Optimization:** +- **`Scholar.Optimize.BFGS`** - Quasi-Newton method with gradients (recommended) +- **`Scholar.Optimize.NelderMead`** - Derivative-free simplex method + +## Scalar Optimization + Scholar provides two algorithms for scalar (univariate) function minimization: - **`Scholar.Optimize.Brent`** - Brent's method (recommended) @@ -283,3 +295,120 @@ Both methods: - Require no derivatives - Are JIT/GPU compatible - Guarantee convergence within the specified bracket + +## Multivariate Optimization + +For functions of multiple variables, Scholar provides two algorithms: + +### BFGS (Recommended) + +BFGS (Broyden-Fletcher-Goldfarb-Shanno) is a quasi-Newton method that uses automatic differentiation to compute gradients. It's the recommended choice for smooth, differentiable functions. + +```elixir +alias Scholar.Optimize.BFGS + +# Minimize a simple quadratic: f(x) = x1^2 + x2^2 +fun = fn x -> Nx.sum(Nx.pow(x, 2)) end +x0 = Nx.tensor([1.0, 2.0]) + +result = BFGS.minimize(x0, fun) + +IO.puts("BFGS Result:") +IO.puts(" x = #{inspect(Nx.to_list(result.x))}") +IO.puts(" f(x) = #{Nx.to_number(result.fun)}") +IO.puts(" Converged: #{Nx.to_number(result.converged) == 1}") +IO.puts(" Iterations: #{Nx.to_number(result.iterations)}") +IO.puts(" Function evals: #{Nx.to_number(result.fun_evals)}") +IO.puts(" Gradient evals: #{Nx.to_number(result.grad_evals)}") +``` + +### Nelder-Mead + +Nelder-Mead is a derivative-free method (simplex algorithm). Use it when gradients are unavailable or expensive to compute. + +```elixir +alias Scholar.Optimize.NelderMead + +# Same function, but without using gradients +fun = fn x -> Nx.sum(Nx.pow(x, 2)) end +x0 = Nx.tensor([1.0, 2.0]) + +result = NelderMead.minimize(x0, fun) + +IO.puts("Nelder-Mead Result:") +IO.puts(" x = #{inspect(Nx.to_list(result.x))}") +IO.puts(" f(x) = #{Nx.to_number(result.fun)}") +IO.puts(" Converged: #{Nx.to_number(result.converged) == 1}") +IO.puts(" Iterations: #{Nx.to_number(result.iterations)}") +IO.puts(" Function evals: #{Nx.to_number(result.fun_evals)}") +``` + +### The Rosenbrock Function + +The Rosenbrock function is a classic optimization test case. It has a global minimum at $(1, 1)$ inside a narrow, curved valley: + +$$f(x, y) = (1 - x)^2 + 100(y - x^2)^2$$ + +```elixir +alias Scholar.Optimize.{BFGS, NelderMead} + +# Rosenbrock function +rosenbrock = fn x -> + x0 = x[0] + x1 = x[1] + term1 = Nx.pow(Nx.subtract(1, x0), 2) + term2 = Nx.multiply(100, Nx.pow(Nx.subtract(x1, Nx.pow(x0, 2)), 2)) + Nx.add(term1, term2) +end + +x0 = Nx.tensor([0.0, 0.0], type: :f64) + +# Solve with BFGS +bfgs_result = BFGS.minimize(x0, rosenbrock, gtol: 1.0e-8, maxiter: 1000) + +# Solve with Nelder-Mead +nm_result = NelderMead.minimize(x0, rosenbrock, tol: 1.0e-10, maxiter: 2000) + +IO.puts("Rosenbrock minimum at (1, 1):") +IO.puts("") +IO.puts("BFGS:") +IO.puts(" x = #{inspect(Nx.to_list(bfgs_result.x))}") +IO.puts(" Function evals: #{Nx.to_number(bfgs_result.fun_evals)}") +IO.puts("") +IO.puts("Nelder-Mead:") +IO.puts(" x = #{inspect(Nx.to_list(nm_result.x))}") +IO.puts(" Function evals: #{Nx.to_number(nm_result.fun_evals)}") +``` + +### BFGS vs Nelder-Mead + +BFGS typically converges much faster for smooth functions because it uses gradient information: + +```elixir +alias Scholar.Optimize.{BFGS, NelderMead} + +# 3D sphere function +fun = fn x -> Nx.sum(Nx.pow(x, 2)) end +x0 = Nx.tensor([1.0, 2.0, 3.0]) + +bfgs_result = BFGS.minimize(x0, fun, gtol: 1.0e-6) +nm_result = NelderMead.minimize(x0, fun, tol: 1.0e-6) + +IO.puts("3D Sphere function minimization:") +IO.puts(" BFGS: #{Nx.to_number(bfgs_result.fun_evals)} function evals") +IO.puts(" Nelder-Mead: #{Nx.to_number(nm_result.fun_evals)} function evals") +IO.puts("") +IO.puts("BFGS speedup: #{Float.round(Nx.to_number(nm_result.fun_evals) / Nx.to_number(bfgs_result.fun_evals), 1)}x fewer evaluations") +``` + +### When to Use Which Method + +| Method | Use Case | +|--------|----------| +| **BFGS** | Default choice for smooth, differentiable functions | +| **Nelder-Mead** | Non-differentiable functions, noisy functions, or when gradients are expensive | + +Both multivariate methods: +- Work with functions of any number of variables +- Are JIT/GPU compatible +- Return similar result structs with `x`, `fun`, `converged`, `iterations`, `fun_evals` diff --git a/test/scholar/optimize/bfgs_test.exs b/test/scholar/optimize/bfgs_test.exs new file mode 100644 index 00000000..98b90cb6 --- /dev/null +++ b/test/scholar/optimize/bfgs_test.exs @@ -0,0 +1,178 @@ +defmodule Scholar.Optimize.BFGSTest do + use Scholar.Case, async: true + alias Scholar.Optimize.BFGS + doctest BFGS + + describe "minimize/3" do + test "minimizes sphere function" do + # f(x) = sum(x^2), minimum at origin + fun = fn x -> Nx.sum(Nx.pow(x, 2)) end + x0 = Nx.tensor([1.0, 2.0, 3.0]) + + result = BFGS.minimize(x0, fun, gtol: 1.0e-6, maxiter: 500) + + assert Nx.to_number(result.converged) == 1 + assert_all_close(result.x, Nx.tensor([0.0, 0.0, 0.0]), atol: 1.0e-4) + assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-8) + end + + test "minimizes Rosenbrock function" do + # f(x,y) = (1-x)^2 + 100*(y-x^2)^2, minimum at (1, 1) + rosenbrock = fn x -> + x0 = x[0] + x1 = x[1] + term1 = Nx.pow(Nx.subtract(1, x0), 2) + term2 = Nx.multiply(100, Nx.pow(Nx.subtract(x1, Nx.pow(x0, 2)), 2)) + Nx.add(term1, term2) + end + + x0 = Nx.tensor([0.0, 0.0], type: :f64) + + result = BFGS.minimize(x0, rosenbrock, gtol: 1.0e-8, maxiter: 1000) + + assert Nx.to_number(result.converged) == 1 + assert_all_close(result.x, Nx.tensor([1.0, 1.0]), atol: 1.0e-4) + assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-6) + end + + test "minimizes Booth function" do + # f(x,y) = (x + 2y - 7)^2 + (2x + y - 5)^2, minimum at (1, 3) + booth = fn x -> + x0 = x[0] + x1 = x[1] + # (x + 2y - 7)^2 + term1 = Nx.pow(Nx.subtract(Nx.add(x0, Nx.multiply(2, x1)), 7), 2) + # (2x + y - 5)^2 + term2 = Nx.pow(Nx.subtract(Nx.add(Nx.multiply(2, x0), x1), 5), 2) + Nx.add(term1, term2) + end + + x0 = Nx.tensor([0.0, 0.0]) + + result = BFGS.minimize(x0, booth, gtol: 1.0e-6, maxiter: 500) + + assert Nx.to_number(result.converged) == 1 + assert_all_close(result.x, Nx.tensor([1.0, 3.0]), atol: 1.0e-4) + assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-8) + end + + test "minimizes Beale function" do + # f(x,y) = (1.5 - x + xy)^2 + (2.25 - x + xy^2)^2 + (2.625 - x + xy^3)^2 + # minimum at (3, 0.5) + beale = fn x -> + x0 = x[0] + x1 = x[1] + + # 1.5 - x + xy + t1 = Nx.add(Nx.subtract(1.5, x0), Nx.multiply(x0, x1)) + term1 = Nx.pow(t1, 2) + + # 2.25 - x + xy^2 + t2 = Nx.add(Nx.subtract(2.25, x0), Nx.multiply(x0, Nx.pow(x1, 2))) + term2 = Nx.pow(t2, 2) + + # 2.625 - x + xy^3 + t3 = Nx.add(Nx.subtract(2.625, x0), Nx.multiply(x0, Nx.pow(x1, 3))) + term3 = Nx.pow(t3, 2) + + Nx.add(Nx.add(term1, term2), term3) + end + + x0 = Nx.tensor([0.0, 0.0], type: :f64) + + result = BFGS.minimize(x0, beale, gtol: 1.0e-8, maxiter: 1000) + + assert Nx.to_number(result.converged) == 1 + assert_all_close(result.x, Nx.tensor([3.0, 0.5]), atol: 1.0e-3) + assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-6) + end + + test "minimizes shifted quadratic" do + # f(x) = (x1 - 2)^2 + (x2 + 3)^2 + (x3 - 1)^2, minimum at (2, -3, 1) + fun = fn x -> + x0 = x[0] + x1 = x[1] + x2 = x[2] + term1 = Nx.pow(Nx.subtract(x0, 2), 2) + term2 = Nx.pow(Nx.add(x1, 3), 2) + term3 = Nx.pow(Nx.subtract(x2, 1), 2) + Nx.add(Nx.add(term1, term2), term3) + end + + x0 = Nx.tensor([0.0, 0.0, 0.0]) + + result = BFGS.minimize(x0, fun, gtol: 1.0e-6, maxiter: 500) + + assert Nx.to_number(result.converged) == 1 + assert_all_close(result.x, Nx.tensor([2.0, -3.0, 1.0]), atol: 1.0e-4) + assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-8) + end + + test "respects maxiter limit" do + fun = fn x -> Nx.sum(Nx.pow(x, 2)) end + x0 = Nx.tensor([100.0, 100.0]) + + result = BFGS.minimize(x0, fun, gtol: 1.0e-15, maxiter: 5) + + assert Nx.to_number(result.iterations) <= 5 + end + + test "works with jit_apply" do + fun = fn x -> Nx.sum(Nx.pow(x, 2)) end + x0 = Nx.tensor([1.0, 2.0]) + opts = [gtol: 1.0e-6, maxiter: 500] + + result = Nx.Defn.jit_apply(&BFGS.minimize/3, [x0, fun, opts]) + + assert Nx.to_number(result.converged) == 1 + assert_all_close(result.x, Nx.tensor([0.0, 0.0]), atol: 1.0e-4) + end + + test "returns correct struct" do + fun = fn x -> Nx.sum(Nx.pow(x, 2)) end + x0 = Nx.tensor([1.0, 2.0]) + + result = BFGS.minimize(x0, fun, gtol: 1.0e-5, maxiter: 500) + + assert %Scholar.Optimize.BFGS{} = result + assert Map.has_key?(result, :grad_evals) + end + + test "achieves higher precision with f64" do + fun = fn x -> Nx.sum(Nx.pow(x, 2)) end + x0 = Nx.tensor([1.0, 2.0], type: :f64) + + result = BFGS.minimize(x0, fun, gtol: 1.0e-12, maxiter: 1000) + + assert Nx.to_number(result.converged) == 1 + assert_all_close(result.x, Nx.tensor([0.0, 0.0]), atol: 1.0e-10) + end + + test "handles higher dimensions" do + # 5-dimensional sphere function + fun = fn x -> Nx.sum(Nx.pow(x, 2)) end + x0 = Nx.tensor([1.0, 2.0, 3.0, 4.0, 5.0]) + + result = BFGS.minimize(x0, fun, gtol: 1.0e-5, maxiter: 1000) + + assert Nx.to_number(result.converged) == 1 + assert_all_close(result.x, Nx.tensor([0.0, 0.0, 0.0, 0.0, 0.0]), atol: 1.0e-3) + end + + test "converges faster than Nelder-Mead on smooth functions" do + # BFGS should use significantly fewer function evaluations + fun = fn x -> Nx.sum(Nx.pow(x, 2)) end + x0 = Nx.tensor([1.0, 2.0, 3.0]) + + bfgs_result = BFGS.minimize(x0, fun, gtol: 1.0e-6) + nm_result = Scholar.Optimize.NelderMead.minimize(x0, fun, tol: 1.0e-6) + + # Both should converge + assert Nx.to_number(bfgs_result.converged) == 1 + assert Nx.to_number(nm_result.converged) == 1 + + # BFGS should use fewer function evaluations for this smooth problem + assert Nx.to_number(bfgs_result.fun_evals) < Nx.to_number(nm_result.fun_evals) + end + end +end diff --git a/test/scholar/optimize/nelder_mead_test.exs b/test/scholar/optimize/nelder_mead_test.exs new file mode 100644 index 00000000..f3b8cd6e --- /dev/null +++ b/test/scholar/optimize/nelder_mead_test.exs @@ -0,0 +1,171 @@ +defmodule Scholar.Optimize.NelderMeadTest do + use Scholar.Case, async: true + alias Scholar.Optimize.NelderMead + doctest NelderMead + + describe "minimize/3" do + test "minimizes sphere function" do + # f(x) = sum(x^2), minimum at origin + fun = fn x -> Nx.sum(Nx.pow(x, 2)) end + x0 = Nx.tensor([1.0, 2.0, 3.0]) + + result = NelderMead.minimize(x0, fun, tol: 1.0e-6, maxiter: 500) + + assert Nx.to_number(result.converged) == 1 + assert_all_close(result.x, Nx.tensor([0.0, 0.0, 0.0]), atol: 1.0e-2) + assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-4) + end + + test "minimizes Rosenbrock function" do + # f(x,y) = (1-x)^2 + 100*(y-x^2)^2, minimum at (1, 1) + rosenbrock = fn x -> + x0 = x[0] + x1 = x[1] + term1 = Nx.pow(Nx.subtract(1, x0), 2) + term2 = Nx.multiply(100, Nx.pow(Nx.subtract(x1, Nx.pow(x0, 2)), 2)) + Nx.add(term1, term2) + end + + x0 = Nx.tensor([0.0, 0.0], type: :f64) + + result = NelderMead.minimize(x0, rosenbrock, tol: 1.0e-10, maxiter: 2000) + + assert Nx.to_number(result.converged) == 1 + assert_all_close(result.x, Nx.tensor([1.0, 1.0]), atol: 1.0e-3) + assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-4) + end + + test "minimizes Booth function" do + # f(x,y) = (x + 2y - 7)^2 + (2x + y - 5)^2, minimum at (1, 3) + booth = fn x -> + x0 = x[0] + x1 = x[1] + # (x + 2y - 7)^2 + term1 = Nx.pow(Nx.subtract(Nx.add(x0, Nx.multiply(2, x1)), 7), 2) + # (2x + y - 5)^2 + term2 = Nx.pow(Nx.subtract(Nx.add(Nx.multiply(2, x0), x1), 5), 2) + Nx.add(term1, term2) + end + + x0 = Nx.tensor([0.0, 0.0]) + + result = NelderMead.minimize(x0, booth, tol: 1.0e-8, maxiter: 500) + + assert Nx.to_number(result.converged) == 1 + assert_all_close(result.x, Nx.tensor([1.0, 3.0]), atol: 1.0e-2) + assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-4) + end + + test "minimizes Beale function" do + # f(x,y) = (1.5 - x + xy)^2 + (2.25 - x + xy^2)^2 + (2.625 - x + xy^3)^2 + # minimum at (3, 0.5) + beale = fn x -> + x0 = x[0] + x1 = x[1] + + # 1.5 - x + xy + t1 = Nx.add(Nx.subtract(1.5, x0), Nx.multiply(x0, x1)) + term1 = Nx.pow(t1, 2) + + # 2.25 - x + xy^2 + t2 = Nx.add(Nx.subtract(2.25, x0), Nx.multiply(x0, Nx.pow(x1, 2))) + term2 = Nx.pow(t2, 2) + + # 2.625 - x + xy^3 + t3 = Nx.add(Nx.subtract(2.625, x0), Nx.multiply(x0, Nx.pow(x1, 3))) + term3 = Nx.pow(t3, 2) + + Nx.add(Nx.add(term1, term2), term3) + end + + x0 = Nx.tensor([0.0, 0.0], type: :f64) + + result = NelderMead.minimize(x0, beale, tol: 1.0e-10, maxiter: 1000) + + assert Nx.to_number(result.converged) == 1 + assert_all_close(result.x, Nx.tensor([3.0, 0.5]), atol: 1.0e-2) + assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-4) + end + + test "minimizes shifted quadratic" do + # f(x) = (x1 - 2)^2 + (x2 + 3)^2 + (x3 - 1)^2, minimum at (2, -3, 1) + fun = fn x -> + x0 = x[0] + x1 = x[1] + x2 = x[2] + term1 = Nx.pow(Nx.subtract(x0, 2), 2) + term2 = Nx.pow(Nx.add(x1, 3), 2) + term3 = Nx.pow(Nx.subtract(x2, 1), 2) + Nx.add(Nx.add(term1, term2), term3) + end + + x0 = Nx.tensor([0.0, 0.0, 0.0]) + + result = NelderMead.minimize(x0, fun, tol: 1.0e-6, maxiter: 500) + + assert Nx.to_number(result.converged) == 1 + assert_all_close(result.x, Nx.tensor([2.0, -3.0, 1.0]), atol: 1.0e-2) + assert_all_close(result.fun, Nx.tensor(0.0), atol: 1.0e-4) + end + + test "respects maxiter limit" do + fun = fn x -> Nx.sum(Nx.pow(x, 2)) end + x0 = Nx.tensor([100.0, 100.0]) + + result = NelderMead.minimize(x0, fun, tol: 1.0e-10, maxiter: 5) + + assert Nx.to_number(result.iterations) <= 5 + end + + test "works with jit_apply" do + fun = fn x -> Nx.sum(Nx.pow(x, 2)) end + x0 = Nx.tensor([1.0, 2.0]) + opts = [tol: 1.0e-6, maxiter: 500] + + result = Nx.Defn.jit_apply(&NelderMead.minimize/3, [x0, fun, opts]) + + assert Nx.to_number(result.converged) == 1 + assert_all_close(result.x, Nx.tensor([0.0, 0.0]), atol: 1.0e-2) + end + + test "returns correct struct" do + fun = fn x -> Nx.sum(Nx.pow(x, 2)) end + x0 = Nx.tensor([1.0, 2.0]) + + result = NelderMead.minimize(x0, fun, tol: 1.0e-5, maxiter: 500) + + assert %Scholar.Optimize.NelderMead{} = result + end + + test "achieves higher precision with f64" do + fun = fn x -> Nx.sum(Nx.pow(x, 2)) end + x0 = Nx.tensor([1.0, 2.0], type: :f64) + + result = NelderMead.minimize(x0, fun, tol: 1.0e-12, maxiter: 1000) + + assert Nx.to_number(result.converged) == 1 + assert_all_close(result.x, Nx.tensor([0.0, 0.0]), atol: 1.0e-4) + end + + test "handles higher dimensions" do + # 5-dimensional sphere function + fun = fn x -> Nx.sum(Nx.pow(x, 2)) end + x0 = Nx.tensor([1.0, 2.0, 3.0, 4.0, 5.0]) + + result = NelderMead.minimize(x0, fun, tol: 1.0e-5, maxiter: 2000) + + assert Nx.to_number(result.converged) == 1 + assert_all_close(result.x, Nx.tensor([0.0, 0.0, 0.0, 0.0, 0.0]), atol: 0.1) + end + + test "custom initial_simplex_step" do + fun = fn x -> Nx.sum(Nx.pow(x, 2)) end + x0 = Nx.tensor([1.0, 2.0]) + + result = NelderMead.minimize(x0, fun, tol: 1.0e-6, initial_simplex_step: 0.1) + + assert Nx.to_number(result.converged) == 1 + assert_all_close(result.x, Nx.tensor([0.0, 0.0]), atol: 1.0e-2) + end + end +end