elixir-nx · krstopro · Jan 16, 2026 · Oct 30, 2024 · Oct 30, 2024 · Oct 31, 2024
diff --git a/lib/scholar/linear/logistic_regression.ex b/lib/scholar/linear/logistic_regression.ex
@@ -1,12 +1,11 @@
 defmodule Scholar.Linear.LogisticRegression do
   @moduledoc """
-  Logistic regression in both binary and multinomial variants.
+  Multiclass logistic regression.
 
   Time complexity is $O(N * K * I)$ where $N$ is the number of samples, $K$ is the number of features, and $I$ is the number of iterations.
   """
   import Nx.Defn
   import Scholar.Shared
-  alias Scholar.Linear.LinearHelpers
 
   @derive {Nx.Container, containers: [:coefficients, :bias]}
   defstruct [:coefficients, :bias]
@@ -15,35 +14,44 @@ defmodule Scholar.Linear.LogisticRegression do
     num_classes: [
       required: true,
       type: :pos_integer,
-      doc: "number of classes contained in the input tensors."
+      doc: "Number of output classes."
     ],
-    iterations: [
+    max_iterations: [
       type: :pos_integer,
       default: 1000,
+      doc: "Maximum number of gradient descent iterations to perform."
+    ],
+    optimizer: [
+      type: {:custom, Scholar.Options, :optimizer, []},
+      default: :sgd,
       doc: """
-      number of iterations of gradient descent performed inside logistic
-      regression.
+      Optimizer name or {init, update} pair of functions (see `Polaris.Optimizers` for more details).
       """
     ],
-    learning_loop_unroll: [
-      type: :boolean,
-      default: false,
-      doc: ~S"""
-      If `true`, the learning loop is unrolled.
+    alpha: [
+      type: {:custom, Scholar.Options, :non_negative_number, []},
+      default: 1.0,
+      doc: """
+      Constant that multiplies the regularization term, controlling regularization strength.
+      If 0, no regularization is applied.
       """
     ],
-    optimizer: [
-      type: {:custom, Scholar.Options, :optimizer, []},
-      default: :sgd,
+    l1_ratio: [
+      type: {:custom, Scholar.Options, :non_negative_number, []},
+      default: 0.0,
       doc: """
-      The optimizer name or {init, update} pair of functions (see `Polaris.Optimizers` for more details).
+      The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`.
+      Setting `l1_ratio` to 0 gives pure L2 regularization, and setting it to 1 gives pure L1 regularization.
+      For values between 0 and 1, a penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2` is used.
       """
     ],
-    eps: [
-      type: :float,
-      default: 1.0e-8,
-      doc:
-        "The convergence tolerance. If the `abs(loss) < size(x) * :eps`, the algorithm is considered to have converged."
+    tol: [
+      type: {:custom, Scholar.Options, :non_negative_number, []},
+      default: 1.0e-4,
+      doc: """
+      Convergence tolerance. If the infinity norm of the gradient is less than `:tol`,
+      the algorithm is considered to have converged.
+      """
     ]
   ]
 
@@ -53,9 +61,6 @@ defmodule Scholar.Linear.LogisticRegression do
   Fits a logistic regression model for sample inputs `x` and sample
   targets `y`.
 
-  Depending on number of classes the function chooses either binary
-  or multinomial logistic regression.
-
   ## Options
 
   #{NimbleOptions.docs(@opts_schema)}
@@ -68,10 +73,6 @@ defmodule Scholar.Linear.LogisticRegression do
 
     * `:bias` - Bias added to the decision function.
 
-    * `:mode` - Indicates whether the problem is binary classification (`:num_classes` set to 2)
-      or multinomial (`:num_classes` is bigger than 2). For binary classification set to `:binary`, otherwise
-      set to `:multinomial`.
-
   ## Examples
 
       iex> x = Nx.tensor([[1.0, 2.0], [3.0, 2.0], [4.0, 7.0]])
@@ -80,26 +81,41 @@ defmodule Scholar.Linear.LogisticRegression do
       %Scholar.Linear.LogisticRegression{
         coefficients: Nx.tensor(
           [
-            [2.5531527996063232, -0.5531544089317322],
-            [-0.35652396082878113, 2.3565237522125244]
+            [0.09002052247524261, -0.09002052992582321],
+            [-0.1521512120962143, 0.1521512120962143]
           ]
         ),
-        bias: Nx.tensor(
-          [-0.28847914934158325, 0.28847917914390564]
-        )
+        bias: Nx.tensor([-0.05300388112664223, 0.053003907203674316])
       }
   """
   deftransform fit(x, y, opts \\ []) do
     if Nx.rank(x) != 2 do
       raise ArgumentError,
-            "expected x to have shape {n_samples, n_features}, got tensor with shape: #{inspect(Nx.shape(x))}"
+            "expected x to have shape {num_samples, num_features}, got tensor with shape: #{inspect(Nx.shape(x))}"
+    end
+
+    if Nx.rank(y) != 1 do
+      raise ArgumentError,
+            "expected y to have shape {num_samples}, got tensor with shape: #{inspect(Nx.shape(y))}"
     end
 
-    {n_samples, _} = Nx.shape(x)
-    y = LinearHelpers.validate_y_shape(y, n_samples, __MODULE__)
+    {num_samples, num_features} = Nx.shape(x)
+
+    if Nx.axis_size(y, 0) != num_samples do
+      raise ArgumentError,
+            "expected x and y to have the same number of samples, got #{num_samples} and #{Nx.axis_size(y, 0)}"
+    end
 
     opts = NimbleOptions.validate!(opts, @opts_schema)
 
+    {l1_ratio, opts} = Keyword.pop!(opts, :l1_ratio)
+
+    unless l1_ratio >= 0.0 and l1_ratio <= 1.0 do
+      raise ArgumentError,
+            "expected l1_ratio to be between 0 and 1, got: #{inspect(l1_ratio)}"
+    end
+
+    type = to_float_type(x)
     {optimizer, opts} = Keyword.pop!(opts, :optimizer)
 
     {optimizer_init_fn, optimizer_update_fn} =
@@ -108,23 +124,39 @@ defmodule Scholar.Linear.LogisticRegression do
         {f1, f2} -> {f1, f2}
       end
 
-    n = Nx.axis_size(x, -1)
     num_classes = opts[:num_classes]
 
-    coef =
+    w =
       Nx.broadcast(
-        Nx.tensor(1.0, type: to_float_type(x)),
-        {n, num_classes}
+        Nx.tensor(0.0, type: type),
+        {num_features, num_classes}
       )
 
-    bias = Nx.broadcast(Nx.tensor(0, type: to_float_type(x)), {num_classes})
+    b = Nx.broadcast(Nx.tensor(0.0, type: type), {num_classes})
+
+    w_optimizer_state = optimizer_init_fn.(w) |> as_type(type)
+    b_optimizer_state = optimizer_init_fn.(b) |> as_type(type)
 
-    coef_optimizer_state = optimizer_init_fn.(coef) |> as_type(to_float_type(x))
-    bias_optimizer_state = optimizer_init_fn.(bias) |> as_type(to_float_type(x))
+    {alpha, opts} = Keyword.pop!(opts, :alpha)
+    {tol, opts} = Keyword.pop!(opts, :tol)
+    alpha = Nx.tensor(alpha, type: type)
+    l1_ratio = Nx.tensor(l1_ratio, type: type)
+    tol = Nx.tensor(tol, type: type)
 
     opts = Keyword.put(opts, :optimizer_update_fn, optimizer_update_fn)
 
-    fit_n(x, y, coef, bias, coef_optimizer_state, bias_optimizer_state, opts)
+    fit_n(
+      x,
+      y,
+      w,
+      b,
+      alpha,
+      l1_ratio,
+      tol,
+      w_optimizer_state,
+      b_optimizer_state,
+      opts
+    )
   end
 
   deftransformp as_type(container, target_type) do
@@ -139,11 +171,20 @@ defmodule Scholar.Linear.LogisticRegression do
     end)
   end
 
-  # Logistic Regression training loop
-
-  defnp fit_n(x, y, coef, bias, coef_optimizer_state, bias_optimizer_state, opts) do
+  defnp fit_n(
+          x,
+          y,
+          w,
+          b,
+          alpha,
+          l1_ratio,
+          tol,
+          w_optimizer_state,
+          b_optimizer_state,
+          opts
+        ) do
     num_samples = Nx.axis_size(x, 0)
-    iterations = opts[:iterations]
+    max_iterations = opts[:max_iterations]
     num_classes = opts[:num_classes]
     optimizer_update_fn = opts[:optimizer_update_fn]
 
@@ -153,40 +194,76 @@ defmodule Scholar.Linear.LogisticRegression do
       |> Nx.broadcast({num_samples, num_classes})
       |> Nx.equal(Nx.iota({num_samples, num_classes}, axis: 1))
 
-    {{final_coef, final_bias}, _} =
-      while {{coef, bias},
-             {x, iterations, y_one_hot, coef_optimizer_state, bias_optimizer_state,
-              has_converged = Nx.u8(0), iter = 0}},
-            iter < iterations and not has_converged do
-        {loss, {coef_grad, bias_grad}} = loss_and_grad(coef, bias, x, y_one_hot)
+    {coef, bias, _} =
+      while {w, b,
+             {x, y_one_hot, max_iterations, alpha, l1_ratio, tol, w_optimizer_state,
+              b_optimizer_state, converged? = Nx.u8(0), iter = Nx.u32(0)}},
+            iter < max_iterations and not converged? do
+        {w_grad, b_grad} =
+          grad({w, b}, fn {w, b} ->
+            compute_loss(w, b, alpha, l1_ratio, x, y_one_hot)
+          end)
 
-        {coef_updates, coef_optimizer_state} =
-          optimizer_update_fn.(coef_grad, coef_optimizer_state, coef)
+        {w_updates, w_optimizer_state} =
+          optimizer_update_fn.(w_grad, w_optimizer_state, w)
 
-        coef = Polaris.Updates.apply_updates(coef, coef_updates)
+        w = Polaris.Updates.apply_updates(w, w_updates)
 
-        {bias_updates, bias_optimizer_state} =
-          optimizer_update_fn.(bias_grad, bias_optimizer_state, bias)
+        {b_updates, b_optimizer_state} =
+          optimizer_update_fn.(b_grad, b_optimizer_state, b)
 
-        bias = Polaris.Updates.apply_updates(bias, bias_updates)
+        b = Polaris.Updates.apply_updates(b, b_updates)
 
-        has_converged = Nx.sum(Nx.abs(loss)) < Nx.size(x) * opts[:eps]
+        converged? =
+          Nx.reduce_max(Nx.abs(w_grad)) < tol and Nx.reduce_max(Nx.abs(b_grad)) < tol
 
-        {{coef, bias},
-         {x, iterations, y_one_hot, coef_optimizer_state, bias_optimizer_state, has_converged,
-          iter + 1}}
+        {w, b,
+         {x, y_one_hot, max_iterations, alpha, l1_ratio, tol, w_optimizer_state,
+          b_optimizer_state, converged?, iter + 1}}
       end
 
     %__MODULE__{
-      coefficients: final_coef,
-      bias: final_bias
+      coefficients: coef,
+      bias: bias
     }
   end
 
-  defnp loss_and_grad(coeff, bias, xs, ys) do
-    value_and_grad({coeff, bias}, fn {coeff, bias} ->
-      -Nx.sum(ys * log_softmax(Nx.dot(xs, coeff) + bias), axes: [-1])
-    end)
+  defnp compute_regularization(w, alpha, l1_ratio) do
+    if alpha > 0.0 do
+      reg =
+        cond do
+          l1_ratio == 0.0 ->
+            # L2 regularization
+            Nx.sum(w * w)
+
+          l1_ratio == 1.0 ->
+            # L1 regularization
+            Nx.sum(Nx.abs(w))
+
+          # Elastic-Net regularization
+          true ->
+            l1_ratio * Nx.sum(Nx.abs(w)) +
+              (1 - l1_ratio) * Nx.sum(w * w)
+        end
+
+      alpha * reg
+    else
+      0.0
+    end
+  end
+
+  defnp compute_loss(w, b, alpha, l1_ratio, xs, ys) do
+    reg = compute_regularization(w, alpha, l1_ratio)
+
+    xs
+    |> Nx.dot(w)
+    |> Nx.add(b)
+    |> log_softmax()
+    |> Nx.multiply(ys)
+    |> Nx.sum(axes: [1])
+    |> Nx.negate()
+    |> Nx.mean()
+    |> Nx.add(reg)
   end
 
   defnp log_softmax(x) do
@@ -219,14 +296,16 @@ defmodule Scholar.Linear.LogisticRegression do
       iex> y = Nx.tensor([1, 0, 1])
       iex> model = Scholar.Linear.LogisticRegression.fit(x, y, num_classes: 2)
       iex> Scholar.Linear.LogisticRegression.predict(model, Nx.tensor([[-3.0, 5.0]]))
-      #Nx.Tensor<
-        s32[1]
-        [1]
-      >
+      Nx.tensor([1])
   """
   defn predict(%__MODULE__{coefficients: coeff, bias: bias} = _model, x) do
-    inter = Nx.dot(x, [1], coeff, [0]) + bias
-    Nx.argmax(inter, axis: 1)
+    if Nx.rank(x) != 2 do
+      raise ArgumentError,
+            "expected x to have shape {n_samples, n_features}, got tensor with shape: #{inspect(Nx.shape(x))}"
+    end
+
+    logits = Nx.dot(x, coeff) + bias
+    Nx.argmax(logits, axis: 1)
   end
 
   @doc """
@@ -238,14 +317,14 @@ defmodule Scholar.Linear.LogisticRegression do
       iex> y = Nx.tensor([1, 0, 1])
       iex> model = Scholar.Linear.LogisticRegression.fit(x, y, num_classes: 2)
       iex> Scholar.Linear.LogisticRegression.predict_probability(model, Nx.tensor([[-3.0, 5.0]]))
-      #Nx.Tensor<
-        f32[1][2]
-        [
-          [6.470913388456623e-11, 1.0]
-        ]
-      >
+      Nx.tensor([[0.10269401967525482, 0.8973060250282288]])
   """
   defn predict_probability(%__MODULE__{coefficients: coeff, bias: bias} = _model, x) do
-    softmax(Nx.dot(x, [1], coeff, [0]) + bias)
+    if Nx.rank(x) != 2 do
+      raise ArgumentError,
+            "expected x to have shape {n_samples, n_features}, got tensor with shape: #{inspect(Nx.shape(x))}"
+    end
+
+    softmax(Nx.dot(x, coeff) + bias)
   end
 end