diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..a160922 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,187 @@ +name: CI + +on: + push: + branches: [master] + tags: ["v*"] + pull_request: + branches: [master] + +permissions: + contents: read + +env: + MIX_ENV: test + ELIXIR_VERSION: "1.17.3" + OTP_VERSION: "27.2" + +jobs: + compile: + name: Compile & Warnings + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: erlef/setup-beam@v1 + with: + elixir-version: ${{ env.ELIXIR_VERSION }} + otp-version: ${{ env.OTP_VERSION }} + + - name: Cache deps & build + uses: actions/cache@v4 + with: + path: | + deps + _build + key: ${{ runner.os }}-mix-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}-${{ hashFiles('mix.lock') }} + restore-keys: | + ${{ runner.os }}-mix-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}- + + - run: mix deps.get + - run: mix deps.compile + + - name: Compile with warnings as errors + run: mix compile --warnings-as-errors + + format: + name: Formatting + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: erlef/setup-beam@v1 + with: + elixir-version: ${{ env.ELIXIR_VERSION }} + otp-version: ${{ env.OTP_VERSION }} + + - name: Cache deps + uses: actions/cache@v4 + with: + path: deps + key: ${{ runner.os }}-mix-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}-${{ hashFiles('mix.lock') }} + restore-keys: | + ${{ runner.os }}-mix-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}- + + - run: mix deps.get + - run: mix format --check-formatted + + credo: + name: Credo + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: erlef/setup-beam@v1 + with: + elixir-version: ${{ env.ELIXIR_VERSION }} + otp-version: ${{ env.OTP_VERSION }} + + - name: Cache deps & build + uses: actions/cache@v4 + with: + path: | + deps + _build + key: ${{ runner.os }}-mix-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}-${{ hashFiles('mix.lock') }} + restore-keys: | + ${{ runner.os }}-mix-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}- + + - run: mix deps.get + - run: mix credo --strict + + test: + name: Tests (OTP ${{ matrix.otp }} / Elixir ${{ matrix.elixir }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - elixir: "1.17.3" + otp: "27.2" + - elixir: "1.18.3" + otp: "27.2" + - elixir: "1.20.0-rc.1" + otp: "28.3.3" + steps: + - uses: actions/checkout@v4 + + - uses: erlef/setup-beam@v1 + with: + elixir-version: ${{ matrix.elixir }} + otp-version: ${{ matrix.otp }} + + - name: Cache deps & build + uses: actions/cache@v4 + with: + path: | + deps + _build + key: ${{ runner.os }}-mix-${{ matrix.elixir }}-${{ matrix.otp }}-${{ hashFiles('mix.lock') }} + restore-keys: | + ${{ runner.os }}-mix-${{ matrix.elixir }}-${{ matrix.otp }}- + + - run: mix deps.get + - run: mix deps.compile + - run: mix compile + - run: mix test + + dialyzer: + name: Dialyzer + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: erlef/setup-beam@v1 + with: + elixir-version: ${{ env.ELIXIR_VERSION }} + otp-version: ${{ env.OTP_VERSION }} + + - name: Cache deps & build + uses: actions/cache@v4 + with: + path: | + deps + _build + key: ${{ runner.os }}-mix-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}-${{ hashFiles('mix.lock') }} + restore-keys: | + ${{ runner.os }}-mix-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}- + + - name: Cache PLTs + uses: actions/cache@v4 + with: + path: priv/plts + key: ${{ runner.os }}-plt-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}-${{ hashFiles('mix.lock') }} + restore-keys: | + ${{ runner.os }}-plt-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}- + + - run: mix deps.get + - run: mix deps.compile + - run: mix compile + - run: mix dialyzer + + publish: + name: Publish to Hex + runs-on: ubuntu-latest + needs: [compile, format, credo, test, dialyzer] + if: startsWith(github.ref, 'refs/tags/v') + steps: + - uses: actions/checkout@v4 + + - uses: erlef/setup-beam@v1 + with: + elixir-version: ${{ env.ELIXIR_VERSION }} + otp-version: ${{ env.OTP_VERSION }} + + - name: Set version from tag + run: | + TAG_VERSION="${GITHUB_REF#refs/tags/v}" + sed -i "s/@version \".*\"/@version \"${TAG_VERSION}\"/" mix.exs + grep '@version' mix.exs + + - run: mix deps.get + - run: mix compile + + - name: Publish to Hex + run: mix hex.publish --yes + env: + HEX_API_KEY: ${{ secrets.HEX_API_KEY }} diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..a3e88e5 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,77 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Build & Test Commands + +```bash +mix deps.get # Fetch dependencies +mix compile # Compile Elixir + C code (via elixir_make) +mix test # Run full test suite (47 tests) +mix test test/process_test.exs # Run a single test file +mix test test/process_test.exs:10 # Run a single test at line +mix format # Auto-format code +mix format --check-formatted # Check formatting (CI) +mix compile --warnings-as-errors # Compile with strict warnings +mix credo --strict # Lint with credo +mix dialyzer # Static type analysis +make clean && make all # Rebuild C code only +``` + +## Architecture + +NetRunner is a safe OS process execution library for Elixir with NIF-based backpressure, zero zombie guarantees, PTY support, and cgroup isolation. + +### Three-Tier Design + +**Elixir Layer** (`lib/`) — GenServer-based process management, stream API, daemon mode. + +**NIF Layer** (`c_src/net_runner_nif.c`) — Wraps FDs in NIF resources with `enif_select` for async I/O on dirty IO schedulers. On EAGAIN, registers with BEAM's epoll/kqueue; the GenServer parks callers in an operations queue and retries when `{:select, _, _, :ready_input/:ready_output}` arrives. + +**Shepherd Layer** (`c_src/shepherd.c`) — Persistent C binary spawned per command via `Port.open`. Forks the child, passes pipe FDs to BEAM via SCM_RIGHTS over a UDS socket, then enters a `poll()` loop. Detects BEAM death via POLLHUP and escalates SIGTERM→SIGKILL on the child's process group. + +### Zero Zombie Prevention (3 layers) + +1. **Shepherd** — detects BEAM death (POLLHUP on UDS), kills child process group +2. **Watcher GenServer** — monitors Process GenServer, kills OS process on DOWN +3. **NIF destructor** — closes FDs on garbage collection + +### Spawn Sequence + +1. BEAM creates UDS listener at random temp path +2. `Port.open` launches shepherd with UDS path as argv[1] +3. Shepherd connects, forks child, sends pipe FDs via SCM_RIGHTS +4. Shepherd sends `MSG_CHILD_STARTED(pid)` over UDS +5. GenServer wraps FDs in NIF resources, registers with Watcher + +### Key Module Relationships + +- `NetRunner` — top-level API (`run/2`, `stream!/2`, `stream/2`) +- `NetRunner.Process` — GenServer owning the OS process lifecycle +- `NetRunner.Process.Exec` — spawn logic (UDS, Port, SCM_RIGHTS, Pipe creation) +- `NetRunner.Process.Nif` — NIF stubs (`nif_read`, `nif_write`, `nif_close`, `nif_create_fd`, `nif_kill`) +- `NetRunner.Process.Pipe` — struct wrapping a NIF resource with owner/type metadata +- `NetRunner.Process.Operations` — pending operation queue (park on EAGAIN, retry on select) +- `NetRunner.Stream` — `Stream.resource` wrapper with concurrent input writer Task +- `NetRunner.Daemon` — supervised long-running process with output draining +- `NetRunner.Watcher` — belt-and-suspenders process monitor +- `NetRunner.Signal` — signal atom to platform number resolution via NIF + +### Shepherd Protocol (`c_src/protocol.h`) + +BEAM→Shepherd: `CMD_KILL(signal)`, `CMD_CLOSE_STDIN`, `CMD_SET_WINSIZE(rows,cols)` +Shepherd→BEAM: `MSG_CHILD_STARTED(pid)`, `MSG_CHILD_EXITED(status)`, `MSG_ERROR(msg)` + +## C Code + +- C99, compiled with `-Wall -Wextra -Werror` +- Platform detection in Makefile: `-D_GNU_SOURCE` (Linux) or `-D_DARWIN_C_SOURCE` (macOS) +- Two build artifacts: `priv/shepherd` (executable) and `priv/net_runner_nif.so` (shared lib) +- NIF functions run on dirty IO schedulers to avoid blocking BEAM + +## Conventions + +- Elixir ~> 1.17, CI tests against 1.17 and 1.18 on OTP 27 +- All public API functions have `@doc` and `@spec` +- Tests are async where possible (`async: true`) +- Credo strict mode enforced: max cyclomatic complexity 9, max nesting depth 2 diff --git a/c_src/shepherd.c b/c_src/shepherd.c index 71e2304..4416938 100644 --- a/c_src/shepherd.c +++ b/c_src/shepherd.c @@ -25,7 +25,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -46,8 +48,8 @@ static int signal_pipe[2] = {-1, -1}; static void sigchld_handler(int sig) { (void)sig; int saved_errno = errno; - /* Write a single byte to wake up poll() */ - (void)write(signal_pipe[1], "C", 1); + /* Write a single byte to wake up poll() — ignore failure in signal handler */ + if (write(signal_pipe[1], "C", 1) < 0) { /* nothing to do */ } errno = saved_errno; } @@ -189,7 +191,6 @@ static void cgroup_cleanup(void) { if (cgroup_path[0] == '\0') return; char full_path[512]; - char procs_path[576]; char kill_path[576]; snprintf(full_path, sizeof(full_path), "/sys/fs/cgroup/%s", cgroup_path); diff --git a/lib/net_runner.ex b/lib/net_runner.ex index 1827b3b..b92b8a0 100644 --- a/lib/net_runner.ex +++ b/lib/net_runner.ex @@ -59,24 +59,7 @@ defmodule NetRunner do {:ok, pid} = Proc.start(cmd, args, process_opts) - # Run I/O in a task so we can enforce timeout via Task.yield - task = - Task.async(fn -> - if input do - write_all_input(pid, input) - else - Proc.close_stdin(pid) - end - - case read_all_with_limits(pid, max_output_size) do - {:ok, output} -> - {:ok, exit_status} = Proc.await_exit(pid) - {output, exit_status} - - {:error, _} = error -> - error - end - end) + task = Task.async(fn -> run_io(pid, input, max_output_size) end) effective_timeout = timeout || :infinity @@ -170,6 +153,23 @@ defmodule NetRunner do end end + defp run_io(pid, input, max_output_size) do + if input do + write_all_input(pid, input) + else + Proc.close_stdin(pid) + end + + case read_all_with_limits(pid, max_output_size) do + {:ok, output} -> + {:ok, exit_status} = Proc.await_exit(pid) + {output, exit_status} + + {:error, _} = error -> + error + end + end + defp kill_and_cleanup(pid) do Proc.kill(pid, :sigterm) diff --git a/lib/net_runner/process.ex b/lib/net_runner/process.ex index cc2a979..e02a5ba 100644 --- a/lib/net_runner/process.ex +++ b/lib/net_runner/process.ex @@ -296,39 +296,8 @@ defmodule NetRunner.Process do state = Enum.reduce(pending ++ stderr_pending, state, fn {ref, {type, from, max_bytes}}, acc -> - {_pipe_name, pipe} = - case type do - {:read, :stdout} -> {:stdout, acc.stdout} - {:read, :stderr} -> {:stderr, acc.stderr} - end - - if is_nil(pipe) do - GenServer.reply(from, {:error, :closed}) - {_, ops} = Operations.pop(acc.operations, ref) - %{acc | operations: ops} - else - case Pipe.read(pipe, max_bytes) do - {:ok, data} -> - GenServer.reply(from, {:ok, data}) - {_, ops} = Operations.pop(acc.operations, ref) - stats = Stats.record_read(acc.stats, byte_size(data)) - %{acc | operations: ops, stats: stats} - - :eof -> - GenServer.reply(from, :eof) - {_, ops} = Operations.pop(acc.operations, ref) - %{acc | operations: ops} - - {:error, :eagain} -> - # Still not ready, keep parked - acc - - {:error, _} = error -> - GenServer.reply(from, error) - {_, ops} = Operations.pop(acc.operations, ref) - %{acc | operations: ops} - end - end + pipe = pipe_for_type(acc, type) + retry_single_read(acc, ref, pipe, from, max_bytes) end) # Also handle internal stderr consumption @@ -339,6 +308,38 @@ defmodule NetRunner.Process do end end + defp pipe_for_type(state, {:read, :stdout}), do: state.stdout + defp pipe_for_type(state, {:read, :stderr}), do: state.stderr + + defp retry_single_read(state, ref, nil, from, _max_bytes) do + GenServer.reply(from, {:error, :closed}) + {_, ops} = Operations.pop(state.operations, ref) + %{state | operations: ops} + end + + defp retry_single_read(state, ref, pipe, from, max_bytes) do + case Pipe.read(pipe, max_bytes) do + {:ok, data} -> + GenServer.reply(from, {:ok, data}) + {_, ops} = Operations.pop(state.operations, ref) + stats = Stats.record_read(state.stats, byte_size(data)) + %{state | operations: ops, stats: stats} + + :eof -> + GenServer.reply(from, :eof) + {_, ops} = Operations.pop(state.operations, ref) + %{state | operations: ops} + + {:error, :eagain} -> + state + + {:error, _} = error -> + GenServer.reply(from, error) + {_, ops} = Operations.pop(state.operations, ref) + %{state | operations: ops} + end + end + defp retry_pending_writes(state) do pending = Operations.pending_by_type(state.operations, :write) diff --git a/lib/net_runner/process/exec.ex b/lib/net_runner/process/exec.ex index 7dc2dbf..9c0b77a 100644 --- a/lib/net_runner/process/exec.ex +++ b/lib/net_runner/process/exec.ex @@ -58,24 +58,15 @@ defmodule NetRunner.Process.Exec do end defp create_uds_listener(path) do - with {:ok, socket} <- :socket.open(:local, :stream, :default) do - addr = %{family: :local, path: path} - - case :socket.bind(socket, addr) do - :ok -> - case :socket.listen(socket) do - :ok -> - {:ok, socket} - - error -> - :socket.close(socket) - error - end - - error -> - :socket.close(socket) - error - end + addr = %{family: :local, path: path} + + with {:ok, socket} <- :socket.open(:local, :stream, :default), + :ok <- :socket.bind(socket, addr), + :ok <- :socket.listen(socket) do + {:ok, socket} + else + {:error, _} = error -> + error end end diff --git a/mix.exs b/mix.exs index 66d8e41..da7c22c 100644 --- a/mix.exs +++ b/mix.exs @@ -36,7 +36,8 @@ defmodule NetRunner.MixProject do # Development dependencies {:ex_doc, "~> 0.35", only: :dev, runtime: false}, - {:dialyxir, "~> 1.4", only: [:dev, :test], runtime: false} + {:dialyxir, "~> 1.4", only: [:dev, :test], runtime: false}, + {:credo, "~> 1.7", only: [:dev, :test], runtime: false} ] end diff --git a/mix.lock b/mix.lock index c68a477..651c3b6 100644 --- a/mix.lock +++ b/mix.lock @@ -1,9 +1,13 @@ %{ + "bunt": {:hex, :bunt, "1.0.0", "081c2c665f086849e6d57900292b3a161727ab40431219529f13c4ddcf3e7a44", [:mix], [], "hexpm", "dc5f86aa08a5f6fa6b8096f0735c4e76d54ae5c9fa2c143e5a1fc7c1cd9bb6b5"}, + "credo": {:hex, :credo, "1.7.16", "a9f1389d13d19c631cb123c77a813dbf16449a2aebf602f590defa08953309d4", [:mix], [{:bunt, "~> 0.2.1 or ~> 1.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2 or ~> 1.0", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "d0562af33756b21f248f066a9119e3890722031b6d199f22e3cf95550e4f1579"}, "dialyxir": {:hex, :dialyxir, "1.4.7", "dda948fcee52962e4b6c5b4b16b2d8fa7d50d8645bbae8b8685c3f9ecb7f5f4d", [:mix], [{:erlex, ">= 0.2.8", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "b34527202e6eb8cee198efec110996c25c5898f43a4094df157f8d28f27d9efe"}, "earmark_parser": {:hex, :earmark_parser, "1.4.44", "f20830dd6b5c77afe2b063777ddbbff09f9759396500cdbe7523efd58d7a339c", [:mix], [], "hexpm", "4778ac752b4701a5599215f7030989c989ffdc4f6df457c5f36938cc2d2a2750"}, "elixir_make": {:hex, :elixir_make, "0.9.0", "6484b3cd8c0cee58f09f05ecaf1a140a8c97670671a6a0e7ab4dc326c3109726", [:mix], [], "hexpm", "db23d4fd8b757462ad02f8aa73431a426fe6671c80b200d9710caf3d1dd0ffdb"}, "erlex": {:hex, :erlex, "0.2.8", "cd8116f20f3c0afe376d1e8d1f0ae2452337729f68be016ea544a72f767d9c12", [:mix], [], "hexpm", "9d66ff9fedf69e49dc3fd12831e12a8a37b76f8651dd21cd45fcf5561a8a7590"}, "ex_doc": {:hex, :ex_doc, "0.40.1", "67542e4b6dde74811cfd580e2c0149b78010fd13001fda7cfeb2b2c2ffb1344d", [:mix], [{:earmark_parser, "~> 1.4.44", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "bcef0e2d360d93ac19f01a85d58f91752d930c0a30e2681145feea6bd3516e00"}, + "file_system": {:hex, :file_system, "1.1.1", "31864f4685b0148f25bd3fbef2b1228457c0c89024ad67f7a81a3ffbc0bbad3a", [:mix], [], "hexpm", "7a15ff97dfe526aeefb090a7a9d3d03aa907e100e262a0f8f7746b78f8f87a5d"}, + "jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"}, "makeup": {:hex, :makeup, "1.2.1", "e90ac1c65589ef354378def3ba19d401e739ee7ee06fb47f94c687016e3713d1", [:mix], [{:nimble_parsec, "~> 1.4", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "d36484867b0bae0fea568d10131197a4c2e47056a6fbe84922bf6ba71c8d17ce"}, "makeup_elixir": {:hex, :makeup_elixir, "1.0.1", "e928a4f984e795e41e3abd27bfc09f51db16ab8ba1aebdba2b3a575437efafc2", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "7284900d412a3e5cfd97fdaed4f5ed389b8f2b4cb49efc0eb3bd10e2febf9507"}, "makeup_erlang": {:hex, :makeup_erlang, "1.0.3", "4252d5d4098da7415c390e847c814bad3764c94a814a0b4245176215615e1035", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "953297c02582a33411ac6208f2c6e55f0e870df7f80da724ed613f10e6706afd"}, diff --git a/test/phase2_test.exs b/test/phase2_test.exs index 6a49118..add56be 100644 --- a/test/phase2_test.exs +++ b/test/phase2_test.exs @@ -1,6 +1,8 @@ defmodule NetRunner.Phase2Test do use ExUnit.Case, async: true + alias NetRunner.Process.Nif + describe "run/2 timeout" do test "kills process on timeout" do result = NetRunner.run(~w(sleep 100), timeout: 200) @@ -41,7 +43,7 @@ defmodule NetRunner.Phase2Test do Process.sleep(500) # The process group leader should be dead - assert NetRunner.Process.Nif.nif_is_os_pid_alive(os_pid) == false + assert Nif.nif_is_os_pid_alive(os_pid) == false end end diff --git a/test/pty_test.exs b/test/pty_test.exs index 7f3cab8..1e92048 100644 --- a/test/pty_test.exs +++ b/test/pty_test.exs @@ -18,13 +18,12 @@ defmodule NetRunner.PtyTest do end test "PTY provides terminal-like behavior" do - # tty should report a device path, not "not a tty" + # tty prints device path and exits immediately {:ok, pid} = Proc.start("tty", [], pty: true) {:ok, data} = Proc.read(pid) refute data =~ "not a tty" assert data =~ "/dev/" - Proc.kill(pid, :sigkill) - Proc.await_exit(pid, 5_000) + {:ok, _status} = Proc.await_exit(pid, 5_000) end test "set_window_size does not crash" do