Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .docker.env

This file was deleted.

137 changes: 127 additions & 10 deletions .github/workflows/docker-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,135 @@ name: Docker Image CI

on:
push:
branches: [ "main" ]
branches: ["docker"]
release:
types: [published]

permissions:
contents: read
packages: write

jobs:
build:
build-base:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
submodules: recursive

- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Build and push dev-base
uses: docker/build-push-action@v6
with:
context: .
file: docker/Dockerfile.dev
target: dev-base
platforms: linux/amd64
push: true
tags: ghcr.io/collaborativebioinformatics/oncolearn:dev-base
cache-from: type=gha,scope=dev-base
cache-to: type=gha,mode=max,scope=dev-base

- name: Build and push prod-base
uses: docker/build-push-action@v6
with:
context: .
file: docker/Dockerfile.prod
target: pytorch-prod-base
platforms: linux/amd64
push: true
tags: ghcr.io/collaborativebioinformatics/oncolearn:prod-base
cache-from: type=gha,scope=prod-base
cache-to: type=gha,mode=max,scope=prod-base

build-gpu:
needs: build-base
runs-on: ubuntu-latest
strategy:
matrix:
include:
- stage: dev
gpu_extra: cpu
image_tag: dev-cpu
rocm_wsl: "0"
- stage: dev
gpu_extra: cu130
image_tag: dev-cuda
rocm_wsl: "0"
- stage: dev
gpu_extra: rocm
image_tag: dev-rocm
rocm_wsl: "0"
- stage: prod
gpu_extra: cpu
image_tag: prod-cpu
rocm_wsl: "0"
- stage: prod
gpu_extra: cu130
image_tag: prod-cuda
rocm_wsl: "0"
- stage: prod
gpu_extra: rocm
image_tag: prod-rocm
rocm_wsl: "0"
- stage: prod
gpu_extra: rocm
image_tag: prod-rocm-wsl
rocm_wsl: "1"

steps:
- uses: actions/checkout@v4

- name: Build the Docker image (amd-wsl)
run: |
docker build . \
--file Dockerfile \
--build-arg GPU_EXTRA=rocm62 \
--tag my-image-name:amd-wsl-$(date +%s)
- uses: actions/checkout@v4
with:
submodules: recursive

- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Set image variables
id: vars
run: |
if [ "${{ matrix.stage }}" = "dev" ]; then
echo "dockerfile=docker/Dockerfile.dev" >> "$GITHUB_OUTPUT"
echo "target=dev" >> "$GITHUB_OUTPUT"
echo "base_scope=dev-base" >> "$GITHUB_OUTPUT"
echo "base_image=ghcr.io/collaborativebioinformatics/oncolearn:dev-base" >> "$GITHUB_OUTPUT"
else
echo "dockerfile=docker/Dockerfile.prod" >> "$GITHUB_OUTPUT"
echo "target=pytorch-prod" >> "$GITHUB_OUTPUT"
echo "base_scope=prod-base" >> "$GITHUB_OUTPUT"
echo "base_image=ghcr.io/collaborativebioinformatics/oncolearn:prod-base" >> "$GITHUB_OUTPUT"
fi

- name: Build and push GPU variant
uses: docker/build-push-action@v6
with:
context: .
file: ${{ steps.vars.outputs.dockerfile }}
target: ${{ steps.vars.outputs.target }}
platforms: linux/amd64
push: true
tags: ghcr.io/collaborativebioinformatics/oncolearn:${{ matrix.image_tag }}
build-args: |
GPU_EXTRA=${{ matrix.gpu_extra }}
ROCM_WSL=${{ matrix.rocm_wsl }}
cache-from: |
type=gha,scope=${{ matrix.image_tag }}
type=gha,scope=${{ steps.vars.outputs.base_scope }}
type=registry,ref=${{ steps.vars.outputs.base_image }}
cache-to: type=gha,mode=max,scope=${{ matrix.image_tag }}
98 changes: 68 additions & 30 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Shared configuration using YAML anchors
x-common-config: &common-config
env_file: .docker.env
volumes:
- .:/workspace
- uv-cache:/root/.cache/uv
Expand All @@ -21,9 +20,10 @@ x-prod-build: &prod-build
dockerfile: docker/Dockerfile.prod

services:
# Main development environment with Python + R (CPU only)
# Development environment CPU only
dev-cpu:
<<: *common-config
image: ghcr.io/collaborativebioinformatics/oncolearn:dev-cpu
profiles: ["dev-cpu"]
build:
<<: *dev-build
Expand All @@ -32,50 +32,54 @@ services:
target: dev
container_name: oncolearn-dev-cpu

# Main development environment with Python + R (NVIDIA GPU)
dev-nvidia:
# Development environment NVIDIA GPU
dev-cuda:
<<: *common-config
profiles: ["dev-nvidia"]
image: ghcr.io/collaborativebioinformatics/oncolearn:dev-cuda
profiles: ["dev-cuda"]
build:
<<: *dev-build
args:
GPU_EXTRA: cu130
target: dev
container_name: oncolearn-dev-nvidia
container_name: oncolearn-dev-cuda
deploy:
resources:
reservations:
devices:
- driver: nvidia
- driver: cuda
count: all
capabilities: [gpu]

# Main development environment with Python + R (AMD GPU - Native Linux)
dev-amd:
# Development environment AMD GPU (native Linux)
dev-rocm:
<<: *common-config
profiles: ["dev-amd"]
image: ghcr.io/collaborativebioinformatics/oncolearn:dev-rocm
profiles: ["dev-rocm"]
build:
<<: *dev-build
args:
GPU_EXTRA: rocm
target: dev
container_name: oncolearn-dev-amd
container_name: oncolearn-dev-rocm
devices:
- /dev/kfd
- /dev/dri
group_add:
- video

# Main development environment with Python + R (AMD GPU - WSL2)
dev-amd-wsl:
# Development environment — AMD GPU (WSL2)
# Uses the same image as dev-rocm; WSL2 differences are runtime-only.
dev-rocm-wsl:
<<: *common-config
profiles: ["dev-amd-wsl"]
image: ghcr.io/collaborativebioinformatics/oncolearn:dev-rocm
profiles: ["dev-rocm-wsl"]
build:
<<: *dev-build
args:
GPU_EXTRA: rocm
target: dev
container_name: oncolearn-dev-amd-wsl
container_name: oncolearn-dev-rocm-wsl
volumes:
- .:/workspace
- uv-cache:/root/.cache/uv
Expand All @@ -99,52 +103,86 @@ services:
ipc: host # required for ROCm shared memory
shm_size: '2gb'

# Production PyTorch environment (CPU only)
pytorch-prod-cpu:
env_file: .docker.env
# Production image — CPU only
prod-cpu:
image: ghcr.io/collaborativebioinformatics/oncolearn:prod-cpu
profiles: ["prod-cpu"]
build:
<<: *prod-build
args:
GPU_EXTRA: cpu
target: pytorch-prod
container_name: oncolearn-pytorch-prod-cpu
container_name: oncolearn-prod-cpu
volumes:
- ./data:/workspace/data:ro
- ./data/configs:/workspace/data/configs

# Production PyTorch environment (NVIDIA GPU - Optimized Size)
pytorch-prod-nvidia:
env_file: .docker.env
profiles: ["prod-nvidia"]
# Production image — NVIDIA GPU
prod-cuda:
image: ghcr.io/collaborativebioinformatics/oncolearn:prod-cuda
profiles: ["prod-cuda"]
build:
<<: *prod-build
args:
GPU_EXTRA: cu130
target: pytorch-prod
container_name: oncolearn-pytorch-prod-nvidia
container_name: oncolearn-prod-cuda
volumes:
- ./data:/workspace/data:ro
- ./data/configs:/workspace/data/configs
deploy:
resources:
reservations:
devices:
- driver: nvidia
- driver: cuda
count: all
capabilities: [gpu]

# Production PyTorch environment (AMD GPU - WSL2)
pytorch-prod-amd-wsl:
env_file: .docker.env
profiles: ["prod-amd"]
# Production image — AMD GPU (native Linux)
prod-rocm:
image: ghcr.io/collaborativebioinformatics/oncolearn:prod-rocm
profiles: ["prod-rocm"]
build:
<<: *prod-build
args:
GPU_EXTRA: rocm
ROCM_WSL: "0"
target: pytorch-prod
container_name: oncolearn-prod-rocm
volumes:
- ./data:/workspace/data:ro
- ./data/configs:/workspace/data/configs
- ./models:/workspace/models
- ./outputs:/workspace/outputs
- ./.hf-cache:/root/.cache/huggingface
- ./src:/workspace/src
- /opt/rocm/lib:/opt/rocm/lib:ro # ROCm runtime libs
environment:
- LD_LIBRARY_PATH=/opt/rocm/lib
devices:
- /dev/kfd
- /dev/dri
group_add:
- video
cap_add:
- SYS_PTRACE
security_opt:
- seccomp:unconfined
ipc: host
shm_size: '2gb'

# Production image — AMD GPU (WSL2)
# Uses a separate image with ROCm 7.2 LW wheels built for /dev/dxg.
prod-rocm-wsl:
image: ghcr.io/collaborativebioinformatics/oncolearn:prod-rocm-wsl
profiles: ["prod-rocm-wsl"]
build:
<<: *prod-build
args:
GPU_EXTRA: rocm
ROCM_WSL: "1"
target: pytorch-prod
container_name: oncolearn-pytorch-prod-amd
container_name: oncolearn-prod-rocm-wsl
mem_limit: 20g # cap below WSL limit so OOM kills container, not WSL
memswap_limit: 22g # allow some swap headroom during model loading
volumes:
Expand Down
Loading
Loading