diff --git a/.claude/commands/blog-meta-image/assets/logos/alibaba.svg b/.claude/commands/blog-meta-image/assets/logos/alibaba.svg
new file mode 100644
index 000000000000..5a9d83e065f8
--- /dev/null
+++ b/.claude/commands/blog-meta-image/assets/logos/alibaba.svg
@@ -0,0 +1,5 @@
+<svg width="32" height="32" viewBox="0 0 32 32" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path d="M4.35 6.10451C4.95 6.00451 13.75 5.95451 13.75 6.05451C13.75 6.15451 13 9.05451 13 9.05451L9.6 9.80451C5.25 10.7545 5.4 10.7045 5.05 11.1545L4.75 11.5545V20.3046L5.05 20.6546C5.4 21.1046 5.3 21.0546 9.6 22.0046C11.45 22.4046 13 22.7546 13 22.7546C13.05 22.8046 13.75 25.7046 13.75 25.7546C13.75 25.9046 5.1 25.8046 4.45 25.7046C2.25 25.2546 0.55 23.5546 0.1 21.3546C0.05 21.0546 0 18.7045 0 15.8045C0 10.4545 0 10.1545 0.65 8.85451C1.3 7.55451 2.9 6.40451 4.35 6.10451Z" fill="#F45D20"/>
+<path d="M18.2998 6.15439C18.2998 6.00439 19.1998 6.00439 22.9998 6.00439C28.2498 6.05439 28.2998 6.10439 29.6498 7.00439C30.5998 7.65439 31.3498 8.60439 31.6998 9.70439C31.9998 10.5044 31.9998 10.5544 31.9998 15.9544C31.9998 21.3544 31.9998 21.4044 31.6998 22.2044C31.2998 23.3044 30.5998 24.2544 29.6498 24.9044C28.2998 25.8044 28.2498 25.8044 22.9998 25.8544C19.1998 25.9044 18.2998 25.8544 18.2998 25.7544C18.2998 25.5544 18.9998 22.9044 19.0498 22.8044L22.6998 21.9544C24.6998 21.5044 26.4498 21.1044 26.5998 21.0044C26.7998 20.8544 26.9998 20.7044 27.0998 20.5044C27.2998 20.1544 27.2998 19.8044 27.2998 15.9044C27.2998 12.0044 27.2998 11.6544 27.0998 11.3044C26.9498 11.1044 26.7998 10.9044 26.5998 10.8044C26.4498 10.7044 24.6998 10.3044 22.6998 9.85439C20.6998 9.40439 19.0498 9.05439 19.0498 9.00439C18.9998 9.00439 18.3498 6.35439 18.2998 6.15439Z" fill="#F45D20"/>
+<path d="M12.9499 15.7044C12.9999 15.5544 13.4999 15.5044 16.0499 15.5044H19.0499V16.3544H15.9999C13.1499 16.3044 12.9499 16.3044 12.8999 16.1044C12.8999 15.9544 12.8999 15.8044 12.9499 15.7044Z" fill="#F45D20"/>
+</svg>
diff --git a/.claude/commands/blog-meta-image/assets/logos/tailscale.svg b/.claude/commands/blog-meta-image/assets/logos/tailscale.svg
new file mode 100644
index 000000000000..757c85e64c6e
--- /dev/null
+++ b/.claude/commands/blog-meta-image/assets/logos/tailscale.svg
@@ -0,0 +1 @@
+<svg role="img" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><title>Tailscale</title><path fill="#000000" d="M24 12a3 3 0 1 1-6 0 3 3 0 0 1 6 0zm-9 9a3 3 0 1 1-6 0 3 3 0 0 1 6 0zm0-9a3 3 0 1 1-6 0 3 3 0 0 1 6 0zm6-6a3 3 0 1 1 0-6 3 3 0 0 1 0 6zm0-.5a2.5 2.5 0 1 0 0-5 2.5 2.5 0 0 0 0 5zM3 24a3 3 0 1 1 0-6 3 3 0 0 1 0 6zm0-.5a2.5 2.5 0 1 0 0-5 2.5 2.5 0 0 0 0 5zm18 .5a3 3 0 1 1 0-6 3 3 0 0 1 0 6zm0-.5a2.5 2.5 0 1 0 0-5 2.5 2.5 0 0 0 0 5zM6 12a3 3 0 1 1-6 0 3 3 0 0 1 6 0zm9-9a3 3 0 1 1-6 0 3 3 0 0 1 6 0zm-3 2.5a2.5 2.5 0 1 0 0-5 2.5 2.5 0 0 0 0 5zM6 3a3 3 0 1 1-6 0 3 3 0 0 1 6 0zM3 5.5a2.5 2.5 0 1 0 0-5 2.5 2.5 0 0 0 0 5z"/></svg>
diff --git a/content/blog/self-host-qwen-llama-cpp-k8s-tailscale-pulumi/conduit.png b/content/blog/self-host-qwen-llama-cpp-k8s-tailscale-pulumi/conduit.png
new file mode 100644
index 000000000000..b0afb3db2a68
Binary files /dev/null and b/content/blog/self-host-qwen-llama-cpp-k8s-tailscale-pulumi/conduit.png differ
diff --git a/content/blog/self-host-qwen-llama-cpp-k8s-tailscale-pulumi/index.md b/content/blog/self-host-qwen-llama-cpp-k8s-tailscale-pulumi/index.md
new file mode 100644
index 000000000000..3b4f0a6325a8
--- /dev/null
+++ b/content/blog/self-host-qwen-llama-cpp-k8s-tailscale-pulumi/index.md
@@ -0,0 +1,390 @@
+---
+title: "Use Your GPU For Your Agents: Self-Host Qwen 3.5 with Pulumi and Tailscale"
+allow_long_title: true
+date: 2026-03-20
+draft: false
+meta_desc: |
+    Self-host Qwen 3.5 on your GPU with Pulumi, llama.cpp, and Tailscale. One pulumi up gives you a private OpenAI-compatible API on your tailnet.
+
+meta_image: meta.png
+
+authors:
+- pablo-seibelt
+
+tags:
+- ai
+- kubernetes
+- tailscale
+- python
+
+social:
+  twitter: |
+    Self-host Qwen 3.5 on your own GPU with Pulumi, llama.cpp, and Tailscale. One pulumi up gives you a private OpenAI-compatible API on your tailnet. No cloud costs, no data leaving your network.
+  linkedin: |
+    Self-host Qwen 3.5 on your own GPU with Pulumi, llama.cpp, and Tailscale. One pulumi up gives you a private OpenAI-compatible API on your tailnet, accessible from any device. No cloud costs, no data leaving your network. The post walks through the full stack: llama.cpp for inference, Open WebUI for a chat interface, and Tailscale for secure access from anywhere.
+---
+
+If you run any kind of AI tools and agents, you have probably accepted three tradeoffs: your data leaves your network on every request, you cannot work if your connection drops, and your bill scales with usage no matter how much hardware you already own.
+
+Many open-weight models now run well on consumer GPUs. Once the model is on your machine, your data stays local, inference works offline, and tokens cost nothing. If you already own a compatible machine, you can run a model yourself.
+
+<!--more-->
+
+This post walks through a Kubernetes deployment on a Linux home server. It was tested on a Ryzen 9 5950x with 32 GB DDR4 and an RTX 3080 10 GB, which is high-end 2020 consumer hardware comparable to a mid-range build today. If your rig is in the same ballpark, this setup will likely work for you. If you are on a Mac with an M-series chip, you can run the same model locally with [mlx-lm](https://github.com/ml-explore/mlx-lm) instead.
+
+[Qwen 3.5](https://qwen.ai/blog?id=qwen3.5) is an Apache 2.0-licensed model family from Alibaba. The 35B-A3B variant uses a Mixture-of-Experts (MoE) architecture that activates only 3 billion parameters per token. Thanks to quantized models distributed in the [GGUF](https://huggingface.co/docs/hub/en/gguf) format, models that would normally require datacenter hardware fit on consumer GPUs with acceptable quality loss. GGUF is the file format; quantization (e.g., Q4_K_M) is the compression that shrinks the model by reducing numerical precision.
+
+The full 35B-parameter model fits in around 22 GB at Q4_K_M quantization, and llama.cpp can split layers between GPU VRAM and system RAM so you do not need all of that in VRAM.
+
+In this post we will set up a complete self-hosted inference stack with a single `pulumi up`: [llama.cpp](https://github.com/ggerganov/llama.cpp) serving an OpenAI-compatible API, [Open WebUI](https://github.com/open-webui/open-webui) for a browser chat interface, and [Tailscale](https://tailscale.com/) for secure access from any device on your tailnet, all orchestrated on a local [k3s](https://k3s.io/) Kubernetes cluster.
+
+## Architecture overview
+
+```mermaid
+graph LR
+    subgraph K8s["Kubernetes cluster (k3s)"]
+        LLM["llama-server pod<br/>OpenAI-compatible API<br/>:30080"]
+        WebUI["Open WebUI<br/>:30000"]
+        TS["Tailscale pod"]
+        PVC["PVC<br/>model weights"]
+    end
+    PVC -->|"mounted in"| LLM
+    WebUI -->|"connects to"| LLM
+    TS -->|"exposes on tailnet"| WebUI
+    Agents["Local agents"] -->|"localhost"| LLM
+    Phone["Phone / laptop"] -->|"tailnet"| TS
+```
+
+## GPU and model sizing
+
+The table below shows sizes for the [unsloth/Qwen3.5-35B-A3B-GGUF](https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF) quantizations. The total memory column is the combined VRAM + system RAM needed to run the model. llama.cpp splits model layers between GPU VRAM and system RAM automatically, so any GPU that supports CUDA or ROCm will help accelerate inference.
+
+| Quantization | File size | Total memory needed |
+|---|---|---|
+| Q3_K_S | 15.3 GB | ~17 GB |
+| Q4_K_M | 22 GB | ~22 GB |
+| Q6_K | 28.9 GB | ~30 GB |
+
+This walkthrough defaults to **Q4_K_M** because it delivers strong quality while fitting on widely available consumer hardware. Both NVIDIA and AMD GPUs work; adjust the `gpuVendor` config value for your hardware.
+
+With [community-recommended llama.cpp parameters](https://www.reddit.com/r/LocalLLaMA/comments/1rg4zqv/followup_qwen3535ba3b_7_communityrequested/) (`--fit-target`, `-fa on`, `--no-mmap`, `-ctk q8_0`, `-ctv q8_0`), the reference hardware (RTX 3080 10 GB) achieves around 600 tok/s prompt processing and 45 tok/s generation. These flags are already configured in the Pulumi program.
+
+If your machine has less RAM or a smaller GPU, you can try a smaller quantization of the same model (for example, Q3_K_S at 15.3 GB), or switch to a smaller model like the 7B or 14B variants. You can swap the `model` and `modelFile` config values to try a different variant without changing any code.
+
+[llmfit](https://github.com/AlexsJones/llmfit) detects your CPU, RAM, and GPU, then tells you exactly which models and quantizations will run on your machine before you download anything:
+
+```bash
+curl -fsSL https://llmfit.axjns.dev/install.sh | sh
+llmfit
+```
+
+## Prerequisites
+
+{{< notes type="info" >}}
+If this is your first time setting up GPU drivers and k3s, budget around 15 minutes for the prerequisites below. The Pulumi program itself deploys in under 5 minutes.
+{{< /notes >}}
+
+Before you start, make sure you have:
+
+- An NVIDIA or AMD GPU with drivers installed
+  - **NVIDIA**: `nvidia-smi` should work
+  - **AMD**: `amd-smi` should work, plus ROCm drivers with `/dev/kfd` and `/dev/dri` present
+- A local Kubernetes cluster. We will use [k3s](https://k3s.io/):
+
+  ```bash
+  curl -sfL https://get.k3s.io | sh -
+
+  # k3s writes its kubeconfig to a root-owned path; copy it so
+  # kubectl and pulumi can access the cluster without sudo
+  mkdir -p ~/.kube
+  sudo cp /etc/rancher/k3s/k3s.yaml ~/.kube/config
+  sudo chown $USER ~/.kube/config
+  ```
+
+- GPU support in k3s. For **NVIDIA**, install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html), then configure the runtime for k3s. Create `/etc/rancher/k3s/config.yaml`:
+
+  ```yaml
+  nvidia-container-runtime-path: /usr/bin/nvidia-container-runtime
+  default-runtime: nvidia
+  ```
+
+  Then configure containerd, restart k3s, and install the device plugin:
+
+  ```bash
+  # Configure the NVIDIA runtime for k3s's embedded containerd
+  sudo nvidia-ctk runtime configure --runtime=containerd \
+    --config=/var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl
+
+  # Restart k3s to pick up the new runtime
+  sudo systemctl enable --now k3s
+
+  # Install the NVIDIA device plugin
+  kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.0/deployments/static/nvidia-device-plugin.yml
+  ```
+
+  For **AMD**, install [ROCm drivers](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/) first. Verify with `rocminfo` and confirm `/dev/kfd` and `/dev/dri` are present. Then apply the [device plugin](https://github.com/ROCm/k8s-device-plugin):
+
+  ```bash
+  kubectl apply -f https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml
+  ```
+
+  Verify your GPU is visible to Kubernetes:
+
+  ```bash
+  # NVIDIA
+  kubectl get nodes -o jsonpath='{.items[0].status.capacity.nvidia\.com/gpu}'
+
+  # AMD
+  kubectl get nodes -o jsonpath='{.items[0].status.capacity.amd\.com/gpu}'
+  ```
+
+  Either command should output `1` (or the number of GPUs you have). If it is empty, check that your GPU device plugin pod is running.
+
+- [Pulumi CLI](/docs/iac/download-install/) and Python 3.9+:
+
+  ```bash
+  curl -fsSL https://get.pulumi.com | sh
+  ```
+
+- A [Tailscale account](https://login.tailscale.com/start) (free tier works)
+
+## The Pulumi program
+
+{{< notes type="tip" >}}
+You could deploy these manifests with `kubectl apply`, but Pulumi buys you a few things: the Tailscale ACL, Kubernetes resources, and config all live in one stack so `pulumi destroy` cleans up everything. The `ComponentResource` lets you swap models or GPU vendors by changing config instead of editing YAML. And the Tailscale auth key is encrypted in state, not sitting in a plaintext file.
+{{< /notes >}}
+
+Create a new project:
+
+```bash
+mkdir self-host-llm && cd self-host-llm
+pulumi new python --name self-host-qwen-llm
+```
+
+Copy the [example program](https://github.com/pulumi/docs/tree/master/static/programs/self-host-qwen-llm-python) into the project directory. Git does not natively support cloning a single folder, so the command uses sparse checkout to avoid downloading the entire repository:
+
+```bash
+git clone --depth 1 --filter=blob:none --sparse \
+  https://github.com/pulumi/docs.git /tmp/pulumi-docs
+git -C /tmp/pulumi-docs sparse-checkout set static/programs/self-host-qwen-llm-python
+cp /tmp/pulumi-docs/static/programs/self-host-qwen-llm-python/* .
+rm -rf /tmp/pulumi-docs
+```
+
+{{< github-card repo="pulumi/docs" branch="master" path="static/programs/self-host-qwen-llm-python" >}}
+
+### How it works
+
+The program is split into two files: `__main__.py` orchestrates the full stack, and `llm_server.py` defines a reusable [ComponentResource](/docs/iac/concepts/resources/components/) that encapsulates the LLM inference server.
+
+#### The LlmServer component
+
+`LlmServer` bundles a PVC, an init container that downloads model weights, the llama-server deployment, and a service into a single reusable component. GPU vendor maps to the right resource key and container image, so switching between NVIDIA and AMD is one config change:
+
+```python
+GPU_RESOURCE_KEYS = {
+    "nvidia": "nvidia.com/gpu",
+    "amd": "amd.com/gpu",
+}
+
+LLAMA_SERVER_IMAGES = {
+    "nvidia": "ghcr.io/ggml-org/llama.cpp:server-cuda",
+    "amd": "ghcr.io/ggml-org/llama.cpp:server-rocm",
+}
+```
+
+The init container uses `uvx` to run `huggingface_hub` without baking it into a custom image. The download is idempotent, so it skips files already on the PVC:
+
+```python
+init_containers = [
+    k8s.core.v1.ContainerArgs(
+        name="download-model",
+        image="ghcr.io/astral-sh/uv:python3.12-bookworm-slim",
+        command=["sh", "-c",
+            f"uvx --from huggingface_hub hf download {model} {download_files} "
+            + f"--local-dir {model_dir}",
+        ],
+        volume_mounts=models_mount,
+    ),
+]
+```
+
+All llama.cpp flags are assembled from config values passed to the constructor, so you can override context size, thread count, or sampling parameters per stack without editing the component:
+
+```python
+config = pulumi.Config()
+model = config.get("model") or "unsloth/Qwen3.5-35B-A3B-GGUF"
+model_file = config.get("modelFile") or "Qwen3.5-35B-A3B-Q4_K_M.gguf"
+context_size = config.get_int("contextSize") or 65536
+
+llm = LlmServer(
+    "llm",
+    model=model,
+    model_file=model_file,
+    port=llm_port,
+    gpu_vendor=gpu_vendor,
+    context_size=context_size,
+    # ...
+)
+```
+
+#### Adopting the Tailscale ACL
+
+The Tailscale ACL is a global singleton per tailnet. It cannot be created or deleted, only updated. The program uses [`import_`](/docs/iac/concepts/options/import/) to adopt the existing ACL into state on first `pulumi up`, and [`retain_on_delete`](/docs/iac/concepts/options/retainondelete/) to prevent `pulumi destroy` from trying to delete it:
+
+```python
+ts_acl = tailscale.Acl(
+    "tailnet-acl",
+    acl=pulumi.Output.json_dumps({
+        "tagOwners": {
+            "tag:llm-server": ["autogroup:admin"],
+        },
+        "acls": [
+            {
+                "action": "accept",
+                "src": ["autogroup:member"],
+                "dst": ["*:*"],
+            },
+        ],
+    }),
+    opts=pulumi.ResourceOptions(
+        import_="acl",
+        retain_on_delete=True,
+    ),
+)
+```
+
+Without these options, destroy+up cycles would fail with a "precondition failed" 412 error.
+
+{{< notes type="info" >}}
+This ACL grants all tailnet members (`autogroup:member`) access to all devices on all ports (`*:*`). This is fine if you are the only user on your tailnet. If you share your tailnet with other people, scope the `dst` field to specific tags and ports (e.g., `tag:llm-server:30000`). Also note that `import_` will **replace your existing tailnet ACL** on first deploy, so export your current rules first if you have custom ones.
+{{< /notes >}}
+
+#### Open WebUI and Tailscale networking
+
+Open WebUI connects to the LLM server via its cluster-internal URL and disables authentication since it is only reachable through the tailnet.
+
+The Tailscale deployment runs as a separate pod that joins your tailnet and forwards traffic to Open WebUI's ClusterIP. An init container enables IP forwarding, and the main container authenticates using a Pulumi-managed auth key. `TS_DEST_IP` is wired directly to the Open WebUI service's cluster IP using a Pulumi output, so the value is always correct even if Kubernetes reassigns it:
+
+```python
+k8s.core.v1.ContainerArgs(
+    name="tailscale",
+    image="ghcr.io/tailscale/tailscale:latest",
+    env=[
+        k8s.core.v1.EnvVarArgs(
+            name="TS_AUTHKEY",
+            value_from=k8s.core.v1.EnvVarSourceArgs(
+                secret_key_ref=k8s.core.v1.SecretKeySelectorArgs(
+                    name="tailscale-auth",
+                    key="TS_AUTHKEY",
+                ),
+            ),
+        ),
+        k8s.core.v1.EnvVarArgs(name="TS_HOSTNAME", value=hostname),
+        k8s.core.v1.EnvVarArgs(
+            name="TS_DEST_IP",
+            value=webui_service.spec.cluster_ip,
+        ),
+        # ...
+    ],
+)
+```
+
+Any device on your tailnet can reach the chat interface at `http://<hostname>:30000` without exposing anything to the public internet.
+
+{{< notes type="warning" >}}
+By default, k3s NodePort services bind to `0.0.0.0`, which means devices on your LAN can also reach port 30000. To restrict access to Tailscale only, add the following to `/etc/rancher/k3s/config.yaml` and restart k3s with `sudo systemctl restart k3s`:
+
+```yaml
+nodeport-addresses: "100.64.0.0/10"
+```
+
+{{< /notes >}}
+
+Configure the Tailscale provider:
+- Generate an API key from [Settings > Keys](https://login.tailscale.com/admin/settings/keys)
+- Find your tailnet name under [Settings > General](https://login.tailscale.com/admin/settings/general)
+
+```bash
+pulumi config set tailscale:apiKey tskey-api-XXXXX --secret
+pulumi config set tailscale:tailnet your-tailnet-name
+```
+
+## Deploy
+
+If you are using an AMD GPU, set the vendor before deploying:
+
+```bash
+pulumi config set gpuVendor amd
+```
+
+The program defaults to `Qwen3.5-35B-A3B-Q4_K_M.gguf`. To use a different quantization, for example:
+
+```bash
+pulumi config set modelFile Qwen3.5-35B-A3B-Q6_K.gguf
+```
+
+Run the deployment:
+
+```bash
+pulumi up
+```
+
+Pulumi shows a preview of all resources it will create. Confirm with `yes`. The first run takes several minutes as the init container downloads the model weights into the PVC.
+
+Once the stack is up, verify llama-server is running:
+
+```bash
+curl http://localhost:30080/v1/models
+```
+
+You should see your model listed. Try a completion:
+
+```bash
+curl http://localhost:30080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "unsloth/Qwen3.5-35B-A3B-GGUF",
+    "messages": [{"role": "user", "content": "What is infrastructure as code?"}],
+    "max_tokens": 200
+  }'
+```
+
+Then open `http://localhost:30000` in your browser to access Open WebUI. Select the Qwen model from the model dropdown and start chatting.
+
+## Connect your agents
+
+Any tool that supports the OpenAI API format works out of the box. Point it at your llama-server endpoint:
+
+```bash
+export OPENAI_BASE_URL=http://localhost:30080/v1
+export OPENAI_API_KEY=not-needed
+```
+
+Some examples:
+
+- **[OpenClaw](https://github.com/openclaw/openclaw)**: connect your WhatsApp, Telegram, or Discord to your self-hosted model
+- **[OpenCode](https://opencode.ai/)**: terminal-based coding agent with local LLM support
+![OpenCode connected to self-hosted Qwen](opencode.png)
+
+## Access from your phone
+
+Install the [Tailscale app](https://tailscale.com/download) on your phone. Once connected to your tailnet, open the URL from `pulumi stack output tailscale_webui_url` in your mobile browser. Open WebUI works well on mobile and gives you a ChatGPT-like interface backed by your own hardware. For a native app experience, try [Conduit](https://apps.apple.com/us/app/conduit-openwebui-client/id6749840287) ([Android](https://play.google.com/store/apps/details?id=app.cogwheel.conduit)), an Open WebUI client for iOS and Android.
+
+{{< figure src="conduit.png" alt="Conduit app connected to self-hosted Qwen via Tailscale" width="300" >}}
+
+## Conclusion
+
+After following this guide you have:
+
+- An OpenAI-compatible API running on your own GPU via llama.cpp
+- A browser-based chat UI accessible from any device on your tailnet
+- Tailscale ACLs scoping access to your tailnet members
+- Persistent model storage that survives pod restarts
+- Everything running on a local Kubernetes cluster you control
+
+To swap in a new model or quantization, change the `model` and `modelFile` config values and run `pulumi up`. The pod restarts and pulls the new GGUF file.
+
+If you outgrow your local GPU, the same Pulumi program can be adapted to target a cloud Kubernetes cluster. Swap your kubeconfig for a managed K8s service with GPU nodes and `pulumi up` again.
+
+{{< related-posts >}}
diff --git a/content/blog/self-host-qwen-llama-cpp-k8s-tailscale-pulumi/meta.png b/content/blog/self-host-qwen-llama-cpp-k8s-tailscale-pulumi/meta.png
new file mode 100644
index 000000000000..2b14409196e0
Binary files /dev/null and b/content/blog/self-host-qwen-llama-cpp-k8s-tailscale-pulumi/meta.png differ
diff --git a/content/blog/self-host-qwen-llama-cpp-k8s-tailscale-pulumi/opencode.png b/content/blog/self-host-qwen-llama-cpp-k8s-tailscale-pulumi/opencode.png
new file mode 100644
index 000000000000..cf995b4eb7d0
Binary files /dev/null and b/content/blog/self-host-qwen-llama-cpp-k8s-tailscale-pulumi/opencode.png differ
diff --git a/data/related.yaml b/data/related.yaml
index dbebfe512118..ab975317aedd 100644
--- a/data/related.yaml
+++ b/data/related.yaml
@@ -368,10 +368,10 @@ tags:
     - 2022-03-10-hierarchical-config
 
   ai:
-    - codegen-learnings
-    - pulumi-copilot-rest
-    - copilot-lessons
-    - future-cloud-infrastructure-10-trends-shaping-2024-and-beyond
+    - run-deepseek-on-aws-ec2-using-pulumi
+    - policy-next-gen
+    - low-code-llm-apps-with-local-ai-flowise-and-pulumi
+    - easy-ai-apps-with-langserve-and-pulumi
 
   ml:
     - devops-ai-developer-future--pulumi-user-group-tech-talks
@@ -1266,6 +1266,11 @@ fargate-vs-ec2:
   - easy-ai-apps-with-langserve-and-pulumi
   - advanced-aws-networking-part-2
 
+self-host-qwen-llama-cpp-k8s-tailscale-pulumi:
+  - deploy-openclaw-aws-hetzner
+  - low-code-llm-apps-with-local-ai-flowise-and-pulumi
+  - mlops-huggingface-llm-aws-sagemaker-python
+
 when-to-use-azure-cosmos-db:
   - azure-deployment-environments
   - sam-cogan-testing-best-practices
diff --git a/scripts/programs/ignore.txt b/scripts/programs/ignore.txt
index ea9d87908795..b7afba348122 100644
--- a/scripts/programs/ignore.txt
+++ b/scripts/programs/ignore.txt
@@ -33,6 +33,7 @@ awsx-apigateway-custom-domain-.*
 kubernetes-.*
 k8s-.*
 helm-.*
+self-host-qwen-llm-python
 
 # Skip broken programs to get back to green
 # https://github.com/pulumi/docs/issues/14505
diff --git a/static/programs/self-host-qwen-llm-python/Pulumi.yaml b/static/programs/self-host-qwen-llm-python/Pulumi.yaml
new file mode 100644
index 000000000000..d414f47d5d5b
--- /dev/null
+++ b/static/programs/self-host-qwen-llm-python/Pulumi.yaml
@@ -0,0 +1,62 @@
+name: self-host-qwen-llm-python
+description: Self-hosted llama-server (llama.cpp) with Open WebUI and Tailscale
+runtime:
+  name: python
+  options:
+    toolchain: pip
+config:
+  pulumi:tags:
+    value:
+      pulumi:template: python
+  model:
+    type: string
+    default: unsloth/Qwen3.5-35B-A3B-GGUF
+    description: HuggingFace model repository
+  modelFile:
+    type: string
+    default: Qwen3.5-35B-A3B-Q4_K_M.gguf
+    description: GGUF filename to download from the model repo
+  gpuVendor:
+    type: string
+    default: nvidia
+    description: GPU vendor ("nvidia" or "amd")
+  gpuCount:
+    type: integer
+    default: 1
+    description: Number of GPUs to allocate
+  contextSize:
+    type: integer
+    default: 65536
+    description: Context window size in tokens
+  fitTarget:
+    type: integer
+    default: 2048
+    description: VRAM fit target in MB for llama.cpp layer placement
+  threads:
+    type: integer
+    default: 5
+    description: Number of CPU threads for inference
+  jinja:
+    type: boolean
+    default: true
+    description: Enable Jinja template processing for chat templates
+  parallel:
+    type: integer
+    default: 1
+    description: Number of parallel request slots
+  llmPort:
+    type: integer
+    default: 8080
+    description: LLM service port
+  llmNodePort:
+    type: integer
+    default: 30080
+    description: NodePort for external LLM access
+  webuiPort:
+    type: integer
+    default: 30000
+    description: Open WebUI NodePort
+  hostname:
+    type: string
+    default: llm-server
+    description: Tailscale hostname
diff --git a/static/programs/self-host-qwen-llm-python/__main__.py b/static/programs/self-host-qwen-llm-python/__main__.py
new file mode 100644
index 000000000000..7660d1a34688
--- /dev/null
+++ b/static/programs/self-host-qwen-llm-python/__main__.py
@@ -0,0 +1,298 @@
+import pulumi
+import pulumi_kubernetes as k8s
+import pulumi_tailscale as tailscale
+
+from llm_server import LlmServer
+
+config = pulumi.Config()
+gpu_vendor = config.get("gpuVendor") or "nvidia"
+webui_port = config.get_int("webuiPort") or 30000
+llm_port = config.get_int("llmPort") or 8080
+llm_node_port = config.get_int("llmNodePort") or 30080
+hostname = config.get("hostname") or "llm-server"
+model = config.get("model") or "unsloth/Qwen3.5-35B-A3B-GGUF"
+model_file = config.get("modelFile") or "Qwen3.5-35B-A3B-Q4_K_M.gguf"
+context_size = config.get_int("contextSize") or 65536
+fit_target = config.get_int("fitTarget") or 2048
+parallel = config.get_int("parallel") or 1
+threads = config.get_int("threads") or 5
+jinja = config.get_bool("jinja")
+if jinja is None:
+    jinja = True
+
+NAMESPACE = "llm"
+
+ns = k8s.core.v1.Namespace(
+    NAMESPACE,
+    metadata=k8s.meta.v1.ObjectMetaArgs(name=NAMESPACE),
+)
+ns_opts = pulumi.ResourceOptions(depends_on=[ns])
+
+llm = LlmServer(
+    "llm",
+    model=model,
+    model_file=model_file,
+    port=llm_port,
+    gpu_vendor=gpu_vendor,
+    gpu_count=config.get_int("gpuCount") or 1,
+    node_port=llm_node_port,
+    namespace=NAMESPACE,
+    context_size=context_size,
+    fit_target=fit_target,
+    parallel=parallel,
+    threads=threads,
+    jinja=jinja,
+    mmproj=config.get("mmproj"),
+    opts=ns_opts,
+)
+
+# --- Tailscale RBAC (must be created before the Tailscale deployment that
+#     references service_account_name="tailscale") ---
+
+ts_sa = k8s.core.v1.ServiceAccount(
+    "tailscale",
+    metadata=k8s.meta.v1.ObjectMetaArgs(name="tailscale", namespace=NAMESPACE),
+    opts=ns_opts,
+)
+
+ts_role = k8s.rbac.v1.Role(
+    "tailscale",
+    metadata=k8s.meta.v1.ObjectMetaArgs(name="tailscale", namespace=NAMESPACE),
+    rules=[
+        k8s.rbac.v1.PolicyRuleArgs(
+            api_groups=[""],
+            resources=["secrets"],
+            verbs=["create", "get", "update", "patch"],
+        ),
+    ],
+    opts=ns_opts,
+)
+
+ts_role_binding = k8s.rbac.v1.RoleBinding(
+    "tailscale",
+    metadata=k8s.meta.v1.ObjectMetaArgs(name="tailscale", namespace=NAMESPACE),
+    subjects=[
+        k8s.rbac.v1.SubjectArgs(
+            kind="ServiceAccount",
+            name="tailscale",
+            namespace=NAMESPACE,
+        ),
+    ],
+    role_ref=k8s.rbac.v1.RoleRefArgs(
+        api_group="rbac.authorization.k8s.io",
+        kind="Role",
+        name="tailscale",
+    ),
+    opts=ns_opts,
+)
+
+# --- Open WebUI ---
+
+webui_labels = {"app": "open-webui"}
+
+webui_pvc = k8s.core.v1.PersistentVolumeClaim(
+    "open-webui-data",
+    metadata=k8s.meta.v1.ObjectMetaArgs(name="open-webui-data", namespace=NAMESPACE),
+    spec=k8s.core.v1.PersistentVolumeClaimSpecArgs(
+        access_modes=["ReadWriteOnce"],
+        resources=k8s.core.v1.VolumeResourceRequirementsArgs(
+            requests={"storage": "5Gi"},
+        ),
+    ),
+    opts=ns_opts,
+)
+
+webui_deployment = k8s.apps.v1.Deployment(
+    "open-webui",
+    metadata=k8s.meta.v1.ObjectMetaArgs(name="open-webui", namespace=NAMESPACE, labels=webui_labels),
+    spec=k8s.apps.v1.DeploymentSpecArgs(
+        replicas=1,
+        selector=k8s.meta.v1.LabelSelectorArgs(match_labels=webui_labels),
+        template=k8s.core.v1.PodTemplateSpecArgs(
+            metadata=k8s.meta.v1.ObjectMetaArgs(labels=webui_labels),
+            spec=k8s.core.v1.PodSpecArgs(
+                containers=[
+                    k8s.core.v1.ContainerArgs(
+                        name="open-webui",
+                        image="ghcr.io/open-webui/open-webui:main",
+                        ports=[k8s.core.v1.ContainerPortArgs(container_port=8080)],
+                        env=[
+                            k8s.core.v1.EnvVarArgs(
+                                name="OPENAI_API_BASE_URLS",
+                                value=llm.url,
+                            ),
+                            k8s.core.v1.EnvVarArgs(name="OPENAI_API_KEYS", value="not-needed"),
+                            k8s.core.v1.EnvVarArgs(name="WEBUI_AUTH", value="false"),
+                        ],
+                        volume_mounts=[
+                            k8s.core.v1.VolumeMountArgs(
+                                name="data",
+                                mount_path="/app/backend/data",
+                            ),
+                        ],
+                    ),
+                ],
+                volumes=[
+                    k8s.core.v1.VolumeArgs(
+                        name="data",
+                        persistent_volume_claim=k8s.core.v1.PersistentVolumeClaimVolumeSourceArgs(
+                            claim_name=webui_pvc.metadata.name,
+                        ),
+                    ),
+                ],
+            ),
+        ),
+    ),
+    opts=pulumi.ResourceOptions(depends_on=[ns, webui_pvc]),
+)
+
+webui_service = k8s.core.v1.Service(
+    "open-webui",
+    metadata=k8s.meta.v1.ObjectMetaArgs(name="open-webui", namespace=NAMESPACE),
+    spec=k8s.core.v1.ServiceSpecArgs(
+        selector=webui_labels,
+        type="NodePort",
+        ports=[
+            k8s.core.v1.ServicePortArgs(
+                port=webui_port,
+                target_port=8080,
+                node_port=webui_port,
+            ),
+        ],
+    ),
+    opts=ns_opts,
+)
+
+# --- Tailscale ---
+
+ts_acl = tailscale.Acl(
+    "tailnet-acl",
+    acl=pulumi.Output.json_dumps({
+        "tagOwners": {
+            "tag:llm-server": ["autogroup:admin"],
+        },
+        "acls": [
+            {
+                "action": "accept",
+                "src": ["autogroup:member"],
+                "dst": ["*:*"],
+            },
+        ],
+    }),
+    # The Tailscale ACL is a global singleton per tailnet — it can't be truly
+    # created or deleted, only updated. import_ adopts the existing ACL into
+    # state on first `pulumi up`, and retain_on_delete prevents `pulumi destroy`
+    # from trying to delete it (which would fail or leave the tailnet broken).
+    # Without these, destroy+up cycles fail with a "precondition failed" 412 error.
+    opts=pulumi.ResourceOptions(
+        import_="acl",
+        retain_on_delete=True,
+    ),
+)
+
+ts_key = tailscale.TailnetKey(
+    "llm-server-key",
+    reusable=True,
+    ephemeral=True,
+    preauthorized=True,
+    tags=["tag:llm-server"],
+    description="Pulumi-managed key for LLM server",
+    opts=pulumi.ResourceOptions(depends_on=[ts_acl]),
+)
+
+ts_secret = k8s.core.v1.Secret(
+    "tailscale-auth",
+    metadata=k8s.meta.v1.ObjectMetaArgs(name="tailscale-auth", namespace=NAMESPACE),
+    string_data={
+        "TS_AUTHKEY": ts_key.key,
+    },
+    opts=ns_opts,
+)
+
+ts_labels = {"app": "tailscale"}
+
+ts_deployment = k8s.apps.v1.Deployment(
+    "tailscale",
+    metadata=k8s.meta.v1.ObjectMetaArgs(name="tailscale", namespace=NAMESPACE, labels=ts_labels),
+    spec=k8s.apps.v1.DeploymentSpecArgs(
+        replicas=1,
+        selector=k8s.meta.v1.LabelSelectorArgs(match_labels=ts_labels),
+        template=k8s.core.v1.PodTemplateSpecArgs(
+            metadata=k8s.meta.v1.ObjectMetaArgs(labels=ts_labels),
+            spec=k8s.core.v1.PodSpecArgs(
+                service_account_name="tailscale",
+                init_containers=[
+                    k8s.core.v1.ContainerArgs(
+                        name="sysctler",
+                        image="busybox",
+                        command=["/bin/sh", "-c"],
+                        args=["sysctl -w net.ipv4.ip_forward=1 net.ipv6.conf.all.forwarding=1"],
+                        security_context=k8s.core.v1.SecurityContextArgs(
+                            privileged=True,
+                        ),
+                    ),
+                ],
+                containers=[
+                    k8s.core.v1.ContainerArgs(
+                        name="tailscale",
+                        image="ghcr.io/tailscale/tailscale:latest",
+                        env=[
+                            k8s.core.v1.EnvVarArgs(
+                                name="TS_AUTHKEY",
+                                value_from=k8s.core.v1.EnvVarSourceArgs(
+                                    secret_key_ref=k8s.core.v1.SecretKeySelectorArgs(
+                                        name="tailscale-auth",
+                                        key="TS_AUTHKEY",
+                                    ),
+                                ),
+                            ),
+                            k8s.core.v1.EnvVarArgs(name="TS_HOSTNAME", value=hostname),
+                            k8s.core.v1.EnvVarArgs(name="TS_STATE_DIR", value="/var/lib/tailscale"),
+                            k8s.core.v1.EnvVarArgs(name="TS_USERSPACE", value="false"),
+                            k8s.core.v1.EnvVarArgs(
+                                name="TS_DEST_IP",
+                                value=webui_service.spec.cluster_ip,
+                            ),
+                            k8s.core.v1.EnvVarArgs(name="TS_KUBE_SECRET", value="tailscale-state"),
+                        ],
+                        volume_mounts=[
+                            k8s.core.v1.VolumeMountArgs(
+                                name="tailscale-state",
+                                mount_path="/var/lib/tailscale",
+                            ),
+                            k8s.core.v1.VolumeMountArgs(
+                                name="dev-tun",
+                                mount_path="/dev/net/tun",
+                            ),
+                        ],
+                        security_context=k8s.core.v1.SecurityContextArgs(
+                            privileged=True,
+                        ),
+                    ),
+                ],
+                volumes=[
+                    k8s.core.v1.VolumeArgs(
+                        name="tailscale-state",
+                        empty_dir=k8s.core.v1.EmptyDirVolumeSourceArgs(),
+                    ),
+                    k8s.core.v1.VolumeArgs(
+                        name="dev-tun",
+                        host_path=k8s.core.v1.HostPathVolumeSourceArgs(
+                            path="/dev/net/tun",
+                            type="CharDevice",
+                        ),
+                    ),
+                ],
+            ),
+        ),
+    ),
+    # Needs the secret (for TS_AUTHKEY), SA and RBAC (for kube secret access)
+    opts=pulumi.ResourceOptions(depends_on=[ns, ts_secret, ts_sa, ts_role_binding]),
+)
+
+# --- Outputs ---
+
+pulumi.export("local_webui_url", f"http://localhost:{webui_port}")
+pulumi.export("local_api_url", f"http://localhost:{llm_node_port}/v1")
+pulumi.export("tailscale_webui_url", f"http://{hostname}:{webui_port}")
+pulumi.export("model", model)
diff --git a/static/programs/self-host-qwen-llm-python/llm_server.py b/static/programs/self-host-qwen-llm-python/llm_server.py
new file mode 100644
index 000000000000..4a913b7a90ba
--- /dev/null
+++ b/static/programs/self-host-qwen-llm-python/llm_server.py
@@ -0,0 +1,165 @@
+import pulumi
+import pulumi_kubernetes as k8s
+
+GPU_RESOURCE_KEYS = {
+    "nvidia": "nvidia.com/gpu",
+    "amd": "amd.com/gpu",
+}
+
+LLAMA_SERVER_IMAGES = {
+    "nvidia": "ghcr.io/ggml-org/llama.cpp:server-cuda",
+    "amd": "ghcr.io/ggml-org/llama.cpp:server-rocm",
+}
+
+_INTERNAL_PORT = 8080
+
+
+class LlmServer(pulumi.ComponentResource):
+    url: pulumi.Output[str]
+    service: k8s.core.v1.Service
+
+    def __init__(self, name, model, model_file, port, gpu_vendor="nvidia",
+                 gpu_count=1, node_port=None, namespace="default",
+                 context_size=65536, fit_target=2048, parallel=1,
+                 mmproj=None, threads=5, jinja=True, server_args=None,
+                 opts=None):
+        super().__init__("selfhost:llm:LlmServer", name, None, opts)
+
+        if gpu_vendor not in GPU_RESOURCE_KEYS:
+            raise ValueError(f"Unsupported gpu_vendor '{gpu_vendor}', must be one of: {', '.join(GPU_RESOURCE_KEYS)}")
+
+        labels = {"app": name}
+        model_dir = "/models"
+        model_path = f"{model_dir}/{model_file}"
+        gpu_resource = GPU_RESOURCE_KEYS[gpu_vendor]
+        image = LLAMA_SERVER_IMAGES[gpu_vendor]
+
+        args = [
+            "-m", model_path,
+            "-c", str(context_size),
+            "--fit-target", str(fit_target),
+            "-fa", "on",
+            "--no-mmap",
+            *(["--jinja"] if jinja else []),
+            "-ctk", "q8_0",
+            "-ctv", "q8_0",
+            "-t", str(threads),
+            "--temp", "1.0",
+            "--top-p", "0.95",
+            "--top-k", "20",
+            "--min-p", "0.00",
+            "--presence-penalty", "1.5",
+            "--repeat-penalty", "1.0",
+            "--port", str(_INTERNAL_PORT),
+            "--host", "0.0.0.0",
+            "--parallel", str(parallel),
+        ]
+        if mmproj:
+            args += ["--mmproj", f"{model_dir}/{mmproj}"]
+        # Escape hatch: pass arbitrary llama.cpp flags without adding constructor params
+        for k, v in (server_args or {}).items():
+            args += [f"--{k}", str(v)]
+
+        download_files = f"{model_file} {mmproj}" if mmproj else model_file
+        models_mount = [k8s.core.v1.VolumeMountArgs(name="models", mount_path=model_dir)]
+
+        init_containers = [
+            k8s.core.v1.ContainerArgs(
+                name="download-model",
+                # Uses uvx to run hf download without baking huggingface-cli into a custom image.
+                # hf download is idempotent — skips files already on the PVC.
+                image="ghcr.io/astral-sh/uv:python3.12-bookworm-slim",
+                command=["sh", "-c",
+                    f"uvx --from huggingface_hub hf download {model} {download_files} "
+                    + f"--local-dir {model_dir}",
+                ],
+                volume_mounts=models_mount,
+            ),
+        ]
+
+        self.models_pvc = k8s.core.v1.PersistentVolumeClaim(
+            f"{name}-models",
+            metadata=k8s.meta.v1.ObjectMetaArgs(
+                name=f"{name}-models",
+                namespace=namespace,
+            ),
+            spec=k8s.core.v1.PersistentVolumeClaimSpecArgs(
+                access_modes=["ReadWriteOnce"],
+                resources=k8s.core.v1.VolumeResourceRequirementsArgs(
+                    requests={"storage": "50Gi"},
+                ),
+            ),
+            opts=pulumi.ResourceOptions(parent=self),
+        )
+
+        self.deployment = k8s.apps.v1.Deployment(
+            name,
+            metadata=k8s.meta.v1.ObjectMetaArgs(
+                name=name,
+                namespace=namespace,
+                labels=labels,
+            ),
+            spec=k8s.apps.v1.DeploymentSpecArgs(
+                replicas=1,
+                progress_deadline_seconds=1800,
+                selector=k8s.meta.v1.LabelSelectorArgs(match_labels=labels),
+                # Recreate strategy: only one pod can hold the GPU at a time
+                strategy=k8s.apps.v1.DeploymentStrategyArgs(type="Recreate"),
+                template=k8s.core.v1.PodTemplateSpecArgs(
+                    metadata=k8s.meta.v1.ObjectMetaArgs(labels=labels),
+                    spec=k8s.core.v1.PodSpecArgs(
+                        init_containers=init_containers,
+                        containers=[
+                            k8s.core.v1.ContainerArgs(
+                                name=name,
+                                image=image,
+                                args=args,
+                                ports=[k8s.core.v1.ContainerPortArgs(container_port=_INTERNAL_PORT)],
+                                resources=k8s.core.v1.ResourceRequirementsArgs(
+                                    limits={gpu_resource: str(gpu_count)},
+                                ),
+                                volume_mounts=models_mount,
+                            ),
+                        ],
+                        volumes=[
+                            k8s.core.v1.VolumeArgs(
+                                name="models",
+                                persistent_volume_claim=k8s.core.v1.PersistentVolumeClaimVolumeSourceArgs(
+                                    claim_name=self.models_pvc.metadata.name,
+                                ),
+                            ),
+                        ],
+                    ),
+                ),
+            ),
+            opts=pulumi.ResourceOptions(
+                parent=self,
+                depends_on=[self.models_pvc],
+            ),
+        )
+
+        service_spec_args = dict(
+            selector=labels,
+            ports=[
+                k8s.core.v1.ServicePortArgs(
+                    port=port,
+                    target_port=_INTERNAL_PORT,
+                    **({"node_port": node_port} if node_port else {}),
+                ),
+            ],
+        )
+        if node_port:
+            service_spec_args["type"] = "NodePort"
+
+        self.service = k8s.core.v1.Service(
+            name,
+            metadata=k8s.meta.v1.ObjectMetaArgs(
+                name=name,
+                namespace=namespace,
+            ),
+            spec=k8s.core.v1.ServiceSpecArgs(**service_spec_args),
+            opts=pulumi.ResourceOptions(parent=self),
+        )
+
+        self.url = pulumi.Output.concat("http://", name, ":", str(port), "/v1")
+        self.register_outputs({"url": self.url})
diff --git a/static/programs/self-host-qwen-llm-python/requirements.txt b/static/programs/self-host-qwen-llm-python/requirements.txt
new file mode 100644
index 000000000000..01a39d1fc295
--- /dev/null
+++ b/static/programs/self-host-qwen-llm-python/requirements.txt
@@ -0,0 +1,3 @@
+pulumi>=3.0.0,<4.0.0
+pulumi-kubernetes>=4.0.0,<5.0.0
+pulumi-tailscale>=0.17.0,<1.0.0