diff --git a/config/examples/train_lora_zimage_base_32gb.yaml b/config/examples/train_lora_zimage_base_32gb.yaml new file mode 100644 index 000000000..cbf5e0601 --- /dev/null +++ b/config/examples/train_lora_zimage_base_32gb.yaml @@ -0,0 +1,87 @@ +--- +# Z-Image Base - Character/Person LoRA training (32GB VRAM, e.g. RTX 5090) +# Best practices: Prodigy or Prodigy Schedule Free optimizer, batch_size 2 (or 4 with gradient_accumulation), +# DOP for identity preservation, 1024 resolution, linear rank 128. Replace folder_path and trigger/sample prompts. +job: extension +config: + name: "my_zimage_base_character_lora_v1" + process: + - type: 'diffusion_trainer' + training_folder: "output" + device: cuda:0 + # LoRA: rank 128 good for character identity; use 64 if VRAM tight. Z-Image typically no conv training. + network: + type: "lora" + linear: 128 + linear_alpha: 128 + save: + dtype: bf16 + save_every: 500 + max_step_saves_to_keep: 6 + save_format: safetensors + datasets: + - folder_path: "/path/to/images/folder" + caption_ext: "txt" + caption_dropout_rate: 0.05 + cache_latents_to_disk: true + # 1024 matches Z-Image native; use [512, 768, 1024] for multi-res if preferred + resolution: [ 1024, 1024 ] + train: + batch_size: 2 # 32GB allows 2; Prodigy works well with larger batch. Try 4 or gradient_accumulation: 2 if headroom + gradient_accumulation: 1 + steps: 3000 # 2500-3000 typical for character identity + train_unet: true + train_text_encoder: false + gradient_checkpointing: true + noise_scheduler: "flowmatch" + timestep_type: "weighted" + content_or_style: "balanced" + loss_type: "mse" + dtype: bf16 + # Prodigy: nominal lr 1.0 (adaptive); use prodigy_schedulefree for schedule-free variant + optimizer: "prodigy" + lr: 1.0 + optimizer_params: + weight_decay: 0.01 + lr_scheduler: "constant" + # DOP: preserves model output without trigger, reduces overfitting for character LoRA + diff_output_preservation: true + diff_output_preservation_multiplier: 1.0 + diff_output_preservation_class: "person" + switch_boundary_every: 1 + unload_text_encoder: false + # cache_text_embeddings: true # optional, saves VRAM if using captions + ema_config: + use_ema: false + ema_decay: 0.99 + skip_first_sample: false + disable_sampling: false + logging: + log_every: 1 + use_ui_logger: true + model: + name_or_path: "Tongyi-MAI/Z-Image" + arch: "zimage" + quantize: true + qtype: "qfloat8" + quantize_te: true + qtype_te: "qfloat8" + low_vram: false # set true if OOM on 32GB + model_kwargs: {} + sample: + sampler: "flowmatch" + sample_every: 250 + width: 1024 + height: 1024 + samples: + - prompt: "[trigger], studio portrait, soft lighting" + - prompt: "[trigger] on a beach, golden hour" + - prompt: "[trigger], casual outfit, urban background" + neg: "" + seed: 42 + walk_seed: true + guidance_scale: 4 # Base uses CFG + sample_steps: 30 +meta: + name: "[name]" + version: '1.0' diff --git a/config/examples/train_lora_zimage_turbo_32gb.yaml b/config/examples/train_lora_zimage_turbo_32gb.yaml new file mode 100644 index 000000000..ebe0f0d64 --- /dev/null +++ b/config/examples/train_lora_zimage_turbo_32gb.yaml @@ -0,0 +1,84 @@ +--- +# Z-Image Turbo - Character/Person LoRA training (32GB VRAM, e.g. RTX 5090) +# Requires training adapter (assistant_lora_path) to avoid distilled quality loss. Use v2 adapter. +# Best practices: Prodigy or Prodigy Schedule Free, batch_size 2, DOP for identity, 1024 resolution, rank 128. +job: extension +config: + name: "my_zimage_turbo_character_lora_v1" + process: + - type: 'diffusion_trainer' + training_folder: "output" + device: cuda:0 + network: + type: "lora" + linear: 128 + linear_alpha: 128 + save: + dtype: bf16 + save_every: 500 + max_step_saves_to_keep: 6 + save_format: safetensors + datasets: + - folder_path: "/path/to/images/folder" + caption_ext: "txt" + caption_dropout_rate: 0.05 + cache_latents_to_disk: true + resolution: [ 1024, 1024 ] + train: + batch_size: 2 # Prodigy works well with batch 2-4 on 32GB + gradient_accumulation: 1 + steps: 3000 + train_unet: true + train_text_encoder: false + gradient_checkpointing: true + noise_scheduler: "flowmatch" + timestep_type: "weighted" + content_or_style: "balanced" + loss_type: "mse" + dtype: bf16 + optimizer: "prodigy" # or prodigy_schedulefree + lr: 1.0 + optimizer_params: + weight_decay: 0.01 + lr_scheduler: "constant" + diff_output_preservation: true # DOP for character identity + diff_output_preservation_multiplier: 1.0 + diff_output_preservation_class: "person" + switch_boundary_every: 1 + unload_text_encoder: false + ema_config: + use_ema: false + ema_decay: 0.99 + skip_first_sample: false + disable_sampling: false + logging: + log_every: 1 + use_ui_logger: true + model: + name_or_path: "Tongyi-MAI/Z-Image-Turbo" + arch: "zimage" + # Required for Turbo: training adapter prevents quality degradation from distilled model + assistant_lora_path: "ostris/zimage_turbo_training_adapter/zimage_turbo_training_adapter_v2.safetensors" + quantize: true + qtype: "qfloat8" + quantize_te: true + qtype_te: "qfloat8" + low_vram: false + model_kwargs: {} + sample: + sampler: "flowmatch" + sample_every: 250 + width: 1024 + height: 1024 + samples: + - prompt: "[trigger], studio portrait, soft lighting" + - prompt: "[trigger] on a beach, golden hour" + - prompt: "[trigger], casual outfit, urban background" + neg: "" + seed: 42 + walk_seed: true + guidance_scale: 1 # Turbo distilled: use 1 + sample_steps: 8 # Turbo: fewer steps +meta: + name: "[name]" + version: '1.0' diff --git a/dgx_requirements.txt b/dgx_requirements.txt index d540b7902..41c0c960f 100644 --- a/dgx_requirements.txt +++ b/dgx_requirements.txt @@ -33,6 +33,7 @@ k-diffusion open_clip_torch timm prodigyopt +prodigy-plus-schedule-free controlnet_aux==0.0.10 python-dotenv bitsandbytes diff --git a/jobs/process/BaseSDTrainProcess.py b/jobs/process/BaseSDTrainProcess.py index 925d34daa..82ab57d44 100644 --- a/jobs/process/BaseSDTrainProcess.py +++ b/jobs/process/BaseSDTrainProcess.py @@ -2180,6 +2180,9 @@ def run(self): ### HOOK ### if self.torch_profiler is not None: self.torch_profiler.start() + # Schedule-Free optimizers (e.g. Prodigy Schedule Free) need train() during training step + if hasattr(optimizer, 'train') and callable(optimizer.train): + optimizer.train() did_oom = False loss_dict = None try: @@ -2262,8 +2265,13 @@ def run(self): # print above the progress bar if self.progress_bar is not None: self.progress_bar.pause() + # Schedule-Free: use averaged params for checkpoint + if hasattr(optimizer, 'eval') and callable(optimizer.eval): + optimizer.eval() print_acc(f"\nSaving at step {self.step_num}") self.save(self.step_num) + if hasattr(optimizer, 'train') and callable(optimizer.train): + optimizer.train() self.ensure_params_requires_grad() # clear any grads optimizer.zero_grad() @@ -2276,10 +2284,15 @@ def run(self): if self.progress_bar is not None: self.progress_bar.pause() flush() + # Schedule-Free: use averaged params for sampling + if hasattr(optimizer, 'eval') and callable(optimizer.eval): + optimizer.eval() # print above the progress bar if self.train_config.free_u: self.sd.pipeline.disable_freeu() self.sample(self.step_num) + if hasattr(optimizer, 'train') and callable(optimizer.train): + optimizer.train() if self.train_config.unload_text_encoder: # make sure the text encoder is unloaded self.sd.text_encoder_to('cpu') diff --git a/requirements.txt b/requirements.txt index 2ab3621d4..7ec85ebae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,7 @@ k-diffusion open_clip_torch timm prodigyopt +prodigy-plus-schedule-free controlnet_aux==0.0.10 python-dotenv bitsandbytes diff --git a/toolkit/optimizer.py b/toolkit/optimizer.py index 355512e9b..68bba81c2 100644 --- a/toolkit/optimizer.py +++ b/toolkit/optimizer.py @@ -39,6 +39,20 @@ def get_optimizer( # let net be the neural network you want to train # you can choose weight decay value based on your problem, 0 by default optimizer = Prodigy8bit(params, lr=use_lr, eps=1e-6, **optimizer_params) + elif (lower_type.startswith("prodigy_schedulefree") or + lower_type.replace("-", "_") == "prodigy_schedule_free"): + try: + from prodigyplus.prodigy_plus_schedulefree import ProdigyPlusScheduleFree + except ImportError: + raise ImportError( + "Prodigy Schedule Free requires: pip install prodigy-plus-schedule-free" + ) + print("Using Prodigy + Schedule-Free optimizer") + use_lr = learning_rate + if use_lr < 0.1: + use_lr = 1.0 + print(f"Using lr {use_lr}") + optimizer = ProdigyPlusScheduleFree(params, lr=use_lr, **optimizer_params) elif lower_type.startswith("prodigy"): from prodigyopt import Prodigy diff --git a/ui/src/app/jobs/new/SimpleJob.tsx b/ui/src/app/jobs/new/SimpleJob.tsx index 5db650c30..8dfbb5986 100644 --- a/ui/src/app/jobs/new/SimpleJob.tsx +++ b/ui/src/app/jobs/new/SimpleJob.tsx @@ -491,7 +491,11 @@ export default function SimpleJob({ onChange={value => setJobConfig(value, 'config.process[0].train.optimizer')} options={[ { value: 'adamw8bit', label: 'AdamW8Bit' }, + { value: 'adamw', label: 'AdamW' }, { value: 'adafactor', label: 'Adafactor' }, + { value: 'prodigy', label: 'Prodigy' }, + { value: 'prodigy8bit', label: 'Prodigy 8-bit' }, + { value: 'prodigy_schedulefree', label: 'Prodigy Schedule Free' }, ]} />