intel · xin3he · Oct 15, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 28, 2025
diff --git a/.azure-pipelines/scripts/ut/run_3x_pt_hpu.sh b/.azure-pipelines/scripts/ut/run_3x_pt_hpu.sh
@@ -14,6 +14,8 @@ echo "##[group]set up UT env..."
 export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
 sed -i '/^auto-round/d;/^torchvision/d' /neural-compressor/test/torch/requirements.txt
 pip install -r /neural-compressor/test/torch/requirements.txt
+pip install deepspeed@git+https://github.com/HabanaAI/DeepSpeed.git@main --no-deps
+pip install msgpack hjson ninja  # deepspeed dependency
 pip install auto-round-hpu
 pip install pytest-cov pytest-html pytest-html-merger beautifulsoup4==4.13.5
 echo "##[endgroup]"

diff --git a/examples/helloworld/fp8_example/b2b_unitest_2_steps.py b/examples/helloworld/fp8_example/b2b_unitest_2_steps.py
@@ -0,0 +1,94 @@
+
+import argparse
+import math
+
+import torch
+import habana_frameworks.torch.core as htcore
+from torch.nn import Parameter, init
+
+# Initialize HPU environment (must be called before HPU operations)
+htcore.hpu_set_env()
+
+from neural_compressor.torch.quantization import FP8Config, convert, finalize_calibration, prepare
+
+torch.manual_seed(1)
+
+
+class B2BMatmul(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y, **kwargs):
+        return torch.matmul(x, y, **kwargs)
+
+
+
+class M(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.matmul = B2BMatmul()
+
+    def forward(self, inp0, inp1):
+        res = self.matmul(inp0, inp1)
+
+        return res
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Habana FP8 sample code with B2BMatmul.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("--quant_config", type=str, help="JSON file of quantization config")
+    args = parser.parse_args()
+
+    # Build model & load config
+    model = M().eval()
+    config = FP8Config.from_json_file(args.quant_config)
+
+    # Optional calibration preparation
+    if config.measure:
+        model = prepare(model, config)
+
+    # Optional quantization
+    if config.quantize:
+        htcore.hpu_initialize()
+        model = convert(model, config)
+        print(model)
+
+    # Create inputs and run
+
+    with torch.no_grad():
+        model.to("hpu")
+
+        B = 6
+        N = 100
+
+        inp0=  torch.tensor([
+            [1,0,0,0,0,0],  # row 0 <- X[0]
+            [0,0,0,1,0,0],  # row 1 <- X[3]
+            [0,1,0,0,0,0],  # row 2 <- X[1]
+            [0,0,0,0,1,0],  # row 3 <- X[4]
+            [0,0,0,0,0,0],  # row 4 <- X[2]
+            [0,0,0,0,0,0],  # row 5 <- X[5]
+        ], dtype=torch.float32).to("hpu")
+
+        # Input for Matmul: [B, D] -> now [6, 100]
+        inp1 = torch.randn(B, N)
+        inp1[2, :] = 1000
+        inp1[5, :] = 1000
+
+
+        # Run the model
+        output = model(inp0, inp1)
+        print("Output shape:", output.shape)
+        print(output)
+
+
+    # Finalize calibration if measuring
+    if config.measure:
+        finalize_calibration(model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/...ch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/requirements.txt b/...ch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/requirements.txt
@@ -1,3 +1,5 @@
 loguru
 hf_transfer
-transformers==4.57.3
+transformers==4.57.3
+# pip install git+https://github.com/yiliu30/long-bench-eval
+long-bench-eval @ git+https://github.com/yiliu30/long-bench-eval
diff --git a/...es/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/setup.sh b/...es/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/setup.sh
@@ -77,4 +77,4 @@ else
     echo "Unsupported device: $DEVICE. Supported devices are gpu and xpu."
     usage
     exit 1
-fi
+fi
diff --git a/neural_compressor/torch/algorithms/autoround/autoround.py b/neural_compressor/torch/algorithms/autoround/autoround.py
@@ -218,7 +218,10 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
             model.autoround_config = weight_config
             return rounder.save_quantized(output_dir=self.output_dir, inplace=True)
         else:  # pragma: no cover
-            rounder.quantize_and_save(output_dir=self.output_dir, format=self.export_format, inplace=True)
+            _, quantized_model_path = rounder.quantize_and_save(
+                output_dir=self.output_dir, format=self.export_format, inplace=True
+            )
+            self.output_dir = quantized_model_path
             model = rounder.model
             model.autoround_config = rounder.layer_config
 
@@ -236,8 +239,8 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
                 import transformers  # pylint: disable=E0401
 
                 model = transformers.AutoModelForCausalLM.from_pretrained(self.output_dir)
-            except:
-                pass
+            except Exception as e:
+                logger.error(f"Error reloading model: {e}")
 
         return model
 

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
@@ -48,7 +48,7 @@ def dequant_original_fp8_weight_if_needed(mod: torch.nn.Module, param: torch.Ten
             else:
                 raise RuntimeError(f"Got fp8 weight for {mod}, but dequant function is None, please check.")
         else:
-            RuntimeError(f"Got fp8 weight for {mod}, but dequant function is not found, please check.")
+            raise RuntimeError(f"Got fp8 weight for {mod}, but dequant function is not found, please check.")
 
     return param
 
@@ -326,14 +326,27 @@ def get_device_type_for_scales(mod):
     return config["device_for_scales"]
 
 
-@lru_cache
-def is_runtime_scale_patching():
-    """Check whether runtime scale patching is enabled via environment variable.
+class RuntimeState(Enum):
+    STATIC = 0
+    RUNTIME_SCALE_PATCHING = 1
+    DYNAMIC_QUANTIZATION = 2
 
-    Returns:
-        bool: True when runtime patching is enabled.
-    """
-    return os.getenv("RUNTIME_SCALE_PATCHING", "False").lower() in ["true", "1"]
+
+_runtime_state = RuntimeState.STATIC
+
+@lru_cache()
+def set_runtime_state(is_dynamic_quantization):
+    global _runtime_state
+    if is_dynamic_quantization:
+        _runtime_state = RuntimeState.DYNAMIC_QUANTIZATION
+    elif (os.getenv("RUNTIME_SCALE_PATCHING", "False").lower() in ["true", "1"]):
+        _runtime_state = RuntimeState.RUNTIME_SCALE_PATCHING
+    else:
+        _runtime_state = RuntimeState.STATIC
+
+
+def is_runtime_scale_patching():
+    return _runtime_state == RuntimeState.RUNTIME_SCALE_PATCHING
 
 #TODO [SW-224612]: Use cguid to calc scales and remove the check
 @lru_cache

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
@@ -131,6 +131,8 @@ def register_patched_measure_modules(model, mod_list, observer_class, d_shapes=N
         d_shapes (dict, optional): Defaults to None.
     """
     top_level_config = get_hqt_config(model)
+    if top_level_config is None:
+        raise ValueError("HQT config is not initialized on the model.")
     config = top_level_config.cfg
     setup_calibration_counter(model, config)
     skip_outputs_measurements = config["measure_exclude"] & (MeasureExclude.OUTPUT | MeasureExclude.ALL)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/patching_common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/patching_common.py
@@ -57,6 +57,7 @@ def create_mod_info_recursion(parent):
     "linear": ModuleType(1, ["weight"], 1, False),
     "row_parallel_linear": ModuleType(1, ["weight"], 2, True),
     "matmul": ModuleType(2, [], 1, False),
+    "b2b_matmul": ModuleType(2, [], 1, True),
     "kv_cache": ModuleType(1, [], 1, False),
     "softmax": ModuleType(1, [], 1, True),
     "fused_sdpa": ModuleType(3, [], 2, True),
@@ -66,7 +67,8 @@ def create_mod_info_recursion(parent):
 
 
 _mod_default_dict = {
-    "Matmul": ModuleInfo("matmul", PatchedMatmul),
+    "Matmul": ModuleInfo("matmul", PatchedMatmul, supports_dynamic_quantization=True),
+    "B2BMatmul": ModuleInfo("b2b_matmul", PatchedMatmul, supports_dynamic_quantization=True),
     "Linear": ModuleInfo("linear", PatchedLinear, supports_dynamic_quantization=True),
     "ParallelLMHead": ModuleInfo("linear", PatchedParallelLMHead, supports_dynamic_quantization=True),
     "RowParallelLinear": ModuleInfo("row_parallel_linear", PatchedRowParallelLinear, supports_dynamic_quantization=True),
@@ -75,7 +77,7 @@ def create_mod_info_recursion(parent):
     "QKVParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear, supports_dynamic_quantization=True),
     "FalconLinear": ModuleInfo("linear", PatchedLinear, supports_dynamic_quantization=True),
     "KVCache": ModuleInfo("kv_cache", PatchedKVCache),
-    "VLLMKVCache": ModuleInfo("kv_cache", PatchedVLLMKVCache),
+    "VLLMKVCache": ModuleInfo("kv_cache", PatchedVLLMKVCache, supports_dynamic_quantization=True),
     "Conv2d": ModuleInfo("linear", PatchedConv2d),
     "LoRACompatibleLinear": ModuleInfo("linear", PatchedLoRACompatibleLinear, supports_dynamic_quantization=True),
     "LoRACompatibleConv": ModuleInfo("linear", PatchedLoRACompatibleConv),

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py
@@ -18,6 +18,7 @@
 from abc import abstractmethod
 from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator
 from .quantized_func_wrappers import get_quantized_func_wrapper, OP_TYPE
+from .fp_utils import invert_scale
 
 
 cur_accelerator = auto_detect_accelerator()
@@ -69,23 +70,23 @@ def extra_repr(self) -> str:
 
 class QuantDequantNone(QuantDequantBase):
     def __init__(self, lp_dtype, hp_dtype, *args, **kwargs):
-        super(QuantDequantNone, self).__init__(lp_dtype, hp_dtype, *args, **kwargs)
+        super().__init__(lp_dtype, hp_dtype, *args, **kwargs)
 
     def forward(self, *args, **kwargs):
         return args[0]
 
     def extra_repr(self) -> str:
-        repr = super(QuantDequantNone, self).extra_repr()
+        repr = super().extra_repr()
         return f"{repr}, doesn't quantize nor dequantize"
 
 
 class QuantInput(QuantDequantBase):
     def __init__(self, scale_inv, lp_dtype, hp_dtype, *args, **kwargs):
-        super(QuantInput, self).__init__(lp_dtype, hp_dtype, *args, **kwargs)
+        super().__init__(lp_dtype, hp_dtype, *args, **kwargs)
         scale_inv = scale_inv.unsqueeze(1) if (scale_inv.numel() > 1 and not self.use_qdq) else scale_inv
         self.register_scale("scale_inv", scale_inv, self.scale_format)
         if self.use_qdq:
-            self.register_scale("scale", 1 / self.scale_inv, self.scale_format)
+            self.register_scale("scale", invert_scale(self.scale_inv), self.scale_format)
             op_type = OP_TYPE.QUANT_PC if self.scale_format == ScaleFormat.CONST and self.scale.numel() > 1 else OP_TYPE.QUANT
         else:
             op_type = OP_TYPE.CAST_TO_FP8
@@ -106,40 +107,40 @@ def forward_qdq(self, x):
             )
 
     def extra_repr(self) -> str:
-        repr = super(QuantInput, self).extra_repr()
+        repr = super().extra_repr()
         dtype = get_scale_dtype(self.scale_inv)
         return f"{repr}, scale_inv dtype={dtype}"
 
 
 class QuantDynamicInput(QuantDequantBase):
     def __init__(self, input_scales_creator, lp_dtype, hp_dtype, *args, **kwargs):
-        super(QuantDynamicInput, self).__init__(lp_dtype, hp_dtype, *args, **kwargs)
+        super().__init__(lp_dtype, hp_dtype, *args, **kwargs)
         self.input_scales_creator = input_scales_creator
-
         self.cast_to_op = get_quantized_func_wrapper(OP_TYPE.CAST_TO_FP8, self.scale_format)
 
-    def calculate_scales(self, x):
-        scale = self.input_scales_creator.calc_scales(x, QuantTensorType.DYNAMIC)
+    def calculate_scales(self, x, in_scale = None):
+        if in_scale is None:
+            scale = self.input_scales_creator.calc_scales(x, QuantTensorType.DYNAMIC)
+        else:
+            scale = in_scale
         scale_inv = self.input_scales_creator.invert_scales(scale)
         return scale, scale_inv
 
-    def forward(self, x):
-        scale, scale_inv = self.calculate_scales(x)
-
+    def forward(self, x, in_scale=None):
+        scale, scale_inv = self.calculate_scales(x, in_scale)
         ret = self.cast_to_op(x, scale_inv, False, False, self.lp_dtype)
-
         return ret, scale
 
     #TODO [SW-224609]: implement forward qdq
 
     def extra_repr(self) -> str:
-        repr = super(QuantDynamicInput, self).extra_repr()
+        repr = super().extra_repr()
         return f"{repr} input_scales_creator={self.input_scales_creator}"
 
 
 class DequantOutput(QuantDequantBase):
     def __init__(self, scale, lp_dtype, hp_dtype, *args, **kwargs):
-        super(DequantOutput, self).__init__(lp_dtype, hp_dtype, *args, **kwargs)
+        super().__init__(lp_dtype, hp_dtype, *args, **kwargs)
         self.register_scale("scale", scale, self.scale_format)
         if self.use_qdq:
             op_type = OP_TYPE.DEQUANT_PC if self.scale_format == ScaleFormat.CONST and self.scale.numel() > 1 else OP_TYPE.DEQUANT
@@ -163,16 +164,25 @@ def forward_qdq(self, x):
             )
 
     def extra_repr(self) -> str:
-        repr = super(DequantOutput, self).extra_repr()
+        repr = super().extra_repr()
         dtype = get_scale_dtype(self.scale)
         return f"{repr}, scale dtype={dtype}"
 
 
+class DequantDynamicOutput(QuantDequantBase):
+    def __init__(self, lp_dtype, hp_dtype, *args, **kwargs):
+        super().__init__(lp_dtype, hp_dtype, *args, **kwargs)
+        self.cast_from_op = get_quantized_func_wrapper(OP_TYPE.CAST_FROM_FP8, self.scale_format)
+
+    def forward(self, x, scale):
+        return self.cast_from_op(x, scale, self.hp_dtype)
+
+
 class QuantDequant(QuantDequantBase):
     def __init__(self, scale_inv, lp_dtype, hp_dtype, *args, **kwargs):
-        super(QuantDequant, self).__init__(lp_dtype, hp_dtype, *args, **kwargs)
+        super().__init__(lp_dtype, hp_dtype, *args, **kwargs)
         self.register_scale("scale_inv", scale_inv, self.scale_format)
-        self.register_scale("scale", 1 / scale_inv, self.scale_format)
+        self.register_scale("scale", invert_scale(scale_inv), self.scale_format)
         self.quantize_op = (
            get_quantized_func_wrapper(OP_TYPE.QUANT, self.scale_format)
            if self.use_qdq
@@ -215,5 +225,5 @@ def forward_qdq(self, x, *args, **kwargs):
         return z
 
     def extra_repr(self) -> str:
-        repr = super(QuantDequant, self).extra_repr()
+        repr = super().extra_repr()
         return f"{repr}, Quantize, and then dequantize"
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
@@ -268,7 +268,8 @@ def quantize(model, mod_list):
     elif config.cfg["mode"] == QuantMode.LOAD:
         # no measurement and scale file
         scale_method_config = {CfgStr.ACTIVATION: ScaleMethodConfig(scale_value_type=ScaleValueType.DUMMY_SCALES),
-                               CfgStr.WEIGHT: ScaleMethodConfig(scale_value_type=ScaleValueType.DUMMY_SCALES)}
+                               CfgStr.WEIGHT: ScaleMethodConfig(scale_value_type=ScaleValueType.DUMMY_SCALES, 
+                                                                granularity=scale_method_config[CfgStr.DEFAULT][CfgStr.WEIGHT].granularity)}
         prepare_model_with_dummy_measurement(model, mod_list, scale_method_config, scale_config)
     else:
         raise Exception("unexpected mode, expected QuantMode.QUANTIZE or QuantMode.LOAD")
diff --git a/...orch/algorithms/fp8_quant/_core/quantized_func_wrappers/hpu/hpu_quantized_func_wrapper.py b/...orch/algorithms/fp8_quant/_core/quantized_func_wrappers/hpu/hpu_quantized_func_wrapper.py
@@ -43,6 +43,8 @@ def get_default_quantized_func(self):
         raise NotImplementedError()
 
     def get_scalar_quantized_func(self):
+        if is_runtime_scale_patching():
+            return self.get_default_quantized_func()
         return self.get_default_quantized_func().scalar
 
     def get_dynamic_scalar_quantized_func(self):
@@ -64,8 +66,10 @@ def get_quantized_func(self, scale_format, is_dynamic=False):
         else:
             if is_runtime_scale_patching() or scale_format == ScaleFormat.CONST:
                 return self.get_default_quantized_func()
-            else:
+            elif scale_format == ScaleFormat.SCALAR:
                 return self.get_scalar_quantized_func()
+            else:
+                return self.get_default_quantized_func()
 
     def __call__(self, *args, **kwargs):
         return self._quantized_func_(*args, **kwargs)

diff --git a/...orch/algorithms/fp8_quant/_core/quantized_func_wrappers/xpu/xpu_quantized_func_wrapper.py b/...orch/algorithms/fp8_quant/_core/quantized_func_wrappers/xpu/xpu_quantized_func_wrapper.py
@@ -23,7 +23,7 @@
 
 class QuantizedXPUFuncWrapperBase(QuantizedFuncWrapperBase, metaclass=ABCMeta):
     """
-    Placeholder for base class for XPU quantized func wrapper.
+    Placeholder for base class for XPU (Falcon/Jaguar Shores) quantized func wrapper.
     """
     def __init__(self, scale_format, is_dynamic=False):
         self._quantized_func_ = self.get_default_quantized_func()
-Original file line number
+Diff line change
@@ Expand Up / @@ -77,4 +77,4 @@ else @@
         echo "Unsupported device: $DEVICE. Supported devices are gpu and xpu."
         usage
         exit 1
-    fi
+    fi