Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
13a17ae
[SW-240730] Support Compressed Tensors quantization method with fp8 w…
Yantom1 Oct 15, 2025
c49b792
[SW-240400] Fix MoE weights handling in measure (#315)
ulivne Oct 27, 2025
d91dcf3
[SW-240869] add support for str padding - same and valid (#311)
linoybu Oct 27, 2025
b537864
[PERFC-756] skip xpu qunaitzed func wrapper test (#316)
ulivne Oct 28, 2025
dbc18e7
corret dequant func check (#318)
yiliu30 Oct 30, 2025
996f2c5
[PERFC-756] fix skipping test (#320)
ulivne Oct 30, 2025
432f933
[SW-240800] Add option to specify output tensor in torch.matmul (#306)
kdamaszk Oct 31, 2025
0dd3853
[SW-233758] Support dynamic quantization for Matmul (#317)
nirda7 Nov 5, 2025
4302144
[SW-233758] Adjust Matmul axis for scale calculation according to inp…
nirda7 Nov 13, 2025
c56603a
[GAUDISW-5809] - Distinguish runtime scale patching from dynamic quan…
nirda7 Dec 2, 2025
8f99ece
[GAUDISW-244631] dispatch quantized hidden_states (#337)
xinyu-intel Dec 12, 2025
d8c8588
[GAUDISW-228042] Add support for dynamic vLLM kv-cache quantization (…
dudilester Dec 14, 2025
8a5862a
[GAUDISW-244192] - Set whether using dynamic quantization from Inc (#…
nirda7 Dec 21, 2025
e1ea559
disable autoround tests [GAUDISW-245272] (#342)
linoybu Jan 7, 2026
805222f
[GAUDISW-245117] add b2b op (#341)
linoybu Jan 8, 2026
4047e5a
[GAUDISW-244752] add dynamic scale for V-Cache on Hiddden dim (#339)
dudilester Jan 15, 2026
cf711d6
[GAUDISW-245131] Skip test for load/save model checkpoint (#345)
Silv3S Jan 18, 2026
46533c0
[GAUDISW-244752] Fix torch.compile graph break in v-cache hidden scal…
dudilester Jan 22, 2026
ff5c096
[GAUDISW-245917] Added WA for weights in 3d - Granite 4.0 (#353)
HolyFalafel Feb 1, 2026
b8bb206
[GAUDISW-245950] disable test fp8_aware_gptq (#352)
linoybu Feb 2, 2026
3e95f25
[GAUDISW-245131] Skip test for load/save model checkpoint part 2 (#351)
Silv3S Feb 2, 2026
89a8090
[GAUDISW-224538] Calling init_linear in __init__ (#357)
HolyFalafel Feb 11, 2026
96d09c1
[GAUDISW-246337] Added dynamic quant with weight PCS POW2 (#354)
HolyFalafel Feb 16, 2026
48db385
[GAUDISW-246352] assign value of 1 to scales of non-active blocks (#358)
dudilester Feb 17, 2026
7df0fdc
[GAUDISW-246550] Remove spaces before equals in scale_method_config p…
HolyFalafel Feb 18, 2026
58ae88e
[GAUDISW-246083] - adjust load mode to pcs for new pytorch version (#…
nirda7 Feb 26, 2026
736f07c
[GAUDISW-247053] Fix coverity issues take2 (#369)
linoybu Mar 3, 2026
acf8638
add missing changes
xinhe3 Apr 20, 2026
e0fb910
merge branch master and fix bug
xinhe3 Apr 29, 2026
5e99707
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 29, 2026
fed594e
Merge branch 'master' into cherry_pick_v1.24.0
xin3he May 8, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .azure-pipelines/scripts/ut/run_3x_pt_hpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ echo "##[group]set up UT env..."
export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
sed -i '/^auto-round/d;/^torchvision/d' /neural-compressor/test/torch/requirements.txt
pip install -r /neural-compressor/test/torch/requirements.txt
pip install deepspeed@git+https://github.com/HabanaAI/DeepSpeed.git@main --no-deps
pip install msgpack hjson ninja # deepspeed dependency
pip install auto-round-hpu
pip install pytest-cov pytest-html pytest-html-merger beautifulsoup4==4.13.5
echo "##[endgroup]"
Expand Down
94 changes: 94 additions & 0 deletions examples/helloworld/fp8_example/b2b_unitest_2_steps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@

import argparse
import math

import torch
import habana_frameworks.torch.core as htcore
from torch.nn import Parameter, init

# Initialize HPU environment (must be called before HPU operations)
htcore.hpu_set_env()

from neural_compressor.torch.quantization import FP8Config, convert, finalize_calibration, prepare

torch.manual_seed(1)


class B2BMatmul(torch.nn.Module):
def __init__(self):
super().__init__()

def forward(self, x, y, **kwargs):
return torch.matmul(x, y, **kwargs)



class M(torch.nn.Module):
def __init__(self) -> None:
super().__init__()
self.matmul = B2BMatmul()

def forward(self, inp0, inp1):
res = self.matmul(inp0, inp1)

return res


def main():
parser = argparse.ArgumentParser(
description="Habana FP8 sample code with B2BMatmul.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument("--quant_config", type=str, help="JSON file of quantization config")
args = parser.parse_args()

# Build model & load config
model = M().eval()
config = FP8Config.from_json_file(args.quant_config)

# Optional calibration preparation
if config.measure:
model = prepare(model, config)

# Optional quantization
if config.quantize:
htcore.hpu_initialize()
model = convert(model, config)
print(model)

# Create inputs and run

with torch.no_grad():
model.to("hpu")

B = 6
N = 100

inp0= torch.tensor([
[1,0,0,0,0,0], # row 0 <- X[0]
[0,0,0,1,0,0], # row 1 <- X[3]
[0,1,0,0,0,0], # row 2 <- X[1]
[0,0,0,0,1,0], # row 3 <- X[4]
[0,0,0,0,0,0], # row 4 <- X[2]
[0,0,0,0,0,0], # row 5 <- X[5]
], dtype=torch.float32).to("hpu")

# Input for Matmul: [B, D] -> now [6, 100]
inp1 = torch.randn(B, N)
inp1[2, :] = 1000
inp1[5, :] = 1000


# Run the model
output = model(inp0, inp1)
print("Output shape:", output.shape)
print(output)


# Finalize calibration if measuring
if config.measure:
finalize_calibration(model)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
loguru
hf_transfer
transformers==4.57.3
transformers==4.57.3
# pip install git+https://github.com/yiliu30/long-bench-eval
long-bench-eval @ git+https://github.com/yiliu30/long-bench-eval
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,4 @@ else
echo "Unsupported device: $DEVICE. Supported devices are gpu and xpu."
usage
exit 1
fi
fi
9 changes: 6 additions & 3 deletions neural_compressor/torch/algorithms/autoround/autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,10 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
model.autoround_config = weight_config
return rounder.save_quantized(output_dir=self.output_dir, inplace=True)
else: # pragma: no cover
rounder.quantize_and_save(output_dir=self.output_dir, format=self.export_format, inplace=True)
_, quantized_model_path = rounder.quantize_and_save(
output_dir=self.output_dir, format=self.export_format, inplace=True
)
self.output_dir = quantized_model_path
model = rounder.model
model.autoround_config = rounder.layer_config

Expand All @@ -236,8 +239,8 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
import transformers # pylint: disable=E0401

model = transformers.AutoModelForCausalLM.from_pretrained(self.output_dir)
except:
pass
except Exception as e:
logger.error(f"Error reloading model: {e}")

return model

Expand Down
29 changes: 21 additions & 8 deletions neural_compressor/torch/algorithms/fp8_quant/_core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def dequant_original_fp8_weight_if_needed(mod: torch.nn.Module, param: torch.Ten
else:
raise RuntimeError(f"Got fp8 weight for {mod}, but dequant function is None, please check.")
else:
RuntimeError(f"Got fp8 weight for {mod}, but dequant function is not found, please check.")
raise RuntimeError(f"Got fp8 weight for {mod}, but dequant function is not found, please check.")

return param

Expand Down Expand Up @@ -326,14 +326,27 @@ def get_device_type_for_scales(mod):
return config["device_for_scales"]


@lru_cache
def is_runtime_scale_patching():
"""Check whether runtime scale patching is enabled via environment variable.
class RuntimeState(Enum):
STATIC = 0
RUNTIME_SCALE_PATCHING = 1
DYNAMIC_QUANTIZATION = 2

Returns:
bool: True when runtime patching is enabled.
"""
return os.getenv("RUNTIME_SCALE_PATCHING", "False").lower() in ["true", "1"]

_runtime_state = RuntimeState.STATIC

@lru_cache()
def set_runtime_state(is_dynamic_quantization):
global _runtime_state
if is_dynamic_quantization:
_runtime_state = RuntimeState.DYNAMIC_QUANTIZATION
elif (os.getenv("RUNTIME_SCALE_PATCHING", "False").lower() in ["true", "1"]):
_runtime_state = RuntimeState.RUNTIME_SCALE_PATCHING
else:
_runtime_state = RuntimeState.STATIC


def is_runtime_scale_patching():
return _runtime_state == RuntimeState.RUNTIME_SCALE_PATCHING

#TODO [SW-224612]: Use cguid to calc scales and remove the check
@lru_cache
Expand Down
2 changes: 2 additions & 0 deletions neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ def register_patched_measure_modules(model, mod_list, observer_class, d_shapes=N
d_shapes (dict, optional): Defaults to None.
"""
top_level_config = get_hqt_config(model)
if top_level_config is None:
raise ValueError("HQT config is not initialized on the model.")
config = top_level_config.cfg
setup_calibration_counter(model, config)
skip_outputs_measurements = config["measure_exclude"] & (MeasureExclude.OUTPUT | MeasureExclude.ALL)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def create_mod_info_recursion(parent):
"linear": ModuleType(1, ["weight"], 1, False),
"row_parallel_linear": ModuleType(1, ["weight"], 2, True),
"matmul": ModuleType(2, [], 1, False),
"b2b_matmul": ModuleType(2, [], 1, True),
"kv_cache": ModuleType(1, [], 1, False),
"softmax": ModuleType(1, [], 1, True),
"fused_sdpa": ModuleType(3, [], 2, True),
Expand All @@ -66,7 +67,8 @@ def create_mod_info_recursion(parent):


_mod_default_dict = {
"Matmul": ModuleInfo("matmul", PatchedMatmul),
"Matmul": ModuleInfo("matmul", PatchedMatmul, supports_dynamic_quantization=True),
"B2BMatmul": ModuleInfo("b2b_matmul", PatchedMatmul, supports_dynamic_quantization=True),
"Linear": ModuleInfo("linear", PatchedLinear, supports_dynamic_quantization=True),
"ParallelLMHead": ModuleInfo("linear", PatchedParallelLMHead, supports_dynamic_quantization=True),
"RowParallelLinear": ModuleInfo("row_parallel_linear", PatchedRowParallelLinear, supports_dynamic_quantization=True),
Expand All @@ -75,7 +77,7 @@ def create_mod_info_recursion(parent):
"QKVParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear, supports_dynamic_quantization=True),
"FalconLinear": ModuleInfo("linear", PatchedLinear, supports_dynamic_quantization=True),
"KVCache": ModuleInfo("kv_cache", PatchedKVCache),
"VLLMKVCache": ModuleInfo("kv_cache", PatchedVLLMKVCache),
"VLLMKVCache": ModuleInfo("kv_cache", PatchedVLLMKVCache, supports_dynamic_quantization=True),
"Conv2d": ModuleInfo("linear", PatchedConv2d),
"LoRACompatibleLinear": ModuleInfo("linear", PatchedLoRACompatibleLinear, supports_dynamic_quantization=True),
"LoRACompatibleConv": ModuleInfo("linear", PatchedLoRACompatibleConv),
Expand Down
48 changes: 29 additions & 19 deletions neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from abc import abstractmethod
from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator
from .quantized_func_wrappers import get_quantized_func_wrapper, OP_TYPE
from .fp_utils import invert_scale


cur_accelerator = auto_detect_accelerator()
Expand Down Expand Up @@ -69,23 +70,23 @@ def extra_repr(self) -> str:

class QuantDequantNone(QuantDequantBase):
def __init__(self, lp_dtype, hp_dtype, *args, **kwargs):
super(QuantDequantNone, self).__init__(lp_dtype, hp_dtype, *args, **kwargs)
super().__init__(lp_dtype, hp_dtype, *args, **kwargs)

def forward(self, *args, **kwargs):
return args[0]

def extra_repr(self) -> str:
repr = super(QuantDequantNone, self).extra_repr()
repr = super().extra_repr()
return f"{repr}, doesn't quantize nor dequantize"


class QuantInput(QuantDequantBase):
def __init__(self, scale_inv, lp_dtype, hp_dtype, *args, **kwargs):
super(QuantInput, self).__init__(lp_dtype, hp_dtype, *args, **kwargs)
super().__init__(lp_dtype, hp_dtype, *args, **kwargs)
scale_inv = scale_inv.unsqueeze(1) if (scale_inv.numel() > 1 and not self.use_qdq) else scale_inv
self.register_scale("scale_inv", scale_inv, self.scale_format)
if self.use_qdq:
self.register_scale("scale", 1 / self.scale_inv, self.scale_format)
self.register_scale("scale", invert_scale(self.scale_inv), self.scale_format)
op_type = OP_TYPE.QUANT_PC if self.scale_format == ScaleFormat.CONST and self.scale.numel() > 1 else OP_TYPE.QUANT
else:
op_type = OP_TYPE.CAST_TO_FP8
Expand All @@ -106,40 +107,40 @@ def forward_qdq(self, x):
)

def extra_repr(self) -> str:
repr = super(QuantInput, self).extra_repr()
repr = super().extra_repr()
dtype = get_scale_dtype(self.scale_inv)
return f"{repr}, scale_inv dtype={dtype}"


class QuantDynamicInput(QuantDequantBase):
def __init__(self, input_scales_creator, lp_dtype, hp_dtype, *args, **kwargs):
super(QuantDynamicInput, self).__init__(lp_dtype, hp_dtype, *args, **kwargs)
super().__init__(lp_dtype, hp_dtype, *args, **kwargs)
self.input_scales_creator = input_scales_creator

self.cast_to_op = get_quantized_func_wrapper(OP_TYPE.CAST_TO_FP8, self.scale_format)

def calculate_scales(self, x):
scale = self.input_scales_creator.calc_scales(x, QuantTensorType.DYNAMIC)
def calculate_scales(self, x, in_scale = None):
if in_scale is None:
scale = self.input_scales_creator.calc_scales(x, QuantTensorType.DYNAMIC)
else:
scale = in_scale
scale_inv = self.input_scales_creator.invert_scales(scale)
return scale, scale_inv

def forward(self, x):
scale, scale_inv = self.calculate_scales(x)

def forward(self, x, in_scale=None):
scale, scale_inv = self.calculate_scales(x, in_scale)
ret = self.cast_to_op(x, scale_inv, False, False, self.lp_dtype)

return ret, scale

#TODO [SW-224609]: implement forward qdq

def extra_repr(self) -> str:
repr = super(QuantDynamicInput, self).extra_repr()
repr = super().extra_repr()
return f"{repr} input_scales_creator={self.input_scales_creator}"


class DequantOutput(QuantDequantBase):
def __init__(self, scale, lp_dtype, hp_dtype, *args, **kwargs):
super(DequantOutput, self).__init__(lp_dtype, hp_dtype, *args, **kwargs)
super().__init__(lp_dtype, hp_dtype, *args, **kwargs)
self.register_scale("scale", scale, self.scale_format)
if self.use_qdq:
op_type = OP_TYPE.DEQUANT_PC if self.scale_format == ScaleFormat.CONST and self.scale.numel() > 1 else OP_TYPE.DEQUANT
Expand All @@ -163,16 +164,25 @@ def forward_qdq(self, x):
)

def extra_repr(self) -> str:
repr = super(DequantOutput, self).extra_repr()
repr = super().extra_repr()
dtype = get_scale_dtype(self.scale)
return f"{repr}, scale dtype={dtype}"


class DequantDynamicOutput(QuantDequantBase):
def __init__(self, lp_dtype, hp_dtype, *args, **kwargs):
super().__init__(lp_dtype, hp_dtype, *args, **kwargs)
self.cast_from_op = get_quantized_func_wrapper(OP_TYPE.CAST_FROM_FP8, self.scale_format)

def forward(self, x, scale):
return self.cast_from_op(x, scale, self.hp_dtype)


class QuantDequant(QuantDequantBase):
def __init__(self, scale_inv, lp_dtype, hp_dtype, *args, **kwargs):
super(QuantDequant, self).__init__(lp_dtype, hp_dtype, *args, **kwargs)
super().__init__(lp_dtype, hp_dtype, *args, **kwargs)
self.register_scale("scale_inv", scale_inv, self.scale_format)
self.register_scale("scale", 1 / scale_inv, self.scale_format)
self.register_scale("scale", invert_scale(scale_inv), self.scale_format)
self.quantize_op = (
get_quantized_func_wrapper(OP_TYPE.QUANT, self.scale_format)
if self.use_qdq
Expand Down Expand Up @@ -215,5 +225,5 @@ def forward_qdq(self, x, *args, **kwargs):
return z

def extra_repr(self) -> str:
repr = super(QuantDequant, self).extra_repr()
repr = super().extra_repr()
return f"{repr}, Quantize, and then dequantize"
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,8 @@ def quantize(model, mod_list):
elif config.cfg["mode"] == QuantMode.LOAD:
# no measurement and scale file
scale_method_config = {CfgStr.ACTIVATION: ScaleMethodConfig(scale_value_type=ScaleValueType.DUMMY_SCALES),
CfgStr.WEIGHT: ScaleMethodConfig(scale_value_type=ScaleValueType.DUMMY_SCALES)}
CfgStr.WEIGHT: ScaleMethodConfig(scale_value_type=ScaleValueType.DUMMY_SCALES,
granularity=scale_method_config[CfgStr.DEFAULT][CfgStr.WEIGHT].granularity)}
prepare_model_with_dummy_measurement(model, mod_list, scale_method_config, scale_config)
else:
raise Exception("unexpected mode, expected QuantMode.QUANTIZE or QuantMode.LOAD")
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ def get_default_quantized_func(self):
raise NotImplementedError()

def get_scalar_quantized_func(self):
if is_runtime_scale_patching():
return self.get_default_quantized_func()
return self.get_default_quantized_func().scalar

def get_dynamic_scalar_quantized_func(self):
Expand All @@ -64,8 +66,10 @@ def get_quantized_func(self, scale_format, is_dynamic=False):
else:
if is_runtime_scale_patching() or scale_format == ScaleFormat.CONST:
return self.get_default_quantized_func()
else:
elif scale_format == ScaleFormat.SCALAR:
return self.get_scalar_quantized_func()
else:
return self.get_default_quantized_func()

def __call__(self, *args, **kwargs):
return self._quantized_func_(*args, **kwargs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

class QuantizedXPUFuncWrapperBase(QuantizedFuncWrapperBase, metaclass=ABCMeta):
"""
Placeholder for base class for XPU quantized func wrapper.
Placeholder for base class for XPU (Falcon/Jaguar Shores) quantized func wrapper.
"""
def __init__(self, scale_format, is_dynamic=False):
self._quantized_func_ = self.get_default_quantized_func()
Expand Down
Loading
Loading