Skip to content

vllm.model_executor.layers.quantization.compressed_tensors.transform.schemes.linear_qutlass_nvfp4

__all__ module-attribute

__all__ = [
    "is_qutlass_fp4_scheme",
    "QutlassNvFP4LinearMethod",
]

QutlassNvFP4LinearMethod

Bases: CompressedTensorsLinearTransformMethod

Source code in vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py
class QutlassNvFP4LinearMethod(CompressedTensorsLinearTransformMethod):
    def create_weights(
        self,
        layer,
        input_size_per_partition,
        output_partition_sizes,
        input_size,
        output_size,
        params_dtype,
        **extra_weight_attrs,
    ):
        # initializes fp4 qparams
        assert isinstance(layer.scheme, (CompressedTensorsW4A4Fp4,))
        ret = super().create_weights(
            layer,
            input_size_per_partition,
            output_partition_sizes,
            input_size,
            output_size,
            params_dtype,
            **extra_weight_attrs,
        )

        assert self.input_transform is not None
        assert len(self.input_transform.weight) == 1
        assert self.input_transform.weight[0].size(0) == layer.scheme.group_size

        return ret

    def apply(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        bias: torch.Tensor | None = None,
    ) -> torch.Tensor:
        raise NotImplementedError()

apply

apply(
    layer: Module, x: Tensor, bias: Tensor | None = None
) -> Tensor
Source code in vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py
def apply(
    self,
    layer: torch.nn.Module,
    x: torch.Tensor,
    bias: torch.Tensor | None = None,
) -> torch.Tensor:
    raise NotImplementedError()

create_weights

create_weights(
    layer,
    input_size_per_partition,
    output_partition_sizes,
    input_size,
    output_size,
    params_dtype,
    **extra_weight_attrs,
)
Source code in vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py
def create_weights(
    self,
    layer,
    input_size_per_partition,
    output_partition_sizes,
    input_size,
    output_size,
    params_dtype,
    **extra_weight_attrs,
):
    # initializes fp4 qparams
    assert isinstance(layer.scheme, (CompressedTensorsW4A4Fp4,))
    ret = super().create_weights(
        layer,
        input_size_per_partition,
        output_partition_sizes,
        input_size,
        output_size,
        params_dtype,
        **extra_weight_attrs,
    )

    assert self.input_transform is not None
    assert len(self.input_transform.weight) == 1
    assert self.input_transform.weight[0].size(0) == layer.scheme.group_size

    return ret

is_qutlass_fp4_scheme

is_qutlass_fp4_scheme(
    quant_scheme: CompressedTensorsScheme | None,
    input_tfms: dict[int, TransformTuple],
) -> bool
Source code in vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py
def is_qutlass_fp4_scheme(
    quant_scheme: CompressedTensorsScheme | None,
    input_tfms: dict[int, TransformTuple],
) -> bool:
    return (
        isinstance(quant_scheme, (CompressedTensorsW4A4Fp4,))
        and len(input_tfms) == 1
        and input_tfms[0].scheme.head_dim == quant_scheme.group_size
    )