mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
https://github.com/pytorch/pytorch/pull/134124 was reverted by https://github.com/pytorch/pytorch/pull/145392 due to KleidiAI clone issue. 1. This reverts commit 0940eb6d44f3cf69dd840db990245cbe1f78e770 (https://github.com/pytorch/pytorch/pull/145392 )and Fixes KleidiAI mirror issue. 2. KleidiAI is now cloned from github mirror instead of arm gitlab Change-Id: I7d6eee7214cd117d3057d615936fcc3ee6052fa2 Fixes https://github.com/pytorch/pytorch/issues/145273 Pull Request resolved: https://github.com/pytorch/pytorch/pull/145505 Approved by: https://github.com/malfet
This commit is contained in:
committed by
PyTorch MergeBot
parent
34b8d8b0c0
commit
41b38f755c
@ -498,6 +498,39 @@ def _group_quantize_tensor(w, n_bit=4, q_group_size=16):
|
||||
return out, scales_and_zeros
|
||||
|
||||
|
||||
def _group_quantize_tensor_symmetric(
|
||||
w, n_bit=4, groupsize=32
|
||||
):
|
||||
# W is of shape [K x N]
|
||||
# We transpose W as Quantization is applied on [N x K]
|
||||
w = w.transpose(0, 1).contiguous()
|
||||
assert w.dim() == 2
|
||||
assert groupsize > 1
|
||||
assert w.shape[-1] % groupsize == 0
|
||||
# Calculate scale and zeros
|
||||
to_quant = w.reshape(-1, groupsize)
|
||||
max_val = to_quant.abs().amax(dim=1, keepdim=True)
|
||||
eps = torch.finfo(max_val.dtype).eps
|
||||
max_int = 2 ** (n_bit - 1) - 1 # For 4-bit, this is 7
|
||||
scales = max_val.clamp(min=eps) / max_int
|
||||
zeros = torch.zeros_like(scales)
|
||||
|
||||
# Quantize the weight
|
||||
scales = scales.to(torch.float32).reshape(w.shape[0], -1)
|
||||
zeros = zeros.to(torch.float32).reshape(w.shape[0], -1)
|
||||
scales = scales.reshape(-1, 1)
|
||||
zeros = zeros.reshape(-1, 1)
|
||||
max_int = 2**n_bit - 1
|
||||
w_int8 = to_quant.div(scales).add(8.5).to(torch.int8).clamp(max=max_int)
|
||||
# We pack 2 signed int4 values in unsigned uint8 container.
|
||||
# This reduces the weight size by half and improves load perf
|
||||
out_uint8 = (w_int8[::, 1::2] << 4 | w_int8[::, ::2]).to(torch.uint8)
|
||||
|
||||
scales_and_zeros = scales.squeeze().contiguous()
|
||||
|
||||
return out_uint8, scales_and_zeros
|
||||
|
||||
|
||||
def _dynamically_quantize_per_channel(x, quant_min, quant_max, target_dtype):
|
||||
# source: https://github.com/pytorch-labs/gpt-fast/blob/main/quantize.py
|
||||
# default setup for affine quantization of activations
|
||||
@ -530,7 +563,6 @@ def _dynamically_quantize_per_channel(x, quant_min, quant_max, target_dtype):
|
||||
return quant, scales.to(x_dtype), zero_points
|
||||
|
||||
|
||||
|
||||
# QuantizationTestCase used as a base class for testing quantization on modules
|
||||
class QuantizationTestCase(TestCase):
|
||||
def setUp(self):
|
||||
|
Reference in New Issue
Block a user