mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Summary: This PR implements infrastructure for post-processing a model to apply int8 quantization to its `nn.Linear` modules. Highlights of the implementation: 1) Inputs and outputs are `float` (quantized and packed internally), but the weight is quantized and packed ahead of time for efficiency. This implementation performs well in small-batch size GEMM calls. It should not be considered a general-purpose quantized GEMM kernel. 2) Weight packing is dependent on machine architecture (e.g. vector register width), so it is done just-in-time. Concretely, it is done on model load for the weights and it is done during operator execution for the input value. 3) Biases are unquantized 4) We fail loudly if we are attempting to run this on a machine that does not support FBGEMM. This is because we do not want a model's numerics to differ based on which machine it is run on. A model containing these FBGEMM ops *must* be run with FBGEMM The API can be seen in the added test case. Highlights are: 1) `torch.jit.quantized.quantize_linear_modules` walks the module hierarchy of the passed-in Module and replaces all `nn.Linear` modules with a new `QuantizedLinear` module, which encapsulates the behavior described above. 2) `_pack()` and `_unpack()` script methods are present on `QuantizedLinear` modules. These methods should be called before serialization and after deserialization, respectively. This ensures that the weight matrix is properly packed for the running machine's architecture. Note that in the long term, we would like to move toward a more Pickle-style serialization technique, rather than having these explicit methods that mutate member values. This is blocked on being able to assign attributes in a ScriptMethod, among other things. Pull Request resolved: https://github.com/pytorch/pytorch/pull/13777 Differential Revision: D13383276 Pulled By: jamesr66a fbshipit-source-id: 00f29c9f34544add2b90107e3cf55a287802c344
55 lines
2.0 KiB
Python
55 lines
2.0 KiB
Python
import torch
|
|
import copy
|
|
|
|
|
|
class QuantizedLinear(torch.jit.ScriptModule):
|
|
__constants__ = ['scale', 'zero_point']
|
|
|
|
def __init__(self, other):
|
|
super(QuantizedLinear, self).__init__()
|
|
self.in_features = other.in_features
|
|
self.out_features = other.out_features
|
|
# Quantize weight and discard the original
|
|
self.weight, self.col_offsets, self.scale, self.zero_point = torch.fbgemm_linear_quantize_weight(
|
|
other.weight.clone().float())
|
|
self.weight = torch.nn.Parameter(self.weight, requires_grad=False)
|
|
self.col_offsets = torch.nn.Parameter(self.col_offsets, requires_grad=False)
|
|
assert other.bias is not None, 'QuantizedLinear requires a bias'
|
|
self.bias = torch.nn.Parameter(other.bias.clone().float())
|
|
|
|
self.register_buffer(
|
|
'packed_tensor_ptr',
|
|
torch.fbgemm_pack_quantized_matrix(self.weight.clone(), self.weight.size(1), self.weight.size(0)))
|
|
|
|
@torch.jit.script_method
|
|
def _unpack(self):
|
|
self.packed_tensor_ptr.set_(
|
|
torch.fbgemm_pack_quantized_matrix(
|
|
self.weight, self.weight.size(1), self.weight.size(0)))
|
|
|
|
@torch.jit.script_method
|
|
def _pack(self):
|
|
self.packed_tensor_ptr.set_(
|
|
torch.zeros(torch.jit.annotate(List[int], []), dtype=torch.uint8).detach())
|
|
|
|
@torch.jit.script_method
|
|
def forward(self, input):
|
|
out = torch.fbgemm_linear_int8_weight(
|
|
input.float(), self.weight, self.packed_tensor_ptr, self.col_offsets,
|
|
self.scale, self.zero_point, self.bias)
|
|
return out.type_as(input)
|
|
|
|
def extra_repr(self):
|
|
repr = 'in_features={in_features}, out_features={out_features}, ' \
|
|
'scale={scale}, zero_point={zero_point}'.format(**self.__dict__)
|
|
return repr
|
|
|
|
|
|
def quantize_linear_modules(module):
|
|
for name, mod in module.named_modules():
|
|
if mod is module:
|
|
continue
|
|
if isinstance(mod, torch.nn.Linear):
|
|
setattr(module, name, QuantizedLinear(mod))
|
|
quantize_linear_modules(mod)
|