mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
Description: 1. Quantize Linear Layer Weights to 4-bits: Quantize the weights of the Linear layer to 4 bits, using symmetric quantization. Pack two 4-bit weights into one uint8 container. Choose a quantization scheme (channel-wise or group-wise), with the group size being a multiple of 32. 2. Prepare Quantized Weights, Scales, and Optional Bias: After quantizing, obtain the quantized_weights, scales, and groupsize. If the original Linear layer has a bias, prepare it as well. 3. Pack the Weights Efficiently: Use torch.ops.aten._dyn_quant_pack_4bit_weight to optimally pack the weights, scales, and optional bias. ```python packed_weights = torch.ops.aten._dyn_quant_pack_4bit_weight(weight, scales_and_zeros, bias, groupsize, in_features, out_features) ``` Input parameters should include: in_features and out_features (the same as the Linear layer’s corresponding parameters). 4. Perform Dynamic Quantized Matrix Multiplication: Use torch.ops.aten._dyn_quant_matmul_4bit to perform matrix multiplication with quantized weights. ```python output = torch.ops.aten._dyn_quant_matmul_4bit(input, packed_weights, groupsize, in_features, out_features) ``` Inputs required include: The input tensor, packed_weights , groupsize, and the in_features and out_features. API Usage: https://github.com/pytorch/pytorch/issues/143289 Model Perf : 7B Transformer model: Prefill : 340 t/s Decode : 40 t/s 2B Transformer model Prefill : 747 t/s Decode : 80 t/s Tests: python test/test_linalg.py -k test__dyn_quant_pack_4bit_weight Ran 1 test in 0.016s OK python test/test_linalg.py -k test__dyn_quant_matmul_4bit Ran 8 tests in 0.077s OK python test/test_linalg.py -k test_compile_dyn_quant_matmul_4bit Ran 8 tests in 11.454s Change-Id: Ia1672bad5e6ec94e64d8bb1971395d60f4b3a452 Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/134124 Approved by: https://github.com/digantdesai, https://github.com/malfet
137 lines
4.7 KiB
Plaintext
137 lines
4.7 KiB
Plaintext
[submodule "third_party/pybind11"]
|
|
ignore = dirty
|
|
path = third_party/pybind11
|
|
url = https://github.com/pybind/pybind11.git
|
|
[submodule "third_party/eigen"]
|
|
ignore = dirty
|
|
path = third_party/eigen
|
|
url = https://gitlab.com/libeigen/eigen.git
|
|
[submodule "third_party/googletest"]
|
|
ignore = dirty
|
|
path = third_party/googletest
|
|
url = https://github.com/google/googletest.git
|
|
[submodule "third_party/benchmark"]
|
|
ignore = dirty
|
|
path = third_party/benchmark
|
|
url = https://github.com/google/benchmark.git
|
|
[submodule "third_party/protobuf"]
|
|
ignore = dirty
|
|
path = third_party/protobuf
|
|
url = https://github.com/protocolbuffers/protobuf.git
|
|
[submodule "third_party/NNPACK"]
|
|
ignore = dirty
|
|
path = third_party/NNPACK
|
|
url = https://github.com/Maratyszcza/NNPACK.git
|
|
[submodule "third_party/gloo"]
|
|
ignore = dirty
|
|
path = third_party/gloo
|
|
url = https://github.com/facebookincubator/gloo
|
|
[submodule "third_party/NNPACK_deps/pthreadpool"]
|
|
ignore = dirty
|
|
path = third_party/pthreadpool
|
|
url = https://github.com/Maratyszcza/pthreadpool.git
|
|
[submodule "third_party/NNPACK_deps/FXdiv"]
|
|
ignore = dirty
|
|
path = third_party/FXdiv
|
|
url = https://github.com/Maratyszcza/FXdiv.git
|
|
[submodule "third_party/NNPACK_deps/FP16"]
|
|
ignore = dirty
|
|
path = third_party/FP16
|
|
url = https://github.com/Maratyszcza/FP16.git
|
|
[submodule "third_party/NNPACK_deps/psimd"]
|
|
ignore = dirty
|
|
path = third_party/psimd
|
|
url = https://github.com/Maratyszcza/psimd.git
|
|
[submodule "third_party/cpuinfo"]
|
|
ignore = dirty
|
|
path = third_party/cpuinfo
|
|
url = https://github.com/pytorch/cpuinfo.git
|
|
[submodule "third_party/python-peachpy"]
|
|
ignore = dirty
|
|
path = third_party/python-peachpy
|
|
url = https://github.com/malfet/PeachPy.git
|
|
[submodule "third_party/onnx"]
|
|
ignore = dirty
|
|
path = third_party/onnx
|
|
url = https://github.com/onnx/onnx.git
|
|
[submodule "third_party/sleef"]
|
|
ignore = dirty
|
|
path = third_party/sleef
|
|
url = https://github.com/shibatch/sleef
|
|
[submodule "third_party/ideep"]
|
|
ignore = dirty
|
|
path = third_party/ideep
|
|
url = https://github.com/intel/ideep
|
|
[submodule "third_party/nccl/nccl"]
|
|
ignore = dirty
|
|
path = third_party/nccl/nccl
|
|
url = https://github.com/NVIDIA/nccl
|
|
[submodule "third_party/gemmlowp/gemmlowp"]
|
|
ignore = dirty
|
|
path = third_party/gemmlowp/gemmlowp
|
|
url = https://github.com/google/gemmlowp.git
|
|
[submodule "third_party/fbgemm"]
|
|
ignore = dirty
|
|
path = third_party/fbgemm
|
|
url = https://github.com/pytorch/fbgemm
|
|
[submodule "android/libs/fbjni"]
|
|
ignore = dirty
|
|
path = android/libs/fbjni
|
|
url = https://github.com/facebookincubator/fbjni.git
|
|
[submodule "third_party/XNNPACK"]
|
|
ignore = dirty
|
|
path = third_party/XNNPACK
|
|
url = https://github.com/google/XNNPACK.git
|
|
[submodule "third_party/fmt"]
|
|
ignore = dirty
|
|
path = third_party/fmt
|
|
url = https://github.com/fmtlib/fmt.git
|
|
[submodule "third_party/tensorpipe"]
|
|
ignore = dirty
|
|
path = third_party/tensorpipe
|
|
url = https://github.com/pytorch/tensorpipe.git
|
|
[submodule "third_party/cudnn_frontend"]
|
|
path = third_party/cudnn_frontend
|
|
url = https://github.com/NVIDIA/cudnn-frontend.git
|
|
[submodule "third_party/kineto"]
|
|
path = third_party/kineto
|
|
url = https://github.com/pytorch/kineto
|
|
[submodule "third_party/pocketfft"]
|
|
path = third_party/pocketfft
|
|
url = https://github.com/mreineck/pocketfft
|
|
[submodule "third_party/ittapi"]
|
|
path = third_party/ittapi
|
|
url = https://github.com/intel/ittapi.git
|
|
[submodule "third_party/flatbuffers"]
|
|
path = third_party/flatbuffers
|
|
url = https://github.com/google/flatbuffers.git
|
|
[submodule "third_party/nlohmann"]
|
|
path = third_party/nlohmann
|
|
url = https://github.com/nlohmann/json.git
|
|
[submodule "third_party/VulkanMemoryAllocator"]
|
|
path = third_party/VulkanMemoryAllocator
|
|
url = https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator.git
|
|
[submodule "third_party/cutlass"]
|
|
path = third_party/cutlass
|
|
url = https://github.com/NVIDIA/cutlass.git
|
|
[submodule "third_party/mimalloc"]
|
|
path = third_party/mimalloc
|
|
url = https://github.com/microsoft/mimalloc.git
|
|
[submodule "third_party/opentelemetry-cpp"]
|
|
path = third_party/opentelemetry-cpp
|
|
url = https://github.com/open-telemetry/opentelemetry-cpp.git
|
|
[submodule "third_party/cpp-httplib"]
|
|
path = third_party/cpp-httplib
|
|
url = https://github.com/yhirose/cpp-httplib.git
|
|
branch = v0.15.3
|
|
[submodule "third_party/NVTX"]
|
|
path = third_party/NVTX
|
|
url = https://github.com/NVIDIA/NVTX.git
|
|
[submodule "third_party/composable_kernel"]
|
|
path = third_party/composable_kernel
|
|
url = https://github.com/ROCm/composable_kernel.git
|
|
branch = develop
|
|
[submodule "third_party/kleidiai"]
|
|
path = third_party/kleidiai
|
|
url = https://git.gitlab.arm.com/kleidi/kleidiai.git
|