mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-10-31 04:04:57 +08:00 
			
		
		
		
	Compare commits
	
		
			69 Commits
		
	
	
		
			csl/add_wi
			...
			document-a
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| feace9648e | |||
| b558c986e8 | |||
| 415e641572 | |||
| 11f5f65686 | |||
| af32d16a71 | |||
| ba480d6bf7 | |||
| 4a6abba0d9 | |||
| 96181d6f76 | |||
| 2164b66121 | |||
| bde18c445d | |||
| f3e43ff2d7 | |||
| 39d0c06ed0 | |||
| 4ab847bbc7 | |||
| 4bd1505f84 | |||
| 1f9614cef8 | |||
| 35f66b83f8 | |||
| 4a39820e5e | |||
| 600267ea56 | |||
| f11ac803d7 | |||
| ea42517e45 | |||
| 91c211fb8c | |||
| 660e369a68 | |||
| 2883b5ab77 | |||
| 9fff8155c3 | |||
| 331191ce4b | |||
| 2c5ed6e7c0 | |||
| 5d7360bb03 | |||
| 321e602692 | |||
| 3c5ca685d6 | |||
| 5178d0a480 | |||
| cf0a00d4f3 | |||
| 5ed4270440 | |||
| 8c728e129d | |||
| 9fc2c6446d | |||
| 409aece3f9 | |||
| b116c51330 | |||
| 2e1742dd63 | |||
| f7ad6dbad6 | |||
| f46bb04dcc | |||
| 6f6a919366 | |||
| 83d71dfb2f | |||
| 5103ecc5d8 | |||
| 9580539e2f | |||
| a11a66ef32 | |||
| 6b768e1890 | |||
| 35c4130fd1 | |||
| 34042a9145 | |||
| 9d1ab4f4bb | |||
| 3e0826c9d7 | |||
| 86c789849e | |||
| f3afbcf340 | |||
| 40b25578e4 | |||
| 412c6d28ec | |||
| 7d570129e0 | |||
| 97ca21106d | |||
| 27234792ad | |||
| b6b7a44dec | |||
| 3ddf2018d0 | |||
| fac6f20ae3 | |||
| 1894082000 | |||
| 5a66ff4915 | |||
| abadea70f3 | |||
| f414aa8e0d | |||
| e438db2546 | |||
| 10335ffb2c | |||
| f006aee601 | |||
| 8d53d788fe | |||
| 0b4f2b46d9 | |||
| 960c4b9937 | 
| @ -1 +1 @@ | ||||
| e0dda9059d082537cee36be6c5e4fe3b18c880c0 | ||||
| deb42f2a8e48f5032b4a98ee781a15fa87a157cf | ||||
|  | ||||
| @ -19,8 +19,8 @@ pip_install \ | ||||
|   transformers==4.36.2 | ||||
|  | ||||
| pip_install coloredlogs packaging | ||||
| pip_install onnxruntime==1.22.1 | ||||
| pip_install onnxscript==0.4.0 | ||||
| pip_install onnxruntime==1.23.0 | ||||
| pip_install onnxscript==0.5.3 | ||||
|  | ||||
| # Cache the transformers model to be used later by ONNX tests. We need to run the transformers | ||||
| # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/ | ||||
|  | ||||
| @ -341,7 +341,7 @@ onnx==1.18.0 | ||||
| #Pinned versions: | ||||
| #test that import: | ||||
|  | ||||
| onnxscript==0.4.0 | ||||
| onnxscript==0.5.3 | ||||
| #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal | ||||
| #Pinned versions: | ||||
| #test that import: | ||||
|  | ||||
| @ -34,12 +34,14 @@ fi | ||||
|  | ||||
|  | ||||
| # Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878 | ||||
| NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true) | ||||
| if [ -n "$NUMBA_CUDA_DIR" ]; then | ||||
|   NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch" | ||||
|   pushd "$NUMBA_CUDA_DIR" | ||||
|   patch -p4 <"$NUMBA_PATCH" | ||||
|   popd | ||||
| if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then | ||||
|   NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true) | ||||
|   if [ -n "$NUMBA_CUDA_DIR" ]; then | ||||
|     NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch" | ||||
|     pushd "$NUMBA_CUDA_DIR" | ||||
|     patch -p4 <"$NUMBA_PATCH" | ||||
|     popd | ||||
|   fi | ||||
| fi | ||||
|  | ||||
| echo "Environment variables:" | ||||
|  | ||||
| @ -66,6 +66,7 @@ readability-simplify-subscript-expr, | ||||
| readability-string-compare, | ||||
| -readability-redundant-access-specifiers, | ||||
| -readability-redundant-control-flow, | ||||
| -readability-redundant-inline-specifier, | ||||
| ' | ||||
| HeaderFilterRegex: '^(aten/|c10/|torch/).*$' | ||||
| WarningsAsErrors: '*' | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/ci_commit_pins/xla.txt
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ci_commit_pins/xla.txt
									
									
									
									
										vendored
									
									
								
							| @ -1 +1 @@ | ||||
| 0fc62aa26a30ed7ca419d285f285cb5ba02c4394 | ||||
| 2a9138a26ee257fef05310ad3fecf7c55fe80d73 | ||||
|  | ||||
							
								
								
									
										9
									
								
								.github/workflows/_get-changed-files.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										9
									
								
								.github/workflows/_get-changed-files.yml
									
									
									
									
										vendored
									
									
								
							| @ -40,6 +40,15 @@ jobs: | ||||
|               # Use gh CLI to get changed files in the PR with explicit repo | ||||
|               CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//') | ||||
|  | ||||
|               # See https://github.com/pytorch/pytorch/pull/134215#issuecomment-2332128790 | ||||
|               PYI_FILES_TO_ADD="" | ||||
|               for file in ${CHANGED_FILES}; do | ||||
|                 if [[ "${file}" == *".pyi.in" ]]; then | ||||
|                   PYI_FILES_TO_ADD="${PYI_FILES_TO_ADD} ${file//.in/}" | ||||
|                 fi | ||||
|               done | ||||
|               CHANGED_FILES="${CHANGED_FILES}${PYI_FILES_TO_ADD}" | ||||
|  | ||||
|               if [ -z "$CHANGED_FILES" ]; then | ||||
|                 echo "No changed files found, setting to '*'" | ||||
|                 CHANGED_FILES="*" | ||||
|  | ||||
							
								
								
									
										10
									
								
								.github/workflows/inductor-periodic.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										10
									
								
								.github/workflows/inductor-periodic.yml
									
									
									
									
										vendored
									
									
								
							| @ -106,6 +106,16 @@ jobs: | ||||
|           { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" }, | ||||
|           { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, | ||||
|           { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, | ||||
|           { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" }, | ||||
|           { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, | ||||
|           { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, | ||||
|           { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, | ||||
|           { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, | ||||
|           { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" }, | ||||
|           { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, | ||||
|           { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, | ||||
|           { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, | ||||
|           { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|  | ||||
							
								
								
									
										6
									
								
								.github/workflows/periodic.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								.github/workflows/periodic.yml
									
									
									
									
										vendored
									
									
								
							| @ -213,9 +213,9 @@ jobs: | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] }, | ||||
|           { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] }, | ||||
|           { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] }, | ||||
|           { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] }, | ||||
|           { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] }, | ||||
|           { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] }, | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/pull.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/pull.yml
									
									
									
									
										vendored
									
									
								
							| @ -127,8 +127,6 @@ jobs: | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       # More memory is needed to build with asan | ||||
|       runner: linux.2xlarge.memory | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build-environment: linux-jammy-py3.10-clang18-asan | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/slow.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/slow.yml
									
									
									
									
										vendored
									
									
								
							| @ -140,8 +140,6 @@ jobs: | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       # More memory is needed to build with asan | ||||
|       runner: linux.2xlarge.memory | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build-environment: linux-jammy-py3.10-clang18-asan | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan | ||||
|  | ||||
| @ -1573,6 +1573,7 @@ exclude_patterns = [ | ||||
|     'torch/_inductor/fx_passes/serialized_patterns/**', | ||||
|     'torch/_inductor/autoheuristic/artifacts/**', | ||||
|     'test/dynamo/cpython/**', | ||||
|     'test/test_torchfuzz_repros.py', | ||||
|     'scripts/**', | ||||
|     'third_party/**', | ||||
|     'fb/**', | ||||
|  | ||||
| @ -40,41 +40,6 @@ namespace { | ||||
|                 ->conv | ||||
|                 ->rnn | ||||
| */ | ||||
| const std::map<std::string, std::vector<std::string>> _fp32_precisions = { | ||||
|     {"generic", {{"ieee", "tf32", "bf16", "none"}}}, | ||||
|     {"mkldnn", {{"ieee", "tf32", "bf16", "none"}}}, | ||||
|     {"cuda", {{"ieee", "tf32", "none"}}}}; | ||||
|  | ||||
| // Check whether the backend and op are legal | ||||
| void check_fp32_prec_backend_and_op( | ||||
|     const std::string& backend, | ||||
|     const std::string& op) { | ||||
|   static std::vector<std::string> backends = {"generic", "mkldnn", "cuda"}; | ||||
|   static std::vector<std::string> operators = {"conv", "matmul", "rnn", "all"}; | ||||
|   TORCH_CHECK( | ||||
|       std::find(backends.begin(), backends.end(), backend) != backends.end(), | ||||
|       "Invalid backend: ", | ||||
|       backend); | ||||
|   TORCH_CHECK( | ||||
|       std::find(operators.begin(), operators.end(), op) != operators.end(), | ||||
|       "Invalid operator: ", | ||||
|       op); | ||||
|   if (backend == "generic") { | ||||
|     TORCH_CHECK(op == "all", "Invalid operation for generic backend: ", op); | ||||
|   } | ||||
|   } | ||||
|  | ||||
|   // Return whether the precision is supported by backends | ||||
|   bool validate_fp32_prec( | ||||
|       const std::string& backend, | ||||
|       const std::string& precision) { | ||||
|     auto iterp = _fp32_precisions.find(backend); | ||||
|     TORCH_CHECK(iterp != _fp32_precisions.end()); | ||||
|     auto precisions = iterp->second; | ||||
|     bool valid = std::find(precisions.begin(), precisions.end(), precision) != | ||||
|         precisions.end(); | ||||
|     return valid; | ||||
|   } | ||||
|  | ||||
|   C10_ALWAYS_INLINE void warn_deprecated_fp32_precision_api(){ | ||||
|     TORCH_WARN_ONCE( | ||||
| @ -86,6 +51,54 @@ void check_fp32_prec_backend_and_op( | ||||
|   } | ||||
| } // namespace | ||||
|  | ||||
| Float32Backend str2backend(const std::string& name) { | ||||
|   if (name == "generic") | ||||
|     return Float32Backend::GENERIC; | ||||
|   else if (name == "cuda") | ||||
|     return Float32Backend::CUDA; | ||||
|   else if (name == "mkldnn") | ||||
|     return Float32Backend::MKLDNN; | ||||
|   TORCH_CHECK(false, "Unknown backend: ", name); | ||||
| } | ||||
|  | ||||
| Float32Op str2op(const std::string& name) { | ||||
|   if (name == "all") | ||||
|     return Float32Op::ALL; | ||||
|   else if (name == "conv") | ||||
|     return Float32Op::CONV; | ||||
|   else if (name == "rnn") | ||||
|     return Float32Op::RNN; | ||||
|   else if (name == "matmul") | ||||
|     return Float32Op::MATMUL; | ||||
|   TORCH_CHECK(false, "Unknown op: ", name); | ||||
| } | ||||
|  | ||||
| Float32Precision str2precision(const std::string& name) { | ||||
|   if (name == "none") | ||||
|     return Float32Precision::NONE; | ||||
|   else if (name == "ieee") | ||||
|     return Float32Precision::IEEE; | ||||
|   else if (name == "tf32") | ||||
|     return Float32Precision::TF32; | ||||
|   else if (name == "bf16") | ||||
|     return Float32Precision::BF16; | ||||
|   TORCH_CHECK(false, "Unknown precision: ", name); | ||||
| } | ||||
|  | ||||
| std::string precision2str(Float32Precision prec) { | ||||
|   switch (prec) { | ||||
|     case Float32Precision::NONE: | ||||
|       return "none"; | ||||
|     case Float32Precision::IEEE: | ||||
|       return "ieee"; | ||||
|     case Float32Precision::TF32: | ||||
|       return "tf32"; | ||||
|     case Float32Precision::BF16: | ||||
|       return "bf16"; | ||||
|   } | ||||
|   TORCH_CHECK(false, "Invalid enum Float32Precision(", static_cast<int>(prec), ")"); | ||||
| } | ||||
|  | ||||
| Context::Context() = default; | ||||
|  | ||||
| // TODO: This could be bad juju if someone calls globalContext() in the | ||||
| @ -179,10 +192,10 @@ void Context::setUserEnabledNNPACK(bool e) { | ||||
|   enabled_nnpack = e; | ||||
| } | ||||
|  | ||||
| bool Context::allowTF32CuDNN(const std::string& op) const { | ||||
|   if (op.empty()){ | ||||
|     bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32"; | ||||
|     bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32"; | ||||
| bool Context::allowTF32CuDNN(std::optional<Float32Op> op) const { | ||||
|   if (!op.has_value()) { | ||||
|     bool allow_tf32_rnn = float32Precision(Float32Backend::CUDA, Float32Op::RNN) == Float32Precision::TF32; | ||||
|     bool allow_tf32_conv = float32Precision(Float32Backend::CUDA, Float32Op::CONV) == Float32Precision::TF32; | ||||
|     TORCH_CHECK( | ||||
|         allow_tf32_rnn == allow_tf32_conv && allow_tf32_rnn == allow_tf32_cudnn, | ||||
|         "PyTorch is checking whether allow_tf32 is enabled for cuDNN without a specific operator name,", | ||||
| @ -191,15 +204,15 @@ bool Context::allowTF32CuDNN(const std::string& op) const { | ||||
|         "We suggest only using the new API to set the TF32 flag(s). See also: ", | ||||
|         "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices"); | ||||
|   } else { | ||||
|     return float32Precision("cuda", op) == "tf32"; | ||||
|     return float32Precision(Float32Backend::CUDA, op.value()) == Float32Precision::TF32; | ||||
|   } | ||||
|   warn_deprecated_fp32_precision_api(); | ||||
|   return allow_tf32_cudnn; | ||||
| } | ||||
|  | ||||
| void Context::setAllowTF32CuDNN(bool b) { | ||||
|   setFloat32Precision("cuda", "rnn", b ? "tf32" : "none"); | ||||
|   setFloat32Precision("cuda", "conv", b ? "tf32" : "none"); | ||||
|   setFloat32Precision(Float32Backend::CUDA, Float32Op::RNN, b ? Float32Precision::TF32 : Float32Precision::NONE); | ||||
|   setFloat32Precision(Float32Backend::CUDA, Float32Op::CONV, b ? Float32Precision::TF32 : Float32Precision::NONE); | ||||
|   allow_tf32_cudnn = b; | ||||
|   warn_deprecated_fp32_precision_api(); | ||||
| } | ||||
| @ -305,7 +318,7 @@ void Context::setImmediateMiopen(bool b) { | ||||
|  | ||||
| bool Context::allowTF32CuBLAS() const { | ||||
|   bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST; | ||||
|   bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32"; | ||||
|   bool allow_tf32_new = float32Precision(Float32Backend::CUDA, Float32Op::MATMUL) == Float32Precision::TF32; | ||||
|   TORCH_CHECK( | ||||
|       legacy_allow_tf32 == allow_tf32_new, | ||||
|       "PyTorch is checking whether allow_tf32_new is enabled for cuBlas matmul,", | ||||
| @ -318,17 +331,17 @@ bool Context::allowTF32CuBLAS() const { | ||||
|  | ||||
| void Context::setAllowTF32CuBLAS(bool b) { | ||||
|   float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST; | ||||
|   setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee"); | ||||
|   setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, b ? Float32Precision::TF32 : Float32Precision::IEEE); | ||||
| } | ||||
|  | ||||
| Float32MatmulPrecision Context::float32MatmulPrecision() const { | ||||
|   bool invalid = float32Precision("cuda", "matmul") == "tf32" && | ||||
|   bool invalid = float32Precision(Float32Backend::CUDA, Float32Op::MATMUL) == Float32Precision::TF32 && | ||||
|       float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST; | ||||
|   invalid = invalid || | ||||
|       (float32Precision("mkldnn", "matmul") == "bf16" && | ||||
|       (float32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL) == Float32Precision::BF16 && | ||||
|        float32_matmul_precision != at::Float32MatmulPrecision::MEDIUM); | ||||
|   invalid = invalid || | ||||
|       (float32Precision("mkldnn", "matmul") == "tf32" && | ||||
|       (float32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL) == Float32Precision::TF32 && | ||||
|        float32_matmul_precision != at::Float32MatmulPrecision::HIGH); | ||||
|   TORCH_CHECK( | ||||
|       !invalid, | ||||
| @ -340,15 +353,26 @@ Float32MatmulPrecision Context::float32MatmulPrecision() const { | ||||
|   return float32_matmul_precision; | ||||
| } | ||||
|  | ||||
| std::string Context::float32Precision(const std::string& backend, const std::string& op) const { | ||||
|   check_fp32_prec_backend_and_op(backend, op); | ||||
|   auto precision = fp32_precision.find(backend)->second.find(op)->second; | ||||
|   if (precision == "none") | ||||
|     precision = fp32_precision.find(backend)->second.find("all")->second; | ||||
|   if (precision == "none") | ||||
|     precision = fp32_precision.find("generic")->second.find("all")->second; | ||||
|   bool valid_prec = validate_fp32_prec(backend, precision); | ||||
|   return valid_prec ? precision : "none"; | ||||
| Float32Precision Context::float32Precision(Float32Backend backend, Float32Op op) const { | ||||
|   std::pair<Float32Backend, Float32Op> key{backend, op}; | ||||
|   auto it = fp32_precision.find(key); | ||||
|   TORCH_CHECK(it != fp32_precision.end(), "Invalid (backend, op) pair: (", backend, ", ", op, ")"); | ||||
|  | ||||
|   Float32Precision precision = it->second; | ||||
|   if (precision == Float32Precision::NONE) { | ||||
|     key.second = Float32Op::ALL; | ||||
|     precision = fp32_precision.find(key)->second; | ||||
|   } | ||||
|   if (precision == Float32Precision::NONE) { | ||||
|     key.first = Float32Backend::GENERIC; | ||||
|     precision = fp32_precision.find(key)->second; | ||||
|   } | ||||
|  | ||||
|   // "cuda" does not support "bf16" | ||||
|   if (backend == Float32Backend::CUDA && precision == Float32Precision::BF16) { | ||||
|     return Float32Precision::NONE; | ||||
|   } | ||||
|   return precision; | ||||
| } | ||||
|  | ||||
| void Context::setFloat32MatmulPrecision(const std::string &s) { | ||||
| @ -357,18 +381,18 @@ void Context::setFloat32MatmulPrecision(const std::string &s) { | ||||
|     // TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention | ||||
|     if (s_ == "highest") { | ||||
|       float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST; | ||||
|       setFloat32Precision("cuda", "matmul", "ieee"); | ||||
|       setFloat32Precision("mkldnn", "matmul", "ieee"); | ||||
|       setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::IEEE); | ||||
|       setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::IEEE); | ||||
|       return true; | ||||
|     } else if (s_ == "high") { | ||||
|       float32_matmul_precision = at::Float32MatmulPrecision::HIGH; | ||||
|       setFloat32Precision("cuda", "matmul", "tf32"); | ||||
|       setFloat32Precision("mkldnn", "matmul", "tf32"); | ||||
|       setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::TF32); | ||||
|       setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::TF32); | ||||
|       return true; | ||||
|     } else if (s_ == "medium") { | ||||
|       float32_matmul_precision = at::Float32MatmulPrecision::MEDIUM; | ||||
|       setFloat32Precision("cuda", "matmul", "tf32"); | ||||
|       setFloat32Precision("mkldnn", "matmul", "bf16"); | ||||
|       setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::TF32); | ||||
|       setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::BF16); | ||||
|       return true; | ||||
|     } | ||||
|     return false; | ||||
| @ -382,25 +406,16 @@ void Context::setFloat32MatmulPrecision(const std::string &s) { | ||||
|     "setFloat32MatmulPrecision call has no effect."); | ||||
| } | ||||
|  | ||||
| void Context::setFloat32Precision(const std::string& backend, const std::string& op, const std::string& p) { | ||||
|   check_fp32_prec_backend_and_op(backend, op); | ||||
|   if (validate_fp32_prec(backend, p)) { | ||||
|     fp32_precision[backend][op] = p; | ||||
|   } else { | ||||
|     std::string msg; | ||||
|     auto iterp = _fp32_precisions.find(backend); | ||||
|     TORCH_CHECK(iterp != _fp32_precisions.end()); | ||||
|     for (const auto& p : iterp->second) { | ||||
|       msg += p; | ||||
|       msg += " "; | ||||
|     } | ||||
|     TORCH_WARN( | ||||
|         "you have set wrong precision for backend:", | ||||
|         backend, | ||||
|         " setFloat32Precision call has no effect.", | ||||
|         "Please choose precision from: ", | ||||
|         msg); | ||||
|   } | ||||
| void Context::setFloat32Precision(Float32Backend backend, Float32Op op, Float32Precision p) { | ||||
|   auto it = fp32_precision.find(std::make_pair(backend, op)); | ||||
|   TORCH_CHECK( | ||||
|       it != fp32_precision.end(), | ||||
|       "Invalid (backend, op) pair: (", backend, ", ", op, ")"); | ||||
|   TORCH_CHECK( | ||||
|       !(backend == Float32Backend::CUDA && p == Float32Precision::BF16), | ||||
|       "backend 'cuda' does not support precision 'bf16'"); | ||||
|  | ||||
|   it->second = p; | ||||
| } | ||||
|  | ||||
| at::LinalgBackend Context::linalgPreferredBackend() const { | ||||
|  | ||||
| @ -25,17 +25,27 @@ | ||||
| #include <c10/util/CallOnce.h> | ||||
| #include <c10/util/Exception.h> | ||||
| #include <c10/util/env.h> | ||||
| #include <c10/util/hash.h> | ||||
| #include <c10/util/irange.h> | ||||
|  | ||||
| #include <cstdint> | ||||
| #include <map> | ||||
| #include <mutex> | ||||
| #include <unordered_map> | ||||
|  | ||||
| namespace at { | ||||
|  | ||||
| class Tensor; | ||||
|  | ||||
| enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM }; | ||||
| enum class TORCH_API Float32Backend { GENERIC, CUDA, MKLDNN }; | ||||
| enum class TORCH_API Float32Op { ALL, CONV, RNN, MATMUL }; | ||||
| enum class TORCH_API Float32Precision { NONE, IEEE, TF32, BF16 }; | ||||
|  | ||||
| TORCH_API Float32Backend str2backend(const std::string& name); | ||||
| TORCH_API Float32Op str2op(const std::string& name); | ||||
| TORCH_API Float32Precision str2precision(const std::string& name); | ||||
| TORCH_API std::string precision2str(Float32Precision prec); | ||||
|  | ||||
| class TORCH_API Context { | ||||
|  public: | ||||
| @ -336,19 +346,17 @@ class TORCH_API Context { | ||||
|  | ||||
|   void setFloat32MatmulPrecision(const std::string& s); | ||||
|   void setFloat32Precision( | ||||
|       const std::string& backend, | ||||
|       const std::string& op, | ||||
|       const std::string& s); | ||||
|   bool allowTF32CuDNN(const std::string& op = std::string()) const; | ||||
|       Float32Backend backend, | ||||
|       Float32Op op, | ||||
|       Float32Precision p); | ||||
|   bool allowTF32CuDNN(std::optional<Float32Op> op = std::nullopt) const; | ||||
|   void setAllowTF32CuDNN(bool); | ||||
|   bool allowTF32OneDNN() const; | ||||
|   void setAllowTF32OneDNN(bool); | ||||
|   bool allowTF32CuBLAS() const; | ||||
|   void setAllowTF32CuBLAS(bool); | ||||
|   Float32MatmulPrecision float32MatmulPrecision() const; | ||||
|   std::string float32Precision( | ||||
|       const std::string& backend, | ||||
|       const std::string& op) const; | ||||
|   Float32Precision float32Precision(Float32Backend backend, Float32Op op) const; | ||||
|   bool allowFP16ReductionCuBLAS() const; | ||||
|   void setAllowFP16ReductionCuBLAS(bool); | ||||
|   bool allowBF16ReductionCuBLAS() const; | ||||
| @ -475,21 +483,20 @@ class TORCH_API Context { | ||||
|   bool enable_sparse_tensor_invariant_checks = false; | ||||
|   bool allow_fp16_reduction_cpu = false; | ||||
|  | ||||
|   std::map<std::string, std::map<std::string, std::string>> fp32_precision = { | ||||
|       {"generic", {{"all", "none"}}}, | ||||
|       {"mkldnn", | ||||
|        {{"matmul", "none"}, | ||||
|         {"conv", "none"}, | ||||
|         {"rnn", "none"}, | ||||
|         {"all", "none"}}}, | ||||
|       {"cuda", | ||||
|        {{"matmul", | ||||
|          float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST | ||||
|              ? "none" | ||||
|              : "tf32"}, | ||||
|         {"conv", "tf32"}, | ||||
|         {"rnn", "tf32"}, | ||||
|         {"all", "none"}}}, | ||||
|   using Key = std::pair<Float32Backend, Float32Op>; | ||||
|   std::unordered_map<Key, Float32Precision, c10::hash<Key>> fp32_precision = { | ||||
|       {{Float32Backend::GENERIC, Float32Op::ALL}, Float32Precision::NONE}, | ||||
|       {{Float32Backend::MKLDNN, Float32Op::ALL}, Float32Precision::NONE}, | ||||
|       {{Float32Backend::MKLDNN, Float32Op::CONV}, Float32Precision::NONE}, | ||||
|       {{Float32Backend::MKLDNN, Float32Op::RNN}, Float32Precision::NONE}, | ||||
|       {{Float32Backend::MKLDNN, Float32Op::MATMUL}, Float32Precision::NONE}, | ||||
|       {{Float32Backend::CUDA, Float32Op::ALL}, Float32Precision::NONE}, | ||||
|       {{Float32Backend::CUDA, Float32Op::CONV}, Float32Precision::TF32}, | ||||
|       {{Float32Backend::CUDA, Float32Op::RNN}, Float32Precision::TF32}, | ||||
|       {{Float32Backend::CUDA, Float32Op::MATMUL}, | ||||
|        float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST | ||||
|            ? Float32Precision::NONE | ||||
|            : Float32Precision::TF32}, | ||||
|   }; | ||||
|  | ||||
|   Allocator* prev_allocator_ptr_{nullptr}; | ||||
| @ -671,5 +678,4 @@ struct TORCH_API ROCmBackwardPassGuard { | ||||
|   ~ROCmBackwardPassGuard(); | ||||
|   static bool is_backward_pass(); | ||||
| }; | ||||
|  | ||||
| } // namespace at | ||||
|  | ||||
| @ -179,7 +179,7 @@ void propagate_names_except(const Tensor& result, const Tensor& src, IntArrayRef | ||||
|     return; | ||||
|   } | ||||
|   const auto src_names = src.names(); | ||||
|   const auto result_dim = static_cast<int64_t>(result.dim()); | ||||
|   const auto result_dim = result.dim(); | ||||
|   const auto src_dim = static_cast<int64_t>(src_names.size()); | ||||
|   const auto excluded_dim = static_cast<int64_t>(excluded_idxs.size()); | ||||
|   TORCH_INTERNAL_ASSERT(src_dim - excluded_dim == result_dim); | ||||
|  | ||||
| @ -214,7 +214,7 @@ inline Tensor applySlice( | ||||
|       "step must be greater than zero"); | ||||
|  | ||||
|   // See NOTE [nested tensor size for indexing] | ||||
|   if (self_sizes.has_value() && self_sizes.value().size() > 0) { | ||||
|   if (self_sizes.has_value() && !self_sizes.value().empty()) { | ||||
|     // Skip this optimization if we are tracing, as the trace may be polymorphic | ||||
|     // over the shape of the `self` tensor, and we still want to record | ||||
|     // the slice. | ||||
|  | ||||
| @ -273,11 +273,11 @@ void checkLayout(CheckedFrom c, at::ArrayRef<Tensor> tensors, at::Layout layout) | ||||
| } | ||||
|  | ||||
| void * maybe_data_ptr(const Tensor& tensor) { | ||||
|   return tensor.defined() ? (void *)tensor.data_ptr() : nullptr; | ||||
|   return tensor.defined() ? tensor.data_ptr() : nullptr; | ||||
| } | ||||
|  | ||||
| void * maybe_data_ptr(const TensorArg& tensor) { | ||||
|   return tensor->defined() ? (void *)tensor->data_ptr() : nullptr; | ||||
|   return tensor->defined() ? tensor->data_ptr() : nullptr; | ||||
| } | ||||
|  | ||||
| void check_dim_size( | ||||
|  | ||||
| @ -173,12 +173,4 @@ unsigned TensorBase::_register_hook(std::function<TensorBase(const TensorBase&)> | ||||
|   return impl::GetVariableHooks()->_register_hook(*this, std::move(hook)); | ||||
| } | ||||
|  | ||||
| std::optional<ScalarType> TensorBase::grad_dtype() const { | ||||
|   return impl::GetVariableHooks()->grad_dtype(*this); | ||||
| } | ||||
|  | ||||
| void TensorBase::set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const { | ||||
|   return impl::GetVariableHooks()->set_grad_dtype(*this, grad_dtype); | ||||
| } | ||||
|  | ||||
| } // namespace at | ||||
|  | ||||
| @ -930,10 +930,6 @@ public: | ||||
|  | ||||
|   const TensorBase& requires_grad_(bool _requires_grad=true) const; | ||||
|  | ||||
|   std::optional<ScalarType> grad_dtype() const; | ||||
|  | ||||
|   void set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const; | ||||
|  | ||||
|   // View Variables | ||||
|   //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
|  | ||||
| @ -117,7 +117,7 @@ C10_HOST_DEVICE inline T cauchy(T val, T median, T sigma) { | ||||
| template <> | ||||
| C10_HOST_DEVICE inline double cauchy(double val, double median, double sigma) { | ||||
|   // https://en.wikipedia.org/wiki/Cauchy_distribution#Cumulative_distribution_function | ||||
|   return median + sigma * at::tan(c10::pi<double> * (val - static_cast<double>(0.5))); | ||||
|   return median + sigma * at::tan(c10::pi<double> * (val - 0.5)); | ||||
| } | ||||
|  | ||||
| /** | ||||
|  | ||||
| @ -68,8 +68,6 @@ struct TORCH_API VariableHooksInterface { | ||||
|       const c10::OperatorHandle& op, | ||||
|       c10::DispatchKeySet dispatch_keys, | ||||
|       torch::jit::Stack* stack) const = 0; | ||||
|   virtual std::optional<c10::ScalarType> grad_dtype(const TensorBase&) const = 0; | ||||
|   virtual void set_grad_dtype(const TensorBase&, const std::optional<c10::ScalarType>&) const = 0; | ||||
| }; | ||||
|  | ||||
| TORCH_API void SetVariableHooks(VariableHooksInterface* hooks); | ||||
|  | ||||
| @ -2,7 +2,7 @@ | ||||
|  | ||||
| namespace c10 { | ||||
|  | ||||
| inline BoxedKernel::BoxedKernel() : functor_(), boxed_kernel_func_(nullptr) {} | ||||
| inline BoxedKernel::BoxedKernel() : boxed_kernel_func_(nullptr) {} | ||||
|  | ||||
| inline BoxedKernel::BoxedKernel( | ||||
|     std::unique_ptr<OperatorKernel> functor, | ||||
|  | ||||
| @ -20,9 +20,7 @@ make_unique_base(Args&&... args) { | ||||
| } // namespace detail | ||||
|  | ||||
| inline KernelFunction::KernelFunction() | ||||
|     : boxed_kernel_func_(), | ||||
|       unboxed_kernel_func_(nullptr), | ||||
|       sym_unboxed_kernel_func_(nullptr) {} | ||||
|     : unboxed_kernel_func_(nullptr), sym_unboxed_kernel_func_(nullptr) {} | ||||
|  | ||||
| inline KernelFunction::~KernelFunction() { | ||||
|   if (tokens_) { | ||||
|  | ||||
| @ -76,13 +76,7 @@ void _print_dispatch_trace(const std::string& label, const std::string& op_name, | ||||
|  | ||||
| OpRegistrationListener::~OpRegistrationListener()= default; | ||||
|  | ||||
| Dispatcher::Dispatcher() | ||||
| : operators_() | ||||
| , operatorLookupTable_() | ||||
| , backendFallbackKernels_() | ||||
| , listeners_(std::make_unique<detail::RegistrationListenerList>()) | ||||
| , cond_var_() | ||||
| , guard_(std::make_shared<Guard>()) | ||||
| Dispatcher::Dispatcher(): backendFallbackKernels_(), listeners_(std::make_unique<detail::RegistrationListenerList>()), guard_(std::make_shared<Guard>()) | ||||
| {} | ||||
|  | ||||
| Dispatcher::~Dispatcher() { | ||||
|  | ||||
| @ -96,7 +96,7 @@ class TORCH_API Dispatcher final { | ||||
|   friend class TypedOperatorHandle; | ||||
|  | ||||
|   struct Guard final { | ||||
|     Guard() : alive(true), mutex() {} | ||||
|     Guard() : alive(true) {} | ||||
|     std::atomic<bool> alive; | ||||
|     std::mutex mutex; | ||||
|   }; | ||||
|  | ||||
| @ -62,17 +62,7 @@ static const auto& getDispatchTableIndexToKey() { | ||||
| } | ||||
|  | ||||
| OperatorEntry::OperatorEntry(OperatorName&& operator_name) | ||||
| : name_(std::move(operator_name)) | ||||
| , schema_() | ||||
| #ifndef C10_MOBILE | ||||
| , tags_() | ||||
| #endif | ||||
| , dispatchTable_() | ||||
| , dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized()) | ||||
| , kernels_() | ||||
| , cpp_signature_() | ||||
| , sym_cpp_signature_() | ||||
| , is_observed_(ObservedOperators::isObserved(name_)) | ||||
| : name_(std::move(operator_name)), dispatchTable_(), dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized()), is_observed_(ObservedOperators::isObserved(name_)) | ||||
| { | ||||
|   // Pick up any backend fallbacks that were registered prior to this | ||||
|   // OperatorEntry being created. | ||||
|  | ||||
| @ -114,7 +114,7 @@ constexpr bool allowlist_contains(std::string_view allowlist, std::string_view i | ||||
|         } | ||||
|         next++; | ||||
|       } else { | ||||
|         if (allowlist.substr(cur).compare(item) == 0) { | ||||
|         if (allowlist.substr(cur) == item) { | ||||
|           return true; | ||||
|         } | ||||
|         break; | ||||
|  | ||||
| @ -73,7 +73,7 @@ c10::FunctionSchema RegisterOperators::inferSchemaFromKernels_( | ||||
|  | ||||
|   std::optional<FunctionSchema> inferred_schema = std::nullopt; | ||||
|   for (const auto& kernel : options.kernels) { | ||||
|     if (nullptr != kernel.inferred_function_schema.get()) { | ||||
|     if (nullptr != kernel.inferred_function_schema) { | ||||
|       if (!inferred_schema.has_value()) { | ||||
|         inferred_schema = *kernel.inferred_function_schema; | ||||
|         break; | ||||
|  | ||||
| @ -411,7 +411,6 @@ public: | ||||
|  | ||||
|     Options() | ||||
|     : schemaOrName_(std::nullopt) | ||||
|     , kernels() | ||||
|     , aliasAnalysisKind_(std::nullopt) | ||||
|     {} | ||||
|  | ||||
| @ -420,7 +419,6 @@ public: | ||||
|     struct KernelRegistrationConfig final { | ||||
|       KernelRegistrationConfig() | ||||
|         : dispatch_key(std::nullopt) | ||||
|         , func() | ||||
|         , cpp_signature(std::nullopt) | ||||
|         , inferred_function_schema(nullptr) | ||||
|       {} | ||||
|  | ||||
| @ -905,7 +905,7 @@ class Vectorized8 : public Vectorizedi { | ||||
|     // Because loadu(const void* ptr, T count) requires zero initialization for | ||||
|     // upper 128 bits. However, by using _mm256_castsi128_si256, the upper 128 | ||||
|     // bits of the result are undefined. | ||||
|     // TODO<leslie> We can use _mm256_zextsi128_si256 in the furture, | ||||
|     // TODO<leslie> We can use _mm256_zextsi128_si256 in the future, | ||||
|     // since gcc 9.3 doesn't support it now. | ||||
|     __m128i input_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr)); | ||||
|     return _mm256_castsi128_si256(input_128); | ||||
| @ -1844,7 +1844,7 @@ Vectorized<int16_t> inline shift_256_16( | ||||
|     c0 = _mm256_srav_epi32(a0, b0); | ||||
|   c0 = _mm256_shuffle_epi8(c0, ctl_1_0); | ||||
|  | ||||
|   // Peform shifting the same way for input array elements with | ||||
|   // Perform shifting the same way for input array elements with | ||||
|   // idx%2==1. | ||||
|   __m256i a1 = _mm256_and_si256(a, keep_1); | ||||
|   __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0); | ||||
| @ -2180,7 +2180,7 @@ Vectorized<T> inline shift_256_8( | ||||
|     c0 = _mm256_srlv_epi32(a0, b0); | ||||
|   c0 = _mm256_shuffle_epi8(c0, ctl_3_0); | ||||
|  | ||||
|   // Peform shifting the same way for input array elements with | ||||
|   // Perform shifting the same way for input array elements with | ||||
|   // idx%4==1. | ||||
|   __m256i a1 = _mm256_shuffle_epi8(a, ctl_1_3); | ||||
|   __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0); | ||||
| @ -2193,7 +2193,7 @@ Vectorized<T> inline shift_256_8( | ||||
|     c1 = _mm256_srlv_epi32(a1, b1); | ||||
|   c1 = _mm256_shuffle_epi8(c1, ctl_3_1); | ||||
|  | ||||
|   // Peform shifting the same way for input array elements with | ||||
|   // Perform shifting the same way for input array elements with | ||||
|   // idx%4==2. | ||||
|   __m256i a2 = _mm256_shuffle_epi8(a, ctl_2_3); | ||||
|   __m256i b2 = _mm256_shuffle_epi8(b, ctl_2_0); | ||||
| @ -2206,7 +2206,7 @@ Vectorized<T> inline shift_256_8( | ||||
|     c2 = _mm256_srlv_epi32(a2, b2); | ||||
|   c2 = _mm256_shuffle_epi8(c2, ctl_3_2); | ||||
|  | ||||
|   // Peform shifting the same way for input array elements with | ||||
|   // Perform shifting the same way for input array elements with | ||||
|   // idx%4==3. | ||||
|   __m256i a3 = _mm256_and_si256(a, keep_3); | ||||
|   __m256i b3 = _mm256_shuffle_epi8(b, ctl_3_0); | ||||
|  | ||||
| @ -1088,7 +1088,7 @@ class Vectorized8 : public Vectorizedi { | ||||
|     // Because loadu(const void* ptr, T count) requires zero initialization for | ||||
|     // upper 384 bits. However, by using _mm512_castsi128_si512, the upper 384 | ||||
|     // bits of the result are undefined. | ||||
|     // TODO<leslie> We can use _mm512_zextsi128_si512 in the furture, | ||||
|     // TODO<leslie> We can use _mm512_zextsi128_si512 in the future, | ||||
|     // since gcc 9.3 doesn't support it now. | ||||
|     __m128i input_128 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)); | ||||
|     return _mm512_castsi128_si512(input_128); | ||||
| @ -2022,7 +2022,7 @@ Vectorized<T> inline shift_512_8( | ||||
|     c0 = _mm512_srlv_epi16(a0, b0); | ||||
|   c0 = _mm512_shuffle_epi8(c0, ctl_1_0); | ||||
|  | ||||
|   // Peform shifting the same way for input array elements with | ||||
|   // Perform shifting the same way for input array elements with | ||||
|   // idx%2==1. | ||||
|   __m512i a1 = _mm512_and_si512(a, keep_1); | ||||
|   __m512i b1 = _mm512_shuffle_epi8(b, ctl_1_0); | ||||
|  | ||||
| @ -323,7 +323,7 @@ class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor< | ||||
|     descriptor_.reset(raw_descriptor); | ||||
|   } | ||||
|   template <typename T> | ||||
|   inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) { | ||||
|   void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) { | ||||
|     // NOLINTNEXTLINE(bugprone-sizeof-expression) | ||||
|     TORCH_CUDABLAS_CHECK(::cublasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(value))); | ||||
|   } | ||||
| @ -345,7 +345,7 @@ class CuBlasLtMatrixLayout : public CuBlasLtDescriptor< | ||||
|     descriptor_.reset(raw_descriptor); | ||||
|   } | ||||
|   template <typename T> | ||||
|   inline void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) { | ||||
|   void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) { | ||||
|     TORCH_CUDABLAS_CHECK(::cublasLtMatrixLayoutSetAttribute(descriptor(), attr, &value, sizeof(T))); | ||||
|   } | ||||
| }; | ||||
| @ -360,7 +360,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< | ||||
|     descriptor_.reset(raw_descriptor); | ||||
|   } | ||||
|   template <typename T> | ||||
|   inline void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) { | ||||
|   void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) { | ||||
|     TORCH_CUDABLAS_CHECK(::cublasLtMatmulPreferenceSetAttribute(descriptor(), attr, &value, sizeof(T))); | ||||
|   } | ||||
| }; | ||||
| @ -395,7 +395,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D | ||||
|     computeType = CUBLAS_COMPUTE_64F; | ||||
|     scaleType = CUDA_R_64F; | ||||
|   } else if constexpr (std::is_same_v<Dtype, float>) { | ||||
|     if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") { | ||||
|     if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) { | ||||
|       computeType = CUBLAS_COMPUTE_32F_FAST_TF32; | ||||
|     } | ||||
|   } else if constexpr (std::is_same_v<Dtype, c10::complex<double>>) { | ||||
| @ -1559,7 +1559,7 @@ bool gemm_and_bias( | ||||
|     computeType = CUBLAS_COMPUTE_64F; | ||||
|     scaleType = CUDA_R_64F; | ||||
|   } else if constexpr (std::is_same_v<Dtype, float>) { | ||||
|     if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") { | ||||
|     if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) { | ||||
|       computeType = CUBLAS_COMPUTE_32F_FAST_TF32; | ||||
|     } | ||||
|   } else if constexpr (std::is_same_v<Dtype, at::Half>) { | ||||
|  | ||||
| @ -109,7 +109,7 @@ void CUDAGeneratorState::increase(uint64_t increment) { | ||||
|         offset_intragraph_ % 4 == 0, "RNG offset must be a multiple of 4."); | ||||
|     // Ensures the increment does not cause overflow. | ||||
|     TORCH_INTERNAL_ASSERT( | ||||
|         offset_intragraph_ <= std::numeric_limits<uint32_t>::max() - increment, | ||||
|         offset_intragraph_ <= std::numeric_limits<uint64_t>::max() - increment, | ||||
|         "Increment causes overflow in the offset value."); | ||||
|     offset_intragraph_ += increment; | ||||
|   } else { | ||||
| @ -461,7 +461,7 @@ void CUDAGeneratorImpl::unregister_graph(cuda::CUDAGraph* graph) { | ||||
|  */ | ||||
| PhiloxCudaState CUDAGeneratorImpl::philox_cuda_state(uint64_t increment) { | ||||
|   if (at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None) { | ||||
|     uint32_t offset = state_->offset_intragraph_; | ||||
|     uint64_t offset = state_->offset_intragraph_; | ||||
|     state_->increase(increment); | ||||
|     return PhiloxCudaState( | ||||
|         state_->seed_extragraph_.data_ptr<int64_t>(), | ||||
|  | ||||
| @ -96,16 +96,16 @@ struct CUDAGraph; | ||||
| struct CUDAGeneratorState : public c10::intrusive_ptr_target { | ||||
|   uint64_t seed_; | ||||
|   uint64_t philox_offset_per_thread_; | ||||
|   uint32_t offset_intragraph_; | ||||
|   uint64_t offset_intragraph_; | ||||
|   bool capturing_{}; | ||||
|   std::unordered_set<cuda::CUDAGraph*> registered_graphs_; | ||||
|   at::TensorBase seed_extragraph_{}; | ||||
|   at::TensorBase offset_extragraph_{}; | ||||
|   at::TensorBase seed_extragraph_; | ||||
|   at::TensorBase offset_extragraph_; | ||||
|  | ||||
|   CUDAGeneratorState( | ||||
|       uint64_t seed = default_rng_seed_val, | ||||
|       uint64_t philox_offset_per_thread = 0, | ||||
|       uint32_t offset_intragraph = 0) | ||||
|       uint64_t offset_intragraph = 0) | ||||
|       : seed_(seed), | ||||
|         philox_offset_per_thread_(philox_offset_per_thread), | ||||
|         offset_intragraph_(offset_intragraph) {} | ||||
| @ -167,7 +167,7 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl { | ||||
|   CUDAGeneratorImpl* clone_impl() const override; | ||||
|  | ||||
|   c10::intrusive_ptr<CUDAGeneratorState> state_; | ||||
|   std::atomic_flag no_reset_rnn_state_{}; | ||||
|   std::atomic_flag no_reset_rnn_state_; | ||||
| }; | ||||
|  | ||||
| namespace cuda::detail { | ||||
|  | ||||
| @ -56,7 +56,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph { | ||||
|  | ||||
|   // the ID assigned by cuda during graph capture, | ||||
|   // used to identify when a stream is participating in capture | ||||
|   CaptureId_t capture_id_ = -1; | ||||
|   CaptureId_t capture_id_ = 0; | ||||
|  | ||||
|   // uuid used to request a particular private mempool from CUDACachingAllocator. | ||||
|   // By default, this will be set to {id_, 0}. | ||||
|  | ||||
| @ -6,43 +6,15 @@ | ||||
| #define HIPSPARSE_VERSION ((hipsparseVersionMajor*100000) + (hipsparseVersionMinor*100) + hipsparseVersionPatch) | ||||
| #endif | ||||
|  | ||||
| // cuSparse Generic API added in CUDA 10.1 | ||||
| // Windows support added in CUDA 11.0 | ||||
| #if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && ((CUSPARSE_VERSION >= 10300) || (CUSPARSE_VERSION >= 11000 && defined(_WIN32))) | ||||
| #define AT_USE_CUSPARSE_GENERIC_API() 1 | ||||
| #else | ||||
| #define AT_USE_CUSPARSE_GENERIC_API() 0 | ||||
| #endif | ||||
|  | ||||
| // cuSparse Generic API descriptor pointers were changed to const in CUDA 12.0 | ||||
| #if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \ | ||||
|     (CUSPARSE_VERSION < 12000) | ||||
| #define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 1 | ||||
| #else | ||||
| #define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 0 | ||||
| #endif | ||||
|  | ||||
| #if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \ | ||||
|     (CUSPARSE_VERSION >= 12000) | ||||
| #define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 1 | ||||
| #else | ||||
| #define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 0 | ||||
| #endif | ||||
|  | ||||
| #if defined(USE_ROCM) | ||||
| // hipSparse const API added in v2.4.0 | ||||
| #if HIPSPARSE_VERSION >= 200400 | ||||
| #define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 1 | ||||
| #define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0 | ||||
| #define AT_USE_HIPSPARSE_GENERIC_API() 1 | ||||
| #else | ||||
| #define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0 | ||||
| #define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 1 | ||||
| #define AT_USE_HIPSPARSE_GENERIC_API() 1 | ||||
| #endif | ||||
| #else // USE_ROCM | ||||
| #define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0 | ||||
| #define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0 | ||||
| #define AT_USE_HIPSPARSE_GENERIC_API() 0 | ||||
| #endif // USE_ROCM | ||||
|  | ||||
|  | ||||
| @ -12,8 +12,6 @@ cusparseStatus_t destroyConstDnMat(const cusparseDnMatDescr* dnMatDescr) { | ||||
|   return cusparseDestroyDnMat(const_cast<cusparseDnMatDescr*>(dnMatDescr)); | ||||
| } | ||||
|  | ||||
| #if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API() | ||||
|  | ||||
| namespace { | ||||
|  | ||||
| // If a specific GPU model does not provide native support for a given data | ||||
| @ -210,6 +208,4 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int6 | ||||
|   descriptor_.reset(raw_descriptor); | ||||
| } | ||||
|  | ||||
| #endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API() | ||||
|  | ||||
| } // namespace at::cuda::sparse | ||||
|  | ||||
| @ -35,7 +35,6 @@ class CuSparseDescriptor { | ||||
|   std::unique_ptr<T, CuSparseDescriptorDeleter<T, destructor>> descriptor_; | ||||
| }; | ||||
|  | ||||
| #if AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS() | ||||
| template <typename T, cusparseStatus_t (*destructor)(const T*)> | ||||
| struct ConstCuSparseDescriptorDeleter { | ||||
|   void operator()(T* x) { | ||||
| @ -58,7 +57,6 @@ class ConstCuSparseDescriptor { | ||||
|  protected: | ||||
|   std::unique_ptr<T, ConstCuSparseDescriptorDeleter<T, destructor>> descriptor_; | ||||
| }; | ||||
| #endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS || AT_USE_HIPSPARSE_CONST_DESCRIPTORS | ||||
|  | ||||
| #if defined(USE_ROCM) | ||||
| using cusparseMatDescr = std::remove_pointer_t<hipsparseMatDescr_t>; | ||||
| @ -123,39 +121,8 @@ class TORCH_CUDA_CPP_API CuSparseBsrsm2Info | ||||
|  | ||||
| #endif // AT_USE_HIPSPARSE_TRIANGULAR_SOLVE | ||||
|  | ||||
| #if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API() | ||||
|  | ||||
| cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type); | ||||
|  | ||||
| #if AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() | ||||
| class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor | ||||
|     : public CuSparseDescriptor<cusparseDnMatDescr, &cusparseDestroyDnMat> { | ||||
|  public: | ||||
|   explicit CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1); | ||||
| }; | ||||
|  | ||||
| class TORCH_CUDA_CPP_API CuSparseConstDnMatDescriptor | ||||
|     : public CuSparseDescriptor<const cusparseDnMatDescr, &destroyConstDnMat> { | ||||
|  public: | ||||
|   explicit CuSparseConstDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1); | ||||
|   cusparseDnMatDescr* unsafe_mutable_descriptor() const { | ||||
|     return const_cast<cusparseDnMatDescr*>(descriptor()); | ||||
|   } | ||||
|   cusparseDnMatDescr* unsafe_mutable_descriptor() { | ||||
|     return const_cast<cusparseDnMatDescr*>(descriptor()); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor | ||||
|     : public CuSparseDescriptor<cusparseDnVecDescr, &cusparseDestroyDnVec> { | ||||
|  public: | ||||
|   explicit CuSparseDnVecDescriptor(const Tensor& input); | ||||
| }; | ||||
|  | ||||
| class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor | ||||
|     : public CuSparseDescriptor<cusparseSpMatDescr, &cusparseDestroySpMat> {}; | ||||
|  | ||||
| #elif AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS() | ||||
|   class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor | ||||
|       : public ConstCuSparseDescriptor< | ||||
|             cusparseDnMatDescr, | ||||
| @ -194,7 +161,6 @@ class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor | ||||
|       : public ConstCuSparseDescriptor< | ||||
|             cusparseSpMatDescr, | ||||
|             &cusparseDestroySpMat> {}; | ||||
| #endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS() | ||||
|  | ||||
| class TORCH_CUDA_CPP_API CuSparseSpMatCsrDescriptor | ||||
|     : public CuSparseSpMatDescriptor { | ||||
| @ -283,6 +249,4 @@ class TORCH_CUDA_CPP_API CuSparseSpGEMMDescriptor | ||||
|   } | ||||
| }; | ||||
|  | ||||
| #endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API() | ||||
|  | ||||
| } // namespace at::cuda::sparse | ||||
|  | ||||
| @ -222,15 +222,15 @@ struct CUDACachingHostAllocatorImpl | ||||
|       size_t numThreads, | ||||
|       size_t pageSize) { | ||||
|     uintptr_t start = (uintptr_t)ptr + (size * i / numThreads); | ||||
|     uintptr_t end = (uintptr_t)start + (size / numThreads); | ||||
|     uintptr_t end = start + (size / numThreads); | ||||
|     if (i == (numThreads - 1)) { | ||||
|       end = (uintptr_t)ptr + size; | ||||
|     } | ||||
|  | ||||
|     // pre-fault/map the pages by setting the first byte of the page | ||||
|     uintptr_t alignedStart = | ||||
|         (((uintptr_t)start + pageSize - 1) & ~(pageSize - 1)); | ||||
|     for (uintptr_t p = alignedStart; p < ((uintptr_t)end); p += pageSize) { | ||||
|         ((start + pageSize - 1) & ~(pageSize - 1)); | ||||
|     for (uintptr_t p = alignedStart; p < (end); p += pageSize) { | ||||
|       // NOLINTNEXTLINE(performance-no-int-to-ptr) | ||||
|       memset((void*)p, 0, 1); | ||||
|     } | ||||
|  | ||||
| @ -310,7 +310,7 @@ cublasHandle_t getCurrentCUDABlasHandle() { | ||||
|   // FP32 data type calculations based on the value of the allow_tf32 flag. | ||||
|   // To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH. | ||||
|   if (!NoTF32Guard::should_disable_tf32() && | ||||
|       at::globalContext().float32Precision("cuda", "matmul") == "tf32") { | ||||
|       at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) { | ||||
|     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH)); | ||||
|   } else { | ||||
|     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); | ||||
|  | ||||
| @ -122,7 +122,7 @@ struct DeviceThreadHandlePool : public std::enable_shared_from_this<DeviceThread | ||||
|  | ||||
|     // Called by the destructor.  Releases this thread's handles back into the pool. | ||||
|     void release() { | ||||
|         if(my_handles.size() > 0) { | ||||
|         if(!my_handles.empty()) { | ||||
|             auto parent = weak_parent.lock(); | ||||
|             if (!parent) { | ||||
|                 // If this thread exits after atexit handlers have completed, the | ||||
|  | ||||
| @ -19,7 +19,7 @@ struct PhiloxCudaState { | ||||
|   // Called if graph capture is underway | ||||
|   PhiloxCudaState(int64_t* seed, | ||||
|                   int64_t* offset_extragraph, | ||||
|                   uint32_t offset_intragraph) { | ||||
|                   uint64_t offset_intragraph) { | ||||
|     seed_.ptr = seed; | ||||
|     offset_.ptr = offset_extragraph; | ||||
|     offset_intragraph_ = offset_intragraph; | ||||
| @ -36,7 +36,7 @@ struct PhiloxCudaState { | ||||
|  | ||||
|   Payload seed_{}; | ||||
|   Payload offset_{}; | ||||
|   uint32_t offset_intragraph_ = 0; | ||||
|   uint64_t offset_intragraph_ = 0; | ||||
|   bool captured_ = false; | ||||
| }; | ||||
|  | ||||
|  | ||||
| @ -162,7 +162,7 @@ inline std::string ComputeTypeFor() { | ||||
| // ROCBLAS and hipBLASLt. | ||||
| template <> | ||||
| inline std::string ComputeTypeFor<float>() { | ||||
|   if (at::globalContext().float32Precision("cuda", "matmul") != "tf32") { | ||||
|   if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) != at::Float32Precision::TF32) { | ||||
|     return "f32_r"; | ||||
|   } else { | ||||
|     return "xf32_r"; | ||||
|  | ||||
| @ -506,7 +506,7 @@ class HipblasltGemmOp : public Callable<ParamsT> { | ||||
|       } | ||||
|  | ||||
|       hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F; | ||||
|       if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") { | ||||
|       if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) { | ||||
|         computeType = HIPBLAS_COMPUTE_32F_FAST_TF32; | ||||
|       } | ||||
|       HipBlasLtMatmulDescriptor matmul(computeType, HIP_R_32F); | ||||
|  | ||||
| @ -141,7 +141,7 @@ class RocblasGemmOp : public Callable<GemmParams<T>> { | ||||
|  | ||||
|     TuningStatus Call(const GemmParams<T>* params) override { | ||||
|       auto input_output_type = RocBlasDataTypeFor<T>(); | ||||
|       if (at::globalContext().float32Precision("cuda", "matmul") == "tf32" && input_output_type == rocblas_datatype_f32_r) | ||||
|       if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r) | ||||
|         return FAIL;  // no support for TF32 in rocBLAS | ||||
|       auto compute_type = RocBlasComputeTypeFor<T>(); | ||||
|       auto h_a = DoCastForHalfOrBfloat16(params->alpha); | ||||
| @ -209,7 +209,7 @@ class RocblasGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>> | ||||
|  | ||||
|     TuningStatus Call(const GemmStridedBatchedParams<T>* params) override { | ||||
|       auto input_output_type = RocBlasDataTypeFor<T>(); | ||||
|       if (at::globalContext().float32Precision("cuda", "matmul") == "tf32" && input_output_type == rocblas_datatype_f32_r) | ||||
|       if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r) | ||||
|         return FAIL;  // no support for TF32 in rocBLAS | ||||
|       auto compute_type = RocBlasComputeTypeFor<T>(); | ||||
|       auto h_a = DoCastForHalfOrBfloat16(params->alpha); | ||||
|  | ||||
| @ -404,8 +404,6 @@ TuningContext::TuningContext() : | ||||
|     max_warmup_iterations_{0}, | ||||
|     icache_flush_{true}, | ||||
|     rotating_buffer_size_{-1}, | ||||
|     filename_{}, | ||||
|     untuned_file_{}, | ||||
|     results_count_from_input_file_{0}, | ||||
|     is_shutting_down_{false} | ||||
| { | ||||
|  | ||||
| @ -141,7 +141,7 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo | ||||
|     size[i] = (int) t.size(i); | ||||
|   } | ||||
|   for (const auto i : c10::irange(dim, pad)) { | ||||
|     size[i] = (int) 1; | ||||
|     size[i] = 1; | ||||
|   } | ||||
|   dim = std::max(dim, pad); | ||||
|   cudnnTensorFormat_t filter_format{}; | ||||
|  | ||||
| @ -176,7 +176,7 @@ struct LinalgCheckMatrixUnaryRuleHelper; | ||||
|  | ||||
| template <char const *op_name, typename F, F Func, typename A, typename... T> | ||||
| struct LinalgCheckMatrixUnaryRuleHelper<op_name, F, Func, typelist<A, T...>> { | ||||
|   static inline Tensor check_and_reshape_input(const Tensor& tensor, std::optional<int64_t> batch_dim) { | ||||
|   static Tensor check_and_reshape_input(const Tensor& tensor, std::optional<int64_t> batch_dim) { | ||||
|     TORCH_CHECK(rankWithoutBatchDim(tensor, batch_dim) >= 2, op_name, ": The input tensor A must have at least 2 dimensions."); | ||||
|     return moveBatchDimToFront(tensor, batch_dim); | ||||
|   } | ||||
| @ -222,7 +222,7 @@ struct LinalgCheckMatrixBinaryRuleHelper; | ||||
|  | ||||
| template <char const *op_name, typename F, F Func, typename A, typename B, typename... T> | ||||
| struct LinalgCheckMatrixBinaryRuleHelper<op_name, F, Func, typelist<A, B, T...>> { | ||||
|   static inline std::tuple<Tensor, Tensor> check_inputs_and_reshape_inputs( | ||||
|   static std::tuple<Tensor, Tensor> check_inputs_and_reshape_inputs( | ||||
|       const Tensor& first, std::optional<int64_t> first_bdim, | ||||
|       const Tensor& second, std::optional<int64_t> second_bdim) { | ||||
|     TORCH_CHECK(rankWithoutBatchDim(first, first_bdim) >= 2, | ||||
|  | ||||
| @ -58,7 +58,7 @@ scalar_t dot_impl(int64_t n, const scalar_t *x, int64_t incx, const scalar_t *y, | ||||
| template<typename scalar_t> | ||||
| scalar_t vdot_impl(int64_t n, const scalar_t *x, int64_t incx, const scalar_t *y, int64_t incy); | ||||
|  | ||||
| static constexpr inline bool lda_cond(int64_t m, int64_t n, int64_t lda) { | ||||
| static constexpr bool lda_cond(int64_t m, int64_t n, int64_t lda) { | ||||
|   return n == 1 || lda >= std::max<int64_t>(1L, m); | ||||
| } | ||||
|  | ||||
|  | ||||
| @ -991,7 +991,7 @@ std::size_t UnsafeUkernelKeyHasher<PackKey>::operator()(const PackKey& key) cons | ||||
| template <typename key_t, typename value_t> | ||||
| struct KernelCache  { | ||||
|   using kstore_t = std::unordered_map<key_t, std::shared_ptr<value_t>, UnsafeUkernelKeyHasher<key_t>>; | ||||
|   static inline std::shared_ptr<value_t>&& fetch_or_create( | ||||
|   static std::shared_ptr<value_t>&& fetch_or_create( | ||||
|       const key_t& key, | ||||
|       const std::function<std::shared_ptr<value_t>()>& callback) { | ||||
|     auto&& search = get_store().find(key); | ||||
| @ -1003,7 +1003,7 @@ struct KernelCache  { | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   static inline kstore_t& get_store() { | ||||
|   static kstore_t& get_store() { | ||||
|     static thread_local kstore_t cache_kernels; | ||||
|     return cache_kernels; | ||||
|   } | ||||
| @ -1067,7 +1067,7 @@ struct GemmHelper { | ||||
| struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> { | ||||
|   // Fetch/create GemmHelper object and execute brgemm with batch size = 1 | ||||
|   template <typename scalar_t_a, typename scalar_t_b, typename scalar_t_c> | ||||
|   static inline void call( | ||||
|   static void call( | ||||
|       int64_t M, | ||||
|       int64_t N, | ||||
|       int64_t K, | ||||
| @ -1118,12 +1118,12 @@ struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> { | ||||
|         .execute(A, B, (*value).A_B_offsets, C, (*value).scratchpad.data()); | ||||
|   } | ||||
|  | ||||
|   static inline std::shared_ptr<GemmHelper>& get_current() { | ||||
|   static std::shared_ptr<GemmHelper>& get_current() { | ||||
|     static thread_local std::shared_ptr<GemmHelper> current; | ||||
|     return current; | ||||
|   } | ||||
|  | ||||
|   static inline bool device_check(ScalarType dtype) { | ||||
|   static bool device_check(ScalarType dtype) { | ||||
|     if (!at::globalContext().userEnabledMkldnn()) { | ||||
|       return false; | ||||
|     } | ||||
| @ -1153,7 +1153,7 @@ using pack_t = dnnl::ukernel::brgemm_pack_B; | ||||
| using pack_t = dnnl::ukernel::transform; | ||||
| #endif | ||||
| struct Pack : public KernelCache <PackKey, pack_t> { | ||||
|   static inline void call( | ||||
|   static void call( | ||||
|       int64_t K, | ||||
|       int64_t N, | ||||
|       int64_t ld_in, | ||||
| @ -1182,7 +1182,7 @@ struct Pack : public KernelCache <PackKey, pack_t> { | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   static inline bool could_pack(ScalarType dtype) { | ||||
|   static bool could_pack(ScalarType dtype) { | ||||
|     if (!at::globalContext().userEnabledMkldnn()) { | ||||
|       return false; | ||||
|     } | ||||
|  | ||||
| @ -702,7 +702,7 @@ static void check_shape_forward(const at::Tensor& input, | ||||
|       // If kernel size is incorrect | ||||
|       std::ostringstream input_ss; | ||||
|       std::ostringstream kernel_ss; | ||||
|       std::string separator = ""; | ||||
|       std::string separator; | ||||
|  | ||||
|       for (int i = 0, len = input_shape.size(); i < len; ++i) { | ||||
|         input_ss << separator << input_shape[i]; | ||||
| @ -1019,7 +1019,7 @@ static Tensor convolution_same( | ||||
|  | ||||
|   if (symmetric_padding) { | ||||
|     // All backends handle symmetric padding natively | ||||
|     SymDimVector output_padding(static_cast<size_t>(dim)); | ||||
|     SymDimVector output_padding(dim); | ||||
|     return at::convolution_symint(input, weight, bias, stride, padding_l, dilation, | ||||
|                                false, output_padding, groups); | ||||
|   } | ||||
| @ -1039,7 +1039,7 @@ static Tensor convolution_same( | ||||
|     } | ||||
|   } | ||||
|   auto padded_input = at::constant_pad_nd_symint(input, pad_nd, 0); | ||||
|   SymDimVector output_padding(static_cast<size_t>(dim)); | ||||
|   SymDimVector output_padding(dim); | ||||
|   return at::convolution_symint(padded_input, weight, bias, stride, padding_l, | ||||
|                                 dilation, false, output_padding, groups); | ||||
| } | ||||
| @ -1174,7 +1174,7 @@ at::Tensor convolution( | ||||
|   bool deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms(); | ||||
|   return at::_convolution(input, weight, bias, stride, padding, dilation, | ||||
|                           transposed, output_padding, groups, | ||||
|                           ctx.benchmarkCuDNN(), deterministic, ctx.userEnabledCuDNN(), ctx.allowTF32CuDNN("conv")); | ||||
|                           ctx.benchmarkCuDNN(), deterministic, ctx.userEnabledCuDNN(), ctx.allowTF32CuDNN(at::Float32Op::CONV)); | ||||
| } | ||||
|  | ||||
| at::Tensor convolution_overrideable( | ||||
| @ -1319,7 +1319,7 @@ ConvBackend select_conv_backend( | ||||
|   params.benchmark = ctx.benchmarkCuDNN(); | ||||
|   params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms(); | ||||
|   params.cudnn_enabled = ctx.userEnabledCuDNN(); | ||||
|   params.allow_tf32 = ctx.allowTF32CuDNN("conv"); | ||||
|   params.allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV); | ||||
|  | ||||
|   auto input = input_r; | ||||
|   auto weight = weight_r; | ||||
| @ -1699,7 +1699,7 @@ at::Tensor _convolution( | ||||
|   c10::MaybeOwned<Tensor> bias_r_maybe_owned = at::borrow_from_optional_tensor(bias_r_opt); | ||||
|   const Tensor& bias_r = *bias_r_maybe_owned; | ||||
|  | ||||
|   return at::_convolution(input_r, weight_r, bias_r, stride_, padding_, dilation_, transposed_, output_padding_, groups_, benchmark, deterministic, cudnn_enabled, at::globalContext().allowTF32CuDNN("conv")); | ||||
|   return at::_convolution(input_r, weight_r, bias_r, stride_, padding_, dilation_, transposed_, output_padding_, groups_, benchmark, deterministic, cudnn_enabled, at::globalContext().allowTF32CuDNN(at::Float32Op::CONV)); | ||||
| } | ||||
|  | ||||
| std::tuple<Tensor, Tensor, Tensor> convolution_backward_overrideable( | ||||
| @ -1997,7 +1997,7 @@ std::tuple<Tensor, Tensor, Tensor> convolution_backward( | ||||
|   params.benchmark = ctx.benchmarkCuDNN(); | ||||
|   params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms(); | ||||
|   params.cudnn_enabled = ctx.userEnabledCuDNN(); | ||||
|   params.allow_tf32 = ctx.allowTF32CuDNN("conv"); | ||||
|   params.allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV); | ||||
|  | ||||
|   // Validate inputs. | ||||
|   check_shape_backward(input, weight.sizes(), params); | ||||
|  | ||||
| @ -1,6 +1,5 @@ | ||||
| #define TORCH_ASSERT_ONLY_METHOD_OPERATORS | ||||
| #include <ATen/native/Copy.h> | ||||
| #include <ATen/native/Copy.h> | ||||
|  | ||||
| #include <ATen/core/Tensor.h> | ||||
| #include <ATen/Dispatch.h> | ||||
|  | ||||
| @ -70,7 +70,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value) | ||||
|         new_shape.emplace_back(input_sizes[i]); | ||||
|     } | ||||
|  | ||||
|     for (const auto i : c10::irange((size_t)l_pad)) { | ||||
|     for (const auto i : c10::irange(l_pad)) { | ||||
|         auto pad_idx = pad.size() - ((i + 1) * 2); | ||||
|         auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1]; | ||||
|         TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ", | ||||
|  | ||||
| @ -47,7 +47,7 @@ int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar | ||||
|     int64_t sgn = (xstep > 0) - (xstep < 0); | ||||
|     size_d = std::ceil((xend - xstart + xstep - sgn) / xstep); | ||||
|   } else { | ||||
|     size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>()) | ||||
|     size_d = std::ceil((end.to<double>() - start.to<double>()) | ||||
|                         / step.to<double>()); | ||||
|   } | ||||
|  | ||||
|  | ||||
| @ -107,11 +107,6 @@ void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes) { | ||||
|   storage->set_nbytes(size_bytes); | ||||
| } | ||||
|  | ||||
| // Call the sparse implementation in SparseTensor.cpp directly. | ||||
| // A dynamic dispatch here is NOT necessary, so I didn't put | ||||
| // this function in native_functions.yaml | ||||
| const Tensor& resize_as_sparse_(const Tensor& self, const Tensor& src); | ||||
|  | ||||
| // TODO(VitalyFedyunin): Move it to HTML docs. | ||||
| // | ||||
| // Strides of the output tensor of `resize_as_` operator is defined by input | ||||
|  | ||||
| @ -145,12 +145,6 @@ | ||||
| #include <utility> | ||||
| #include <vector> | ||||
|  | ||||
| namespace at::native { | ||||
|  | ||||
| AdvancedIndex make_info(Tensor self, IOptTensorListRef orig); | ||||
|  | ||||
| } // namespace at::native | ||||
|  | ||||
| namespace at::meta { | ||||
|  | ||||
| TORCH_META_FUNC(gather) | ||||
|  | ||||
| @ -73,7 +73,6 @@ | ||||
| #include <ATen/ops/where_native.h> | ||||
| #include <ATen/ops/zeros_like.h> | ||||
|  | ||||
| #include <iostream> | ||||
| #include <utility> | ||||
| #endif | ||||
|  | ||||
|  | ||||
| @ -124,7 +124,7 @@ struct IsUnique {}; | ||||
|  | ||||
| template <typename scalar_t> | ||||
| struct IsUnique<scalar_t, false> { | ||||
|   inline bool operator() (scalar_t* data_ptr, int64_t i) { | ||||
|   bool operator() (scalar_t* data_ptr, int64_t i) { | ||||
|     if (i == 0) { return true; } | ||||
|     return c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1]); | ||||
|   } | ||||
| @ -132,7 +132,7 @@ struct IsUnique<scalar_t, false> { | ||||
|  | ||||
| template <typename scalar_t> | ||||
| struct IsUnique<scalar_t, true> { | ||||
|   inline bool operator() (scalar_t* data_ptr, int64_t i) { | ||||
|   bool operator() (scalar_t* data_ptr, int64_t i) { | ||||
|     if (i == 0) { return true; } | ||||
|     return (c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1])) | ||||
|         && !(_isnan(data_ptr[i]) && _isnan(data_ptr[i - 1])); | ||||
|  | ||||
| @ -4,7 +4,6 @@ | ||||
|  | ||||
| #include <ATen/OpMathType.h> | ||||
| #include <ATen/TensorUtils.h> | ||||
| #include <ATen/OpMathType.h> | ||||
| #include <ATen/core/Tensor.h> | ||||
| #include <ATen/cpu/vec/functional.h> | ||||
| #include <ATen/cpu/vec/vec.h> | ||||
|  | ||||
| @ -17,7 +17,7 @@ | ||||
|  | ||||
| namespace ao::sparse { | ||||
|  | ||||
| int register_linear_params(); | ||||
|  | ||||
|  | ||||
| #ifdef USE_FBGEMM | ||||
|  | ||||
|  | ||||
| @ -20,7 +20,7 @@ | ||||
|  | ||||
| namespace ao::sparse { | ||||
|  | ||||
| int register_linear_params(); | ||||
|  | ||||
|  | ||||
| #ifdef USE_FBGEMM | ||||
| namespace { | ||||
|  | ||||
| @ -16,7 +16,7 @@ | ||||
| #endif | ||||
|  | ||||
| namespace ao::sparse { | ||||
| int register_linear_params(); | ||||
|  | ||||
|  | ||||
| #ifdef USE_FBGEMM | ||||
|  | ||||
|  | ||||
| @ -22,7 +22,7 @@ static inline void cpu_atomic_add_float(float* dst, float fvalue) | ||||
|   old_value.floatV = *dst; | ||||
|   new_value.floatV = old_value.floatV + fvalue; | ||||
|  | ||||
|   unsigned* old_intV = (unsigned*)(&old_value.intV); | ||||
|   unsigned* old_intV = &old_value.intV; | ||||
|   while (!std::atomic_compare_exchange_strong(dst_intV, old_intV, new_value.intV)) { | ||||
| #ifdef __aarch64__ | ||||
|     __asm__ __volatile__("yield;" : : : "memory"); | ||||
|  | ||||
| @ -118,7 +118,7 @@ gemm_notrans_( | ||||
|   scale_(m, n, beta, c, ldc); | ||||
|  | ||||
|   // c += alpha * (a @ b) | ||||
|   const uint64_t unsigned_m = static_cast<int64_t>(m); | ||||
|   const uint64_t unsigned_m = m; | ||||
|   const uint64_t i_m = unsigned_m / 4; | ||||
|   for (const uint64_t l : c10::irange(k)) { | ||||
|     for (const uint64_t j : c10::irange(n)) { | ||||
|  | ||||
| @ -8,7 +8,6 @@ | ||||
| #include <c10/util/irange.h> | ||||
| #include <ATen/OpMathType.h> | ||||
| #include <ATen/native/cpu/utils.h> | ||||
| #include <ATen/OpMathType.h> | ||||
|  | ||||
| namespace at::native { | ||||
| inline namespace CPU_CAPABILITY { | ||||
|  | ||||
| @ -17,7 +17,6 @@ | ||||
| #include <ATen/cpu/vec/functional.h> | ||||
| #include <ATen/cpu/vec/vec.h> | ||||
| #include <c10/util/irange.h> | ||||
| #include <ATen/OpMathType.h> | ||||
|  | ||||
| // [Note AVX-SSE transitions] In general we avoid calls into cmath for code | ||||
| // compiled with AVX/AVX2 This is because of SSE-AVX transitions and a bug in | ||||
|  | ||||
| @ -240,7 +240,7 @@ static void unfolded2d_copy( | ||||
|     int64_t output_height, | ||||
|     int64_t output_width) { | ||||
|   at::parallel_for( | ||||
|       0, (int64_t)n_input_plane * kH * kW, 0, [&](int64_t start, int64_t end) { | ||||
|       0, n_input_plane * kH * kW, 0, [&](int64_t start, int64_t end) { | ||||
|         for (const auto k : c10::irange(start, end)) { | ||||
|           int64_t nip = k / (kH * kW); | ||||
|           int64_t rest = k % (kH * kW); | ||||
| @ -316,7 +316,7 @@ static void unfolded2d_copy( | ||||
|                 for (int64_t x = 0; x < output_width; x++) | ||||
|                   memcpy( | ||||
|                       dst + (size_t)y * output_width + x, | ||||
|                       src + (size_t)iy * input_width + ix + (int64_t)x * dW, | ||||
|                       src + (size_t)iy * input_width + ix + x * dW, | ||||
|                       sizeof(scalar_t) * (1)); | ||||
|               } | ||||
|             } | ||||
|  | ||||
| @ -906,7 +906,7 @@ static void ref_dyn_quant_matmul_4bit_channelwise_kernel( | ||||
|           // Round to nearest integer | ||||
|           const int32_t nudged_zero_point0 = lrintf(zero_point0); | ||||
|  | ||||
|           int8_t* dst_ptr = (int8_t*)lhs_qa8dx + m_idx * dst_stride; | ||||
|           int8_t* dst_ptr = lhs_qa8dx + m_idx * dst_stride; | ||||
|  | ||||
|           // LHS offset at the beginning of the row | ||||
|           *((float*)(dst_ptr)) = recip_scale0; | ||||
| @ -1048,7 +1048,7 @@ static void ref_dyn_quant_matmul_4bit_groupwise_kernel( | ||||
|       zero_point0 = (std::min)(zero_point0, qmax); | ||||
|       const int32_t nudged_zero_point0 = lrintf(zero_point0); | ||||
|  | ||||
|       int8_t* dst_ptr = (int8_t*)lhs_qa8dx + row_idx * dst_stride; | ||||
|       int8_t* dst_ptr = lhs_qa8dx + row_idx * dst_stride; | ||||
|  | ||||
|       *((float*)(dst_ptr)) = recip_scale0; | ||||
|       dst_ptr += sizeof(float); | ||||
|  | ||||
| @ -1919,7 +1919,7 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca | ||||
|   TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor"); | ||||
|  | ||||
|  | ||||
|   addmm_out_cuda_impl(const_cast<Tensor&>(out), out, self, mat2, 0, 1); | ||||
|   addmm_out_cuda_impl(out, out, self, mat2, 0, 1); | ||||
|  | ||||
|   return out; | ||||
| } | ||||
|  | ||||
| @ -127,8 +127,7 @@ void apply_ldl_solve_cusolver( | ||||
|     const Tensor& pivots, | ||||
|     const Tensor& B, | ||||
|     bool upper) { | ||||
| #if !(defined(CUDART_VERSION) && defined(CUSOLVER_VERSION) && \ | ||||
|     CUSOLVER_VERSION >= 11102) | ||||
| #if !(defined(CUDART_VERSION) && defined(CUSOLVER_VERSION)) | ||||
|   TORCH_CHECK( | ||||
|       false, | ||||
|       "Calling torch.linalg.ldl_solve on a CUDA tensor requires compiling ", | ||||
|  | ||||
| @ -169,7 +169,10 @@ std::string repro_from_args(const ConvolutionParams& params) { | ||||
|   ss << "If that doesn't trigger the error, please include your original repro script when reporting this issue.\n\n"; | ||||
|   ss << "import torch\n"; | ||||
|   ss << "torch.backends.cuda.matmul.allow_tf32 = " | ||||
|      << pybool(at::globalContext().float32Precision("cuda", "matmul") == "tf32") | ||||
|      << pybool( | ||||
|             at::globalContext().float32Precision( | ||||
|                 at::Float32Backend::CUDA, at::Float32Op::MATMUL) == | ||||
|             at::Float32Precision::TF32) | ||||
|      << "\n"; | ||||
|   ss << "torch.backends.cudnn.benchmark = " | ||||
|      << pybool(at::globalContext().benchmarkCuDNN()) << "\n"; | ||||
| @ -726,7 +729,7 @@ Tensor cudnn_convolution_relu( | ||||
|  | ||||
|   auto& ctx = at::globalContext(); | ||||
|   bool benchmark = ctx.benchmarkCuDNN(); | ||||
|   bool allow_tf32 = ctx.allowTF32CuDNN("conv"); | ||||
|   bool allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV); | ||||
|   auto _bias = bias_t.has_value() | ||||
|       ? bias_t.value() | ||||
|       : at::zeros( | ||||
| @ -784,7 +787,7 @@ Tensor cudnn_convolution_add_relu( | ||||
|   } | ||||
|  | ||||
|   auto& ctx = at::globalContext(); | ||||
|   bool allow_tf32 = ctx.allowTF32CuDNN("conv"); | ||||
|   bool allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV); | ||||
|   bool benchmark = ctx.benchmarkCuDNN(); | ||||
|   auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0; | ||||
|   auto _bias = bias_t.has_value() | ||||
|  | ||||
| @ -76,7 +76,6 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss_tensor( | ||||
|  | ||||
| #else // AT_CUDNN_ENABLED | ||||
|  | ||||
| #include <ATen/cudnn/Descriptors.h> | ||||
| #include <ATen/cudnn/Types.h> | ||||
| #include <ATen/cudnn/Utils.h> | ||||
|  | ||||
| @ -284,9 +283,9 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss_tensor( | ||||
|   checkBackend(c, {*targets}, Backend::CUDA); | ||||
|   const auto batch_size = log_probs->size(1); | ||||
|   int64_t input_lengths_size = | ||||
|       input_lengths_.sizes().size() ? input_lengths_.size(0) : 1; | ||||
|       !input_lengths_.sizes().empty() ? input_lengths_.size(0) : 1; | ||||
|   int64_t target_lengths_size = | ||||
|       target_lengths_.sizes().size() ? target_lengths_.size(0) : 1; | ||||
|       !target_lengths_.sizes().empty() ? target_lengths_.size(0) : 1; | ||||
|   TORCH_CHECK( | ||||
|       input_lengths_size == batch_size, | ||||
|       "input_lengths needs to have size to match batch_size"); | ||||
|  | ||||
| @ -142,8 +142,6 @@ void run_cudnn_SDP_bprop_nestedtensor( | ||||
| namespace at { | ||||
| namespace native { | ||||
|  | ||||
| #include <cudnn_frontend.h> | ||||
|  | ||||
| namespace fe = cudnn_frontend; | ||||
|  | ||||
| constexpr uint8_t MAX_MHA_DIM = 4; | ||||
| @ -1379,7 +1377,7 @@ void run_cudnn_SDP_fprop( | ||||
|   cudnnHandle_t handle = getCudnnHandle(); | ||||
|  | ||||
|   // NB: The key initialization will round up sequence length, stride data etc. | ||||
|   // if use_ragged_in_dense is enabled (to allow multiple sequence lenghths to | ||||
|   // if use_ragged_in_dense is enabled (to allow multiple sequence lengths to | ||||
|   // reuse the same cached value/graph) | ||||
|   auto key = MHACacheKeyWrapper( | ||||
|       b, | ||||
|  | ||||
| @ -245,7 +245,7 @@ descriptor(cudnnHandle_t handle, DropoutDescriptor&& dropout_desc) const { | ||||
|       datatype, | ||||
|       input_datatype, | ||||
|       algo, | ||||
|       at::globalContext().allowTF32CuDNN("rnn")); | ||||
|       at::globalContext().allowTF32CuDNN(at::Float32Op::RNN)); | ||||
| #else | ||||
|     rnn_desc.set( | ||||
|         handle, | ||||
| @ -261,7 +261,7 @@ descriptor(cudnnHandle_t handle, DropoutDescriptor&& dropout_desc) const { | ||||
|         datatype, | ||||
|         input_datatype, | ||||
|         algo, | ||||
|         at::globalContext().allowTF32CuDNN("rnn")); | ||||
|         at::globalContext().allowTF32CuDNN(at::Float32Op::RNN)); | ||||
| #endif | ||||
|   return rnn_desc; | ||||
| } | ||||
|  | ||||
| @ -38,7 +38,6 @@ REGISTER_NO_CPU_DISPATCH(mkldnn_convolution_transpose_backward_stub) | ||||
|  | ||||
| #include <ATen/native/mkldnn/MKLDNNCommon.h> | ||||
| #include <ATen/native/mkldnn/Utils.h> | ||||
| #include <ATen/native/ConvUtils.h> | ||||
| #include <c10/util/irange.h> | ||||
|  | ||||
| namespace at::native { | ||||
| @ -105,7 +104,7 @@ static void check_shape_forward(const Tensor& input, | ||||
|     // If kernel size is incorrect | ||||
|     std::ostringstream input_ss; | ||||
|     std::ostringstream kernel_ss; | ||||
|     std::string separator = ""; | ||||
|     std::string separator; | ||||
|  | ||||
|     for (int i = 0, len = input_shape.size(); i < len; ++i) { | ||||
|       input_ss << separator << input_shape[i]; | ||||
| @ -156,12 +155,12 @@ static void check_shape_forward(const Tensor& input, | ||||
| // | ||||
|  | ||||
| static bool mkldnn_conv_enabled_fpmath_mode_bf16(){ | ||||
|   return at::globalContext().float32Precision("mkldnn", "conv") == "bf16" && | ||||
|   return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::CONV) == at::Float32Precision::BF16 && | ||||
|       mkldnn_bf16_device_check(); | ||||
| } | ||||
|  | ||||
| static bool mkldnn_conv_enabled_fpmath_mode_tf32(){ | ||||
|   return at::globalContext().float32Precision("mkldnn", "conv") == "tf32" && | ||||
|   return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::CONV) == at::Float32Precision::TF32 && | ||||
|       cpuinfo_has_x86_amx_fp16(); | ||||
| } | ||||
|  | ||||
|  | ||||
| @ -69,12 +69,12 @@ mkldnn_scaled_mm(const Tensor& mat1, const Tensor& mat2, | ||||
| namespace at::native { | ||||
|  | ||||
| static bool use_mkldnn_bf32_linear() { | ||||
|   return at::globalContext().float32Precision("mkldnn", "matmul") == "bf16" && | ||||
|   return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::BF16 && | ||||
|       mkldnn_bf16_device_check(); | ||||
| } | ||||
|  | ||||
| static bool use_mkldnn_tf32_linear() { | ||||
|   return at::globalContext().float32Precision("mkldnn", "matmul") == "tf32" && | ||||
|   return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && | ||||
|       cpuinfo_has_x86_amx_fp16(); | ||||
| } | ||||
|  | ||||
|  | ||||
| @ -111,11 +111,11 @@ static bool use_mkldnn_fp16_matmul() { | ||||
| } | ||||
|  | ||||
| static bool use_mkldnn_bf32_matmul() { | ||||
|   return use_mkldnn_bf16_matmul() && at::globalContext().float32Precision("mkldnn", "matmul") == "bf16"; | ||||
|   return use_mkldnn_bf16_matmul() && at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::BF16; | ||||
| } | ||||
|  | ||||
| static bool use_mkldnn_tf32_matmul() { | ||||
|   return cpuinfo_has_x86_amx_fp16() && at::globalContext().float32Precision("mkldnn", "matmul") == "tf32"; | ||||
|   return cpuinfo_has_x86_amx_fp16() && at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::TF32; | ||||
| } | ||||
|  | ||||
| // returns an ideep::tensor | ||||
|  | ||||
| @ -316,7 +316,7 @@ Tensor NestedTensor_to_padded_tensor_generic( | ||||
|     TORCH_CHECK( | ||||
|         (int64_t)output_size_.size() == ret_val.dim(), | ||||
|         "Length of output_size does not match NestedTensor dims. Broadcasting is not supported."); | ||||
|     for (int64_t i = 0; i < (int64_t)ret_val.dim(); i++) { | ||||
|     for (int64_t i = 0; i < ret_val.dim(); i++) { | ||||
|       TORCH_CHECK( | ||||
|           output_size_[i] >= ret_val.size(i), | ||||
|           "Value in output_size is less than NestedTensor padded size. Truncation is not supported."); | ||||
|  | ||||
| @ -146,12 +146,12 @@ inline TensorQuantizationParams ChooseQuantizationParams( | ||||
|   // The arithmetic error on the zero point computed from either pair | ||||
|   // will be roughly machine_epsilon * (sum of absolute values of terms) | ||||
|   // so we want to use the variant that adds the smaller terms. | ||||
|   double zero_point_from_min = qmin - min / static_cast<double>(scale); | ||||
|   double zero_point_from_max = qmax - max / static_cast<double>(scale); | ||||
|   double zero_point_from_min = qmin - min / scale; | ||||
|   double zero_point_from_max = qmax - max / scale; | ||||
|   double zero_point_from_min_error = | ||||
|       std::abs(qmin) - std::abs(min / static_cast<double>(scale)); | ||||
|       std::abs(qmin) - std::abs(min / scale); | ||||
|   double zero_point_from_max_error = | ||||
|       std::abs(qmax) - std::abs(max / static_cast<double>(scale)); | ||||
|       std::abs(qmax) - std::abs(max / scale); | ||||
|   double initial_zero_point = | ||||
|       zero_point_from_min_error < zero_point_from_max_error | ||||
|       ? zero_point_from_min | ||||
|  | ||||
| @ -560,7 +560,7 @@ float hsum_sq(const int32_t* A, int len) { | ||||
|   alignas(64) float temp[8]; | ||||
|   _mm256_store_ps(temp, sum_ps); | ||||
|   for (const auto k : c10::irange(8)) { | ||||
|     row_sum += static_cast<float>(temp[k]); | ||||
|     row_sum += temp[k]; | ||||
|   } | ||||
| #elif defined(CPU_CAPABILITY_AVX512) | ||||
|   __m512 sum_ps = _mm512_setzero_ps(); | ||||
| @ -574,7 +574,7 @@ float hsum_sq(const int32_t* A, int len) { | ||||
|   alignas(64) float temp[16]; | ||||
|   _mm512_store_ps(temp, sum_ps); | ||||
|   for (const auto k : c10::irange(16)) { | ||||
|     row_sum += static_cast<float>(temp[k]); | ||||
|     row_sum += temp[k]; | ||||
|   } | ||||
| #endif // CPU_CAPABILITY_AVX2 or CPU_CAPABILITY_AVX512 | ||||
|  | ||||
| @ -1282,7 +1282,7 @@ template <bool ReLUFused = false> | ||||
| void qadd_scalar_kernel(Tensor& out, const Tensor& self, const Scalar& other) { | ||||
|   int64_t zero_point = out.q_zero_point(); | ||||
|   float scale = static_cast<float>(out.q_scale()); | ||||
|   float inv_scale = static_cast<float>(1.0f / scale); | ||||
|   float inv_scale = 1.0f / scale; | ||||
|   int64_t self_zero_point = self.q_zero_point(); | ||||
|   float self_scale = static_cast<float>(self.q_scale()); | ||||
|  | ||||
| @ -2915,7 +2915,7 @@ void fake_quantize_learnable_channel_grad_kernel_cpu( | ||||
|       // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) | ||||
|       *dx_output = (*dy_input) * (xqi >= quant_min && xqi <= quant_max); | ||||
|       // Calculate gradients for scale and zero point. | ||||
|       float xfqi = static_cast<float>((std::max(std::min(xqi, quant_max), quant_min) - (*zero_point_input)) * (*scale_input)); | ||||
|       float xfqi = ((std::max(std::min(xqi, quant_max), quant_min) - (*zero_point_input)) * (*scale_input)); | ||||
|       if (xqi < quant_min || xqi > quant_max) { | ||||
|         *dzero_point_output = (*dy_input) * (-1) * (*scale_input) * grad_factor; | ||||
|         *dscale_output = ((xqi < quant_min) ? ((*dy_input) * dscale_small) : ((*dy_input) * dscale_big)) * grad_factor; | ||||
| @ -4415,7 +4415,7 @@ void _qmul_tensor_cpu_impl( | ||||
|     uint8_t y_data = *(y_ptr + idx); | ||||
|     int32_t x_val = static_cast<int32_t>(x_data) - x_zero_point; | ||||
|     int32_t y_val = static_cast<int32_t>(y_data) - y_zero_point; | ||||
|     int32_t out_val = static_cast<int32_t>(x_val * y_val); | ||||
|     int32_t out_val = x_val * y_val; | ||||
|     float out_val_f = (float)out_val * multiplier; | ||||
|     if constexpr (std::is_same<T, float>::value) { | ||||
|       *(out_ptr + idx) = out_val_f; | ||||
|  | ||||
| @ -1198,7 +1198,7 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl( | ||||
|       kSpatialDim == 2 ? ideep::format_tag::nhwc : ideep::format_tag::ndhwc); | ||||
|   ideep::tensor src(src_desc, act_contig.data_ptr()); | ||||
|   // weights & bias | ||||
|   ideep::tensor& weights = *(weight_.get()); | ||||
|   ideep::tensor& weights = *(weight_); | ||||
|   bool with_bias = bias_.has_value(); | ||||
|   const auto& kernel_size = weights.get_dims(); | ||||
|   // dst | ||||
|  | ||||
| @ -812,7 +812,7 @@ at::Tensor PackedLinearWeightsOnednn::apply_impl( | ||||
|  | ||||
|   auto is_input_qint8 = input.scalar_type() == c10::ScalarType::QInt8; | ||||
|   auto input_contig = input.expect_contiguous(); | ||||
|   auto& w = *(weight_.get()); | ||||
|   auto& w = *weight_; | ||||
|   auto K = input.size(dim - 1), M = input.numel() / K, N = w.get_dim(1); | ||||
|   auto input_dims = {M, K}; | ||||
|   auto input_data_type = is_input_qint8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::u8; | ||||
|  | ||||
| @ -545,7 +545,7 @@ at::Tensor PackedLinearWeightsOnednn::apply_dynamic_impl( | ||||
|       /*reduce_range=*/reduce_range); | ||||
|   const std::vector<int32_t>& src_zero_point = std::vector<int32_t>(1, q_params.zero_point); | ||||
|   // weights, dst | ||||
|   auto w = *(weight_.get()); | ||||
|   auto w = *weight_; | ||||
|   auto dst_dims = {x.get_dim(0), w.get_dim(1)}; | ||||
|   const ideep::scale_t& src_scales = ideep::scale_t(1, 1.0/q_params.scale); | ||||
|   const ideep::scale_t& weights_scales = w.get_scale(); | ||||
|  | ||||
| @ -12,7 +12,6 @@ | ||||
| #include <ATen/quantized/Quantizer.h> | ||||
| #include <c10/core/QScheme.h> | ||||
| #include <c10/util/irange.h> | ||||
| #include <torch/library.h> | ||||
|  | ||||
| #include <utility> | ||||
|  | ||||
|  | ||||
| @ -10,7 +10,6 @@ | ||||
| #include <ATen/quantized/Quantizer.h> | ||||
| #include <c10/core/QScheme.h> | ||||
| #include <c10/util/irange.h> | ||||
| #include <torch/library.h> | ||||
|  | ||||
| int register_linear_params(); | ||||
|  | ||||
|  | ||||
| @ -65,7 +65,7 @@ Tensor& addmv_out_sparse_compressed( | ||||
|       return result.zero_(); | ||||
|     } else { | ||||
|       return at::mul_out( | ||||
|           const_cast<Tensor&>(result), | ||||
|           result, | ||||
|           self, | ||||
|           at::native::scalar_tensor( | ||||
|               beta, | ||||
|  | ||||
| @ -1330,18 +1330,18 @@ Tensor reduce_sparse_csr_cpu_template(const Tensor& sparse, IntArrayRef dims_to_ | ||||
|  | ||||
| template <typename scalar_t> | ||||
| struct ReductionAddOp { | ||||
|   inline scalar_t operator()(const scalar_t& a, const scalar_t& b) const { | ||||
|   scalar_t operator()(const scalar_t& a, const scalar_t& b) const { | ||||
|     return a + b; | ||||
|   } | ||||
|   inline scalar_t identity() const { return 0; } | ||||
|   scalar_t identity() const { return 0; } | ||||
| }; | ||||
|  | ||||
| template <typename scalar_t> | ||||
| struct ReductionMulOp { | ||||
|   inline scalar_t operator()(const scalar_t& a, const scalar_t& b) const { | ||||
|   scalar_t operator()(const scalar_t& a, const scalar_t& b) const { | ||||
|     return a * b; | ||||
|   } | ||||
|   inline scalar_t identity() const { return 1; } | ||||
|   scalar_t identity() const { return 1; } | ||||
| }; | ||||
|  | ||||
| }  // namespace | ||||
|  | ||||
| @ -55,7 +55,6 @@ | ||||
| #include <ATen/ops/is_pinned_native.h> | ||||
| #include <ATen/ops/resize_as_sparse.h> | ||||
| #include <ATen/ops/resize_as_sparse_native.h> | ||||
| #include <ATen/ops/sparse_coo_tensor.h> | ||||
| #include <ATen/ops/sparse_coo_tensor_native.h> | ||||
| #include <ATen/ops/sparse_dim_native.h> | ||||
| #include <ATen/ops/sparse_mask_native.h> | ||||
|  | ||||
| @ -244,7 +244,7 @@ Tensor& addmv_out_sparse_compressed_cuda( | ||||
|       return result.zero_(); | ||||
|     } else { | ||||
|       return at::mul_out( | ||||
|           const_cast<Tensor&>(result), | ||||
|           result, | ||||
|           self, | ||||
|           at::native::scalar_tensor( | ||||
|               beta, | ||||
|  | ||||
| @ -10,7 +10,6 @@ | ||||
| #include <ATen/native/cuda/MiscUtils.h> | ||||
| #include <ATen/native/sparse/SparseBlasImpl.h> | ||||
| #include <ATen/native/sparse/cuda/SparseBlasImpl.h> | ||||
| #include <ATen/native/sparse/cuda/SparseBlasLegacy.h> | ||||
|  | ||||
| #ifndef AT_PER_OPERATOR_HEADERS | ||||
| #include <ATen/Functions.h> | ||||
| @ -94,15 +93,6 @@ void inline col_indices_and_values_resize_(const Tensor& input, int64_t nnz) { | ||||
|       input.sizes()); | ||||
| } | ||||
|  | ||||
| void inline bsrsv2_bsrsm2_may_need_to_sync() { | ||||
| #if defined(CUSPARSE_VERSION) && CUSPARSE_VERSION < 11703 | ||||
|   // cusparse bsrsv2 and bsrsm2 have a synchronization issue that may cause illegal memory access in cuda <= 11.6.x | ||||
|   // See https://github.com/pytorch/pytorch/issues/71297 | ||||
|   ::c10::cuda::device_synchronize(); | ||||
| #endif | ||||
|   // else: do nothing! | ||||
| } | ||||
|  | ||||
| void block_sparse_triangular_solve_vec( | ||||
|     const at::sparse_csr::SparseCsrTensor& A, | ||||
|     const Tensor& B, | ||||
| @ -223,7 +213,6 @@ void block_sparse_triangular_solve_vec( | ||||
|             CUSPARSE_SOLVE_POLICY_NO_LEVEL, | ||||
|             work_data.get()); | ||||
|  | ||||
|         bsrsv2_bsrsm2_may_need_to_sync(); | ||||
|       }); | ||||
|   if (!X.is_same(*X_)) { | ||||
|     X.copy_(*X_); | ||||
| @ -364,7 +353,6 @@ void block_sparse_triangular_solve_mat( | ||||
|             CUSPARSE_SOLVE_POLICY_NO_LEVEL, | ||||
|             work_data.get()); | ||||
|  | ||||
|         bsrsv2_bsrsm2_may_need_to_sync(); | ||||
|       }); | ||||
|   if (!X.is_same(*X_)) { | ||||
|     X.copy_(*X_); | ||||
| @ -665,12 +653,6 @@ void spgemm( | ||||
|     const Scalar& beta, | ||||
|     const Scalar& alpha, | ||||
|     const at::sparse_csr::SparseCsrTensor& C) { | ||||
|   // older versions of cusparse on Windows segfault for complex128 dtype | ||||
| #if defined(_WIN32) && defined(CUSPARSE_VERSION) && CUSPARSE_VERSION < 11400 | ||||
|   TORCH_CHECK( | ||||
|       !(A.scalar_type() == ScalarType::ComplexDouble), | ||||
|       "Sparse multiplication with complex128 dtype inputs is not supported with current CUDA version. Please upgrade to CUDA Toolkit 11.2.1+"); | ||||
| #endif | ||||
|  | ||||
|   IntArrayRef A_sizes = A.sizes(); | ||||
|   auto ndim = A.dim(); | ||||
| @ -953,13 +935,6 @@ void addmv_out_sparse_csr( | ||||
|   if (mat.layout() == kSparseBsr) { | ||||
|     return block_sparse_mv(mat, vec, beta, alpha, result); | ||||
|   } | ||||
| #if !(AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()) | ||||
|   TORCH_CHECK( | ||||
|       false, | ||||
|       "Calling addmv on a sparse GPU tensor requires compiling ", | ||||
|       "PyTorch with CUDA 10.2+ (CUDA 11+ on Windows). ", | ||||
|       "Please use PyTorch built with newer CUDA version."); | ||||
| #else | ||||
|   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; | ||||
|  | ||||
|   c10::MaybeOwned<Tensor> result_ = prepare_dense_vector_for_cusparse(result); | ||||
| @ -970,11 +945,10 @@ void addmv_out_sparse_csr( | ||||
|   auto descX = at::cuda::sparse::CuSparseDnVecDescriptor(*vec_); | ||||
|   auto descY = at::cuda::sparse::CuSparseDnVecDescriptor(*result_); | ||||
|  | ||||
|   // cusparseSpMVAlg_t was updated in cuda 11.2.1 (cusparse 11.4.0) | ||||
| #if CUSPARSE_VERSION >= 11400 | ||||
|   cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; | ||||
| #else | ||||
| #ifdef USE_ROCM | ||||
|   cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; | ||||
| #else | ||||
|   cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; | ||||
| #endif | ||||
|  | ||||
|   // SpMV doesn't support uniform precision computation | ||||
| @ -1027,7 +1001,6 @@ void addmv_out_sparse_csr( | ||||
|   if (!result.is_same(*result_)) { | ||||
|     result.copy_(*result_); | ||||
|   } | ||||
| #endif // !(AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()) | ||||
| } | ||||
|  | ||||
| /* | ||||
| @ -1245,12 +1218,8 @@ void triangular_solve_out_sparse_csr( | ||||
|       return block_sparse_triangular_solve_mat(A, B, X, upper, transpose, unitriangular); | ||||
|     } | ||||
|   } | ||||
| #if !AT_USE_CUSPARSE_GENERIC_SPSV() | ||||
|   TORCH_CHECK( | ||||
|       false, | ||||
|       "Calling triangular solve on a sparse GPU tensor requires compiling ", | ||||
|       "PyTorch with at least CUDA 11.3. ", | ||||
|       "Please use PyTorch built with newer CUDA version."); | ||||
| #ifdef USE_ROCM | ||||
|   TORCH_CHECK(false, "ROCm is not supported"); | ||||
| #else | ||||
|   c10::MaybeOwned<Tensor> X_ = prepare_dense_matrix_for_cusparse(X); | ||||
|   // It should be possible to use mixed memory format | ||||
| @ -1317,13 +1286,6 @@ void triangular_solve_out_sparse_csr( | ||||
|               desc_spsv.descriptor())); | ||||
|         }); | ||||
|   } else { | ||||
| #if !AT_USE_CUSPARSE_GENERIC_SPSM() | ||||
|     TORCH_CHECK( | ||||
|         false, | ||||
|         "Calling triangular solve on a sparse GPU tensor requires compiling ", | ||||
|         "PyTorch with at least CUDA 11.3.1. ", | ||||
|         "Please use PyTorch built with newer CUDA version."); | ||||
| #else | ||||
|     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES( | ||||
|         X.scalar_type(), "triangular_solve_out_sparse_csr_cuda_impl", [&] { | ||||
|           scalar_t alpha = 1; | ||||
| @ -1377,12 +1339,11 @@ void triangular_solve_out_sparse_csr( | ||||
|               CUSPARSE_SPSM_ALG_DEFAULT, | ||||
|               desc_spsm.descriptor())); | ||||
|         }); | ||||
| #endif // !AT_USE_CUSPARSE_GENERIC_SPSM() | ||||
|   } | ||||
|   if (!X.is_same(*X_)) { | ||||
|     X.copy_(*X_); | ||||
|   } | ||||
| #endif // !AT_USE_CUSPARSE_GENERIC_SPSV() | ||||
| #endif | ||||
| } | ||||
|  | ||||
| void sampled_addmm_out_sparse_csr( | ||||
| @ -1391,13 +1352,6 @@ void sampled_addmm_out_sparse_csr( | ||||
|     const Scalar& beta, | ||||
|     const Scalar& alpha, | ||||
|     const at::sparse_csr::SparseCsrTensor& C) { | ||||
| #if !(AT_USE_CUSPARSE_GENERIC_SDDMM() || AT_USE_HIPSPARSE_GENERIC_API()) | ||||
|   TORCH_CHECK( | ||||
|       false, | ||||
|       "Calling sampled_addmm with sparse GPU tensors requires compiling ", | ||||
|       "PyTorch with CUDA 11.2.1+. ", | ||||
|       "Please use PyTorch built with newer CUDA version."); | ||||
| #else | ||||
|   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.layout() == Layout::Strided); | ||||
|   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(B.layout() == Layout::Strided); | ||||
|   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(C.is_sparse_csr()); | ||||
| @ -1472,7 +1426,6 @@ void sampled_addmm_out_sparse_csr( | ||||
|               buffer.get())); | ||||
|         } | ||||
|       }); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| } // namespace at::native::sparse::impl::cuda | ||||
|  | ||||
| @ -90,12 +90,12 @@ class TORCH_API TensorMaker { | ||||
|  | ||||
|   void* data_; | ||||
|   IntArrayRef sizes_; | ||||
|   OptionalIntArrayRef strides_{}; | ||||
|   std::optional<int64_t> storage_offset_{}; | ||||
|   std::function<void(void*)> deleter_{}; | ||||
|   OptionalIntArrayRef strides_; | ||||
|   std::optional<int64_t> storage_offset_; | ||||
|   std::function<void(void*)> deleter_; | ||||
|   std::unique_ptr<void, ContextDeleter> ctx_{nullptr, detail::noopDelete}; | ||||
|   std::optional<Device> device_{}; | ||||
|   TensorOptions opts_{}; | ||||
|   std::optional<Device> device_; | ||||
|   TensorOptions opts_; | ||||
|   bool resizeable_{}; | ||||
|   c10::Allocator* allocator_{}; | ||||
| }; | ||||
|  | ||||
| @ -203,7 +203,7 @@ class LocalCallbackManager { | ||||
|   // Runtime cache. | ||||
|   size_t global_version_{GlobalCallbackManager::NoVersion}; | ||||
|   std::array<CacheEntry, NumRecordScopes> active_callbacks_; | ||||
|   std::mt19937 generator_{}; | ||||
|   std::mt19937 generator_; | ||||
| }; | ||||
|  | ||||
| // ============================================================================ | ||||
|  | ||||
| @ -34,19 +34,24 @@ def check_accuracy(actual_csv, expected_csv, expected_filename): | ||||
|     if "rocm" in expected_filename: | ||||
|         flaky_models.update( | ||||
|             { | ||||
|                 "Background_Matting", | ||||
|                 "alexnet", | ||||
|                 "cait_m36_384", | ||||
|                 "dla102", | ||||
|                 "demucs", | ||||
|                 "densenet121", | ||||
|                 "detectron2_fcos_r_50_fpn", | ||||
|                 "doctr_det_predictor", | ||||
|                 "doctr_reco_predictor", | ||||
|                 "dpn107", | ||||
|                 "fbnetv3_b", | ||||
|                 "hf_BigBird", | ||||
|                 "hf_Longformer", | ||||
|                 "hf_Reformer", | ||||
|                 "hf_Roberta_base", | ||||
|                 "hf_T5", | ||||
|                 "hf_T5_base", | ||||
|                 "hf_T5_generate", | ||||
|                 "levit_128", | ||||
|                 "llava", | ||||
|                 "microbench_unbacked_tolist_sum", | ||||
| @ -64,6 +69,7 @@ def check_accuracy(actual_csv, expected_csv, expected_filename): | ||||
|                 "squeezenet1_1", | ||||
|                 "stable_diffusion_text_encoder", | ||||
|                 "stable_diffusion_unet", | ||||
|                 "swsl_resnext101_32x16d", | ||||
|                 "timm_efficientdet", | ||||
|                 "timm_efficientnet", | ||||
|                 "timm_nfnet", | ||||
|  | ||||
| @ -47,6 +47,8 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename): | ||||
|                 "levit_128", | ||||
|                 "llava", | ||||
|                 "microbench_unbacked_tolist_sum", | ||||
|                 "resnet50", | ||||
|                 "resnet152", | ||||
|                 "sam", | ||||
|                 "sam_fast", | ||||
|                 "stable_diffusion_text_encoder", | ||||
|  | ||||
| @ -378,7 +378,7 @@ vgg16,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| vision_maskrcnn,pass,20 | ||||
| vision_maskrcnn,pass,18 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| 
 | 
| @ -286,7 +286,7 @@ vgg16,pass,6 | ||||
|  | ||||
|  | ||||
|  | ||||
| vision_maskrcnn,pass,39 | ||||
| vision_maskrcnn,pass,37 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| 
 | 
| @ -46,7 +46,7 @@ deit_base_distilled_patch16_224,pass,7 | ||||
|  | ||||
|  | ||||
|  | ||||
| dla102,pass,7 | ||||
| dla102,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| 
 | 
| @ -170,7 +170,7 @@ mobilenet_v2_quantized_qat,eager_fail_to_run,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| mobilenet_v3_large,pass,7 | ||||
| mobilenet_v3_large,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -210,7 +210,7 @@ pytorch_unet,pass_due_to_skip,7 | ||||
|  | ||||
|  | ||||
|  | ||||
| resnet152,pass,7 | ||||
| resnet152,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -218,7 +218,7 @@ resnet18,pass,6 | ||||
|  | ||||
|  | ||||
|  | ||||
| resnet50,pass,6 | ||||
| resnet50,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -270,7 +270,7 @@ timm_nfnet,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| timm_regnet,pass,7 | ||||
| timm_regnet,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| 
 | 
| @ -58,7 +58,7 @@ DistilBertForQuestionAnswering,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| DistillGPT2,pass,2 | ||||
| DistillGPT2,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| 
 | 
| @ -150,6 +150,10 @@ hf_GPT2_large,pass_due_to_skip,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_Roberta_base,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_T5,pass,0 | ||||
|  | ||||
|  | ||||
| @ -194,6 +198,10 @@ maml_omniglot,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| microbench_unbacked_tolist_sum,fail_to_run,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| mnasnet1_0,pass,0 | ||||
|  | ||||
|  | ||||
| @ -310,6 +318,10 @@ timm_efficientnet,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| timm_nfnet,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| timm_regnet,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| 
 | 
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user
	