Shard RegisterDispatchKey (#144364)

Should fix https://github.com/pytorch/pytorch/issues/143952 .

Testing: built PyTorch on Raspberry Pi 5; this seemed to alleviate high peak memory requirement. (I did increase shard counts for other generated files along the way, but I need to go back and figure out how much of that was strictly necessary vs. needing to use -j1 or -j2.)

Differential Revision: [D67925496](https://our.internmc.facebook.com/intern/diff/D67925496/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/144364
Approved by: https://github.com/Skylion007, https://github.com/bdhirsh
ghstack dependencies: #144363
This commit is contained in:
Scott Wolchok
2025-01-09 15:00:21 -08:00
committed by PyTorch MergeBot
parent 4143312e67
commit b46d00c1b7
5 changed files with 129 additions and 79 deletions

View File

@ -38,26 +38,29 @@ aten_generation_srcs = ["aten/src/ATen/native/native_functions.yaml"] + ["aten/s
generated_cpu_cpp = [
"aten/src/ATen/RegisterBackendSelect.cpp",
"aten/src/ATen/RegisterCPU.cpp",
"aten/src/ATen/RegisterCPU_0.cpp",
"aten/src/ATen/RegisterCPU_1.cpp",
"aten/src/ATen/RegisterCPU_2.cpp",
"aten/src/ATen/RegisterCPU_3.cpp",
"aten/src/ATen/RegisterFunctionalization_0.cpp",
"aten/src/ATen/RegisterFunctionalization_1.cpp",
"aten/src/ATen/RegisterFunctionalization_2.cpp",
"aten/src/ATen/RegisterFunctionalization_3.cpp",
# "aten/src/ATen/RegisterFunctionalizationEverything.cpp",
"aten/src/ATen/RegisterMkldnnCPU.cpp",
"aten/src/ATen/RegisterNestedTensorCPU.cpp",
"aten/src/ATen/RegisterQuantizedCPU.cpp",
"aten/src/ATen/RegisterSparseCPU.cpp",
"aten/src/ATen/RegisterSparseCsrCPU.cpp",
"aten/src/ATen/RegisterZeroTensor.cpp",
"aten/src/ATen/RegisterCompositeImplicitAutograd.cpp",
"aten/src/ATen/RegisterCompositeImplicitAutogradNestedTensor.cpp",
"aten/src/ATen/RegisterCompositeExplicitAutograd.cpp",
"aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional.cpp",
"aten/src/ATen/RegisterMeta.cpp",
"aten/src/ATen/RegisterSparseMeta.cpp",
"aten/src/ATen/RegisterQuantizedMeta.cpp",
"aten/src/ATen/RegisterNestedTensorMeta.cpp",
"aten/src/ATen/RegisterMkldnnCPU_0.cpp",
"aten/src/ATen/RegisterNestedTensorCPU_0.cpp",
"aten/src/ATen/RegisterQuantizedCPU_0.cpp",
"aten/src/ATen/RegisterSparseCPU_0.cpp",
"aten/src/ATen/RegisterSparseCsrCPU_0.cpp",
"aten/src/ATen/RegisterZeroTensor_0.cpp",
"aten/src/ATen/RegisterCompositeImplicitAutograd_0.cpp",
"aten/src/ATen/RegisterCompositeImplicitAutogradNestedTensor_0.cpp",
"aten/src/ATen/RegisterCompositeExplicitAutograd_0.cpp",
"aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp",
"aten/src/ATen/RegisterMeta_0.cpp",
"aten/src/ATen/RegisterSparseMeta_0.cpp",
"aten/src/ATen/RegisterQuantizedMeta_0.cpp",
"aten/src/ATen/RegisterNestedTensorMeta_0.cpp",
"aten/src/ATen/RegisterSchema.cpp",
"aten/src/ATen/CPUFunctions.h",
"aten/src/ATen/CPUFunctions_inl.h",
@ -97,11 +100,11 @@ generated_cpu_cpp = [
generated_cuda_cpp = [
"aten/src/ATen/CUDAFunctions.h",
"aten/src/ATen/CUDAFunctions_inl.h",
"aten/src/ATen/RegisterCUDA.cpp",
"aten/src/ATen/RegisterNestedTensorCUDA.cpp",
"aten/src/ATen/RegisterQuantizedCUDA.cpp",
"aten/src/ATen/RegisterSparseCUDA.cpp",
"aten/src/ATen/RegisterSparseCsrCUDA.cpp",
"aten/src/ATen/RegisterCUDA_0.cpp",
"aten/src/ATen/RegisterNestedTensorCUDA_0.cpp",
"aten/src/ATen/RegisterQuantizedCUDA_0.cpp",
"aten/src/ATen/RegisterSparseCUDA_0.cpp",
"aten/src/ATen/RegisterSparseCsrCUDA_0.cpp",
]
generate_aten(