mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-25 16:14:55 +08:00
Compare commits
3 Commits
eager_mode
...
transpose_
| Author | SHA1 | Date | |
|---|---|---|---|
| 38984de634 | |||
| 6ea3d9db41 | |||
| 457f154632 |
@ -31,7 +31,8 @@ pip install -r /pytorch/requirements.txt
|
||||
pip install auditwheel==6.2.0 wheel
|
||||
if [ "$DESIRED_CUDA" = "cpu" ]; then
|
||||
echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
|
||||
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
|
||||
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
|
||||
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
|
||||
else
|
||||
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
|
||||
export USE_SYSTEM_NCCL=1
|
||||
@ -45,5 +46,6 @@ else
|
||||
export USE_NVIDIA_PYPI_LIBS=1
|
||||
fi
|
||||
|
||||
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
|
||||
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
|
||||
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
|
||||
fi
|
||||
|
||||
@ -317,7 +317,7 @@ if __name__ == "__main__":
|
||||
).decode()
|
||||
|
||||
print("Building PyTorch wheel")
|
||||
build_vars = ""
|
||||
build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
|
||||
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
|
||||
if enable_cuda:
|
||||
build_vars += "MAX_JOBS=5 "
|
||||
|
||||
@ -1 +1 @@
|
||||
e0dda9059d082537cee36be6c5e4fe3b18c880c0
|
||||
56392aa978594cc155fa8af48cd949f5b5f1823a
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
transformers==4.56.0
|
||||
transformers==4.54.0
|
||||
soxr==0.5.0
|
||||
|
||||
@ -42,27 +42,22 @@ install_pip_dependencies() {
|
||||
# A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current
|
||||
# numba and scipy version used in PyTorch CI
|
||||
conda_run pip uninstall -y numba scipy
|
||||
# Yaspin is needed for running CI test (get_benchmark_analysis_data.py)
|
||||
pip_install yaspin==3.1.0
|
||||
|
||||
popd
|
||||
}
|
||||
|
||||
setup_executorch() {
|
||||
pushd executorch
|
||||
|
||||
export PYTHON_EXECUTABLE=python
|
||||
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON"
|
||||
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
|
||||
|
||||
as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
|
||||
popd
|
||||
}
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
clone_executorch
|
||||
install_buck2
|
||||
install_conda_dependencies
|
||||
install_pip_dependencies
|
||||
pushd executorch
|
||||
setup_executorch
|
||||
popd
|
||||
else
|
||||
"$@"
|
||||
fi
|
||||
clone_executorch
|
||||
install_buck2
|
||||
install_conda_dependencies
|
||||
install_pip_dependencies
|
||||
setup_executorch
|
||||
|
||||
@ -93,9 +93,8 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
mypy==1.16.0 ; platform_system != "Windows"
|
||||
mypy==1.16.0
|
||||
# Pin MyPy version because new errors are likely to appear with each release
|
||||
# Skip on Windows as lots of type annotations are POSIX specific
|
||||
#Description: linter
|
||||
#Pinned versions: 1.16.0
|
||||
#test that import: test_typing.py, test_type_hints.py
|
||||
|
||||
@ -41,6 +41,7 @@ def sample_vllm_test_library():
|
||||
"pytest -v -s basic_correctness/test_cumem.py",
|
||||
"pytest -v -s basic_correctness/test_basic_correctness.py",
|
||||
"pytest -v -s basic_correctness/test_cpu_offload.py",
|
||||
"VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
|
||||
],
|
||||
},
|
||||
"vllm_basic_models_test": {
|
||||
@ -67,11 +68,14 @@ def sample_vllm_test_library():
|
||||
"-v",
|
||||
"-s",
|
||||
"entrypoints/llm",
|
||||
"--ignore=entrypoints/llm/test_lazy_outlines.py",
|
||||
"--ignore=entrypoints/llm/test_generate.py",
|
||||
"--ignore=entrypoints/llm/test_generate_multiple_loras.py",
|
||||
"--ignore=entrypoints/llm/test_collective_rpc.py",
|
||||
]
|
||||
),
|
||||
"pytest -v -s entrypoints/llm/test_generate.py",
|
||||
"pytest -v -s entrypoints/llm/test_lazy_outlines.py",
|
||||
"pytest -v -s entrypoints/llm/test_generate.py ",
|
||||
"VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
|
||||
],
|
||||
},
|
||||
|
||||
@ -35,11 +35,10 @@ fi
|
||||
|
||||
print_cmake_info
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
|
||||
# Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
|
||||
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
||||
USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
||||
else
|
||||
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
|
||||
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
|
||||
# NB: we always build with distributed; USE_DISTRIBUTED turns off all
|
||||
# backends (specifically the gloo backend), so test that this case works too
|
||||
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
|
||||
fi
|
||||
if which sccache > /dev/null; then
|
||||
|
||||
@ -13,9 +13,13 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
|
||||
fi
|
||||
popd
|
||||
|
||||
python -mpip install -r requirements.txt
|
||||
|
||||
# enable debug asserts in serialization
|
||||
export TORCH_SERIALIZATION_DEBUG=1
|
||||
|
||||
python -mpip install --no-input -r requirements.txt
|
||||
|
||||
setup_test_python() {
|
||||
# The CircleCI worker hostname doesn't resolve to an address.
|
||||
# This environment variable makes ProcessGroupGloo default to
|
||||
|
||||
@ -561,43 +561,6 @@ else
|
||||
DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
|
||||
fi
|
||||
|
||||
# Validate backend availability for dynamo_eager configs
|
||||
if [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
|
||||
echo "Validating eager backend availability for TEST_CONFIG: ${TEST_CONFIG}"
|
||||
if ! python -c "import torch; backends = torch._dynamo.list_backends(); print('Available backends:', backends); assert 'eager' in backends, f'eager backend not available. Available: {backends}'"; then
|
||||
echo "ERROR: eager backend not available in this environment"
|
||||
echo "This might be due to missing dependencies or incorrect PyTorch installation"
|
||||
exit 1
|
||||
fi
|
||||
echo "eager backend validation successful"
|
||||
|
||||
# Additional validation: test that torch.compile works with eager backend
|
||||
echo "Testing torch.compile with eager backend..."
|
||||
if ! python -c "
|
||||
import torch
|
||||
import torch._dynamo as dynamo
|
||||
|
||||
def test_func(x):
|
||||
return x * 2
|
||||
|
||||
# Test that eager backend works
|
||||
try:
|
||||
compiled_func = torch.compile(test_func, backend='eager')
|
||||
result = compiled_func(torch.tensor([1.0, 2.0]))
|
||||
print('torch.compile with eager backend test successful')
|
||||
except Exception as e:
|
||||
print(f'ERROR: torch.compile with eager backend failed: {e}')
|
||||
exit(1)
|
||||
"; then
|
||||
echo "ERROR: torch.compile with eager backend failed"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Debug logging for backend selection
|
||||
echo "TEST_CONFIG: ${TEST_CONFIG}"
|
||||
echo "DYNAMO_BENCHMARK_FLAGS: ${DYNAMO_BENCHMARK_FLAGS[*]}"
|
||||
|
||||
test_cachebench() {
|
||||
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
||||
mkdir -p "$TEST_REPORTS_DIR"
|
||||
@ -659,16 +622,6 @@ test_perf_for_dashboard() {
|
||||
shift
|
||||
|
||||
local backend=inductor
|
||||
# Allow surfacing eager metrics in CI by switching backend based on TEST_CONFIG
|
||||
if [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
|
||||
backend=eager
|
||||
elif [[ "${TEST_CONFIG}" == *aot_eager* ]]; then
|
||||
backend=aot_eager
|
||||
fi
|
||||
|
||||
# Debug logging for backend selection in test_perf_for_dashboard
|
||||
echo "test_perf_for_dashboard: TEST_CONFIG=${TEST_CONFIG}, selected backend=${backend}"
|
||||
echo "DASHBOARD_TAG=${DASHBOARD_TAG}"
|
||||
local modes=()
|
||||
if [[ "$DASHBOARD_TAG" == *training-true* ]]; then
|
||||
modes+=(training)
|
||||
@ -722,37 +675,20 @@ test_perf_for_dashboard() {
|
||||
fi
|
||||
|
||||
if [[ "$DASHBOARD_TAG" == *default-true* ]]; then
|
||||
echo "Running benchmark: ${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}"
|
||||
echo "Command: $TASKSET python benchmarks/dynamo/$suite.py ${target_flag[*]} --$mode --$dtype --backend $backend --disable-cudagraphs $*"
|
||||
if ! $TASKSET python "benchmarks/dynamo/$suite.py" \
|
||||
$TASKSET python "benchmarks/dynamo/$suite.py" \
|
||||
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
|
||||
--output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"; then
|
||||
echo "ERROR: Benchmark failed for ${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}"
|
||||
echo "This might indicate an issue with the eager backend or benchmark configuration"
|
||||
exit 1
|
||||
fi
|
||||
echo "Benchmark completed successfully: ${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}"
|
||||
--output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"
|
||||
fi
|
||||
if [[ "$DASHBOARD_TAG" == *cudagraphs-true* ]]; then
|
||||
echo "Running benchmark: ${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}"
|
||||
if ! $TASKSET python "benchmarks/dynamo/$suite.py" \
|
||||
$TASKSET python "benchmarks/dynamo/$suite.py" \
|
||||
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
|
||||
--output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"; then
|
||||
echo "ERROR: Benchmark failed for ${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}"
|
||||
exit 1
|
||||
fi
|
||||
echo "Benchmark completed successfully: ${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}"
|
||||
--output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"
|
||||
fi
|
||||
if [[ "$DASHBOARD_TAG" == *dynamic-true* ]]; then
|
||||
echo "Running benchmark: ${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}"
|
||||
if ! $TASKSET python "benchmarks/dynamo/$suite.py" \
|
||||
$TASKSET python "benchmarks/dynamo/$suite.py" \
|
||||
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes \
|
||||
--dynamic-batch-only "$@" \
|
||||
--output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}.csv"; then
|
||||
echo "ERROR: Benchmark failed for ${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}"
|
||||
exit 1
|
||||
fi
|
||||
echo "Benchmark completed successfully: ${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}"
|
||||
--output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}.csv"
|
||||
fi
|
||||
if [[ "$DASHBOARD_TAG" == *cppwrapper-true* ]]; then
|
||||
TORCHINDUCTOR_CPP_WRAPPER=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
|
||||
@ -1614,10 +1550,14 @@ test_executorch() {
|
||||
install_torchvision
|
||||
install_torchaudio
|
||||
|
||||
INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh"
|
||||
|
||||
pushd /executorch
|
||||
"${INSTALL_SCRIPT}" setup_executorch
|
||||
|
||||
export PYTHON_EXECUTABLE=python
|
||||
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
|
||||
|
||||
# NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
|
||||
# from the PR
|
||||
bash .ci/scripts/setup-linux.sh --build-tool cmake
|
||||
|
||||
echo "Run ExecuTorch unit tests"
|
||||
pytest -v -n auto
|
||||
@ -1631,6 +1571,10 @@ test_executorch() {
|
||||
|
||||
popd
|
||||
|
||||
# Test torchgen generated code for Executorch.
|
||||
echo "Testing ExecuTorch op registration"
|
||||
"$BUILD_BIN_DIR"/test_edge_op_registration
|
||||
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
|
||||
@ -137,7 +137,7 @@ sccache --show-stats
|
||||
python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
|
||||
(
|
||||
if "%BUILD_ENVIRONMENT%"=="" (
|
||||
echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash.
|
||||
echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
|
||||
) else (
|
||||
copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%"
|
||||
|
||||
|
||||
@ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" (
|
||||
) else (
|
||||
set CONDA_PARENT_DIR=C:\Jenkins
|
||||
)
|
||||
set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3
|
||||
|
||||
|
||||
:: Be conservative here when rolling out the new AMI with conda. This will try
|
||||
:: to install conda as before if it couldn't find the conda installation. This
|
||||
:: can be removed eventually after we gain enough confidence in the AMI
|
||||
if not exist %CONDA_ROOT_DIR% (
|
||||
if not exist %CONDA_PARENT_DIR%\Miniconda3 (
|
||||
set INSTALL_FRESH_CONDA=1
|
||||
)
|
||||
|
||||
@ -17,14 +17,10 @@ if "%INSTALL_FRESH_CONDA%"=="1" (
|
||||
if errorlevel 1 exit /b
|
||||
if not errorlevel 0 exit /b
|
||||
|
||||
%TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR%
|
||||
%TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
|
||||
if errorlevel 1 exit /b
|
||||
if not errorlevel 0 exit /b
|
||||
)
|
||||
|
||||
:: Activate conda so that we can use its commands, i.e. conda, python, pip
|
||||
call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%
|
||||
:: Activate conda so that we can use its commands, i.e. conda, python, pip
|
||||
call conda activate py_tmp
|
||||
|
||||
call pip install -r .ci/docker/requirements-ci.txt
|
||||
call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
|
||||
|
||||
@ -14,7 +14,7 @@ if not errorlevel 0 exit /b
|
||||
:: build\torch. Rather than changing all these references, making a copy of torch folder
|
||||
:: from conda to the current workspace is easier. The workspace will be cleaned up after
|
||||
:: the job anyway
|
||||
xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
|
||||
xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
|
||||
|
||||
pushd .
|
||||
if "%VC_VERSION%" == "" (
|
||||
|
||||
@ -38,14 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
|
||||
fi
|
||||
|
||||
# TODO: Move both of them to Windows AMI
|
||||
python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
|
||||
|
||||
# Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments
|
||||
# pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node'
|
||||
# scipy from 1.6.3 to 1.10
|
||||
# expecttest from 0.1.3 to 0.3.0
|
||||
# xdoctest from 1.0.2 to 1.3.0
|
||||
python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.3.0" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" "boto3==1.35.42"
|
||||
python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
|
||||
|
||||
# Install Z3 optional dependency for Windows builds.
|
||||
python -m pip install z3-solver==4.15.1.0
|
||||
@ -59,6 +52,9 @@ python -m pip install parameterized==0.8.1
|
||||
# Install pulp for testing ilps under torch\distributed\_tools
|
||||
python -m pip install pulp==2.9.0
|
||||
|
||||
# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
|
||||
python -m pip install expecttest==0.3.0
|
||||
|
||||
run_tests() {
|
||||
# Run nvidia-smi if available
|
||||
for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
|
||||
|
||||
@ -177,7 +177,8 @@ source ~/${desired_python}-build/bin/activate
|
||||
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
|
||||
retry brew install libomp
|
||||
|
||||
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
|
||||
# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
|
||||
# is build as part of tensorpipe submodule
|
||||
export USE_DISTRIBUTED=1
|
||||
|
||||
export USE_MKLDNN=OFF
|
||||
|
||||
@ -264,7 +264,7 @@ def unzip_artifact_and_replace_files() -> None:
|
||||
change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py")
|
||||
|
||||
for file in Path(f"artifacts/dist/{old_stem}").glob(
|
||||
"*.dist-info/*",
|
||||
"*.dist-info/**",
|
||||
):
|
||||
change_content_to_new_version(file)
|
||||
|
||||
|
||||
16
.github/actions/setup-win/action.yml
vendored
16
.github/actions/setup-win/action.yml
vendored
@ -6,12 +6,6 @@ inputs:
|
||||
cuda-version:
|
||||
description: which cuda version to install, 'cpu' for none
|
||||
required: true
|
||||
python-version:
|
||||
required: false
|
||||
type: string
|
||||
default: "3.10"
|
||||
description: |
|
||||
The python version to be used. Will be 3.10 by default
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
@ -44,24 +38,18 @@ runs:
|
||||
CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat"
|
||||
|
||||
{
|
||||
echo "CONDA=${CONDA}";
|
||||
echo "CONDA_RUN=${CONDA} run --no-capture-output";
|
||||
echo "CONDA_BUILD=${CONDA} run conda-build";
|
||||
echo "CONDA_INSTALL=${CONDA} install";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Setup Python3
|
||||
env:
|
||||
PYTHON_VERSION: ${{ inputs.python-version }}
|
||||
shell: bash
|
||||
run: |
|
||||
set +e
|
||||
set -x
|
||||
|
||||
# Create new py_tmp env with python-version
|
||||
${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp
|
||||
|
||||
PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
|
||||
PYTHON3=$(${CONDA_RUN} which python3)
|
||||
EXIT_CODE=$?
|
||||
|
||||
if [[ "${EXIT_CODE}" == "0" ]]; then
|
||||
@ -74,7 +62,7 @@ runs:
|
||||
# installation, which is Python 3 based. Its Python is default to Python 3. Further, there
|
||||
# is also the Miniconda installation that is Python 2 based, and both can be installed if
|
||||
# needed. In both cases, Python binary is just called python
|
||||
PYTHON=$(${CONDA_RUN} -n py_tmp which python)
|
||||
PYTHON=$(${CONDA_RUN} which python)
|
||||
EXIT_CODE=$?
|
||||
|
||||
if [[ "${EXIT_CODE}" == "0" ]]; then
|
||||
|
||||
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
367a480bd3534edf27a8dac3c6f7ea8af9d1ed45
|
||||
d119fc86140785e7efc8f125c17153544d1e0f20
|
||||
|
||||
3
.github/labeler.yml
vendored
3
.github/labeler.yml
vendored
@ -130,6 +130,3 @@
|
||||
- torch/csrc/inductor/aoti_include/**
|
||||
- torchgen/aoti/**
|
||||
- torchgen/gen_aoti_c_shim.py
|
||||
|
||||
"ciflow/vllm":
|
||||
- .github/ci_commit_pins/vllm.txt
|
||||
|
||||
2
.github/scripts/generate_ci_workflows.py
vendored
2
.github/scripts/generate_ci_workflows.py
vendored
@ -135,7 +135,7 @@ ROCM_SMOKE_WORKFLOWS = [
|
||||
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
|
||||
OperatingSystem.LINUX,
|
||||
arches=["6.4"],
|
||||
python_versions=["3.10"],
|
||||
python_versions=["3.9"],
|
||||
),
|
||||
ciflow_config=CIFlowConfig(
|
||||
labels={
|
||||
|
||||
2
.github/workflows/_binary-test-linux.yml
vendored
2
.github/workflows/_binary-test-linux.yml
vendored
@ -187,6 +187,8 @@ jobs:
|
||||
|
||||
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
|
||||
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
|
||||
with:
|
||||
driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }}
|
||||
if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}
|
||||
|
||||
- name: configure aws credentials
|
||||
|
||||
2
.github/workflows/_win-build.yml
vendored
2
.github/workflows/_win-build.yml
vendored
@ -151,7 +151,7 @@ jobs:
|
||||
BUILD_WHEEL: 1
|
||||
MAX_JOBS: 8
|
||||
CUDA_VERSION: ${{ inputs.cuda-version }}
|
||||
PYTHON_VERSION: "3.10"
|
||||
PYTHON_VERSION: "3.9"
|
||||
SCCACHE_BUCKET: "ossci-compiler-cache"
|
||||
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
|
||||
SCCACHE_REGION: us-east-1
|
||||
|
||||
2
.github/workflows/_win-test.yml
vendored
2
.github/workflows/_win-test.yml
vendored
@ -184,7 +184,7 @@ jobs:
|
||||
env:
|
||||
USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
|
||||
INSTALL_WINDOWS_SDK: 1
|
||||
PYTHON_VERSION: "3.10"
|
||||
PYTHON_VERSION: 3.9
|
||||
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
|
||||
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
|
||||
TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
|
||||
|
||||
3
.github/workflows/docker-builds.yml
vendored
3
.github/workflows/docker-builds.yml
vendored
@ -71,7 +71,8 @@ jobs:
|
||||
pytorch-linux-jammy-py3-clang12-onnx,
|
||||
pytorch-linux-jammy-linter,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
|
||||
pytorch-linux-jammy-py3-clang12-executorch,
|
||||
# Executorch pin needs update
|
||||
# pytorch-linux-jammy-py3-clang12-executorch,
|
||||
pytorch-linux-jammy-py3.12-triton-cpu,
|
||||
pytorch-linux-noble-riscv64-py3.12-gcc14
|
||||
]
|
||||
|
||||
14
.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
generated
vendored
14
.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
generated
vendored
@ -44,7 +44,7 @@ jobs:
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
manywheel-py3_10-rocm6_4-build:
|
||||
manywheel-py3_9-rocm6_4-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
@ -58,16 +58,16 @@ jobs:
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
|
||||
DESIRED_PYTHON: "3.10"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-rocm6_4
|
||||
build_name: manywheel-py3_9-rocm6_4
|
||||
build_environment: linux-binary-manywheel-rocm
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-rocm6_4-test: # Testing
|
||||
manywheel-py3_9-rocm6_4-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_10-rocm6_4-build
|
||||
- manywheel-py3_9-rocm6_4-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
@ -82,14 +82,14 @@ jobs:
|
||||
SKIP_ALL_TESTS: 1
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
|
||||
DESIRED_PYTHON: "3.10"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Setup ROCm
|
||||
uses: ./.github/actions/setup-rocm
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: manywheel-py3_10-rocm6_4
|
||||
name: manywheel-py3_9-rocm6_4
|
||||
path: "${{ runner.temp }}/artifacts/"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
|
||||
19
.github/workflows/inductor-perf-test-nightly.yml
vendored
19
.github/workflows/inductor-perf-test-nightly.yml
vendored
@ -57,7 +57,7 @@ on:
|
||||
description: The list of configs used the benchmark
|
||||
required: false
|
||||
type: string
|
||||
default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,dynamo_eager_huggingface_perf,dynamo_eager_timm_perf,dynamo_eager_torchbench_perf,cachebench
|
||||
default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,cachebench
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
@ -97,35 +97,18 @@ jobs:
|
||||
{ config: "inductor_huggingface_perf", shard: 3, num_shards: 5, runner: "linux.aws.a100" },
|
||||
{ config: "inductor_huggingface_perf", shard: 4, num_shards: 5, runner: "linux.aws.a100" },
|
||||
{ config: "inductor_huggingface_perf", shard: 5, num_shards: 5, runner: "linux.aws.a100" },
|
||||
{ config: "dynamo_eager_huggingface_perf", shard: 1, num_shards: 5, runner: "linux.aws.a100" },
|
||||
{ config: "dynamo_eager_huggingface_perf", shard: 2, num_shards: 5, runner: "linux.aws.a100" },
|
||||
{ config: "dynamo_eager_huggingface_perf", shard: 3, num_shards: 5, runner: "linux.aws.a100" },
|
||||
{ config: "dynamo_eager_huggingface_perf", shard: 4, num_shards: 5, runner: "linux.aws.a100" },
|
||||
{ config: "dynamo_eager_huggingface_perf", shard: 5, num_shards: 5, runner: "linux.aws.a100" },
|
||||
{ config: "inductor_timm_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "inductor_timm_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "inductor_timm_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "inductor_timm_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "inductor_timm_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "inductor_timm_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "dynamo_eager_timm_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "dynamo_eager_timm_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "dynamo_eager_timm_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "dynamo_eager_timm_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "dynamo_eager_timm_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "dynamo_eager_timm_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "inductor_torchbench_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "inductor_torchbench_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "inductor_torchbench_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "inductor_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "inductor_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "inductor_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "dynamo_eager_torchbench_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "dynamo_eager_torchbench_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "dynamo_eager_torchbench_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "dynamo_eager_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "dynamo_eager_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "dynamo_eager_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "cachebench", shard: 1, num_shards: 2, runner: "linux.aws.a100" },
|
||||
{ config: "cachebench", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
|
||||
]}
|
||||
|
||||
28
.github/workflows/pull.yml
vendored
28
.github/workflows/pull.yml
vendored
@ -127,6 +127,8 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# More memory is needed to build with asan
|
||||
runner: linux.2xlarge.memory
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.10-clang18-asan
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
|
||||
@ -316,6 +318,32 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3-clang12-executorch-build:
|
||||
if: false # Docker build needs pin update
|
||||
name: linux-jammy-py3-clang12-executorch
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3-clang12-executorch
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3-clang12-executorch-test:
|
||||
name: linux-jammy-py3-clang12-executorch
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-py3-clang12-executorch-build
|
||||
if: false # Has been broken for a while
|
||||
with:
|
||||
build-environment: linux-jammy-py3-clang12-executorch
|
||||
docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
|
||||
name: cuda12.8-py3.10-gcc9-sm75
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
||||
2
.github/workflows/slow.yml
vendored
2
.github/workflows/slow.yml
vendored
@ -140,6 +140,8 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# More memory is needed to build with asan
|
||||
runner: linux.2xlarge.memory
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.10-clang18-asan
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
|
||||
|
||||
24
.github/workflows/trunk.yml
vendored
24
.github/workflows/trunk.yml
vendored
@ -259,27 +259,3 @@ jobs:
|
||||
docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3-clang12-executorch-build:
|
||||
name: linux-jammy-py3-clang12-executorch
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3-clang12-executorch
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3-clang12-executorch-test:
|
||||
name: linux-jammy-py3-clang12-executorch
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-py3-clang12-executorch-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3-clang12-executorch
|
||||
docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
2
.github/workflows/vllm.yml
vendored
2
.github/workflows/vllm.yml
vendored
@ -36,8 +36,6 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# When building vLLM, uv doesn't like that we rename wheel without changing the wheel metadata
|
||||
allow-reuse-old-whl: false
|
||||
build-additional-packages: "vision audio"
|
||||
build-external-packages: "vllm"
|
||||
build-environment: linux-jammy-cuda12.8-py3.12-gcc11
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@ -259,9 +259,6 @@ gen
|
||||
.pytest_cache
|
||||
aten/build/*
|
||||
|
||||
# Linker scripts for prioritized text optimization
|
||||
cmake/linker_script.ld
|
||||
|
||||
# Bram
|
||||
plsdontbreak
|
||||
|
||||
|
||||
@ -123,7 +123,6 @@ is_formatter = true
|
||||
code = 'MYPY'
|
||||
include_patterns = [
|
||||
'setup.py',
|
||||
'functorch/dim/**/*.py',
|
||||
'torch/**/*.py',
|
||||
'torch/**/*.pyi',
|
||||
'caffe2/**/*.py',
|
||||
@ -965,6 +964,7 @@ exclude_patterns = [
|
||||
'test/jit/**', # should be run through test/test_jit.py
|
||||
'test/ao/sparsity/**', # should be run through test/test_ao_sparsity.py
|
||||
'test/fx/**', # should be run through test/test_fx.py
|
||||
'test/bottleneck_test/**', # excluded by test/run_test.py
|
||||
'test/package/**', # excluded by test/run_test.py
|
||||
'test/distributed/argparse_util_test.py',
|
||||
'test/distributed/bin/test_script.py',
|
||||
@ -1410,6 +1410,8 @@ exclude_patterns = [
|
||||
'torch/utils/benchmark/utils/timer.py',
|
||||
'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
|
||||
'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
|
||||
'torch/utils/bottleneck/__init__.py',
|
||||
'torch/utils/bottleneck/__main__.py',
|
||||
'torch/utils/bundled_inputs.py',
|
||||
'torch/utils/checkpoint.py',
|
||||
'torch/utils/collect_env.py',
|
||||
|
||||
@ -22,7 +22,6 @@ COMMON_COPTS = [
|
||||
"-DHAVE_SHM_UNLINK=1",
|
||||
"-D_FILE_OFFSET_BITS=64",
|
||||
"-DUSE_FBGEMM",
|
||||
"-DUSE_DISTRIBUTED",
|
||||
"-DAT_PER_OPERATOR_HEADERS",
|
||||
"-DATEN_THREADING=NATIVE",
|
||||
"-DNO_CUDNN_DESTROY_HANDLE",
|
||||
@ -811,7 +810,7 @@ cc_library(
|
||||
name = "torch_python",
|
||||
srcs = libtorch_python_core_sources
|
||||
+ if_cuda(libtorch_python_cuda_sources)
|
||||
+ if_cuda(libtorch_python_distributed_sources)
|
||||
+ libtorch_python_distributed_sources
|
||||
+ GENERATED_AUTOGRAD_PYTHON,
|
||||
hdrs = glob([
|
||||
"torch/csrc/generic/*.cpp",
|
||||
|
||||
@ -181,8 +181,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
|
||||
set(CPU_POWER ON)
|
||||
endif()
|
||||
|
||||
# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
|
||||
# tested and likely won't work without additional changes.
|
||||
# For non-supported platforms, turn USE_DISTRIBUTED off by default.
|
||||
# NB: USE_DISTRIBUTED simply disables the backend; distributed code
|
||||
# still gets built
|
||||
if(NOT LINUX AND NOT WIN32)
|
||||
set(USE_DISTRIBUTED
|
||||
OFF
|
||||
@ -262,11 +263,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
|
||||
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
|
||||
option(USE_NATIVE_ARCH "Use -march=native" OFF)
|
||||
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
|
||||
option(USE_DISTRIBUTED "Use distributed" ON)
|
||||
option(USE_DISTRIBUTED "Enable default distributed backends" ON)
|
||||
cmake_dependent_option(USE_NCCL "Use NCCL" ON
|
||||
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
|
||||
cmake_dependent_option(USE_XCCL "Use XCCL" ON
|
||||
"USE_XPU;UNIX;NOT APPLE" OFF)
|
||||
"USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
|
||||
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
|
||||
cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
|
||||
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
|
||||
@ -379,13 +380,6 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
|
||||
OFF "USE_CUDA" OFF)
|
||||
cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
|
||||
"CPU_AARCH64" OFF)
|
||||
# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le.
|
||||
set(USE_PRIORITIZED_TEXT_DEFAULT OFF)
|
||||
if(LINUX AND CPU_AARCH64)
|
||||
set(USE_PRIORITIZED_TEXT_DEFAULT ON)
|
||||
endif()
|
||||
cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld."
|
||||
"${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF)
|
||||
|
||||
option(USE_MIMALLOC "Use mimalloc" OFF)
|
||||
# Enable third party mimalloc library to improve memory allocation performance
|
||||
@ -438,11 +432,10 @@ if(WIN32)
|
||||
PATH_SUFFIXES lib
|
||||
NO_DEFAULT_PATH)
|
||||
if(NOT libuv_tmp_LIBRARY)
|
||||
set(USE_DISTRIBUTED OFF)
|
||||
set(USE_GLOO OFF)
|
||||
message(
|
||||
WARNING
|
||||
"Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
|
||||
"Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
|
||||
"Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
|
||||
)
|
||||
else()
|
||||
@ -664,11 +657,6 @@ endif(MSVC)
|
||||
|
||||
string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
|
||||
|
||||
# Set linker max-page-size to 64KiB on AArch64 Linux
|
||||
if(LINUX AND CPU_AARCH64)
|
||||
add_link_options_if_supported("-z,max-page-size=0x10000")
|
||||
endif()
|
||||
|
||||
# Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
|
||||
# applicable to mobile are disabled by this variable. Setting
|
||||
# `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
|
||||
@ -1433,57 +1421,3 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
|
||||
install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
|
||||
DESTINATION "${CMAKE_INSTALL_BINDIR}")
|
||||
endif()
|
||||
|
||||
if(USE_PRIORITIZED_TEXT_FOR_LD)
|
||||
add_compile_options(
|
||||
$<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
|
||||
$<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
|
||||
)
|
||||
set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
|
||||
set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
|
||||
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
|
||||
COMMENT "Generating prioritized text linker files"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
|
||||
|
||||
if(BUILD_PYTHON)
|
||||
set(LINKER_OPT_TARGETS torch_python)
|
||||
endif()
|
||||
|
||||
if(NOT BUILD_LIBTORCHLESS)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
|
||||
if(USE_CUDA)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
|
||||
endif()
|
||||
if(USE_XPU)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
|
||||
endif()
|
||||
if(USE_ROCM)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
foreach(tgt IN LISTS LINKER_OPT_TARGETS)
|
||||
if(TARGET ${tgt})
|
||||
add_dependencies("${tgt}" generate_linker_script)
|
||||
target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
|
||||
set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
|
||||
else()
|
||||
message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
else()
|
||||
if(LINUX AND CPU_AARCH64)
|
||||
message(WARNING [[
|
||||
It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
|
||||
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
|
||||
]])
|
||||
endif()
|
||||
endif()
|
||||
@ -180,7 +180,7 @@ void Context::setUserEnabledNNPACK(bool e) {
|
||||
}
|
||||
|
||||
bool Context::allowTF32CuDNN(const std::string& op) const {
|
||||
if (op.empty()){
|
||||
if (op.size() == 0){
|
||||
bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32";
|
||||
bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32";
|
||||
TORCH_CHECK(
|
||||
@ -281,6 +281,9 @@ bool Context::userEnabledOverrideableSDP() const {
|
||||
|
||||
static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
|
||||
static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
|
||||
#ifdef USE_ROCM
|
||||
static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32";
|
||||
#endif
|
||||
|
||||
bool Context::checkCuBLASConfigDeterministic() {
|
||||
// If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
|
||||
@ -340,6 +343,12 @@ void Context::setImmediateMiopen(bool b) {
|
||||
}
|
||||
|
||||
bool Context::allowTF32CuBLAS() const {
|
||||
#ifdef USE_ROCM
|
||||
const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
|
||||
if (allow_tf32 != true) {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
|
||||
bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32";
|
||||
TORCH_CHECK(
|
||||
@ -353,6 +362,14 @@ bool Context::allowTF32CuBLAS() const {
|
||||
}
|
||||
|
||||
void Context::setAllowTF32CuBLAS(bool b) {
|
||||
#ifdef USE_ROCM
|
||||
const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
|
||||
if (allow_tf32 != true) {
|
||||
C10_LOG_FIRST_N(INFO, 10) << "torch.backends.cuda.matmul.allow_tf32 is not supported on ROCm by default. "
|
||||
<< "Please set environment variable HIPBLASLT_ALLOW_TF32=1 to enable it.";
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
|
||||
setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee");
|
||||
}
|
||||
@ -426,7 +443,7 @@ void Context::setFloat32Precision(const std::string& backend, const std::string&
|
||||
std::string msg;
|
||||
auto iterp = _fp32_precisions.find(backend);
|
||||
TORCH_CHECK(iterp != _fp32_precisions.end());
|
||||
for (const auto& p : iterp->second) {
|
||||
for (auto p : iterp->second) {
|
||||
msg += p;
|
||||
msg += " ";
|
||||
}
|
||||
|
||||
@ -133,7 +133,7 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const
|
||||
: c10::TensorImpl(
|
||||
c10::DispatchKeySet(DispatchKey::Functionalize),
|
||||
view_value.dtype(),
|
||||
base->storage().data_ptr().device()
|
||||
view_value.device()
|
||||
),
|
||||
value_(view_value),
|
||||
is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
|
||||
@ -485,10 +485,7 @@ void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptr<TensorI
|
||||
|
||||
|
||||
c10::Device FunctionalTensorWrapper::device_custom() const {
|
||||
// The storage pointer already uses the underlying tensor custom device (if
|
||||
// applicable) to extract the device. So, we dont have to recurse again by
|
||||
// doing value_.unsafeGetTensorImpl()->device().
|
||||
return storage().data_ptr().device();
|
||||
return value_.unsafeGetTensorImpl()->device();
|
||||
}
|
||||
at::IntArrayRef FunctionalTensorWrapper::sizes_custom() const {
|
||||
return value_.unsafeGetTensorImpl()->sizes();
|
||||
|
||||
@ -149,5 +149,105 @@ static inline void pack_vnni4(
|
||||
#endif
|
||||
}
|
||||
|
||||
// This is a helper function for transpose_pack_vnni4
|
||||
// Transform a [4, 16] block (with incontiguous output)
|
||||
// Src:
|
||||
// a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16
|
||||
// b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15 b16
|
||||
// c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 c16
|
||||
// d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 d16
|
||||
// Dst:
|
||||
// a1 a2 a3 a4 b1 b2 b3 b4 c1 c2 c3 c4 d1 d2 d3 d4
|
||||
// a5 a6 a7 a8 b5 b6 b7 b8 c5 c6 c7 c8 d5 d6 d7 d8
|
||||
// a9 a10 a11 a12 b9 b10 b11 b12 c9 c10 c11 c12 d9 d10 d11 d12
|
||||
// a13 a14 a15 a16 b13 b14 b15 b16 c13 c14 c15 c16 d13 d14 d15 d16
|
||||
template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 1>>
|
||||
static inline void transpose_vnni4_pad_4x16_block(
|
||||
const scalar_t* src,
|
||||
scalar_t* dst,
|
||||
int64_t ld_src,
|
||||
int64_t ld_dst,
|
||||
int krem = 4) {
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
__m128i r[4];
|
||||
for (int i = 0; i < krem; ++i) {
|
||||
r[i] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i * ld_src));
|
||||
}
|
||||
for (int i = krem; i < 4; ++i) {
|
||||
r[i] = _mm_setzero_si128();
|
||||
}
|
||||
|
||||
// Transpose 4x16 bytes using unpack and shuffle
|
||||
__m128i t0 = _mm_unpacklo_epi32(r[0], r[1]);
|
||||
__m128i t1 = _mm_unpackhi_epi32(r[0], r[1]);
|
||||
__m128i t2 = _mm_unpacklo_epi32(r[2], r[3]);
|
||||
__m128i t3 = _mm_unpackhi_epi32(r[2], r[3]);
|
||||
|
||||
__m128i r0 = _mm_unpacklo_epi64(t0, t2);
|
||||
__m128i r1 = _mm_unpackhi_epi64(t0, t2);
|
||||
__m128i r2 = _mm_unpacklo_epi64(t1, t3);
|
||||
__m128i r3 = _mm_unpackhi_epi64(t1, t3);
|
||||
|
||||
// Store output
|
||||
if (krem == 4) {
|
||||
// normal case
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), r0);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst), r1);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst * 2), r2);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst * 3), r3);
|
||||
} else {
|
||||
// masked case
|
||||
__mmask16 mask = (1ULL << (krem * 4)) - 1;
|
||||
_mm_mask_storeu_epi8(dst, mask, r0);
|
||||
_mm_mask_storeu_epi8(reinterpret_cast<__m128i*>(dst + ld_dst), mask, r1);
|
||||
_mm_mask_storeu_epi8(
|
||||
reinterpret_cast<__m128i*>(dst + ld_dst * 2), mask, r2);
|
||||
_mm_mask_storeu_epi8(
|
||||
reinterpret_cast<__m128i*>(dst + ld_dst * 3), mask, r3);
|
||||
}
|
||||
#else
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"transpose_vnni4_pad_4x16_block is only supported when AVX-512 is supported")
|
||||
#endif
|
||||
}
|
||||
|
||||
// Do the transpose packing fusion with VNNI4
|
||||
// Reorder [K, N] → [N/4, K, 4] (VNNI4-style layout for bit8)
|
||||
template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 1>>
|
||||
static inline void transpose_pack_vnni4(
|
||||
const scalar_t* src,
|
||||
scalar_t* dst,
|
||||
int64_t ld_src,
|
||||
int64_t K,
|
||||
int64_t N) {
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
TORCH_CHECK(
|
||||
N % 16 == 0, "N needs to be multiple of 16 for transpose_pack_vnni4");
|
||||
int64_t bk = 0;
|
||||
int64_t _K = K / 4 * 4;
|
||||
for (; bk < _K; bk += 4) {
|
||||
int64_t bn = 0;
|
||||
for (; bn < N; bn += 16) {
|
||||
transpose_vnni4_pad_4x16_block(
|
||||
src + bk * ld_src + bn, dst + bn * K + bk * 4, ld_src, K * 4);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle leftover K rows (< 4)
|
||||
if (K % 4 != 0) {
|
||||
int krem = K - bk;
|
||||
int64_t bn = 0;
|
||||
for (; bn < N; bn += 16) {
|
||||
transpose_vnni4_pad_4x16_block(
|
||||
src + bk * ld_src + bn, dst + bn * K + bk * 4, ld_src, K * 4, krem);
|
||||
}
|
||||
}
|
||||
#else
|
||||
TORCH_CHECK(
|
||||
false, "transpose_pack_vnni4 is only supported when AVX-512 is supported")
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
||||
@ -1954,8 +1954,8 @@ void scaled_gemm(
|
||||
#if ROCM_VERSION >= 70000
|
||||
if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) {
|
||||
// TODO: add constraints based on hipblaslt internals
|
||||
TORCH_CHECK((m % 16 == 0) && (n % 16 == 0) && (k % 128 == 0),
|
||||
"M, N must be multiples of 16 and K should be multiple of 128 for MX format. "
|
||||
TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
|
||||
"Matrix dimensions must be multiples of 32 for MX format. "
|
||||
"Got m=", m, ", n=", n, ", k=", k);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -73,7 +73,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value)
|
||||
for (const auto i : c10::irange((size_t)l_pad)) {
|
||||
auto pad_idx = pad.size() - ((i + 1) * 2);
|
||||
auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
|
||||
TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
|
||||
TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
|
||||
pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, "
|
||||
"which is invalid. Check dimension ", l_diff + i, " of your input.");
|
||||
new_shape.emplace_back(new_dim);
|
||||
|
||||
@ -1138,14 +1138,9 @@ bool is_blockwise_1x16_scaling(const at::Tensor& t, const at::Tensor& scale) {
|
||||
bool is_blockwise_1x32_scaling(const at::Tensor& t, const at::Tensor& scale) {
|
||||
// TODO: We might want to enforce some structure on the shapes of the scale
|
||||
// tensors
|
||||
bool is_fp8_path = (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
|
||||
&& scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4));
|
||||
bool is_packed_fp4_path = false;
|
||||
#ifdef USE_ROCM
|
||||
is_packed_fp4_path = (t.scalar_type() == ScalarType::Float4_e2m1fn_x2 && scale.scalar_type() == at::kFloat8_e8m0fnu
|
||||
&& scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1) * 2, 32), 4));
|
||||
#endif
|
||||
return (is_fp8_path || is_packed_fp4_path) && scale.is_contiguous();
|
||||
return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
|
||||
&& scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4)
|
||||
&& scale.is_contiguous());
|
||||
}
|
||||
|
||||
bool is_blockwise_1x128_scaling(const at::Tensor& t, const at::Tensor& scale) {
|
||||
@ -1386,15 +1381,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
|
||||
"Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
|
||||
|
||||
int packed_factor = 1;
|
||||
if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
|
||||
// For float4 data type, each byte stores two 4-bit floating-point values,
|
||||
// effectively packing two elements into one byte.
|
||||
packed_factor = 2;
|
||||
}
|
||||
TORCH_CHECK(mat1.size(0) % 16 == 0 && (mat1.size(1) * packed_factor) % 128 == 0 &&
|
||||
mat2.size(1) % 16 == 0,
|
||||
"M, N must be multiples of 16 and K must be multiple of 128 for block-wise scaling");
|
||||
TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
|
||||
mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
|
||||
"Matrix dimensions must be multiples of 32 for block-wise scaling");
|
||||
|
||||
TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
|
||||
out.scalar_type() == ScalarType::Half,
|
||||
|
||||
@ -51,7 +51,7 @@ std::vector<Tensor> foreach_tensor_list_op(
|
||||
Op<opmath_t>(),
|
||||
alpha.to<opmath_t>());
|
||||
|
||||
return std::move(tensor_lists[2]);
|
||||
return tensor_lists[2];
|
||||
}
|
||||
|
||||
template <typename T, template <class> class Op>
|
||||
|
||||
@ -45,7 +45,7 @@ std::vector<Tensor> foreach_binary_op(
|
||||
/* res_arg_index */ 1>(),
|
||||
Op<opmath_t>(),
|
||||
scalar.to<opmath_t>());
|
||||
return std::move(tensor_lists[1]);
|
||||
return tensor_lists[1];
|
||||
}
|
||||
|
||||
template <typename T, template <class> class Op>
|
||||
|
||||
@ -33,7 +33,7 @@ std::vector<Tensor> foreach_binary_op(
|
||||
}
|
||||
|
||||
tensor_lists.emplace_back(tensors.vec());
|
||||
tensor_lists.emplace_back(std::move(vec_res));
|
||||
tensor_lists.emplace_back(vec_res);
|
||||
|
||||
using opmath_t = at::opmath_type<T>;
|
||||
multi_tensor_apply<2, opmath_t>(
|
||||
@ -46,7 +46,7 @@ std::vector<Tensor> foreach_binary_op(
|
||||
/* res_arg_index */ 1>(),
|
||||
|
||||
Op<opmath_t>());
|
||||
return std::move(tensor_lists[1]);
|
||||
return tensor_lists[1];
|
||||
}
|
||||
|
||||
template <typename T, template <class> class Op>
|
||||
|
||||
@ -56,7 +56,7 @@ std::vector<Tensor> foreach_binary_op(
|
||||
Op<opmath_t>(),
|
||||
scalar.data_ptr<T>(),
|
||||
alpha.to<opmath_t>());
|
||||
return std::move(tensor_lists[1]);
|
||||
return tensor_lists[1];
|
||||
}
|
||||
|
||||
template <typename T, template <class> class Op>
|
||||
|
||||
@ -57,7 +57,7 @@ std::vector<Tensor> foreach_pointwise_op(
|
||||
scalar.to<opmath_t>());
|
||||
});
|
||||
|
||||
return std::move(tensor_lists[3]);
|
||||
return tensor_lists[3];
|
||||
}
|
||||
|
||||
template <template <class> class Op>
|
||||
@ -160,7 +160,7 @@ std::vector<Tensor> foreach_pointwise_op(
|
||||
Op<opmath_t>());
|
||||
});
|
||||
|
||||
return std::move(tensor_lists[3]);
|
||||
return tensor_lists[3];
|
||||
}
|
||||
|
||||
#define FOREACH_POINTWISE_OP_SCALAR(NAME, OP) \
|
||||
|
||||
@ -37,7 +37,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
|
||||
vec_res.emplace_back(at::native::empty_like(t));
|
||||
}
|
||||
std::vector<std::vector<at::Tensor>> tensor_lists{
|
||||
tensors1.vec(), tensors2.vec(), tensors3.vec(), std::move(vec_res)};
|
||||
tensors1.vec(), tensors2.vec(), tensors3.vec(), vec_res};
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
|
||||
at::ScalarType::Half,
|
||||
@ -56,7 +56,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
|
||||
LerpFunctor<opmath_t>());
|
||||
});
|
||||
|
||||
return std::move(tensor_lists[3]);
|
||||
return tensor_lists[3];
|
||||
}
|
||||
|
||||
void foreach_tensor_lerp_ternary_cuda_(
|
||||
@ -104,7 +104,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
|
||||
vec_res.emplace_back(at::native::empty_like(t));
|
||||
}
|
||||
std::vector<std::vector<at::Tensor>> tensor_lists{
|
||||
tensors1.vec(), tensors2.vec(), std::move(vec_res)};
|
||||
tensors1.vec(), tensors2.vec(), vec_res};
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
|
||||
at::ScalarType::Half,
|
||||
@ -124,7 +124,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
|
||||
weight.to<opmath_t>());
|
||||
});
|
||||
|
||||
return std::move(tensor_lists[2]);
|
||||
return tensor_lists[2];
|
||||
}
|
||||
|
||||
void foreach_tensor_lerp_list_cuda_(
|
||||
@ -173,7 +173,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
|
||||
vec_res.emplace_back(at::native::empty_like(t));
|
||||
}
|
||||
std::vector<std::vector<at::Tensor>> tensor_lists{
|
||||
tensors1.vec(), tensors2.vec(), std::move(vec_res)};
|
||||
tensors1.vec(), tensors2.vec(), vec_res};
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
|
||||
at::ScalarType::Half,
|
||||
@ -193,7 +193,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
|
||||
LerpFunctor<opmath_t>());
|
||||
});
|
||||
|
||||
return std::move(tensor_lists[2]);
|
||||
return tensor_lists[2];
|
||||
}
|
||||
|
||||
void foreach_tensor_lerp_scalarlist_cuda_(
|
||||
|
||||
@ -67,7 +67,7 @@ std::vector<Tensor> foreach_unary_op(TensorList tensors) {
|
||||
/* res_arg_index */ 1>(),
|
||||
Op<opmath_t>());
|
||||
|
||||
return std::move(tensor_lists[1]);
|
||||
return tensor_lists[1];
|
||||
}
|
||||
|
||||
template <typename scalar_t, template <class> class Op>
|
||||
|
||||
@ -226,38 +226,6 @@ __global__ void CatArrayBatchedCopy_contig(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T, typename IndexType, int Dims, int batch_size, int stride_size, int alignment, int elems_per_vec>
|
||||
__global__ void CatArrayBatchedCopy_vectorized(
|
||||
char* output,
|
||||
CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs,
|
||||
TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
|
||||
const int concatDim,
|
||||
IndexType trailingSize) {
|
||||
|
||||
IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
IndexType nElements = inputs.nElements[blockIdx.y] / elems_per_vec;
|
||||
|
||||
if(tid >= nElements) return;
|
||||
|
||||
const char * data = (char*)inputs.input[blockIdx.y];
|
||||
IndexType offset = inputs.offset[blockIdx.y] * trailingSize / elems_per_vec;
|
||||
IndexType dimSize = inputs.dimSize[blockIdx.y] * trailingSize / elems_per_vec;
|
||||
int64_t dataOffset = (int64_t)offset * alignment; // in bytes
|
||||
|
||||
IndexType stride = gridDim.x * blockDim.x;
|
||||
|
||||
while( tid < nElements){
|
||||
int64_t elementOffset = (int64_t)CatArrIndexToOffset<IndexType, Dims>::compute(
|
||||
os.tensorSize, os.tensorStride, dimSize, concatDim, tid) * alignment; // in bytes
|
||||
auto vec = at::native::memory::ld_vec<alignment>(data + (int64_t)alignment * tid);
|
||||
at::native::memory::st_vec<alignment>(output + dataOffset + elementOffset, vec);
|
||||
tid += stride;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads
|
||||
to improve memory bandwidth throughput.
|
||||
@ -328,27 +296,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
scalar_t *data = (scalar_t *)(out.mutable_data_ptr());
|
||||
CatArrInputTensorMetadata<scalar_t, unsigned int, batch_size, stride_size> catMetaData;
|
||||
TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam;
|
||||
// If all batches are contiguous we can call a specialized implementation
|
||||
// which requires the input tensor addresses to be aligned to a
|
||||
// 16 Byte boundary.
|
||||
|
||||
constexpr bool isContig = stride_size == 1;
|
||||
bool isAligned = true;
|
||||
constexpr int alignment = 16;
|
||||
|
||||
// Next, let's initialize the size, stride arrays for the output Tensor.
|
||||
// for contig case, we'll canonicalize output strides, so that
|
||||
// we don't have arbitrary strides for dims of size 0
|
||||
size_t stride0 = 1;
|
||||
if (memory_format == c10::MemoryFormat::Contiguous) {
|
||||
for (int i = nDims - 1; i >= 0; --i) {
|
||||
for (int i = 0; i < nDims; ++i) {
|
||||
outputParam.tensorSize[i] = out.size(i);
|
||||
if (isContig) {
|
||||
outputParam.tensorStride[i] = stride0;
|
||||
stride0 *= out.size(i);
|
||||
} else {
|
||||
outputParam.tensorStride[i] = out.stride(i);
|
||||
}
|
||||
outputParam.tensorStride[i] = out.stride(i);
|
||||
}
|
||||
} else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) {
|
||||
// permute the semantics of dims from NCHW to NHWC so that the input
|
||||
@ -367,15 +320,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
|
||||
at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
// If all batches are contiguous we can call a specialized implementation
|
||||
// which requires the input tensor addresses to be aligned to a
|
||||
// 16 Byte boundary.
|
||||
|
||||
// for channels last computing slice size correctly is much more involved, so we never send it
|
||||
// on the fully vectorized path
|
||||
// we need output stride in cat dimension to be multiple of alignment,
|
||||
// if we ever use it to compute offsets
|
||||
// for catting in 0th dimension it doesn't matter
|
||||
bool isInOutAligned = isContig && at::native::memory::get_alignment(data) >= alignment &&
|
||||
memory_format == c10::MemoryFormat::Contiguous && (dimension == 0 ||
|
||||
outputParam.tensorStride[dimension - 1] * sizeof(scalar_t) % alignment == 0);
|
||||
bool isContig = true;
|
||||
bool isAligned = true;
|
||||
unsigned int max_elements_per_tensor = 0;
|
||||
|
||||
// Now we loop
|
||||
@ -391,16 +341,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
// high-dimensional tensor
|
||||
if (inputs[i+batchCounter].get().numel() > 0) {
|
||||
dimSize = inputs[i+batchCounter].get().size(dimension);
|
||||
if (isInOutAligned) {
|
||||
auto t = inputs[i+batchCounter].get();
|
||||
// similarly to output stride, we cannot trust stride value to
|
||||
// determine slice size if the corresponding dimension is 1
|
||||
// we have to multiply all the subsequent sizes
|
||||
int64_t slice_size = dimension == 0 ? t.numel() : t.sizes()[dimension - 1] != 1 ?
|
||||
t.strides()[dimension - 1] : c10::multiply_integers(t.sizes().begin() + dimension, t.sizes().end());
|
||||
slice_size *= sizeof(scalar_t);
|
||||
isInOutAligned &= (slice_size % alignment == 0);
|
||||
}
|
||||
}
|
||||
|
||||
catMetaData.input[batchCounter] = (scalar_t*)(inputs[i+batchCounter].get().const_data_ptr());
|
||||
@ -411,12 +351,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
#ifdef USE_ROCM
|
||||
// On ROCm, CatArrayBatchedCopy_contig is faster
|
||||
isAligned = false;
|
||||
isInOutAligned = false;
|
||||
#else
|
||||
// If at least one of the inputs is not aligned, we can't call the
|
||||
// CatArrayBatchedCopy_alignedK_contig
|
||||
isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]);
|
||||
isInOutAligned &= at::native::memory::get_alignment(catMetaData.input[batchCounter]) >= alignment;
|
||||
#endif
|
||||
|
||||
if (stride_size > 1) {
|
||||
@ -427,6 +365,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j];
|
||||
}
|
||||
catMetaData.isContiguous[batchCounter] = false;
|
||||
isContig = false;
|
||||
} else {
|
||||
catMetaData.isContiguous[batchCounter] = true;
|
||||
}
|
||||
@ -449,13 +388,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
max_elements_per_tensor, batchCounter);
|
||||
#else
|
||||
dim3 applyBlock, catGrid;
|
||||
if (isInOutAligned) {
|
||||
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, alignment>(
|
||||
max_elements_per_tensor, batchCounter);
|
||||
} else if (isContig && isAligned && sizeof(scalar_t) > 2) {
|
||||
if (isContig && sizeof(scalar_t) > 2) {
|
||||
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_16>(
|
||||
max_elements_per_tensor, batchCounter);
|
||||
} else if (isContig && isAligned && sizeof(scalar_t) == 2) {
|
||||
} else if (isContig && sizeof(scalar_t) == 2) {
|
||||
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_8>(
|
||||
max_elements_per_tensor, batchCounter);
|
||||
} else {
|
||||
@ -463,30 +399,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
getCatGrid(batchCounter, catGrid);
|
||||
}
|
||||
#endif
|
||||
int32_t trailingSize;
|
||||
TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
|
||||
if (isInOutAligned) {
|
||||
// in this case we can and should flatten the tensors after the cat dim
|
||||
// we want to view the tensors as if consisting of `alignment`-sized elements
|
||||
// however, we might not be able to cleanly divide just the last dim -
|
||||
// it might not be the multiple of alignment.
|
||||
// however, we know that the full concatted slice is multiple of alignment,
|
||||
// so if we flatten all the dims after and including concat dim,
|
||||
// it will be divisible by alignment
|
||||
// then we need to divide last out size by elems_per_vec,
|
||||
// and divide all strides except last by elems_per_vec (last stride is 1 always)
|
||||
// for input, we will fix up the sizes and strides in the kernel directly
|
||||
kernelOutputParam = outputParam;
|
||||
nDims = dimension + 1;
|
||||
constexpr auto elems_per_vec = alignment / sizeof(scalar_t);
|
||||
auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1];
|
||||
kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec;
|
||||
trailingSize = outputParam.tensorStride[dimension];
|
||||
kernelOutputParam.tensorStride[dimension] = 1;
|
||||
for (int i = 0; i < dimension; ++i) {
|
||||
kernelOutputParam.tensorStride[i] /= elems_per_vec;
|
||||
}
|
||||
}
|
||||
|
||||
if (memory_format != c10::MemoryFormat::Contiguous) {
|
||||
switch (dimension) {
|
||||
@ -501,12 +413,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
}
|
||||
// Template Declarations for dim = 1, 2, 3, 4
|
||||
#define HANDLE_CASE(DIMS) \
|
||||
if (isInOutAligned) {\
|
||||
constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \
|
||||
CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\
|
||||
catGrid, applyBlock, 0, stream.stream()>>>(\
|
||||
(char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\
|
||||
} else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
|
||||
if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
|
||||
CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
|
||||
catGrid, applyBlock, 0, stream.stream()>>>(\
|
||||
data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <ATen/div_rtn.h>
|
||||
#include <c10/util/safe_numerics.h>
|
||||
|
||||
namespace at::native {
|
||||
|
||||
@ -55,14 +54,6 @@ inline void col2im_shape_check(
|
||||
|
||||
int64_t batch_dim = (ndim == 3) ? 0 : -1;
|
||||
int64_t n_input_plane = input.size(batch_dim + 1);
|
||||
uint64_t prod_kernel_size = 1;
|
||||
|
||||
TORCH_CHECK(!c10::mul_overflows(static_cast<uint64_t>(kernel_width), static_cast<uint64_t>(kernel_height), &prod_kernel_size),
|
||||
"Given kernel_width = ",
|
||||
kernel_width,
|
||||
" and kernel_height = ",
|
||||
kernel_height,
|
||||
" the product of kernel_width and kernel_height overflowed.");
|
||||
|
||||
if (n_input_plane % (kernel_width * kernel_height) != 0) {
|
||||
TORCH_CHECK(false,
|
||||
|
||||
@ -559,60 +559,4 @@ Tensor _int_mm_xpu(const Tensor& self, const Tensor& mat2) {
|
||||
at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt));
|
||||
return _int_mm_out_xpu(self, mat2, result);
|
||||
}
|
||||
|
||||
Tensor _weight_int8pack_mm_xpu(
|
||||
const Tensor& A,
|
||||
const Tensor& B,
|
||||
const Tensor& scales) {
|
||||
auto M = A.size(0);
|
||||
auto N = B.size(0);
|
||||
auto K = A.size(1);
|
||||
|
||||
TORCH_CHECK(
|
||||
A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat,
|
||||
" : expect A to be either 32-bit or 16-bit float tensor.");
|
||||
TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor.");
|
||||
TORCH_CHECK(
|
||||
A.stride(1) == 1, " : A must be contiguous on the last dimension.");
|
||||
TORCH_CHECK(B.dtype() == kChar, " : expect B to be int8 tensor.");
|
||||
TORCH_CHECK(B.is_contiguous(), " : expect B to be contiguous.");
|
||||
TORCH_CHECK(B.size(1) == K, " : expect B.size(1) == ", K);
|
||||
|
||||
TORCH_CHECK(
|
||||
scales.dim() == 1 && scales.size(0) == N,
|
||||
" : expect scales to be 1d tensor with size ",
|
||||
N);
|
||||
|
||||
auto C = at::empty({M, N}, A.options());
|
||||
|
||||
// --- Launch kernel ---
|
||||
Tensor bias = at::Tensor();
|
||||
Tensor mat2_zero_points = at::Tensor();
|
||||
Tensor non_const_scales = scales;
|
||||
auto post_op_args = torch::List<std::optional<at::Scalar>>();
|
||||
|
||||
at::native::onednn::quantized_matmul(
|
||||
A.contiguous(),
|
||||
1.0,
|
||||
0,
|
||||
B,
|
||||
non_const_scales,
|
||||
mat2_zero_points,
|
||||
bias,
|
||||
C,
|
||||
1.0,
|
||||
0,
|
||||
C.scalar_type(),
|
||||
/*other*/ std::nullopt,
|
||||
/*other scale*/ 1.0,
|
||||
/*other zp*/ 0,
|
||||
/*binary post op*/ "none",
|
||||
/*binary alpha*/ 1.0,
|
||||
/*post_op_name*/ "none",
|
||||
post_op_args,
|
||||
/*post_op_algorithm*/ "none",
|
||||
/*m2_trans*/ false);
|
||||
|
||||
return C;
|
||||
}
|
||||
} // namespace at::native
|
||||
|
||||
@ -110,9 +110,8 @@ void quantized_matmul(
|
||||
// [Note] Quantized Matrix Multiplication at XPU
|
||||
// The following code integrates oneDNN quantized gemm. The quantization
|
||||
// config we support:
|
||||
// activation: s8, u8, fp16, bf16, fp32; per tensor calibrated;
|
||||
// symmetric&asymmetric weight: s8; per_tensor/per_channel calibrated;
|
||||
// symmetric
|
||||
// activation: s8&u8; per tensor calibrated; symmetric&asymmetric
|
||||
// weight: s8; per_tensor/per_channel calibrated; symmetric
|
||||
auto attr = Attr(static_cast<float>(1.0 / output_scale), output_zero_point);
|
||||
construct_attr_by_post_op(
|
||||
binary_post_op,
|
||||
|
||||
@ -534,18 +534,6 @@ static void max_unpool_out_mps_template(const Tensor& input,
|
||||
output.resize_(output_size, memory_format);
|
||||
output.fill_(0);
|
||||
|
||||
if (indices.defined() && indices.numel() > 0) {
|
||||
auto output_image_size = c10::multiply_integers(output_size_);
|
||||
|
||||
int64_t min_idx = indices.min().item<int64_t>();
|
||||
int64_t max_idx = indices.max().item<int64_t>();
|
||||
|
||||
if (min_idx < 0 || max_idx >= output_image_size) {
|
||||
int64_t error_idx = (min_idx < 0) ? min_idx : max_idx;
|
||||
TORCH_CHECK(false, "Found an invalid max index: ", error_idx, " for output tensor of shape ", output_size_);
|
||||
}
|
||||
}
|
||||
|
||||
id<MTLDevice> device = MPSDevice::getInstance()->device();
|
||||
MPSStream* mpsStream = getCurrentMPSStream();
|
||||
const auto numThreads = input.numel();
|
||||
|
||||
@ -4243,7 +4243,6 @@
|
||||
CPU: _weight_int8pack_mm_cpu
|
||||
CUDA: _weight_int8pack_mm_cuda
|
||||
MPS: _weight_int8pack_mm_mps
|
||||
XPU: _weight_int8pack_mm_xpu
|
||||
|
||||
- func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
|
||||
python_module: sparse
|
||||
@ -10849,7 +10848,6 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
|
||||
CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
|
||||
MTIA: foreach_tensor_maximum_scalar_kernel_mtia_
|
||||
autogen: _foreach_maximum.Scalar_out
|
||||
|
||||
# foreach_minimum/maximum dispatches to clamp_max/min
|
||||
|
||||
@ -64,6 +64,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
|
||||
// create sparse descriptor, dtype
|
||||
cusparseLtMatDescriptor_t sparse_input_descriptor;
|
||||
cudaDataType type;
|
||||
auto compression_factor = 9;
|
||||
|
||||
#ifdef USE_ROCM
|
||||
TORCH_CHECK(isHipSparseLtSupported());
|
||||
@ -72,6 +73,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
|
||||
switch (sparse_input.scalar_type()) {
|
||||
case at::ScalarType::Char:
|
||||
type = CUDA_R_8I;
|
||||
compression_factor = 10;
|
||||
break;
|
||||
case at::ScalarType::Half:
|
||||
type = CUDA_R_16F;
|
||||
@ -87,6 +89,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
|
||||
#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602 && !defined(USE_ROCM)
|
||||
case at::ScalarType::Float8_e4m3fn:
|
||||
type = CUDA_R_8F_E4M3;
|
||||
compression_factor = 10;
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
@ -94,6 +97,10 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
|
||||
break;
|
||||
}
|
||||
|
||||
// create a new compressed tensor with the same dtype as
|
||||
auto compressed_tensor =
|
||||
sparse_input.new_empty(sparse_input.numel() * compression_factor / 16);
|
||||
|
||||
TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
|
||||
&handle,
|
||||
&sparse_input_descriptor,
|
||||
@ -114,15 +121,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
|
||||
&compressed_size,
|
||||
&compressed_buffer_size));
|
||||
|
||||
// create a new compressed tensor with the same dtype as the input,
|
||||
// and with packed data/metadata stored in an array with original
|
||||
// number of rows, and sufficient columns to provide compressed_size
|
||||
// buffer (in bytes)
|
||||
size_t orig_m = sparse_input.size(0);
|
||||
size_t div = orig_m * sparse_input.itemsize();
|
||||
size_t new_n = (compressed_size + div - 1) / div; // floor
|
||||
auto compressed_tensor = sparse_input.new_empty({(int64_t)orig_m, (int64_t)new_n});
|
||||
|
||||
auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
|
||||
auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
@ -167,6 +165,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
|
||||
cudaDataType output_type;
|
||||
cudaDataType C_type;
|
||||
cusparseComputeType compute_type;
|
||||
auto compression_factor = 9;
|
||||
|
||||
#ifdef USE_ROCM
|
||||
TORCH_CHECK(isHipSparseLtSupported());
|
||||
@ -178,6 +177,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
|
||||
output_type = CUDA_R_8I;
|
||||
C_type = CUDA_R_8I;
|
||||
compute_type = CUSPARSE_COMPUTE_32I;
|
||||
compression_factor = 10;
|
||||
break;
|
||||
|
||||
// cuSPARSELt v0.5.2 onwards changes CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUT_16F
|
||||
@ -210,6 +210,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
|
||||
output_type = CUDA_R_8F_E4M3;
|
||||
C_type = CUDA_R_16F;
|
||||
compute_type = CUSPARSE_COMPUTE_32F;
|
||||
compression_factor = 10;
|
||||
break;
|
||||
#endif
|
||||
// cuSPARSELt <= v0.5.2 uses CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUTE_16F
|
||||
@ -299,10 +300,9 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
|
||||
}
|
||||
}
|
||||
|
||||
TORCH_INTERNAL_ASSERT(compressed_A.dim() == 2); // encoded M x S
|
||||
int64_t k = dense_B.size(0);
|
||||
int64_t n = dense_B.size(1);
|
||||
int64_t m = compressed_A.size(0);
|
||||
int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
|
||||
|
||||
// initialize sparse descriptor
|
||||
cusparseLtMatDescriptor_t sparse_input_descriptor;
|
||||
|
||||
@ -5,6 +5,51 @@
|
||||
|
||||
#include <ATen/test/allocator_clone_test.h>
|
||||
|
||||
#include <torch/csrc/cuda/CUDAPluggableAllocator.h>
|
||||
|
||||
TEST(AllocatorTestCUDA, test_clone) {
|
||||
test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
|
||||
}
|
||||
|
||||
static int called_dummy_free_0 = 0;
|
||||
static int called_dummy_free_1 = 0;
|
||||
|
||||
void* dummy_alloc_0(size_t size, int device, void* stream) {return nullptr;}
|
||||
void dummy_free_0(void* data, size_t size, int device, void* stream) {
|
||||
called_dummy_free_0++;
|
||||
}
|
||||
void dummy_free_1(void* data, size_t size, int device, void* stream) {
|
||||
called_dummy_free_1++;
|
||||
}
|
||||
|
||||
// Tests that data_ptrs have their respective deleters
|
||||
// when mixing allocators
|
||||
TEST(AllocatorTestCUDA, test_pluggable_allocator_deleters) {
|
||||
// Create a tensor with dummy_allocator_0, where dummy_free_0 is the deleter
|
||||
auto dummy_allocator_0 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_0);
|
||||
c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_0.get());
|
||||
at::Tensor a = at::empty({0}, at::TensorOptions().device(at::kCUDA));
|
||||
|
||||
// Create a tensor with dummy_allocator_1, where dummy_free_1 is the deleter
|
||||
auto dummy_allocator_1 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_1);
|
||||
c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_1.get());
|
||||
at::Tensor b = at::empty({0}, at::TensorOptions().device(at::kCUDA));
|
||||
|
||||
// Manually use a's deleter
|
||||
auto* ctx = a.storage().data_ptr().get_context();
|
||||
a.storage().data_ptr().get_deleter()(ctx);
|
||||
a.storage().mutable_data_ptr().release_context();
|
||||
|
||||
// a's deleter is dummy_free_0
|
||||
// dummy_free_0 should be called above, so called_dummy_free_0 should be 1
|
||||
ASSERT_TRUE(called_dummy_free_0 == 1);
|
||||
|
||||
// Manually use b's deleter
|
||||
ctx = b.storage().data_ptr().get_context();
|
||||
b.storage().data_ptr().get_deleter()(ctx);
|
||||
b.storage().mutable_data_ptr().release_context();
|
||||
|
||||
// b's deleter is dummy_free_1
|
||||
// dummy_free_1 should be called above, so called_dummy_free_1 should be 1
|
||||
ASSERT_TRUE(called_dummy_free_1 == 1);
|
||||
}
|
||||
|
||||
@ -1561,6 +1561,38 @@ namespace {
|
||||
<< "Failure Details:\nTest Seed to reproduce: " << seed;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
TYPED_TEST(Quantization8BitTests, TransposePackVNNI4) {
|
||||
using VT = ValueType<TypeParam>;
|
||||
constexpr auto K = 197;
|
||||
constexpr auto N = 64;
|
||||
constexpr auto L = K * N;
|
||||
constexpr auto ld_src = N;
|
||||
constexpr auto ld_dst = K * 4;
|
||||
CACHE_ALIGN VT x[L];
|
||||
CACHE_ALIGN VT y[L];
|
||||
CACHE_ALIGN VT ref[L];
|
||||
auto seed = TestSeed();
|
||||
ValueGen<VT> generator(VT(-100), VT(100), seed);
|
||||
for (const auto i : c10::irange(L)) {
|
||||
x[i] = generator.get();
|
||||
}
|
||||
at::vec::transpose_pack_vnni4(x, y, ld_src, K, N);
|
||||
int64_t _N = N / 4;
|
||||
for (int64_t k = 0; k < K; k++) {
|
||||
for(int64_t n = 0; n < _N; n++) {
|
||||
for(int64_t l = 0; l < 4; l++) {
|
||||
ref[n * ld_dst + k * 4 + l] =
|
||||
c10::load(&(x[k * ld_src + n * 4 + l]));
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const auto i : c10::irange(L)) {
|
||||
ASSERT_EQ(y[i], ref[i])
|
||||
<< "Failure Details:\nTest Seed to reproduce: " << seed;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
TYPED_TEST(FunctionalTests, Map) {
|
||||
using vec = TypeParam;
|
||||
|
||||
@ -174,11 +174,11 @@ YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,0
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,0
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,0
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,20
|
||||
hf_Reformer,pass,25
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -170,15 +170,15 @@ YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,fail_to_run,0
|
||||
meta-llama/Llama-3.2-1B,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,fail_to_run,0
|
||||
google/gemma-2-2b,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,fail_to_run,0
|
||||
google/gemma-3-4b-it,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
@ -186,4 +186,4 @@ openai/whisper-tiny,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,fail_to_run,0
|
||||
Qwen/Qwen3-0.6B,fail_accuracy,0
|
||||
|
||||
|
@ -138,7 +138,7 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,27
|
||||
hf_BigBird,pass,25
|
||||
|
||||
|
||||
|
||||
@ -158,7 +158,7 @@ hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -138,7 +138,7 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,27
|
||||
hf_BigBird,pass,25
|
||||
|
||||
|
||||
|
||||
@ -158,7 +158,7 @@ hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -138,7 +138,7 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,27
|
||||
hf_BigBird,pass,25
|
||||
|
||||
|
||||
|
||||
@ -158,7 +158,7 @@ hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -174,11 +174,11 @@ YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,0
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,0
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,0
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,20
|
||||
hf_Reformer,pass,25
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -122,7 +122,7 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,27
|
||||
hf_BigBird,pass,25
|
||||
|
||||
|
||||
|
||||
@ -142,7 +142,7 @@ hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -174,11 +174,11 @@ YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,0
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,0
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,0
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,20
|
||||
hf_Reformer,pass,25
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -174,11 +174,11 @@ YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,0
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,0
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,0
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
||||
|
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,20
|
||||
hf_Reformer,pass,25
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -174,11 +174,11 @@ YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,0
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,0
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,0
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,20
|
||||
hf_Reformer,pass,25
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -181,7 +181,7 @@ hf_T5_base,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
@ -205,7 +205,7 @@ llama,pass,0
|
||||
|
||||
|
||||
|
||||
llama_v2_7b_16h,pass_due_to_skip,0
|
||||
llama_v2_7b_16h,model_fail_to_load,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -178,7 +178,7 @@ llama,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
llama_v2_7b_16h,pass_due_to_skip,0
|
||||
llama_v2_7b_16h,model_fail_to_load,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -181,7 +181,7 @@ hf_T5_base,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -198,7 +198,7 @@ llama,pass,0
|
||||
|
||||
|
||||
|
||||
llama_v2_7b_16h,pass_due_to_skip,0
|
||||
llama_v2_7b_16h,model_fail_to_load,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_failed_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_failed_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_failed_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_failed_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_failed_to_run,0
|
||||
|
||||
|
@ -181,7 +181,7 @@ hf_T5_base,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -198,7 +198,7 @@ llama,pass,0
|
||||
|
||||
|
||||
|
||||
llama_v2_7b_16h,pass_due_to_skip,0
|
||||
llama_v2_7b_16h,model_fail_to_load,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -3580,10 +3580,18 @@ def process_caching_precompile():
|
||||
)
|
||||
from torch._dynamo.precompile_context import PrecompileContext
|
||||
|
||||
debug_info = PrecompileContext.save_to_dynamo_cache()
|
||||
print(
|
||||
f"Saved {len(debug_info['dynamo'])} precompile artifacts with {len(debug_info['backends'])} backends"
|
||||
)
|
||||
# Serialize all callables, clear PrecompileContext
|
||||
# TODO: put this under torch.compiler API once ready
|
||||
serialized = PrecompileContext.serialize()
|
||||
PrecompileContext.clear()
|
||||
if serialized is not None:
|
||||
artifacts, info = serialized
|
||||
print(
|
||||
f"Saving {len(info.precompile_dynamo_artifacts)} Precompile Artifact(s)..."
|
||||
)
|
||||
results = PrecompileContext.deserialize(artifacts)
|
||||
assert results is not None
|
||||
PrecompileContext.populate_caches(results)
|
||||
|
||||
|
||||
def process_entry(rank, runner, original_dir, args):
|
||||
|
||||
@ -6,7 +6,7 @@ add_loop_eager_dynamic,compile_time_instruction_count,4432000000,0.1
|
||||
|
||||
|
||||
|
||||
add_loop_inductor,compile_time_instruction_count,29660000000,0.1
|
||||
add_loop_inductor,compile_time_instruction_count,30280000000,0.1
|
||||
|
||||
|
||||
|
||||
@ -50,27 +50,27 @@ symint_sum_loop,compile_time_instruction_count,4299000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,1869000000,0.1
|
||||
aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,2151000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5281000000,0.1
|
||||
aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,6124000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_partitioner_cpu,compile_time_instruction_count,8333000000,0.1
|
||||
aotdispatcher_partitioner_cpu,compile_time_instruction_count,9005000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1909000000,0.1
|
||||
aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1989000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3442000000,0.1
|
||||
aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3959000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_training_subclass_cpu,compile_time_instruction_count,9239000000,0.1
|
||||
aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10650000000,0.1
|
||||
|
||||
|
||||
|
||||
@ -78,7 +78,7 @@ mm_loop_inductor_gpu,compile_time_instruction_count,4820968837,0.1
|
||||
|
||||
|
||||
|
||||
mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,9051000000,0.1
|
||||
mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,8802129167,0.1
|
||||
|
||||
|
||||
|
||||
@ -86,4 +86,4 @@ basic_NestedModule_eager,compile_time_instruction_count,9554000000,0.1
|
||||
|
||||
|
||||
|
||||
basic_InlineMod_eager,compile_time_instruction_count,7618000000,0.1
|
||||
basic_InlineMod_eager,compile_time_instruction_count,7464000000,0.1
|
||||
|
||||
|
@ -156,7 +156,7 @@ ROOT = "//" if IS_OSS else "//xplat/caffe2"
|
||||
# for targets in subfolders
|
||||
ROOT_PATH = "//" if IS_OSS else "//xplat/caffe2/"
|
||||
|
||||
C10 = "//c10:c10" if IS_OSS else "//xplat/caffe2/c10:c10"
|
||||
C10 = "//c10:c10" if IS_OSS else ("//xplat/caffe2/c10:c10_ovrsource" if is_arvr_mode() else "//xplat/caffe2/c10:c10")
|
||||
|
||||
# a dictionary maps third party library name to fbsource and oss target
|
||||
THIRD_PARTY_LIBS = {
|
||||
@ -948,6 +948,7 @@ def define_buck_targets(
|
||||
[
|
||||
("torch/csrc/api/include", "torch/**/*.h"),
|
||||
("", "torch/csrc/**/*.h"),
|
||||
("", "torch/csrc/**/*.hpp"),
|
||||
("", "torch/nativert/**/*.h"),
|
||||
("", "torch/headeronly/**/*.h"),
|
||||
("", "torch/script.h"),
|
||||
@ -1997,21 +1998,7 @@ def define_buck_targets(
|
||||
third_party("sleef_arm"),
|
||||
],
|
||||
}),
|
||||
compiler_flags = get_aten_compiler_flags() + select({
|
||||
"DEFAULT": [],
|
||||
"ovr_config//os:android-arm32": [
|
||||
"-mfpu=vfpv3-d16",
|
||||
"-march=armv7-a",
|
||||
"-mthumb",
|
||||
"-mfpu=neon",
|
||||
],
|
||||
"ovr_config//os:android-x86_32": [
|
||||
"-mssse3",
|
||||
],
|
||||
"ovr_config//os:android-x86_64": [
|
||||
"-mssse3",
|
||||
],
|
||||
}),
|
||||
compiler_flags = get_aten_compiler_flags(),
|
||||
exported_preprocessor_flags = get_aten_preprocessor_flags(),
|
||||
exported_deps = [
|
||||
":aten_header",
|
||||
@ -2047,6 +2034,7 @@ def define_buck_targets(
|
||||
("", "caffe2/utils/*.h"),
|
||||
("", "caffe2/core/*.h"),
|
||||
("", "torch/csrc/*.h"),
|
||||
("", "torch/csrc/*.hpp"),
|
||||
("", "torch/csrc/api/include/torch/*.h"),
|
||||
("", "torch/csrc/autograd/*.h"),
|
||||
("", "torch/csrc/autograd/*/*.h"),
|
||||
|
||||
@ -3269,7 +3269,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
|
||||
is_le<sizeof(autograd_meta_), 16, FieldNameEnum::autograd_meta_>();
|
||||
is_le<sizeof(extra_meta_), 16, FieldNameEnum::extra_meta_>();
|
||||
are_equal<sizeof(version_counter_), 8, FieldNameEnum::version_counter_>();
|
||||
are_equal<sizeof(pyobj_slot_), 16, FieldNameEnum::pyobj_slot_>();
|
||||
are_equal<sizeof(pyobj_slot_), 8, FieldNameEnum::pyobj_slot_>();
|
||||
are_equal<sizeof(sizes_and_strides_), 88, FieldNameEnum::sizes_and_strides_>();
|
||||
are_equal<sizeof(storage_offset_), 8, FieldNameEnum::storage_offset_>();
|
||||
are_equal<sizeof(numel_), 8, FieldNameEnum::numel_>();
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
#include <c10/core/impl/DeviceGuardImplInterface.h>
|
||||
#include <c10/core/impl/FakeGuardImpl.h>
|
||||
#include <array>
|
||||
|
||||
namespace c10::impl {
|
||||
@ -15,26 +14,4 @@ DeviceGuardImplRegistrar::DeviceGuardImplRegistrar(
|
||||
device_guard_impl_registry[static_cast<size_t>(type)].store(impl);
|
||||
}
|
||||
|
||||
namespace {
|
||||
thread_local std::unique_ptr<DeviceGuardImplInterface> tls_fake_device_guard =
|
||||
nullptr;
|
||||
}
|
||||
|
||||
void ensureCUDADeviceGuardSet() {
|
||||
constexpr auto cuda_idx = static_cast<std::size_t>(DeviceType::CUDA);
|
||||
|
||||
const DeviceGuardImplInterface* p =
|
||||
device_guard_impl_registry[cuda_idx].load();
|
||||
|
||||
// A non-null `ptr` indicates that the CUDA guard is already set up,
|
||||
// implying this is using cuda build
|
||||
if (p && p->deviceCount() == 0) {
|
||||
// In following cases, we override CUDA guard interface with a no-op
|
||||
// device guard. When p->deviceCount() == 0, cuda build is enabled, but no
|
||||
// cuda devices available.
|
||||
tls_fake_device_guard = std::make_unique<FakeGuardImpl<DeviceType::CUDA>>();
|
||||
device_guard_impl_registry[cuda_idx].store(tls_fake_device_guard.get());
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace c10::impl
|
||||
|
||||
@ -6,7 +6,6 @@
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
// Just for C10_ANONYMOUS_VARIABLE
|
||||
#include <c10/core/impl/TorchDispatchModeTLS.h>
|
||||
#include <c10/util/Registry.h>
|
||||
|
||||
#include <array>
|
||||
@ -252,7 +251,7 @@ struct C10_API DeviceGuardImplInterface {
|
||||
// for devices that don't actually have a concept of device index. Prominent
|
||||
// examples are CPU and Meta.
|
||||
template <DeviceType D>
|
||||
struct NoOpDeviceGuardImpl : public DeviceGuardImplInterface {
|
||||
struct NoOpDeviceGuardImpl final : public DeviceGuardImplInterface {
|
||||
NoOpDeviceGuardImpl() = default;
|
||||
DeviceType type() const override {
|
||||
return D;
|
||||
@ -372,7 +371,5 @@ inline bool hasDeviceGuardImpl(DeviceType type) {
|
||||
return device_guard_impl_registry[static_cast<size_t>(type)].load();
|
||||
}
|
||||
|
||||
void C10_API ensureCUDADeviceGuardSet();
|
||||
|
||||
} // namespace impl
|
||||
} // namespace c10
|
||||
|
||||
@ -13,11 +13,10 @@ struct C10_API PyInterpreterHooksInterface {
|
||||
|
||||
// Get the PyInterpreter instance
|
||||
// Stub implementation throws error when Python is not available
|
||||
// We return nullptr rather than throwing an error since there are bits of c10
|
||||
// that expect an empty PyObjectSlot when python is not available.
|
||||
virtual PyInterpreter* getPyInterpreter() const {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"PyTorch was compiled without Python support. "
|
||||
"Cannot access Python interpreter from C++.");
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
namespace c10::impl {
|
||||
|
||||
PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
|
||||
PyObjectSlot::PyObjectSlot() : pyobj_(nullptr) {}
|
||||
|
||||
PyObjectSlot::~PyObjectSlot() {
|
||||
maybe_destroy_pyobj();
|
||||
@ -10,9 +10,9 @@ PyObjectSlot::~PyObjectSlot() {
|
||||
|
||||
void PyObjectSlot::maybe_destroy_pyobj() {
|
||||
if (owns_pyobj()) {
|
||||
TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
|
||||
TORCH_INTERNAL_ASSERT(getGlobalPyInterpreter() != nullptr);
|
||||
TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
|
||||
(*pyobj_interpreter_.load(std::memory_order_acquire))
|
||||
(*getGlobalPyInterpreter())
|
||||
->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
|
||||
// NB: this destructor can only be entered when there are no
|
||||
// references to this C++ object (obviously), NOR any references
|
||||
@ -25,7 +25,7 @@ void PyObjectSlot::maybe_destroy_pyobj() {
|
||||
}
|
||||
|
||||
PyInterpreter* PyObjectSlot::pyobj_interpreter() {
|
||||
return pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
return getGlobalPyInterpreter();
|
||||
}
|
||||
|
||||
PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
|
||||
@ -35,7 +35,7 @@ PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
|
||||
}
|
||||
|
||||
PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
|
||||
auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
auto interpreter = getGlobalPyInterpreter();
|
||||
if (interpreter) {
|
||||
return *interpreter;
|
||||
}
|
||||
|
||||
@ -6,10 +6,17 @@
|
||||
#include <c10/util/python_stub.h>
|
||||
#include <optional>
|
||||
|
||||
#include <atomic>
|
||||
|
||||
namespace c10::impl {
|
||||
|
||||
// Function pointer type for getting the global interpreter
|
||||
using GetPyInterpreterFn = PyInterpreter* (*)();
|
||||
|
||||
// Global function pointer (set by csrc initialization)
|
||||
C10_API extern GetPyInterpreterFn g_get_pyinterpreter_fn;
|
||||
|
||||
// Helper function to get the global interpreter
|
||||
C10_API PyInterpreter* getGlobalPyInterpreter();
|
||||
|
||||
struct C10_API PyObjectSlot {
|
||||
public:
|
||||
PyObjectSlot();
|
||||
@ -26,8 +33,6 @@ struct C10_API PyObjectSlot {
|
||||
// NB: THIS FUNCTION CAN RAISE AN EXCEPTION. Make sure to clean up after
|
||||
// PyObject if necessary!
|
||||
void init_pyobj(PyObject* pyobj) {
|
||||
pyobj_interpreter_.store(
|
||||
getGlobalPyInterpreter(), std::memory_order_relaxed);
|
||||
pyobj_ = pyobj;
|
||||
}
|
||||
|
||||
@ -55,18 +60,15 @@ struct C10_API PyObjectSlot {
|
||||
|
||||
// @todo alban: I'm not too sure what's going on here, we can probably delete
|
||||
// it but it's worthwhile making sure
|
||||
std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
|
||||
impl::PyInterpreter* interpreter =
|
||||
pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
if (interpreter == nullptr) {
|
||||
std::optional<PyObject*> check_pyobj() const {
|
||||
impl::PyInterpreter* interpreter = getGlobalPyInterpreter();
|
||||
if (interpreter == nullptr || pyobj_ == nullptr) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
|
||||
if (c10::impl::HermeticPyObjectTLS::get_state()) {
|
||||
return std::nullopt;
|
||||
} else {
|
||||
return _unchecked_untagged_pyobj();
|
||||
}
|
||||
return _unchecked_untagged_pyobj();
|
||||
}
|
||||
|
||||
PyInterpreter& load_pyobj_interpreter() const;
|
||||
@ -76,30 +78,6 @@ struct C10_API PyObjectSlot {
|
||||
void set_owns_pyobj(bool b);
|
||||
|
||||
private:
|
||||
// This field contains the interpreter tag for this object. See
|
||||
// Note [Python interpreter tag] for general context
|
||||
//
|
||||
// Note [Memory ordering on Python interpreter tag]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// What memory_order do we need when accessing this atomic? We don't
|
||||
// need a single total modification order (as provided by
|
||||
// memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
|
||||
// transition from -1 to some positive integer and never changes afterwards.
|
||||
// Because there is only one modification, it trivially already has a total
|
||||
// modification order (e.g., we don't need fences or locked instructions on
|
||||
// x86)
|
||||
//
|
||||
// In fact, one could make a reasonable argument that relaxed reads are OK,
|
||||
// due to the presence of external locking (GIL) to ensure that interactions
|
||||
// with other data structures are still correctly synchronized, so that
|
||||
// we fall in the "Single-Location Data Structures" case as described in
|
||||
// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
|
||||
// However, on x86, it doesn't matter if I use acquire or relaxed on the load
|
||||
// as I get the same assembly in both cases. So I just use the more
|
||||
// conservative acquire (which will impede compiler optimizations but I don't
|
||||
// care)
|
||||
std::atomic<PyInterpreter*> pyobj_interpreter_;
|
||||
|
||||
// This field contains a reference to a PyObject representing this Tensor.
|
||||
// If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
|
||||
// PyObject for it and set this field. This field does not have to be
|
||||
|
||||
@ -18,9 +18,9 @@ cuda_supported_platforms = [
|
||||
|
||||
def define_c10_ovrsource(name, is_mobile):
|
||||
if is_mobile:
|
||||
pp_flags = ["-DC10_MOBILE=1"]
|
||||
pp_flags = ["-DC10_MOBILE=1", "-DC10_USE_GLOG"]
|
||||
else:
|
||||
pp_flags = []
|
||||
pp_flags = ["-DC10_USE_GLOG"]
|
||||
|
||||
oxx_static_library(
|
||||
name = name,
|
||||
|
||||
@ -540,11 +540,9 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
|
||||
${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
|
||||
)
|
||||
|
||||
if(USE_DISTRIBUTED)
|
||||
append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
|
||||
endif()
|
||||
append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@ -575,32 +573,30 @@ if(USE_CUDA)
|
||||
list(APPEND Caffe2_GPU_SRCS
|
||||
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
|
||||
endif()
|
||||
if(USE_DISTRIBUTED)
|
||||
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
|
||||
set_source_files_properties(
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
|
||||
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
|
||||
)
|
||||
endif()
|
||||
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
|
||||
set_source_files_properties(
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
|
||||
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
|
||||
)
|
||||
endif()
|
||||
|
||||
set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
|
||||
# Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
|
||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
|
||||
endif()
|
||||
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
|
||||
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
|
||||
endif()
|
||||
set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
|
||||
# Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
|
||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
|
||||
endif()
|
||||
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
|
||||
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
|
||||
endif()
|
||||
set_source_files_properties(
|
||||
${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
|
||||
@ -633,11 +629,9 @@ if(USE_ROCM)
|
||||
list(APPEND Caffe2_HIP_SRCS
|
||||
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
|
||||
endif()
|
||||
if(USE_DISTRIBUTED)
|
||||
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
|
||||
endif()
|
||||
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
|
||||
endif()
|
||||
# caffe2_nvrtc's stubs to driver APIs are useful for HIP.
|
||||
# See NOTE [ ATen NVRTC Stub and HIP ]
|
||||
@ -1358,12 +1352,10 @@ if(BUILD_TEST)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
|
||||
add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
|
||||
if(USE_DISTRIBUTED)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
|
||||
if(NOT WIN32)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
|
||||
endif()
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
|
||||
if(NOT WIN32)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
|
||||
endif()
|
||||
if(NOT NO_API)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
|
||||
@ -1468,46 +1460,40 @@ if(BUILD_LITE_INTERPRETER)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
# Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and
|
||||
# jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set
|
||||
if(USE_DISTRIBUTED)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_DISTRIBUTED)
|
||||
if(USE_GLOO AND USE_C10D_GLOO)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
|
||||
if(USE_GLOO AND USE_C10D_GLOO)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
|
||||
endif()
|
||||
if(USE_UCC AND USE_C10D_UCC)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
|
||||
if(USE_CUDA)
|
||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
|
||||
endif()
|
||||
if(USE_UCC AND USE_C10D_UCC)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
|
||||
if(USE_CUDA)
|
||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
|
||||
endif()
|
||||
endif()
|
||||
if(USE_NCCL AND USE_C10D_NCCL)
|
||||
if(USE_ROCM)
|
||||
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
|
||||
else()
|
||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
|
||||
endif()
|
||||
if(USE_NCCL AND USE_C10D_NCCL)
|
||||
if(USE_ROCM)
|
||||
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
|
||||
else()
|
||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
|
||||
endif()
|
||||
endif()
|
||||
if(USE_MPI AND USE_C10D_MPI)
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
set_source_files_properties(
|
||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
|
||||
PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
|
||||
endif()
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
|
||||
endif()
|
||||
# Pass USE_RPC in order to reduce use of
|
||||
# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
|
||||
# need to be removed when RPC is supported
|
||||
if(NOT WIN32)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_RPC)
|
||||
endif()
|
||||
# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
|
||||
# can only be compiled with USE_TENSORPIPE is set.
|
||||
if(USE_TENSORPIPE)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
|
||||
endif()
|
||||
if(USE_MPI AND USE_C10D_MPI)
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
set_source_files_properties(
|
||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
|
||||
PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
|
||||
endif()
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
|
||||
endif()
|
||||
# Pass USE_RPC in order to reduce use of
|
||||
# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
|
||||
# need to be removed when RPC is supported
|
||||
if(NOT WIN32)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_RPC)
|
||||
endif()
|
||||
# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
|
||||
# can only be compiled with USE_TENSORPIPE is set.
|
||||
if(USE_TENSORPIPE)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
|
||||
endif()
|
||||
|
||||
if(NOT INTERN_BUILD_MOBILE)
|
||||
|
||||
@ -114,20 +114,14 @@ inline float32x4_t vexpq_f32(float32x4_t x) {
|
||||
|
||||
auto poly = svset_neonq(svundef_f32(), vfmaq_f32(scale, p12345, scale));
|
||||
|
||||
auto pHigh = svcmpgt_f32(svptrue_b8(), svset_neonq(svundef_f32(), x), max_input);
|
||||
auto pLow = svcmplt_f32(svptrue_b8(), svset_neonq(svundef_f32(), x), min_input);
|
||||
|
||||
auto bound = svsel_f32(
|
||||
pHigh,
|
||||
inf,
|
||||
zero);
|
||||
|
||||
auto pCombined = svorr_b_z(svptrue_b8(), pLow, pHigh);
|
||||
|
||||
// Handle underflow and overflow.
|
||||
poly = svsel_f32(
|
||||
pCombined,
|
||||
bound,
|
||||
svcmplt_f32(svptrue_b8(), svset_neonq(svundef_f32(), x), min_input),
|
||||
zero,
|
||||
poly);
|
||||
poly = svsel_f32(
|
||||
svcmpgt_f32(svptrue_b8(), svset_neonq(svundef_f32(), x), max_input),
|
||||
inf,
|
||||
poly);
|
||||
|
||||
return svget_neonq(poly);
|
||||
|
||||
@ -73,6 +73,19 @@ void box_cox_zero_lambda(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
at::vec::Vectorized<T> box_cox_nonzero_lambda_impl(
|
||||
at::vec::Vectorized<T> data,
|
||||
at::vec::Vectorized<T> lambda1,
|
||||
at::vec::Vectorized<T> lambda2,
|
||||
at::vec::Vectorized<T> k_eps) {
|
||||
auto sum = data + lambda2;
|
||||
auto max = at::vec::max(sum, k_eps);
|
||||
auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
|
||||
auto pow = max.pow(lambda1);
|
||||
return at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void box_cox_nonzero_lambda(
|
||||
int64_t D,
|
||||
@ -88,21 +101,18 @@ void box_cox_nonzero_lambda(
|
||||
auto k_eps_vec = Vec(k_eps);
|
||||
for(; j + VLEN < D; j += VLEN) {
|
||||
auto data = Vec::loadu(data_ptr + j);
|
||||
auto lambda2 = Vec::loadu(lambda2_ptr + j);
|
||||
auto sum = data + lambda2;
|
||||
auto max = at::vec::max(sum, k_eps_vec);
|
||||
auto lambda1 = Vec::loadu(lambda1_ptr + j);
|
||||
auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
|
||||
auto pow = max.pow(lambda1);
|
||||
auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
|
||||
auto lambda2 = Vec::loadu(lambda2_ptr + j);
|
||||
auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
|
||||
res.store(out + j);
|
||||
}
|
||||
for ( ;j < D; ++j) {
|
||||
auto sum = data_ptr[j] + lambda2_ptr[j];
|
||||
auto max = std::max(sum, k_eps);
|
||||
auto lambda_over_1 = at::vec::fast_recieprocal(lambda1_ptr[j]);
|
||||
auto pow = std::pow(max, lambda1_ptr[j]);
|
||||
out[j] = pow * lambda_over_1 - lambda_over_1;
|
||||
if (j < D) {
|
||||
auto remaining = D - j;
|
||||
auto data = Vec::loadu(data_ptr + j, remaining);
|
||||
auto lambda1 = Vec::loadu(lambda1_ptr + j, remaining);
|
||||
auto lambda2 = Vec::loadu(lambda2_ptr + j, remaining);
|
||||
auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
|
||||
res.store(out + j, remaining);
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
||||
@ -1134,7 +1134,7 @@ if(USE_CUDA AND CUDA_VERSION VERSION_LESS 13.0)
|
||||
include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
|
||||
endif()
|
||||
|
||||
if(USE_DISTRIBUTED AND USE_TENSORPIPE)
|
||||
if(USE_TENSORPIPE)
|
||||
if(MSVC)
|
||||
message(WARNING "Tensorpipe cannot be used on Windows.")
|
||||
else()
|
||||
|
||||
@ -158,7 +158,6 @@ function(caffe2_print_configuration_summary)
|
||||
if(${USE_KLEIDIAI})
|
||||
message(STATUS " USE_KLEIDIAI : ${USE_KLEIDIAI}")
|
||||
endif()
|
||||
message(STATUS " USE_PRIORITIZED_TEXT_FOR_LD : ${USE_PRIORITIZED_TEXT_FOR_LD}")
|
||||
message(STATUS " USE_UCC : ${USE_UCC}")
|
||||
if(${USE_UCC})
|
||||
message(STATUS " USE_SYSTEM_UCC : ${USE_SYSTEM_UCC}")
|
||||
@ -193,13 +192,11 @@ function(caffe2_print_configuration_summary)
|
||||
message(STATUS " USE_PYTORCH_QNNPACK : ${USE_PYTORCH_QNNPACK}")
|
||||
message(STATUS " USE_XNNPACK : ${USE_XNNPACK}")
|
||||
message(STATUS " USE_DISTRIBUTED : ${USE_DISTRIBUTED}")
|
||||
if(${USE_DISTRIBUTED})
|
||||
message(STATUS " USE_MPI : ${USE_MPI}")
|
||||
message(STATUS " USE_GLOO : ${USE_GLOO}")
|
||||
message(STATUS " USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
|
||||
message(STATUS " USE_GLOO_IBVERBS : ${USE_GLOO_IBVERBS}")
|
||||
message(STATUS " USE_TENSORPIPE : ${USE_TENSORPIPE}")
|
||||
endif()
|
||||
message(STATUS " USE_MPI : ${USE_MPI}")
|
||||
message(STATUS " USE_GLOO : ${USE_GLOO}")
|
||||
message(STATUS " USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
|
||||
message(STATUS " USE_GLOO_IBVERBS : ${USE_GLOO_IBVERBS}")
|
||||
message(STATUS " USE_TENSORPIPE : ${USE_TENSORPIPE}")
|
||||
if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
|
||||
message(STATUS " SELECTED_OP_LIST : ${SELECTED_OP_LIST}")
|
||||
endif()
|
||||
|
||||
@ -482,7 +482,6 @@ function(torch_update_find_cuda_flags)
|
||||
endfunction()
|
||||
|
||||
include(CheckCXXCompilerFlag)
|
||||
include(CheckLinkerFlag)
|
||||
|
||||
##############################################################################
|
||||
# CHeck if given flag is supported and append it to provided outputvar
|
||||
@ -512,22 +511,3 @@ function(target_compile_options_if_supported target flag)
|
||||
target_compile_options(${target} PRIVATE ${flag})
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
# Check if a global link option is supported
|
||||
function(add_link_options_if_supported flag)
|
||||
check_linker_flag(C "LINKER:${flag}" _supported)
|
||||
if("${_supported}")
|
||||
add_link_options("LINKER:${flag}")
|
||||
else()
|
||||
message(WARNING "Attempted to use unsupported link option : ${flag}.")
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
function(target_link_options_if_supported tgt flag)
|
||||
check_linker_flag(C "LINKER:${flag}" _supported)
|
||||
if("${_supported}")
|
||||
target_link_options("${tgt}" PRIVATE "LINKER:${flag}")
|
||||
else()
|
||||
message(WARNING "Attempted to use unsupported link option : ${flag}.")
|
||||
endif()
|
||||
endfunction()
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user