From c1c94cb0be098104ebec52e368abf00fef75b326 Mon Sep 17 00:00:00 2001
From: Andrea Frittoli <andrea.frittoli@uk.ibm.com>
Date: Fri, 8 Nov 2024 13:28:27 +0000
Subject: [PATCH] Build magma binary tarballs for various cuda (#139888)

This is a first step towards removing builds dependency to conda.

Currently we build magma as a conda package in a pytorch conda channel, implemented in https://github.com/pytorch/builder/tree/a1b372dbda2e9e3bd946cf135aa3b0137dfdf052/magma.

This commit adapts the logic from pytorch/builder as follows:
- use pytorch/manylinux-cuda<cuda-version> as base image
- apply patches and invoke the build.sh script directly (not anymore through conda build)
- stores license and build files along with the built artifact, in an info subfolder
- create a tarball file which resembles that created by conda, without any conda-specific metadata

A new matrix workflow is added, which runs the build for each supported cuda version, and uploads the binaries to pyorch s3 bucket.

For the upload, define an upload.sh script, which will be used by the magma windows job as well, to upload to `s3://ossci-*` buckets.

The build runs on PR and push, upload runs in DRY_RUN mode in case of PR.

Fixes #139397

Pull Request resolved: https://github.com/pytorch/pytorch/pull/139888
Approved by: https://github.com/atalman, https://github.com/malfet, https://github.com/seemethere
---
 .ci/magma/.gitignore                        |   2 +
 .ci/magma/Makefile                          |  48 +++
 .ci/magma/README.md                         |  50 +++
 .ci/magma/build_magma.sh                    |  50 +++
 .ci/magma/package_files/CMake.patch         |  40 ++
 .ci/magma/package_files/build.sh            |  12 +
 .ci/magma/package_files/cmakelists.patch    | 388 ++++++++++++++++++++
 .ci/magma/package_files/getrf_nbparam.patch |  40 ++
 .ci/magma/package_files/getrf_shfl.patch    |  15 +
 .ci/magma/package_files/magma-2.6.1.sha256  |   1 +
 .ci/magma/package_files/thread_queue.patch  |  20 +
 .github/scripts/upload_aws_ossci.sh         |  41 +++
 .github/workflows/build-magma-linux.yml     |  67 ++++
 .lintrunner.toml                            |   1 +
 14 files changed, 775 insertions(+)
 create mode 100644 .ci/magma/.gitignore
 create mode 100644 .ci/magma/Makefile
 create mode 100644 .ci/magma/README.md
 create mode 100755 .ci/magma/build_magma.sh
 create mode 100644 .ci/magma/package_files/CMake.patch
 create mode 100755 .ci/magma/package_files/build.sh
 create mode 100644 .ci/magma/package_files/cmakelists.patch
 create mode 100644 .ci/magma/package_files/getrf_nbparam.patch
 create mode 100644 .ci/magma/package_files/getrf_shfl.patch
 create mode 100644 .ci/magma/package_files/magma-2.6.1.sha256
 create mode 100644 .ci/magma/package_files/thread_queue.patch
 create mode 100644 .github/scripts/upload_aws_ossci.sh
 create mode 100644 .github/workflows/build-magma-linux.yml
diff --git a/.ci/magma/.gitignore b/.ci/magma/.gitignore
new file mode 100644
index 000000000000..cf874d9dd01b
--- /dev/null
+++ b/.ci/magma/.gitignore
@@ -0,0 +1,2 @@
+output/
+magma-cuda*/
diff --git a/.ci/magma/Makefile b/.ci/magma/Makefile
new file mode 100644
index 000000000000..1c4599ee240e
--- /dev/null
+++ b/.ci/magma/Makefile
@@ -0,0 +1,48 @@
+SHELL=/usr/bin/env bash
+
+DOCKER_CMD ?= docker
+DESIRED_CUDA ?= 11.8
+DESIRED_CUDA_SHORT = $(subst .,,$(DESIRED_CUDA))
+PACKAGE_NAME = magma-cuda
+CUDA_ARCH_LIST ?= -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90
+
+DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
+	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
+	-w /builder \
+	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_CUDA_SHORT} \
+	-e DESIRED_CUDA=${DESIRED_CUDA} \
+	-e CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" \
+	"pytorch/manylinux-cuda${DESIRED_CUDA_SHORT}" \
+	magma/build_magma.sh
+
+.PHONY: all
+all: magma-cuda126
+all: magma-cuda124
+all: magma-cuda121
+all: magma-cuda118
+
+.PHONY:
+clean:
+	$(RM) -r magma-*
+	$(RM) -r output
+
+.PHONY: magma-cuda126
+magma-cuda126: DESIRED_CUDA := 12.6
+magma-cuda126:
+	$(DOCKER_RUN)
+
+.PHONY: magma-cuda124
+magma-cuda124: DESIRED_CUDA := 12.4
+magma-cuda124:
+	$(DOCKER_RUN)
+
+.PHONY: magma-cuda121
+magma-cuda121: DESIRED_CUDA := 12.1
+magma-cuda121:
+	$(DOCKER_RUN)
+
+.PHONY: magma-cuda118
+magma-cuda118: DESIRED_CUDA := 11.8
+magma-cuda118: CUDA_ARCH_LIST += -gencode arch=compute_37,code=sm_37
+magma-cuda118:
+	$(DOCKER_RUN)
diff --git a/.ci/magma/README.md b/.ci/magma/README.md
new file mode 100644
index 000000000000..c343b4a8cdce
--- /dev/null
+++ b/.ci/magma/README.md
@@ -0,0 +1,50 @@
+# Magma
+
+This folder contains the scripts and configurations to build magma, statically linked for various versions of CUDA.
+
+## Building
+
+Look in the `Makefile` for available targets to build. To build any target, for example `magma-cuda118`, run
+
+```
+# Using `docker`
+make magma-cuda118
+
+# Using `podman`
+DOCKER_CMD=podman make magma-cuda118
+```
+
+This spawns a `pytorch/manylinux-cuda<version>` docker image, which has the required `devtoolset` and CUDA versions installed.
+Within the docker image, it runs `build_magma.sh` with the correct environment variables set, which package the necessary files
+into a tarball, with the following structure:
+
+```
+.
+├── include       # header files
+├── lib           # libmagma.a
+├── info
+│   ├── licenses  # license file
+│   └── recipe    # build script and patches
+```
+
+More specifically, `build_magma.sh` copies over the relevant files from the `package_files` directory depending on the CUDA version.
+Outputted binaries should be in the `output` folder.
+
+
+## Pushing
+
+Packages can be uploaded to an S3 bucket using:
+
+```
+aws s3 cp output/*/magma-cuda*.bz2 <bucket-with-path>
+```
+
+If you do not have upload permissions, please ping @seemethere or @soumith to gain access
+
+## New versions
+
+New CUDA versions can be added by creating a new make target with the next desired version. For CUDA version NN.n, the target should be named `magma-cudaNNn`.
+
+Make sure to edit the appropriate environment variables (e.g., DESIRED_CUDA, CUDA_ARCH_LIST) in the `Makefile` accordingly. Remember also to check `build_magma.sh` to ensure the logic for copying over the files remains correct.
+
+New patches can be added by editing `Makefile` and`build_magma.sh` the same way `getrf_nbparam.patch` is implemented.
diff --git a/.ci/magma/build_magma.sh b/.ci/magma/build_magma.sh
new file mode 100755
index 000000000000..541ae64b7537
--- /dev/null
+++ b/.ci/magma/build_magma.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+# Environment variables
+# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+MAGMA_VERSION=2.6.1
+
+# Folders for the build
+PACKAGE_FILES=${ROOT_DIR}/magma/package_files # source patches and metadata
+PACKAGE_DIR=${ROOT_DIR}/magma/${PACKAGE_NAME} # build workspace
+PACKAGE_OUTPUT=${ROOT_DIR}/magma/output # where tarballs are stored
+PACKAGE_BUILD=${PACKAGE_DIR}/build # where the content of the tarball is prepared
+PACKAGE_RECIPE=${PACKAGE_BUILD}/info/recipe
+PACKAGE_LICENSE=${PACKAGE_BUILD}/info/licenses
+mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RECIPE} ${PACKAGE_LICENSE}
+
+# Fetch magma sources and verify checksum
+pushd ${PACKAGE_DIR}
+curl -LO http://icl.utk.edu/projectsfiles/magma/downloads/magma-${MAGMA_VERSION}.tar.gz
+tar zxf magma-${MAGMA_VERSION}.tar.gz
+sha256sum --check < ${PACKAGE_FILES}/magma-${MAGMA_VERSION}.sha256
+popd
+
+# Apply patches and build
+pushd ${PACKAGE_DIR}/magma-${MAGMA_VERSION}
+patch < ${PACKAGE_FILES}/CMake.patch
+patch < ${PACKAGE_FILES}/cmakelists.patch
+patch -p0 < ${PACKAGE_FILES}/thread_queue.patch
+patch -p1 < ${PACKAGE_FILES}/getrf_shfl.patch
+patch -p1 < ${PACKAGE_FILES}/getrf_nbparam.patch
+# The build.sh script expects to be executed from the sources root folder
+INSTALL_DIR=${PACKAGE_BUILD} ${PACKAGE_FILES}/build.sh
+popd
+
+# Package recipe, license and tarball
+# Folder and package name are backward compatible for the build workflow
+cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh
+cp ${PACKAGE_FILES}/thread_queue.patch ${PACKAGE_RECIPE}/thread_queue.patch
+cp ${PACKAGE_FILES}/cmakelists.patch ${PACKAGE_RECIPE}/cmakelists.patch
+cp ${PACKAGE_FILES}/getrf_shfl.patch ${PACKAGE_RECIPE}/getrf_shfl.patch
+cp ${PACKAGE_FILES}/getrf_nbparam.patch ${PACKAGE_RECIPE}/getrf_nbparam.patch
+cp ${PACKAGE_FILES}/CMake.patch ${PACKAGE_RECIPE}/CMake.patch
+cp ${PACKAGE_FILES}/magma-${MAGMA_VERSION}.sha256 ${PACKAGE_RECIPE}/magma-${MAGMA_VERSION}.sha256
+cp ${PACKAGE_DIR}/magma-${MAGMA_VERSION}/COPYRIGHT ${PACKAGE_LICENSE}/COPYRIGHT
+pushd ${PACKAGE_BUILD}
+tar cjf ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2 include lib info
+echo Built in ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2
+popd
\ No newline at end of file
diff --git a/.ci/magma/package_files/CMake.patch b/.ci/magma/package_files/CMake.patch
new file mode 100644
index 000000000000..5d4636bfa09f
--- /dev/null
+++ b/.ci/magma/package_files/CMake.patch
@@ -0,0 +1,40 @@
+--- CMake.src.cuda	2023-03-29 10:05:32.136954140 +0000
++++ CMake.src.cuda	2023-03-29 10:05:50.281318043 +0000
+@@ -283,10 +283,10 @@
+ magmablas/zgeadd.cu
+ magmablas/zgeadd2.cu
+ magmablas/zgeam.cu
+-magmablas/zgemm_fermi.cu
++#magmablas/zgemm_fermi.cu
+ magmablas/zgemm_reduce.cu
+ magmablas/zgemv_conj.cu
+-magmablas/zgemv_fermi.cu
++#magmablas/zgemv_fermi.cu
+ magmablas/zgerbt.cu
+ magmablas/zgerbt_kernels.cu
+ magmablas/zgetmatrix_transpose.cpp
+@@ -1009,18 +1009,18 @@
+ magmablas/sgeam.cu
+ magmablas/dgeam.cu
+ magmablas/cgeam.cu
+-magmablas/sgemm_fermi.cu
+-magmablas/dgemm_fermi.cu
+-magmablas/cgemm_fermi.cu
++#magmablas/sgemm_fermi.cu
++#magmablas/dgemm_fermi.cu
++#magmablas/cgemm_fermi.cu
+ magmablas/sgemm_reduce.cu
+ magmablas/dgemm_reduce.cu
+ magmablas/cgemm_reduce.cu
+ magmablas/sgemv_conj.cu
+ magmablas/dgemv_conj.cu
+ magmablas/cgemv_conj.cu
+-magmablas/sgemv_fermi.cu
+-magmablas/dgemv_fermi.cu
+-magmablas/cgemv_fermi.cu
++#magmablas/sgemv_fermi.cu
++#magmablas/dgemv_fermi.cu
++#magmablas/cgemv_fermi.cu
+ magmablas/sgerbt.cu
+ magmablas/dgerbt.cu
+ magmablas/cgerbt.cu
diff --git a/.ci/magma/package_files/build.sh b/.ci/magma/package_files/build.sh
new file mode 100755
index 000000000000..8aa79a92d472
--- /dev/null
+++ b/.ci/magma/package_files/build.sh
@@ -0,0 +1,12 @@
+CUDA__VERSION=$(nvcc --version|sed -n 4p|cut -f5 -d" "|cut -f1 -d",")
+if [ "$CUDA__VERSION" != "$DESIRED_CUDA" ]; then
+    echo "CUDA Version is not $DESIRED_CUDA. CUDA Version found: $CUDA__VERSION"
+    exit 1
+fi
+
+mkdir build
+cd build
+cmake .. -DUSE_FORTRAN=OFF -DGPU_TARGET="All" -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" -DCUDA_ARCH_LIST="$CUDA_ARCH_LIST"
+make -j$(getconf _NPROCESSORS_CONF)
+make install
+cd ..
diff --git a/.ci/magma/package_files/cmakelists.patch b/.ci/magma/package_files/cmakelists.patch
new file mode 100644
index 000000000000..52c21720d6a6
--- /dev/null
+++ b/.ci/magma/package_files/cmakelists.patch
@@ -0,0 +1,388 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index d5d8d87d..8a507334 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -3,7 +3,7 @@ cmake_minimum_required( VERSION 2.8.1 )
+ # ----------------------------------------
+ # to disable Fortran, set this to "off"
+ # see also -DADD_ below
+-option( USE_FORTRAN "Fortran is required for some tester checks, but can be disabled with reduced functionality" ON )
++option( USE_FORTRAN "Fortran is required for some tester checks, but can be disabled with reduced functionality" OFF )
+
+ if (USE_FORTRAN)
+     project( MAGMA C CXX Fortran )
+@@ -75,6 +75,8 @@ else()
+     message( WARNING "The compiler ${CMAKE_CXX_COMPILER} doesn't support the -std=c++11 flag. Some code may not compile.")
+ endif()
+
++set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++ -fno-exceptions")
++
+ CHECK_C_COMPILER_FLAG("-std=c99" COMPILER_SUPPORTS_C99)
+ if (COMPILER_SUPPORTS_C99)
+     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
+@@ -101,15 +103,15 @@ endif()
+
+
+ # ----------------------------------------
+-# locate OpenMP
+-find_package( OpenMP )
+-if (OPENMP_FOUND)
+-    message( STATUS "Found OpenMP" )
+-    message( STATUS "    OpenMP_C_FLAGS   ${OpenMP_C_FLAGS}" )
+-    message( STATUS "    OpenMP_CXX_FLAGS ${OpenMP_CXX_FLAGS}" )
+-    set( CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}" )
+-    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}" )
+-endif()
++# # locate OpenMP
++# find_package( OpenMP )
++# if (OPENMP_FOUND)
++#     message( STATUS "Found OpenMP" )
++#     message( STATUS "    OpenMP_C_FLAGS   ${OpenMP_C_FLAGS}" )
++#     message( STATUS "    OpenMP_CXX_FLAGS ${OpenMP_CXX_FLAGS}" )
++#     set( CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}" )
++#     set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}" )
++# endif()
+
+ if (MAGMA_ENABLE_CUDA)
+   # ----------------------------------------
+@@ -132,7 +134,7 @@ if (MAGMA_ENABLE_CUDA)
+     set( NV_SM    "" )
+     set( NV_COMP  "" )
+
+-    set(CUDA_SEPARABLE_COMPILATION ON)
++    set(CUDA_SEPARABLE_COMPILATION OFF)
+
+     # nvcc >= 6.5 supports -std=c++11, so propagate CXXFLAGS to NVCCFLAGS.
+     # Older nvcc didn't support -std=c++11, so previously we disabled propagation.
+@@ -294,11 +296,18 @@ if (MAGMA_ENABLE_CUDA)
+         message( STATUS "    compile for CUDA arch 8.0 (Ampere)" )
+     endif()
+
++    if ( ${GPU_TARGET} MATCHES "All")
++        set( MIN_ARCH 370)
++        SET( NV_SM ${CUDA_ARCH_LIST})
++        SET( NV_COMP "")
++    endif()
++
+     if (NOT MIN_ARCH)
+         message( FATAL_ERROR "GPU_TARGET must contain one or more of Fermi, Kepler, Maxwell, Pascal, Volta, Turing, Ampere, or valid sm_[0-9][0-9]" )
+     endif()
+
+-    set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fPIC ${NV_SM} ${NV_COMP} ${FORTRAN_CONVENTION} )
++    set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -DHAVE_CUBLAS -Xfatbin -compress-all -Xcompiler -fPIC -std=c++11 ${NV_SM} ${NV_COMP} ${FORTRAN_CONVENTION} )
++    MESSAGE(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
+     #add_definitions( "-DMAGMA_HAVE_CUDA -DMAGMA_CUDA_ARCH_MIN=${MIN_ARCH}" )
+     set(MAGMA_HAVE_CUDA "1")
+     set(MAGMA_CUDA_ARCH_MIN "${MIN_ARCH}")
+@@ -413,7 +422,7 @@ set_property(CACHE BLA_VENDOR PROPERTY STRINGS
+ set( LAPACK_LIBRARIES "" CACHE STRING "Libraries for LAPACK and BLAS, to manually override search" )
+ if (LAPACK_LIBRARIES STREQUAL "")
+     message( STATUS "Searching for BLAS and LAPACK. To override, set LAPACK_LIBRARIES using ccmake." )
+-    find_package( LAPACK )
++    # find_package( LAPACK )
+     # force showing updated LAPACK_LIBRARIES in ccmake / cmake-gui.
+     set( LAPACK_LIBRARIES ${LAPACK_LIBRARIES} CACHE STRING "Libraries for LAPACK and BLAS, to manually override search" FORCE )
+ else()
+@@ -552,12 +561,12 @@ if (WIN32)
+     #message( "libmagma_all_f   ${libmagma_all_f}"   )
+
+     # on Windows, Fortran files aren't compiled if listed here...
+-    cuda_add_library( magma ${libmagma_all_cpp} )
++    cuda_add_library( magma STATIC ${libmagma_all_cpp} OPTIONS --compiler-options "-fPIC")
+     target_link_libraries( magma
+         ${LAPACK_LIBRARIES}
+         ${CUDA_CUDART_LIBRARY}
+         ${CUDA_CUBLAS_LIBRARIES}
+-        ${CUDA_cusparse_LIBRARY}
++        # ${CUDA_cusparse_LIBRARY}
+     )
+
+     # no Fortran files at the moment (how to test libmagma_all_f is not empty?),
+@@ -575,13 +584,13 @@ if (WIN32)
+ else()
+     # Unix doesn't seem to have a problem with mixing C, CUDA, and Fortran files
+     if (MAGMA_ENABLE_CUDA)
+-      cuda_add_library( magma ${libmagma_all} )
++      cuda_add_library( magma STATIC ${libmagma_all} OPTIONS --compiler-options "-fPIC")
+       target_link_libraries( magma
+         ${blas_fix}
+         ${LAPACK_LIBRARIES}
+         ${CUDA_CUDART_LIBRARY}
+         ${CUDA_CUBLAS_LIBRARIES}
+-        ${CUDA_cusparse_LIBRARY}
++        # ${CUDA_cusparse_LIBRARY}
+ 	)
+     else()
+       find_package( hipBLAS )
+@@ -614,138 +623,139 @@ else()
+     endif()
+ endif()
+ add_custom_target( lib DEPENDS magma )
+-
+-
+-# ----------------------------------------
+-# compile lapacktest library
+-# If use fortran, compile only Fortran files, not magma_[sdcz]_no_fortran.cpp
+-# else,           compile only C++     files, not Fortran files
+-if (USE_FORTRAN)
+-    foreach( filename ${liblapacktest_all} )
+-        if (filename MATCHES "\\.(f|f90|F90)$")
+-            list( APPEND liblapacktest_all_f ${filename} )
+-        endif()
+-    endforeach()
+-    add_library( lapacktest ${liblapacktest_all_f} )
+-else()
+-    # alternatively, use only C/C++/CUDA files, including magma_[sdcz]_no_fortran.cpp
+-    foreach( filename ${liblapacktest_all} )
+-        if (filename MATCHES "\\.(c|cu|cpp)$")
+-            list( APPEND liblapacktest_all_cpp ${filename} )
+-        endif()
+-    endforeach()
+-    add_library( lapacktest ${liblapacktest_all_cpp} )
+-endif()
+-target_link_libraries( lapacktest
+-    ${blas_fix}
+-    ${LAPACK_LIBRARIES}
+-)
+-
+-
+-# ----------------------------------------
+-# compile tester library
+-add_library( tester ${libtest_all} )
+-target_link_libraries( tester
+-    magma
+-    lapacktest
+-    ${blas_fix}
+-    ${LAPACK_LIBRARIES}
+-)
++set_target_properties(magma PROPERTIES POSITION_INDEPENDENT_CODE ON)
++
++
++# # ----------------------------------------
++# # compile lapacktest library
++# # If use fortran, compile only Fortran files, not magma_[sdcz]_no_fortran.cpp
++# # else,           compile only C++     files, not Fortran files
++# if (USE_FORTRAN)
++#     foreach( filename ${liblapacktest_all} )
++#         if (filename MATCHES "\\.(f|f90|F90)$")
++#             list( APPEND liblapacktest_all_f ${filename} )
++#         endif()
++#     endforeach()
++#     add_library( lapacktest ${liblapacktest_all_f} )
++# else()
++#     # alternatively, use only C/C++/CUDA files, including magma_[sdcz]_no_fortran.cpp
++#     foreach( filename ${liblapacktest_all} )
++#         if (filename MATCHES "\\.(c|cu|cpp)$")
++#             list( APPEND liblapacktest_all_cpp ${filename} )
++#         endif()
++#     endforeach()
++#     add_library( lapacktest ${liblapacktest_all_cpp} )
++# endif()
++# target_link_libraries( lapacktest
++#     ${blas_fix}
++#     ${LAPACK_LIBRARIES}
++# )
++
++
++# # ----------------------------------------
++# # compile tester library
++# add_library( tester ${libtest_all} )
++# target_link_libraries( tester
++#     magma
++#     lapacktest
++#     ${blas_fix}
++#     ${LAPACK_LIBRARIES}
++# )
+
+
+ # ----------------------------------------
+ # compile MAGMA sparse library
+
+ # sparse doesn't have Fortran at the moment, so no need for above shenanigans
+-if (MAGMA_ENABLE_CUDA)
+-  include_directories( sparse/include )
+-  include_directories( sparse/control )
+-else()
+-  include_directories( sparse_hip/include )
+-  include_directories( sparse_hip/control )
+-endif()
+-include_directories( testing )
+-
+-if (MAGMA_ENABLE_CUDA)
+-  cuda_add_library( magma_sparse ${libsparse_all} )
+-  target_link_libraries( magma_sparse
+-    magma
+-    ${blas_fix}
+-    ${LAPACK_LIBRARIES}
+-    ${CUDA_CUDART_LIBRARY}
+-    ${CUDA_CUBLAS_LIBRARIES}
+-    ${CUDA_cusparse_LIBRARY}
+-    )
+-else()
+-  add_library( magma_sparse ${libsparse_all} )
+-  target_link_libraries( magma_sparse
+-    magma
+-    ${blas_fix}
+-    ${LAPACK_LIBRARIES}
+-    hip::device
+-    roc::hipblas
+-    roc::hipsparse
+-    )
+-endif()
+-add_custom_target( sparse-lib DEPENDS magma_sparse )
+-
+-
+-# ----------------------------------------
+-# compile each tester
+-
+-# save testers to testing/
+-# save tester lib files to testing_lib/ to avoid cluttering lib/
+-set( CMAKE_RUNTIME_OUTPUT_DIRECTORY testing )
+-set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY testing_lib )
+-set( CMAKE_LIBRARY_OUTPUT_DIRECTORY testing_lib )
+-
+-# skip Fortran testers, which require an extra file from CUDA
+-foreach( filename ${testing_all} )
+-    if (filename MATCHES "\\.(c|cu|cpp)$")
+-        list( APPEND testing_all_cpp ${filename} )
+-    endif()
+-endforeach()
+-foreach( TEST ${testing_all_cpp} )
+-    string( REGEX REPLACE "\\.(cpp|f90|F90)" "" EXE ${TEST} )
+-    string( REGEX REPLACE "testing/" "" EXE ${EXE} )
+-    #message( "${TEST} --> ${EXE}" )
+-    add_executable( ${EXE} ${TEST} )
+-    target_link_libraries( ${EXE} tester lapacktest magma )
+-    list( APPEND testing ${EXE} )
+-endforeach()
+-add_custom_target( testing DEPENDS ${testing} )
+-
+-
+-# ----------------------------------------
+-# compile each sparse tester
+-
+-if (MAGMA_ENABLE_CUDA)
+-  set(SPARSE_TEST_DIR "sparse/testing")
+-else()
+-  set(SPARSE_TEST_DIR "sparse_hip/testing")
+-endif()
+-
+-
+-set( CMAKE_RUNTIME_OUTPUT_DIRECTORY "${SPARSE_TEST_DIR}" )
+-cmake_policy( SET CMP0037 OLD)
+-foreach( TEST ${sparse_testing_all} )
+-    string( REGEX REPLACE "\\.(cpp|f90|F90)"     "" EXE ${TEST} )
+-    string( REGEX REPLACE "${SPARSE_TEST_DIR}/" "" EXE ${EXE} )
+-    #message( "${TEST} --> ${EXE}" )
+-    add_executable( ${EXE} ${TEST} )
+-    target_link_libraries( ${EXE} magma_sparse magma )
+-    list( APPEND sparse-testing ${EXE} )
+-endforeach()
+-add_custom_target( sparse-testing DEPENDS ${sparse-testing} )
++# if (MAGMA_ENABLE_CUDA)
++#   include_directories( sparse/include )
++#   include_directories( sparse/control )
++# else()
++#   include_directories( sparse_hip/include )
++#   include_directories( sparse_hip/control )
++# endif()
++# include_directories( testing )
++
++# if (MAGMA_ENABLE_CUDA)
++#   cuda_add_library( magma_sparse ${libsparse_all} )
++#   target_link_libraries( magma_sparse
++#     magma
++#     ${blas_fix}
++#     ${LAPACK_LIBRARIES}
++#     ${CUDA_CUDART_LIBRARY}
++#     ${CUDA_CUBLAS_LIBRARIES}
++#     ${CUDA_cusparse_LIBRARY}
++#     )
++# else()
++#   add_library( magma_sparse ${libsparse_all} )
++#   target_link_libraries( magma_sparse
++#     magma
++#     ${blas_fix}
++#     ${LAPACK_LIBRARIES}
++#     hip::device
++#     roc::hipblas
++#     roc::hipsparse
++#     )
++# endif()
++# add_custom_target( sparse-lib DEPENDS magma_sparse )
++
++
++# # ----------------------------------------
++# # compile each tester
++
++# # save testers to testing/
++# # save tester lib files to testing_lib/ to avoid cluttering lib/
++# set( CMAKE_RUNTIME_OUTPUT_DIRECTORY testing )
++# set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY testing_lib )
++# set( CMAKE_LIBRARY_OUTPUT_DIRECTORY testing_lib )
++
++# # skip Fortran testers, which require an extra file from CUDA
++# foreach( filename ${testing_all} )
++#     if (filename MATCHES "\\.(c|cu|cpp)$")
++#         list( APPEND testing_all_cpp ${filename} )
++#     endif()
++# endforeach()
++# foreach( TEST ${testing_all_cpp} )
++#     string( REGEX REPLACE "\\.(cpp|f90|F90)" "" EXE ${TEST} )
++#     string( REGEX REPLACE "testing/" "" EXE ${EXE} )
++#     #message( "${TEST} --> ${EXE}" )
++#     add_executable( ${EXE} ${TEST} )
++#     target_link_libraries( ${EXE} tester lapacktest magma )
++#     list( APPEND testing ${EXE} )
++# endforeach()
++# add_custom_target( testing DEPENDS ${testing} )
++
++
++# # ----------------------------------------
++# # compile each sparse tester
++
++# if (MAGMA_ENABLE_CUDA)
++#   set(SPARSE_TEST_DIR "sparse/testing")
++# else()
++#   set(SPARSE_TEST_DIR "sparse_hip/testing")
++# endif()
++
++
++# set( CMAKE_RUNTIME_OUTPUT_DIRECTORY "${SPARSE_TEST_DIR}" )
++# cmake_policy( SET CMP0037 OLD)
++# foreach( TEST ${sparse_testing_all} )
++#     string( REGEX REPLACE "\\.(cpp|f90|F90)"     "" EXE ${TEST} )
++#     string( REGEX REPLACE "${SPARSE_TEST_DIR}/" "" EXE ${EXE} )
++#     #message( "${TEST} --> ${EXE}" )
++#     add_executable( ${EXE} ${TEST} )
++#     target_link_libraries( ${EXE} magma_sparse magma )
++#     list( APPEND sparse-testing ${EXE} )
++# endforeach()
++# add_custom_target( sparse-testing DEPENDS ${sparse-testing} )
+
+
+ # ----------------------------------------
+ # what to install
+-install( TARGETS magma magma_sparse ${blas_fix}
++install( TARGETS magma ${blas_fix}
+          RUNTIME DESTINATION bin
+          LIBRARY DESTINATION lib
+          ARCHIVE DESTINATION lib )
+-file( GLOB headers include/*.h sparse/include/*.h "${CMAKE_BINARY_DIR}/include/*.h" )
++file( GLOB headers include/*.h "${CMAKE_BINARY_DIR}/include/*.h" )
+ if (USE_FORTRAN)
+     install( FILES ${headers} ${modules}
+              DESTINATION include )
+@@ -769,9 +779,9 @@ else()
+     "${blas_fix_lib} ${LAPACK_LIBS} hip::device roc::hipblas roc::hipsparse" )
+ endif()
+ set( MAGMA_REQUIRED "" )
+-configure_file( "${pkgconfig}.in" "${pkgconfig}" @ONLY )
+-install( FILES "${CMAKE_BINARY_DIR}/${pkgconfig}"
+-         DESTINATION lib/pkgconfig )
++# configure_file( "${pkgconfig}.in" "${pkgconfig}" @ONLY )
++# install( FILES "${CMAKE_BINARY_DIR}/${pkgconfig}"
++#          DESTINATION lib/pkgconfig )
+
+ # ----------------------------------------
+ get_directory_property( compile_definitions COMPILE_DEFINITIONS )
diff --git a/.ci/magma/package_files/getrf_nbparam.patch b/.ci/magma/package_files/getrf_nbparam.patch
new file mode 100644
index 000000000000..ce69c5281d03
--- /dev/null
+++ b/.ci/magma/package_files/getrf_nbparam.patch
@@ -0,0 +1,40 @@
+diff --git a/control/get_batched_crossover.cpp b/control/get_batched_crossover.cpp
+index 4ec57306..912f8608 100644
+--- a/control/get_batched_crossover.cpp
++++ b/control/get_batched_crossover.cpp
+@@ -119,7 +119,7 @@ void magma_get_spotrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_
+ void magma_get_zgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_t *recnb)
+ {
+     *nb    = 64;
+-    *recnb = 32;
++    *recnb = 16;
+     return;
+ }
+ 
+@@ -127,7 +127,7 @@ void magma_get_zgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_
+ void magma_get_cgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_t *recnb)
+ {
+     *nb    = 128;
+-    *recnb =  32;
++    *recnb =  16;
+     return;
+ }
+ 
+@@ -135,7 +135,7 @@ void magma_get_cgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_
+ void magma_get_dgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_t *recnb)
+ {
+     *nb    = 128;
+-    *recnb =  32;
++    *recnb =  16;
+     return;
+ }
+ 
+@@ -143,7 +143,7 @@ void magma_get_dgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_
+ void magma_get_sgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_t *recnb)
+ {
+     *nb    = 128;
+-    *recnb =  32;
++    *recnb =  16;
+     return;
+ }
+ 
diff --git a/.ci/magma/package_files/getrf_shfl.patch b/.ci/magma/package_files/getrf_shfl.patch
new file mode 100644
index 000000000000..49baae01227c
--- /dev/null
+++ b/.ci/magma/package_files/getrf_shfl.patch
@@ -0,0 +1,15 @@
+diff --git a/src/zgetrf_batched.cpp b/src/zgetrf_batched.cpp
+index 24a65a90..884d9352 100644
+--- a/src/zgetrf_batched.cpp
++++ b/src/zgetrf_batched.cpp
+@@ -116,7 +116,9 @@ magma_zgetrf_batched(
+             return magma_zgetrf_batched_smallsq_noshfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
+         }
+         else{
+-            return magma_zgetrf_batched_smallsq_shfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
++            // magma_cgetrf_batched_smallsq_shfl is broken, therefore let's call noshfl version for arch < 700
++            // return magma_zgetrf_batched_smallsq_shfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
++            return magma_zgetrf_batched_smallsq_noshfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
+         }
+         #else
+         return magma_zgetrf_batched_smallsq_noshfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
diff --git a/.ci/magma/package_files/magma-2.6.1.sha256 b/.ci/magma/package_files/magma-2.6.1.sha256
new file mode 100644
index 000000000000..1a0b85508ba1
--- /dev/null
+++ b/.ci/magma/package_files/magma-2.6.1.sha256
@@ -0,0 +1 @@
+6cd83808c6e8bc7a44028e05112b3ab4e579bcc73202ed14733f66661127e213  magma-2.6.1.tar.gz
\ No newline at end of file
diff --git a/.ci/magma/package_files/thread_queue.patch b/.ci/magma/package_files/thread_queue.patch
new file mode 100644
index 000000000000..1c2fa400ff13
--- /dev/null
+++ b/.ci/magma/package_files/thread_queue.patch
@@ -0,0 +1,20 @@
+--- control/thread_queue.cpp	2016-08-30 06:37:49.000000000 -0700
++++ control/thread_queue.cpp	2016-10-10 19:47:28.911580965 -0700
+@@ -15,7 +15,7 @@
+ {
+     if ( err != 0 ) {
+         fprintf( stderr, "Error: %s (%d)\n", strerror(err), err );
+-        throw std::exception();
++        // throw std::exception();
+     }
+ }
+ 
+@@ -172,7 +172,7 @@
+     check( pthread_mutex_lock( &mutex ));
+     if ( quit_flag ) {
+         fprintf( stderr, "Error: push_task() called after quit()\n" );
+-        throw std::exception();
++        // throw std::exception();
+     }
+     q.push( task );
+     ntask += 1;
diff --git a/.github/scripts/upload_aws_ossci.sh b/.github/scripts/upload_aws_ossci.sh
new file mode 100644
index 000000000000..680bbf7ba733
--- /dev/null
+++ b/.github/scripts/upload_aws_ossci.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+# Upload a binary to a bucket, supports dry-run mode
+
+set -euo pipefail
+
+# Optional inputs. By default upload to s3://ossci-linux
+TARGET_OS=${TARGET_OS:-linux}
+UPLOAD_BUCKET=${UPLOAD_BUCKET:-s3://ossci-${TARGET_OS}}
+UPLOAD_SUBFOLDER=${UPLOAD_SUBFOLDER:-}
+
+# Download to ${{ runner.temp }}/artifacts to match the default
+PKG_DIR=${PKG_DIR:-/tmp/workspace/artifacts}
+
+# Optional package include.
+# By default looks for and uploads *.tar.bz2 files only
+PKG_INCLUDE=${PKG_INCLUDE:-'*.tar.bz2'}
+
+# Dry-run logs the upload command without actually executing it
+# Dry-run is enabled by default, it has to be disabled to upload
+DRY_RUN=${DRY_RUN:-enabled}
+# Don't actually do work unless explicit
+AWS_S3_CP="aws s3 cp --dryrun"
+if [[ "${DRY_RUN}" = "disabled" ]]; then
+  AWS_S3_CP="aws s3 cp"
+fi
+
+# Install dependencies (should be a no-op if previously installed)
+pip install -q awscli
+
+# Handle subfolders, if provided
+s3_root_dir="${UPLOAD_BUCKET}"
+if [[ -z ${UPLOAD_SUBFOLDER:-} ]]; then
+    s3_upload_dir="${s3_root_dir}/"
+else
+    s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/"
+fi
+
+# Upload all packages that match PKG_INCLUDE within PKG_DIR and subdirs
+set -x
+${AWS_S3_CP} --no-progress --acl public-read --exclude="*" --include="${PKG_INCLUDE}" --recursive "${PKG_DIR}" "${s3_upload_dir}"
diff --git a/.github/workflows/build-magma-linux.yml b/.github/workflows/build-magma-linux.yml
new file mode 100644
index 000000000000..a4265160ce00
--- /dev/null
+++ b/.github/workflows/build-magma-linux.yml
@@ -0,0 +1,67 @@
+name: build-linux-magma
+
+on:
+  push:
+    branches:
+      main
+    paths:
+      - .ci/magma/*
+      - .ci/magma/package_files/*
+      - .github/workflows/build-magma-linux.yml
+  pull_request:
+    paths:
+      - .ci/magma/*
+      - .ci/magma/package_files/*
+      - .github/workflows/build-magma-linux.yml
+
+defaults:
+  run:
+    shell: bash -x -e -l {0}
+env:
+  BUILD_ENVIRONMENT: build-linux-magma
+  IN_CI: 1
+  IS_GHA: 1
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  build-linux-magma:
+    if: github.repository_owner == 'pytorch'
+    runs-on: linux.2xlarge
+    strategy:
+      matrix:
+        cuda_version: ["124", "121", "118"]  # There is no pytorch/manylinux-cuda126 yet
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+      - name: Build Magma Cuda
+        working-directory: .ci/magma
+        run: |
+          # Produces artifacts under magma/output/linux-64/magma-cuda*.bz2
+          make magma-cuda${{ matrix.cuda_version }}
+      - name: Save as artifact
+        uses: actions/upload-artifact@v4
+        with:
+          path: .ci/magma/output/linux-64/magma-cuda*.bz2
+          name: artifact_${{ matrix.cuda_version }}
+      - name: Configure AWS credentials(PyTorch account)
+        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_ossci_linux_windows_read_write
+          aws-region: us-east-1
+      - name: Set DRY_RUN
+        if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
+        run: |
+            echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Upload binaries
+        shell: bash
+        env:
+            PKG_DIR: ".ci/magma/output/linux-64/"
+            TARGET_OS: "linux"
+            PKG_INCLUDE: "magma-cuda*.tar.bz2"
+        run: |
+            set -ex
+            bash .github/scripts/upload_aws_ossci.sh
\ No newline at end of file
diff --git a/.lintrunner.toml b/.lintrunner.toml
index 191a905719ad..a2c57f940219 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -427,6 +427,7 @@ exclude_patterns = [
     'test/cpp/jit/upgrader_models/*.ptl.ff',
     '.ci/docker/common/install_rocm_drm.sh',
     '.lintrunner.toml',
+    '.ci/magma/package_files/*.patch',
 ]
 command = [
     'python3',