#!/usr/bin/env python3

"""Generates a matrix to be utilized through github actions

Will output a condensed version of the matrix if on a pull request that only
includes the latest version of python we support built on three different
architectures:
    * CPU
    * Latest CUDA
    * Latest ROCM
    * Latest XPU
"""

import os
from typing import Optional


# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
CUDA_STABLE = "12.8"
CUDA_ARCHES_FULL_VERSION = {
    "12.6": "12.6.3",
    "12.8": "12.8.1",
    "12.9": "12.9.1",
    "13.0": "13.0.0",
}
CUDA_ARCHES_CUDNN_VERSION = {
    "12.6": "9",
    "12.8": "9",
    "12.9": "9",
    "13.0": "9",
}

# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
ROCM_ARCHES = ["6.4", "7.0"]

XPU_ARCHES = ["xpu"]

CPU_AARCH64_ARCH = ["cpu-aarch64"]

CPU_S390X_ARCH = ["cpu-s390x"]

CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "12.9-aarch64", "13.0-aarch64"]


PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
    "12.6": (
        "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | "
        "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | "
        "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | "
        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
        "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | "
        "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | "
        "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | "
        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | "
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
    ),
    "12.8": (
        "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | "
        "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | "
        "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | "
        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
        "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | "
        "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | "
        "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | "
        "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | "
        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
    ),
    "12.9": (
        "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | "
        "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | "
        "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | "
        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
        "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | "
        "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | "
        "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | "
        "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | "
        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
    ),
    "13.0": (
        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | "
        "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | "
        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' | "
        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' | "
        "nvidia-curand==10.4.0.35; platform_system == 'Linux' | "
        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
        "nvidia-cufile==1.15.0.42; platform_system == 'Linux'"
    ),
    "xpu": (
        "intel-cmplr-lib-rt==2025.2.1 | "
        "intel-cmplr-lib-ur==2025.2.1 | "
        "intel-cmplr-lic-rt==2025.2.1 | "
        "intel-sycl-rt==2025.2.1 | "
        "oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "onemkl-sycl-blas==2025.2.0 | "
        "onemkl-sycl-dft==2025.2.0 | "
        "onemkl-sycl-lapack==2025.2.0 | "
        "onemkl-sycl-rng==2025.2.0 | "
        "onemkl-sycl-sparse==2025.2.0 | "
        "dpcpp-cpp-rt==2025.2.1 | "
        "intel-opencl-rt==2025.2.1 | "
        "mkl==2025.2.0 | "
        "intel-openmp==2025.2.1 | "
        "tbb==2022.2.0 | "
        "tcmlib==1.4.0 | "
        "umf==0.11.0 | "
        "intel-pti==0.13.1"
    ),
}


def get_nccl_wheel_version(arch_version: str) -> str:
    import re

    requirements = map(
        str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
    )
    return next(x for x in requirements if x.startswith("nvidia-nccl")).split("==")[1]


def read_nccl_pin(arch_version: str) -> str:
    from pathlib import Path

    nccl_pin_path = os.path.join(
        Path(__file__).absolute().parents[2],
        ".ci",
        "docker",
        "ci_commit_pins",
        f"nccl-cu{arch_version[:2]}.txt",
    )
    with open(nccl_pin_path) as f:
        return f.read().strip()


def validate_nccl_dep_consistency(arch_version: str) -> None:
    nccl_release_tag = read_nccl_pin(arch_version)
    wheel_ver = get_nccl_wheel_version(arch_version)
    if not nccl_release_tag.startswith(f"v{wheel_ver}"):
        raise RuntimeError(
            f"{arch_version} NCCL release tag version {nccl_release_tag} does not correspond to wheel version {wheel_ver}"
        )


def arch_type(arch_version: str) -> str:
    if arch_version in CUDA_ARCHES:
        return "cuda"
    elif arch_version in ROCM_ARCHES:
        return "rocm"
    elif arch_version in XPU_ARCHES:
        return "xpu"
    elif arch_version in CPU_AARCH64_ARCH:
        return "cpu-aarch64"
    elif arch_version in CPU_S390X_ARCH:
        return "cpu-s390x"
    elif arch_version in CUDA_AARCH64_ARCHES:
        return "cuda-aarch64"
    else:  # arch_version should always be "cpu" in this case
        return "cpu"


DEFAULT_TAG = os.getenv("RELEASE_VERSION_TAG", "main")

WHEEL_CONTAINER_IMAGES = {
    **{gpu_arch: f"manylinux2_28-builder:cuda{gpu_arch}" for gpu_arch in CUDA_ARCHES},
    **{
        gpu_arch: f"manylinuxaarch64-builder:cuda{gpu_arch.replace('-aarch64', '')}"
        for gpu_arch in CUDA_AARCH64_ARCHES
    },
    **{gpu_arch: f"manylinux2_28-builder:rocm{gpu_arch}" for gpu_arch in ROCM_ARCHES},
    "xpu": "manylinux2_28-builder:xpu",
    "cpu": "manylinux2_28-builder:cpu",
    "cpu-aarch64": "manylinux2_28_aarch64-builder:cpu-aarch64",
    "cpu-s390x": "pytorch/manylinuxs390x-builder:cpu-s390x",
}

RELEASE = "release"
DEBUG = "debug"

LIBTORCH_CONTAINER_IMAGES: dict[str, str] = {
    **{gpu_arch: f"libtorch-cxx11-builder:cuda{gpu_arch}" for gpu_arch in CUDA_ARCHES},
    **{gpu_arch: f"libtorch-cxx11-builder:rocm{gpu_arch}" for gpu_arch in ROCM_ARCHES},
    "cpu": "libtorch-cxx11-builder:cpu",
}

FULL_PYTHON_VERSIONS = ["3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]


def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
    return {
        "cpu": "cpu",
        "cpu-aarch64": "cpu",
        "cpu-s390x": "cpu",
        "cuda": f"cu{gpu_arch_version.replace('.', '')}",
        "cuda-aarch64": f"cu{gpu_arch_version.replace('-aarch64', '').replace('.', '')}",
        "rocm": f"rocm{gpu_arch_version}",
        "xpu": "xpu",
    }.get(gpu_arch_type, gpu_arch_version)


def list_without(in_list: list[str], without: list[str]) -> list[str]:
    return [item for item in in_list if item not in without]


def generate_libtorch_matrix(
    os: str,
    release_type: str,
    arches: Optional[list[str]] = None,
    libtorch_variants: Optional[list[str]] = None,
) -> list[dict[str, str]]:
    if arches is None:
        arches = ["cpu"]
        if os == "linux":
            arches += CUDA_ARCHES
            arches += ROCM_ARCHES
        elif os == "windows":
            # TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
            # in 2.10
            windows_cuda_arches = CUDA_ARCHES.copy()
            windows_cuda_arches.remove("12.9")
            arches += windows_cuda_arches
    if libtorch_variants is None:
        libtorch_variants = [
            "shared-with-deps",
            "shared-without-deps",
            "static-with-deps",
            "static-without-deps",
        ]

    ret: list[dict[str, str]] = []
    for arch_version in arches:
        for libtorch_variant in libtorch_variants:
            gpu_arch_type = arch_type(arch_version)
            gpu_arch_version = "" if arch_version == "cpu" else arch_version
            # ROCm builds without-deps failed even in ROCm runners; skip for now
            if gpu_arch_type == "rocm" and ("without-deps" in libtorch_variant):
                continue
            ret.append(
                {
                    "gpu_arch_type": gpu_arch_type,
                    "gpu_arch_version": gpu_arch_version,
                    "desired_cuda": translate_desired_cuda(
                        gpu_arch_type, gpu_arch_version
                    ),
                    "libtorch_config": release_type,
                    "libtorch_variant": libtorch_variant,
                    "container_image": (
                        LIBTORCH_CONTAINER_IMAGES[arch_version].split(":")[0]
                        if os not in ("windows", "windows-arm64")
                        else ""
                    ),
                    "container_image_tag_prefix": (
                        LIBTORCH_CONTAINER_IMAGES[arch_version].split(":")[1]
                        if os not in ("windows", "windows-arm64")
                        else ""
                    ),
                    "package_type": "libtorch",
                    "build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{release_type}".replace(
                        ".", "_"
                    ),
                }
            )
    return ret


def generate_wheels_matrix(
    os: str,
    arches: Optional[list[str]] = None,
    python_versions: Optional[list[str]] = None,
) -> list[dict[str, str]]:
    package_type = "wheel"
    if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
        # NOTE: We only build manywheel packages for x86_64 and aarch64 and s390x linux
        package_type = "manywheel"

    if python_versions is None:
        python_versions = FULL_PYTHON_VERSIONS

    if arches is None:
        # Define default compute archivectures
        arches = ["cpu"]
        if os == "linux":
            arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
        elif os == "windows":
            # TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
            # in 2.10
            windows_cuda_arches = CUDA_ARCHES.copy()
            windows_cuda_arches.remove("12.9")
            arches += windows_cuda_arches + XPU_ARCHES
        elif os == "linux-aarch64":
            # Separate new if as the CPU type is different and
            # uses different build/test scripts
            arches = CPU_AARCH64_ARCH + CUDA_AARCH64_ARCHES
        elif os == "linux-s390x":
            # Only want the one arch as the CPU type is different and
            # uses different build/test scripts
            arches = ["cpu-s390x"]

    ret: list[dict[str, str]] = []
    for python_version in python_versions:
        for arch_version in arches:
            gpu_arch_type = arch_type(arch_version)
            gpu_arch_version = (
                ""
                if arch_version == "cpu"
                or arch_version == "cpu-aarch64"
                or arch_version == "cpu-s390x"
                or arch_version == "xpu"
                else arch_version
            )

            # TODO: Enable python 3.14 for rest
            if os not in [
                "linux",
                "linux-aarch64",
                "linux-s390x",
                "macos-arm64",
                "windows",
            ] and (python_version == "3.14" or python_version == "3.14t"):
                continue

            # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install

            if (
                arch_version in ["13.0", "12.9", "12.8", "12.6"]
                and os == "linux"
                or arch_version in CUDA_AARCH64_ARCHES
            ):
                desired_cuda = translate_desired_cuda(gpu_arch_type, gpu_arch_version)
                ret.append(
                    {
                        "python_version": python_version,
                        "gpu_arch_type": gpu_arch_type,
                        "gpu_arch_version": gpu_arch_version,
                        "desired_cuda": desired_cuda,
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split(
                            ":"
                        )[0],
                        "container_image_tag_prefix": WHEEL_CONTAINER_IMAGES[
                            arch_version
                        ].split(":")[1],
                        "package_type": package_type,
                        "pytorch_extra_install_requirements": (
                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS[
                                f"{desired_cuda[2:4]}.{desired_cuda[4:]}"  # for cuda-aarch64: cu126 -> 12.6
                            ]
                            if os == "linux-aarch64"
                            else PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]
                        ),
                        "build_name": (
                            f"{package_type}-py{python_version}-{gpu_arch_type}"
                            f"{'-' if 'aarch64' in gpu_arch_type else ''}{gpu_arch_version.replace('-aarch64', '')}".replace(
                                ".", "_"
                            )
                        ),  # include special case for aarch64 build, remove the -aarch64 postfix
                    }
                )
            else:
                ret.append(
                    {
                        "python_version": python_version,
                        "gpu_arch_type": gpu_arch_type,
                        "gpu_arch_version": gpu_arch_version,
                        "desired_cuda": translate_desired_cuda(
                            gpu_arch_type, gpu_arch_version
                        ),
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split(
                            ":"
                        )[0],
                        "container_image_tag_prefix": WHEEL_CONTAINER_IMAGES[
                            arch_version
                        ].split(":")[1],
                        "package_type": package_type,
                        "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
                            ".", "_"
                        ),
                        "pytorch_extra_install_requirements": (
                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS["xpu"]
                            if gpu_arch_type == "xpu"
                            else ""
                        ),
                    }
                )

    return ret


validate_nccl_dep_consistency("13.0")
validate_nccl_dep_consistency("12.9")
validate_nccl_dep_consistency("12.8")
validate_nccl_dep_consistency("12.6")