Compare commits

...

8 Commits

Author SHA1 Message Date
c263bd43e8 [inductor] use triu ref instead of lowering (#96040) (#96462)
Fixes #95958
Generated code is functionally identical with ref and lowering, only minor differences

Pull Request resolved: https://github.com/pytorch/pytorch/pull/96040
Approved by: https://github.com/jansel

Co-authored-by: Natalia Gimelshein <ngimel@fb.com>
2023-03-09 17:42:00 -05:00
c9913cf66f Add jinja2 as mandatory dependency (#95691) (#96450)
Should fix #95671  for nightly wheels issue. v2.0.0 RC does not need this.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/95691
Approved by: https://github.com/malfet

Co-authored-by: Wei Wang <weiwangmeta@meta.com>
2023-03-09 17:31:12 -05:00
2f7d8bbf17 Fix expired deprecation of comparison dtype for NumPy 1.24+ (#91517) (#96452)
> The `dtype=` argument to comparison ufuncs is now applied correctly. That
> means that only `bool` and `object` are valid values and `dtype=object` is
> enforced.

Source: https://numpy.org/doc/stable/release/1.24.0-notes.html#expired-deprecations

Fixes #91516

Pull Request resolved: https://github.com/pytorch/pytorch/pull/91517
Approved by: https://github.com/zou3519, https://github.com/huydhn

Co-authored-by: Johnson <j3.soon@msa.hinet.net>
2023-03-09 14:30:00 -08:00
ca0cdf52ca dl_open_guard should restore flag even after exception (#96231) (#96457)
I.e. follow pattern outlined in https://docs.python.org/3.8/library/contextlib.html#contextlib.contextmanager

Also, return early on non-unix platforms (when `sys.getdlopenflags` is not defined)

Fixes https://github.com/pytorch/pytorch/issues/96159

Pull Request resolved: https://github.com/pytorch/pytorch/pull/96231
Approved by: https://github.com/atalman

(cherry picked from commit 941ff109d32d51d6e93a2c2f4a028ff3826ece31)
2023-03-09 14:29:17 -08:00
9cfa076da8 [Release/2.0] Use Triton from PYPI (#96010)
* [Release/2.0] Use Triton from PYPI

Remove `[dynamo]` extras from setup.py

Build torchtriton conda wheels as 2.0.0

* Also, upload triton conda packages to test channel
2023-03-03 20:15:48 -05:00
8e05e41dbc [Release/2.0] Use builder release branch for tests 2023-03-03 16:22:04 -08:00
d8ffc60bc1 Remove mention of dynamo.optimize() in docs (#95802) (#96007)
This should be self containable to merge but other stuff that's been bugging me is
* Instructions on debugging IMA issues
* Dynamic shape instructions
* Explaining config options better

Will look at adding a config options doc

Pull Request resolved: https://github.com/pytorch/pytorch/pull/95802
Approved by: https://github.com/svekars
2023-03-03 17:43:31 -05:00
1483723037 [MPS] Disallow reshape in slice (#95905) (#95978)
Disallow reshapes for arrayViews.
Current code allows a base shape of `[2, 4, 256]` to be sliced into `[4, 1, 256]` (view's shape) - which is not possible. Slicing a smaller dimension into a bigger one will always error out.

Fixes https://github.com/pytorch/pytorch/issues/95883
Pull Request resolved: https://github.com/pytorch/pytorch/pull/95905
Approved by: https://github.com/razarmehr, https://github.com/kulinseth

Co-authored-by: Denis Vieriu <dvieriu@apple.com>
2023-03-03 10:15:10 -08:00
18 changed files with 119 additions and 89 deletions

View File

@ -38,7 +38,7 @@ def build_triton(commit_hash: str, build_conda: bool = False, py_version : Optio
check_call(["git", "checkout", commit_hash], cwd=triton_basedir)
if build_conda:
with open(triton_basedir / "meta.yaml", "w") as meta:
print(f"package:\n name: torchtriton\n version: 2.0.0+{commit_hash[:10]}\n", file=meta)
print("package:\n name: torchtriton\n version: 2.0.0\n", file=meta)
print("source:\n path: .\n", file=meta)
print("build:\n string: py{{py}}\n number: 1\n script: cd python; "
"python setup.py install --single-version-externally-managed --record=record.txt\n", file=meta)

View File

@ -226,7 +226,8 @@ def generate_wheels_matrix(os: str,
"nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'",
"nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"triton==2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'",
"build_name":
f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-with-pypi-cudnn"
.replace(

View File

@ -153,7 +153,7 @@ jobs:
- name: Checkout pytorch/builder to builder dir
uses: malfet/checkout@silent-checkout
with:
ref: main
ref: release/2.0
submodules: recursive
repository: pytorch/builder
path: builder

View File

@ -137,7 +137,7 @@ jobs:
run: |
set -ex
pip install -q awscli
s3_dir="${UPLOAD_BUCKET}/whl/nightly/"
s3_dir="${UPLOAD_BUCKET}/whl/test/"
for pkg in "${PKG_DIR}/"*.whl; do
aws s3 cp --no-progress --acl public-read "${pkg}" "${s3_dir}"
done
@ -193,7 +193,7 @@ jobs:
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/master' || github.event.ref == 'refs/heads/main') }}
run: |
container_name=$(docker container ps --format '{{.ID}}')
docker exec -t "${container_name}" sh -c "anaconda upload /artifacts/torch*.tar.bz2 -u pytorch-nightly --label main --no-progress --force"
docker exec -t "${container_name}" sh -c "anaconda upload /artifacts/torch*.tar.bz2 -u pytorch-test --label main --no-progress --force"
- name: Chown artifacts
run: |

View File

@ -47,7 +47,7 @@ jobs:
DESIRED_PYTHON: "3.8"
build_name: manywheel-py3_8-cuda11_7-with-pypi-cudnn
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -169,7 +169,7 @@ jobs:
DESIRED_PYTHON: "3.8"
build_name: manywheel-py3_8-cuda11_7-with-pypi-cudnn
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -667,7 +667,7 @@ jobs:
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cuda11_7-with-pypi-cudnn
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1165,7 +1165,7 @@ jobs:
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda11_7-with-pypi-cudnn
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1663,7 +1663,7 @@ jobs:
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cuda11_7-with-pypi-cudnn
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64' | triton==2.0.0; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -478,25 +478,19 @@ bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
}
IntArrayRef src_base_shape = getIMPSAllocator()->getBufferShape(src.storage().data());
std::vector<int64_t> src_base_squeezed_shape = getSqueezedBaseShape(src, src_base_shape);
size_t src_ndim_base = src_base_shape.size();
size_t src_squeezed_ndim_base = src_base_squeezed_shape.size();
std::vector<int64_t> src_view_squeezed_shape = getViewShape(src, mpsShape, true);
size_t src_ndim_view = getViewShape(src, mpsShape, false).size();
size_t src_squeezed_ndim_view = src_view_squeezed_shape.size();
std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape, false);
size_t src_ndim_view = src_view_shape.size();
if (src_ndim_base != src_ndim_view) {
return false;
}
if (src_squeezed_ndim_base == src_squeezed_ndim_view) {
for (const auto i: c10::irange(src_squeezed_ndim_base)) {
if (src_view_squeezed_shape[i] > src_base_squeezed_shape[i]) {
return false;
}
}
}
for (const auto i: c10::irange(src_ndim_base)) {
if (src_view_shape[i] > src_base_shape[i]) {
return false;
}
}
return true;
}

View File

@ -6,13 +6,12 @@ significant speedups the newer your GPU is.
.. code:: python
from torch._dynamo import optimize
import torch
def fn(x, y):
a = torch.cos(x).cuda()
b = torch.sin(y).cuda()
return a + b
new_fn = optimize("inductor")(fn)
new_fn = torch.compile(fn, backend="inductor")
input_tensor = torch.randn(10000).to(device="cuda:0")
a = new_fn(input_tensor, input_tensor)
@ -54,7 +53,7 @@ with the actual generated kernel being
tmp2 = tl.sin(tmp1)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
And you can verify that fusing the two ``sins`` did actually occur
And you can verify that fusing the two ``sin`` did actually occur
because the two ``sin`` operations occur within a single Triton kernel
and the temporary variables are held in registers with very fast access.
@ -69,13 +68,12 @@ hub.
.. code-block:: python
import torch
import torch._dynamo as dynamo
model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
opt_model = dynamo.optimize("inductor")(model)
opt_model = torch.compile(model, backend="inductor")
model(torch.randn(1,3,64,64))
And that is not the only available backend, you can run in a REPL
``dynamo.list_backends()`` to see all the available backends. Try out the
``torch._dynamo.list_backends()`` to see all the available backends. Try out the
``cudagraphs`` or ``nvfuser`` next as inspiration.
Lets do something a bit more interesting now, our community frequently
@ -92,11 +90,10 @@ HuggingFace hub and optimize it:
import torch
from transformers import BertTokenizer, BertModel
import torch._dynamo as dynamo
# Copy pasted from here https://huggingface.co/bert-base-uncased
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased").to(device="cuda:0")
model = dynamo.optimize("inductor")(model) # This is the only line of code that we changed
model = torch.compile(model, backend="inductor") # This is the only line of code that we changed
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt').to(device="cuda:0")
output = model(**encoded_input)
@ -116,7 +113,7 @@ Similarly lets try out a TIMM example
import torch._dynamo as dynamo
import torch
model = timm.create_model('resnext101_32x8d', pretrained=True, num_classes=2)
opt_model = dynamo.optimize("inductor")(model)
opt_model = torch.compile(model, backend="inductor")
opt_model(torch.randn(64,3,7,7))
Our goal with Dynamo and inductor is to build the highest coverage ML compiler
@ -132,16 +129,16 @@ or ``torch._dynamo.list_backends()`` each of which with its optional dependencie
Some of the most commonly used backends include:
**Training & inference backends**:
* ``dynamo.optimize("inductor")`` - Uses ``TorchInductor`` backend. `Read more <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__
* ``dynamo.optimize("aot_ts_nvfuser")`` - nvFuser with AotAutograd/TorchScript. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
* ``dynamo.optimize("nvprims_nvfuser")`` - nvFuser with PrimTorch. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
* ``dynamo.optimize("cudagraphs")`` - cudagraphs with AotAutograd. `Read more <https://github.com/pytorch/torchdynamo/pull/757>`__
* ``torch.compile(m, backend="inductor")`` - Uses ``TorchInductor`` backend. `Read more <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__
* ``torch.compile(m, backend="aot_ts_nvfuser")`` - nvFuser with AotAutograd/TorchScript. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
* ``torch.compile(m, backend=""nvprims_nvfuser")`` - nvFuser with PrimTorch. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
* ``torch.compile(m, backend="cudagraphs")`` - cudagraphs with AotAutograd. `Read more <https://github.com/pytorch/torchdynamo/pull/757>`__
**Inference-only backends**:
* ``dynamo.optimize("onnxrt")`` - Uses ONNXRT for inference on CPU/GPU. `Read more <https://onnxruntime.ai/>`__
* ``dynamo.optimize("tensorrt")`` - Uses ONNXRT to run TensorRT for inference optimizations. `Read more <https://github.com/onnx/onnx-tensorrt>`__
* ``dynamo.optimize("ipex")`` - Uses IPEX for inference on CPU. `Read more <https://github.com/intel/intel-extension-for-pytorch>`__
* ``dynamo.optimize("tvm")`` - Uses Apach TVM for inference optimizations. `Read more <https://tvm.apache.org/>`__
* ``torch.compile(m, backend="onnxrt")`` - Uses ONNXRT for inference on CPU/GPU. `Read more <https://onnxruntime.ai/>`__
* ``torch.compile(m, backend="tensorrt")`` - Uses ONNXRT to run TensorRT for inference optimizations. `Read more <https://github.com/onnx/onnx-tensorrt>`__
* ``torch.compile(m, backend="ipex")`` - Uses IPEX for inference on CPU. `Read more <https://github.com/intel/intel-extension-for-pytorch>`__
* ``torch.compile(m, backend="tvm")`` - Uses Apach TVM for inference optimizations. `Read more <https://tvm.apache.org/>`__
Why do you need another way of optimizing PyTorch code?
-------------------------------------------------------

View File

@ -15,7 +15,7 @@ Where a complete example looks like this:
from typing import List
import torch
import torchdynamo
from torch import _dynamo as torchdynamo
def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
print("my_compiler() called with FX graph:")
gm.graph.print_tabular()

View File

@ -14,7 +14,7 @@ worlds — usability and performance.
TorchDynamo makes it easy to experiment with different compiler
backends to make PyTorch code faster with a single line decorator
``torch._dynamo.optimize()``
``torch._dynamo.optimize()`` which is wrapped for convenience by ``torch.compile()``
.. image:: ../_static/img/dynamo/TorchDynamo.png

View File

@ -27,7 +27,7 @@ TorchDynamo dependencies (for CUDA 11.7):
.. code-block:: shell
pip3 install numpy --pre torch[dynamo] --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/cu117
pip3 install numpy --pre torch --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/cu117
CPU requirements
~~~~~~~~~~~~~~~~
@ -41,16 +41,6 @@ To install, run the following command:
pip3 install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu
Install from Local Source
~~~~~~~~~~~~~~~~~~~~~~~~~
Alternatively, you can build PyTorch from `source
<https://github.com/pytorch/pytorch#from-source>`__, which has TorchDynamo
included.
To install GPU TorchDynamo dependencies, run ``make triton`` in the
PyTorch repo root directory.
Verify Installation
~~~~~~~~~~~~~~~~~~~

View File

@ -1024,17 +1024,12 @@ def main():
'typing-extensions',
'sympy',
'networkx',
'jinja2',
]
extras_require = {
'opt-einsum': ['opt-einsum>=3.3']
}
if platform.system() == 'Linux':
triton_pin_file = os.path.join(cwd, ".github", "ci_commit_pins", "triton.txt")
if os.path.exists(triton_pin_file):
with open(triton_pin_file) as f:
triton_pin = f.read().strip()
extras_require['dynamo'] = ['pytorch-triton==2.0.0+' + triton_pin[:10], 'jinja2']
# Parse the command line and check the arguments before we proceed with
# building deps and setup. We need to set values so `--help` works.

View File

@ -448,6 +448,7 @@ inductor_all_samples = {
"mT",
"mH",
"rsub",
"triu",
}

View File

@ -1807,6 +1807,78 @@ class TestMPS(TestCaseMPS):
x_cpu = x_cpu + 2
self.assertEqual(x, x_cpu)
def test_reshape_storage_offset(self):
# https://github.com/pytorch/pytorch/issues/95883
B = 4
T = 1
lin_cpu = nn.Linear(10, 256)
lin_mps = nn.Linear(10, 256, device="mps")
# Use the same weights and bias as the ones from the cpu
lin_mps.weight.data = lin_cpu.weight.data.detach().clone().to("mps").requires_grad_()
lin_mps.bias.data = lin_cpu.bias.data.detach().clone().to("mps").requires_grad_()
x_mps = torch.rand([B, T, 10], device="mps", requires_grad=True)
x_cpu = x_mps.detach().clone().cpu().requires_grad_()
x_mps = lin_mps(x_mps)
x_cpu = lin_cpu(x_cpu)
self.assertEqual(x_mps.shape, (B, T, 256))
self.assertEqual(x_cpu.shape, (B, T, 256))
cls_token_mps = torch.rand([1, 256], device="mps", requires_grad=True).repeat(B, 1, 1)
cls_token_cpu = cls_token_mps.detach().clone().cpu()
x_mps = torch.cat([cls_token_mps, x_mps], dim=1)
x_cpu = torch.cat([cls_token_cpu, x_cpu], dim=1)
x_mps = x_mps.transpose(0, 1)
x_cpu = x_cpu.transpose(0, 1)
target_mps = torch.rand_like(x_mps)
target_cpu = target_mps.detach().clone().cpu()
loss_mps = F.mse_loss(x_mps, target_mps)
loss_cpu = F.mse_loss(x_cpu, target_cpu)
self.assertEqual(loss_mps, loss_cpu)
loss_mps.backward()
loss_cpu.backward()
self.assertEqual(x_mps.grad, x_cpu.grad)
def test_stack(self):
# https://github.com/pytorch/pytorch/issues/87856
x_cpu = torch.tensor([[1, 2]])
x_mps = x_cpu.detach().clone().to("mps")
y_cpu = torch.stack((x_cpu[:, :1], x_cpu[:, -1:]), dim=-1)
y_mps = torch.stack((x_mps[:, :1], x_mps[:, -1:]), dim=-1)
self.assertEqual(y_cpu, y_mps)
t_mps = torch.tensor([1, 2, 3, 4], device="mps")
t_cpu = t_mps.detach().cpu().detach()
x_mps = t_mps[2:]
y_mps = t_mps[:2]
x_cpu = t_cpu[2:]
y_cpu = t_cpu[:2]
res_mps = torch.stack((y_mps, x_mps), dim=-1)
res_cpu = torch.stack((y_cpu, x_cpu), dim=-1)
self.assertEqual(res_mps, res_cpu)
def test_unsafe_chunk(self):
# https://github.com/pytorch/pytorch/issues/91065
a = torch.rand(5, dtype=torch.float32, device="cpu")
ret = a.unsafe_chunk(4, 0)
y = ret[0] * ret[2]
a_mps = a.to("mps")
ret_mps = a_mps.unsafe_chunk(4, 0)
y_mps = ret_mps[0] * ret_mps[2]
self.assertEqual(y, y_mps)
def test_slice_casting(self):
# generate random binary numbers
cpu_in = torch.bernoulli(torch.empty(1, 1, 128, 128).uniform_(0, 1)).to(torch.uint8)

View File

@ -308,6 +308,7 @@ def core_aten_decompositions() -> Dict[OpOverload, Callable]:
aten.trace,
aten.transpose.int,
aten.tril.default,
aten.triu.default,
aten.unfold,
aten.unfold_backward,
aten.upsample_bilinear2d,

View File

@ -1505,30 +1505,6 @@ def iota(
)
@register_lowering(aten.triu)
def triu(x, diagonal=0):
x_loader = x.make_loader()
dtype = x.get_dtype()
def inner_fn(index):
*_, i, j = index
return ops.where(
ops.ge(
ops.index_expr(j - i - diagonal, torch.int32),
ops.constant(0, torch.int32),
),
x_loader(index),
ops.constant(0, dtype),
)
return Pointwise.create(
device=x.get_device(),
dtype=dtype,
inner_fn=inner_fn,
ranges=list(x.get_size()),
)
@register_lowering(aten.select_scatter, type_promotion_kind=None)
def select_scatter(x, src, dim: int, index: int):
assert x.get_dtype() == src.get_dtype()

View File

@ -22,11 +22,14 @@ def dl_open_guard():
Context manager to set the RTLD_GLOBAL dynamic linker flag while we open a
shared library to load custom operators.
"""
if _SET_GLOBAL_FLAGS:
old_flags = sys.getdlopenflags()
sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL)
yield
if _SET_GLOBAL_FLAGS:
if not _SET_GLOBAL_FLAGS:
yield
return
old_flags = sys.getdlopenflags()
sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL)
try:
yield
finally:
sys.setdlopenflags(old_flags)

View File

@ -380,7 +380,7 @@ def make_histogram(values, bins, max_bins=None):
limits = new_limits
# Find the first and the last bin defining the support of the histogram:
cum_counts = np.cumsum(np.greater(counts, 0, dtype=np.int32))
cum_counts = np.cumsum(np.greater(counts, 0))
start, end = np.searchsorted(cum_counts, [0, cum_counts[-1] - 1], side="right")
start = int(start)
end = int(end) + 1