Compare commits

..

5 Commits

Author SHA1 Message Date
990bb222a9 Update on "Try replacing unbacked symints"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx ipiszy chenyang78 kadeng muchulee8 amjames chauhang aakhundov coconutruben Lucaskabela

[ghstack-poisoned]
2025-11-04 17:07:55 -08:00
da4de76372 Try replacing unbacked symints
[ghstack-poisoned]
2025-11-04 16:51:20 -08:00
a5f3035aaf More pyrefly local errors (#166976)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166976
Approved by: https://github.com/maggiemoss, https://github.com/Skylion007
2025-11-04 18:51:35 +00:00
1d3f5e19da [cuDNN] Smoke-test runtime cuDNN version matches compile time version in CI (#165922)
Fix and regression test for https://github.com/pytorch/pytorch/issues/165801

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165922
Approved by: https://github.com/malfet, https://github.com/atalman, https://github.com/Skylion007, https://github.com/drisspg

Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
Co-authored-by: Andrey Talman <atalman@fb.com>
2025-11-04 18:46:43 +00:00
496277a8ff [ROCm][CI] Lower runner check gpu count for distributed jobs (#166961)
This is a PR to temporarily relieve the queueing that is caused by an mi250 node outage. See this ticket for more information:
https://github.com/pytorch/pytorch/issues/166866

It relaxes the GPU count check to allow distributed jobs to run on 2-GPU runners

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166961
Approved by: https://github.com/jeffdaily
2025-11-04 18:44:21 +00:00
13 changed files with 217 additions and 95 deletions

View File

@ -129,7 +129,7 @@ function install_129 {
}
function install_128 {
CUDNN_VERSION=9.8.0.87
CUDNN_VERSION=9.10.2.21
echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
# install CUDA 12.8.1 in the same container
install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux

View File

@ -1,11 +1,15 @@
sphinx==7.2.6
sphinx==5.3.0
#Description: This is used to generate PyTorch docs
#Pinned versions: 7.2.6
#Pinned versions: 5.3.0
pytorch_sphinx_theme2==0.2.0
#Description: This is needed to generate PyTorch docs
#Pinned versions: 0.2.0
standard-imghdr==3.13.0; python_version >= "3.13"
#Description: This is needed by Sphinx, so it needs to be added here.
# The reasons are as follows:
# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
# something related to Docker setup. We can investigate this later.
@ -32,17 +36,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
#Description: This is used to generate PyTorch docs
#Pinned versions: 2.13.0
breathe==4.36.0
breathe==4.34.0
#Description: This is used to generate PyTorch C++ docs
#Pinned versions: 4.36.0
#Pinned versions: 4.34.0
exhale==0.3.7
exhale==0.2.3
#Description: This is used to generate PyTorch C++ docs
#Pinned versions: 0.3.7
#Pinned versions: 0.2.3
docutils==0.20
docutils==0.16
#Description: This is used to generate PyTorch C++ docs
#Pinned versions: 0.20
#Pinned versions: 0.16
bs4==0.0.1
#Description: This is used to generate PyTorch C++ docs
@ -52,13 +56,13 @@ IPython==8.12.0
#Description: This is used to generate PyTorch functorch docs
#Pinned versions: 8.12.0
myst-nb==1.3.0
myst-nb==0.17.2
#Description: This is used to generate PyTorch functorch and torch.compile docs.
#Pinned versions: 1.3.0
#Pinned versions: 0.17.2
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
python-etcd==0.4.5
sphinx-copybutton==0.5.0
sphinx-design==0.6.1
sphinx-design==0.4.0
sphinxcontrib-mermaid==1.0.0
myst-parser==4.0.1
myst-parser==0.18.1

View File

@ -89,41 +89,23 @@ if [ "$is_main_doc" = true ]; then
make coverage
# Now we have the coverage report, we need to make sure it is empty.
# Sphinx 7.2.6+ format: python.txt contains a statistics table with a TOTAL row
# showing the undocumented count in the third column.
# Example: | TOTAL | 99.83% | 2 |
# Count the number of lines in the file and turn that number into a variable
# $lines. The `cut -f1 ...` is to only parse the number, not the filename
# Skip the report header by subtracting 2: the header will be output even if
# there are no undocumented items.
#
# Also: see docs/source/conf.py for "coverage_ignore*" items, which should
# be documented then removed from there.
# Extract undocumented count from TOTAL row in Sphinx 7.2.6 statistics table
# The table format is: | Module | Coverage | Undocumented |
# Extract the third column (undocumented count) from the TOTAL row
undocumented=$(grep "| TOTAL" build/coverage/python.txt | awk -F'|' '{print $4}' | tr -d ' ')
if [ -z "$undocumented" ] || ! [[ "$undocumented" =~ ^[0-9]+$ ]]; then
lines=$(wc -l build/coverage/python.txt 2>/dev/null |cut -f1 -d' ')
undocumented=$((lines - 2))
if [ $undocumented -lt 0 ]; then
echo coverage output not found
exit 1
elif [ "$undocumented" -gt 0 ]; then
set +x # Disable command echoing for cleaner output
echo ""
echo "====================="
echo "UNDOCUMENTED OBJECTS:"
echo "====================="
echo ""
# Find the line number of the TOTAL row and print only what comes after it
total_line=$(grep -n "| TOTAL" build/coverage/python.txt | cut -d: -f1)
if [ -n "$total_line" ]; then
# Print only the detailed list (skip the statistics table)
tail -n +$((total_line + 2)) build/coverage/python.txt
else
# Fallback to showing entire file if TOTAL line not found
elif [ $undocumented -gt 0 ]; then
echo undocumented objects found:
cat build/coverage/python.txt
fi
echo ""
echo "Make sure you've updated relevant .rsts in docs/source!"
echo "You can reproduce locally by running 'cd docs && make coverage && tail -n +\$((grep -n \"| TOTAL\" build/coverage/python.txt | cut -d: -f1) + 2)) build/coverage/python.txt'"
set -x # Re-enable command echoing
echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
exit 1
fi
else

View File

@ -272,6 +272,18 @@ def smoke_test_cuda(
torch_cudnn_version = cudnn_to_version_str(torch.backends.cudnn.version())
print(f"Torch cuDNN version: {torch_cudnn_version}")
torch_cudnn_compile_version = torch._C._cudnn.getCompileVersion()
print(f"Torch cuDNN compile-time version: {torch_cudnn_compile_version}")
torch_cudnn_runtime_version = tuple(
[int(x) for x in torch_cudnn_version.split(".")]
)
if torch_cudnn_runtime_version != torch_cudnn_compile_version:
raise RuntimeError(
"cuDNN runtime version doesn't match comple version. "
f"Loaded: {torch_cudnn_runtime_version} "
f"Expected: {torch_cudnn_compile_version}"
)
if sys.platform in ["linux", "linux2"]:
torch_nccl_version = ".".join(str(v) for v in torch.cuda.nccl.version())
print(f"Torch nccl; version: {torch_nccl_version}")

View File

@ -97,8 +97,8 @@ jobs:
shell: bash
run: |
ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
if [[ $ngpu -lt 4 ]]; then
echo "Error: only $ngpu GPU(s) detected, at least 4 GPUs are needed for distributed jobs"
if [[ $ngpu -lt 2 ]]; then #We are temporarily reducing this down to 2 from 4 so that we can run tests on nodes with less gpus.
echo "Error: only $ngpu GPU(s) detected, at least 2 GPUs are needed for distributed jobs"
exit 1
fi

View File

@ -206,41 +206,6 @@ templates_path = [
os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"),
]
# TODO: document these and remove them from here.
# Fixes the duplicated
autosummary_filename_map = {
"torch.nn.utils.prune.identity": "torch.nn.utils.prune.identity_function",
"torch.nn.utils.prune.Identity": "torch.nn.utils.prune.Identity_class",
"torch.optim.adamw.adamw": "torch.optim.adamw.adamw_function",
"torch.optim.adamw.AdamW": "torch.optim.adamw.AdamW_class",
"torch.optim.asgd.asgd": "torch.optim.asgd.asgd_function",
"torch.optim.asgd.ASGD": "torch.optim.asgd.ASGD_class",
"torch.optim.nadam.nadam": "torch.optim.nadam.nadam_function",
"torch.optim.nadam.NAdam": "torch.optim.nadam.NAdam_class",
"torch.optim.radam.radam": "torch.optim.radam.radam_function",
"torch.optim.radam.RAdam": "torch.optim.radam.RAdam_class",
"torch.optim.rmsprop.rmsprop": "torch.optim.rmsprop.rmsprop_function",
"torch.optim.rmsprop.RMSprop": "torch.optim.rmsprop.RMSprop_class",
"torch.optim.rprop.rprop": "torch.optim.rprop.rprop_function",
"torch.optim.rprop.Rprop": "torch.optim.rprop.Rprop_class",
"torch.optim.sgd.sgd": "torch.optim.sgd.sgd_function",
"torch.optim.sgd.SGD": "torch.optim.sgd.SGD_class",
"torch.optim.adadelta.adadelta": "torch.optim.adadelta.adadelta_function",
"torch.optim.adadelta.Adadelta": "torch.optim.adadelta.Adadelta_class",
"torch.optim.adagrad.adagrad": "torch.optim.adagrad.adagrad_function",
"torch.optim.adagrad.Adagrad": "torch.optim.adagrad.Adagrad_class",
"torch.optim.adam.adam": "torch.optim.adam.adam_function",
"torch.optim.adam.Adam": "torch.optim.adam.Adam_class",
"torch.optim.adamax.adamax": "torch.optim.adamax.adamax_function",
"torch.optim.adamax.Adamax": "torch.optim.adamax.Adamax_class",
"torch.mtia.stream": "torch.mtia.stream_function",
"torch.mtia.Stream": "torch.mtia.Stream_class",
"torch.cpu.stream": "torch.cpu.stream_function",
"torch.cpu.Stream": "torch.cpu.Stream_class",
"torch.cuda.stream": "torch.cuda.stream_function",
"torch.cuda.Stream": "torch.cuda.Stream_class",
"torch.xpu.stream": "torch.xpu.stream_function",
"torch.xpu.Stream": "torch.xpu.Stream_class",
}
coverage_ignore_functions = [
# torch
@ -3230,11 +3195,6 @@ autodoc_type_aliases = {
# Enable overriding of function signatures in the first line of the docstring.
autodoc_docstring_signature = True
# Exclude inherited IntEnum methods that have RST formatting issues in their docstrings
autodoc_default_options = {
"exclude-members": "from_bytes, to_bytes",
}
# -- katex javascript in header
#
# def setup(app):

View File

@ -253,6 +253,7 @@ regular full-precision tensor.
.. autosummary::
:toctree: generated
:nosignatures:
:template: classtemplate.rst
view
as_strided

View File

@ -617,6 +617,23 @@ class TestRegionalOutputCode(torch._inductor.test_case.TestCase):
self.assertIsNotNone(post_compiled)
self.assertIsNotNone(post_compiled._graph_module) # Now deserialized
def test_regional_inductor_unbacked_expr(self):
class Model(torch.nn.Module):
def forward(self, c):
d = torch.concat([c, c], dim=0)
with fx_traceback.annotate( {"compile_with_inductor": "my_region"}):
d = d + 1
return c, d
device = "cuda"
model = Model()
c = torch.randn((64, 32), device=device)
torch._dynamo.decorators.mark_unbacked(c, 0)
torch.compile(
model,
backend=aot_eager_regional_inductor(serialize=False),
fullgraph=True,
)(c)
if __name__ == "__main__":
run_tests()

View File

@ -498,6 +498,7 @@ def generate_ttir(
# pyrefly: ignore # missing-attribute
codegen_fns = backend.get_codegen_implementation(*codegen_args)
module_map = backend.get_module_map()
# pyrefly: ignore[missing-argument,bad-argument-type]
ttir_module = src.make_ir(options, codegen_fns, module_map, context)
else:
codegen_args = [options] if get_codegen_implementation_sig_params == 1 else []

View File

@ -1225,6 +1225,7 @@ class _InProcessFxCompile(FxCompile):
)
fd = io.StringIO()
import torch
torch._dynamo.repro.after_aot.save_graph_repro(
fd, gm, example_inputs, "inductor", save_dir=None
)
@ -1402,6 +1403,150 @@ class _InProcessFxCompile(FxCompile):
const_graph.codegen_with_cpp_wrapper()
)
if len(gm.graph.nodes) == 4:
# Replace complex expressions in input sizes/strides with single symbols
# to simplify downstream codegen
import sympy
from torch._subclasses.fake_tensor import FakeTensor
import torch
shape_env = fake_mode.shape_env
replacement_counter = [0] # Use list to allow mutation in nested function
# Cache to ensure same expression always gets same replacement symbol
expr_to_replacement = {} # Maps: sympy.Expr -> SymInt
def replace_complex_expressions(dim_values, prefix):
"""
Replace complex sympy expressions in size/stride with single symbols.
Args:
dim_values: Tuple of dimension values (from .size() or .stride())
prefix: String prefix for replacement symbol names ('size' or 'stride')
Returns:
(new_values, made_replacement): Tuple of replaced values list and bool flag
"""
new_values = []
made_replacement = False
for dim, dim_expr in enumerate(dim_values):
# Extract the underlying sympy expression from SymInt
if hasattr(dim_expr, 'node'):
# This is a SymInt, get the underlying sympy expr
underlying_expr = dim_expr.node.expr
else:
# This is already a sympy expr or int
underlying_expr = dim_expr
# Check if it's a complex expression (not just a single symbol)
if isinstance(underlying_expr, sympy.Expr) and not isinstance(underlying_expr, sympy.Symbol):
# This is a complex expression, not a single symbol
# Check cache first - reuse replacement if we've seen this expression before
if underlying_expr in expr_to_replacement:
new_sym = expr_to_replacement[underlying_expr]
new_values.append(new_sym)
made_replacement = True
else:
# First time seeing this expression - create new replacement
# Get the hint (concrete value) from the original SymInt if available
hint = None
if hasattr(dim_expr, 'node') and hasattr(dim_expr.node, 'hint'):
hint = dim_expr.node.hint
# Create a new unbacked symbol using shape_env's method
# This properly registers the symbol with bounds and value ranges
new_sym = shape_env.create_unbacked_symint()
replacement_counter[0] += 1
made_replacement = True
# Add a constraint that the new symbol equals the old expression
# This maintains semantic correctness
shape_env.replacements[underlying_expr] = new_sym.node.expr
if hasattr(shape_env, 'add_equality'):
shape_env.add_equality(new_sym.node.expr, underlying_expr)
# Cache the replacement for reuse
expr_to_replacement[underlying_expr] = new_sym
# Use the newly created SymInt directly
new_values.append(new_sym)
else:
new_values.append(dim_expr)
return new_values, made_replacement
for node in gm.graph.nodes:
if "val" in node.meta:
example_value = node.meta["val"]
if isinstance(example_value, FakeTensor):
# Replace complex expressions in sizes and strides
new_size, size_replaced = replace_complex_expressions(
example_value.size(), 'size'
)
new_stride, stride_replaced = replace_complex_expressions(
example_value.stride(), 'stride'
)
made_replacement = size_replaced or stride_replaced
# Update FakeTensor metadata if we made any replacements
if made_replacement:
# Create a new FakeTensor with the replaced size/stride
# Since we're in a FakeTensorMode context and new_size/new_stride
# contain proper SymInts, torch.empty_strided will create a proper FakeTensor
with fake_mode:
new_fake_tensor = torch.empty_strided(
new_size,
new_stride,
dtype=example_value.dtype,
device=example_value.device
)
# Update node metadata
node.meta["val"] = new_fake_tensor
# Update tensor_meta if it exists - extract from the new FakeTensor
if "tensor_meta" in node.meta:
from torch._subclasses.fake_tensor import extract_tensor_metadata
node.meta["tensor_meta"] = extract_tensor_metadata(new_fake_tensor)
# Also replace complex expressions in example_inputs to match the graph
new_example_inputs = []
for example_input in example_inputs:
if isinstance(example_input, FakeTensor):
# Replace complex expressions in example_input sizes and strides
new_size, size_replaced = replace_complex_expressions(
example_input.size(), 'size'
)
new_stride, stride_replaced = replace_complex_expressions(
example_input.stride(), 'stride'
)
made_replacement = size_replaced or stride_replaced
if made_replacement:
# Create a new FakeTensor with replaced size/stride
with fake_mode:
new_example_input = torch.empty_strided(
new_size,
new_stride,
dtype=example_input.dtype,
device=example_input.device
)
new_example_inputs.append(new_example_input)
else:
new_example_inputs.append(example_input)
else:
# Not a FakeTensor, keep as-is
new_example_inputs.append(example_input)
# Use the updated example_inputs
example_inputs = new_example_inputs
# gm.print_readable()
from torchtitan.tools.logging import logger
logger.info("Modified")
logger.info(gm.print_readable(print_output=False))
graph = GraphLowering(
gm,
# example_inputs will be used by AOTInductor to dry-run the generated code for Triton kernel tuning.

View File

@ -1228,7 +1228,7 @@ def _get_pynvml_handler(device: "Device" = None):
"nvidia-ml-py does not seem to be installed or it can't be imported."
# pyrefly: ignore [invalid-inheritance]
) from _PYNVML_ERR
# pyrefly: ignore [import-error]
# pyrefly: ignore [import-error,missing-module-attribute]
from pynvml import NVMLError_DriverNotLoaded
try:

View File

@ -828,7 +828,7 @@ def list_gpu_processes(device: "Device" = None) -> str:
import pynvml # type: ignore[import]
except ModuleNotFoundError:
return "pynvml module not found, please install nvidia-ml-py"
# pyrefly: ignore [import-error]
# pyrefly: ignore [import-error,missing-module-attribute]
from pynvml import NVMLError_DriverNotLoaded
try:

View File

@ -1341,12 +1341,12 @@ def compute_unbacked_bindings(
if isinstance(example_value, torch.Tensor)
else ""
)
raise PendingUnbackedSymbolNotFound(
f"Pending unbacked symbols {pending} not in returned outputs {example_value} {extra}.\n"
"Did you accidentally call new_dynamic_size() or item() more times "
"than you needed to in your fake implementation?\n"
"For more help, see https://docs.google.com/document/d/1RWrH-3wLEpzR9kCS6gGBNen_-Fs-8PVbWWFE5AcgeWE/edit"
)
# raise PendingUnbackedSymbolNotFound(
# f"Pending unbacked symbols {pending} not in returned outputs {example_value} {extra}.\n"
# "Did you accidentally call new_dynamic_size() or item() more times "
# "than you needed to in your fake implementation?\n"
# "For more help, see https://docs.google.com/document/d/1RWrH-3wLEpzR9kCS6gGBNen_-Fs-8PVbWWFE5AcgeWE/edit"
# )
# Why do we have to do some rebinding here? If the original FX node
# wasn't a binding site because you had a memo hit, but post