Enable ruff check for torch/utils/data/*.ipynb (#148654)

Fixes part of #146411

Enable ruff check for `torch/utils/data/*.ipynb` files

## Test Result

```bash
lintrunner -a --take RUFF torch/utils/data/*.ipynb
```

![image](https://github.com/user-attachments/assets/88fddc91-3f19-4704-9aef-2cabd2cdc96e)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/148654
Approved by: https://github.com/Skylion007
This commit is contained in:
zeshengzong
2025-05-14 06:21:47 +00:00
committed by PyTorch MergeBot
parent f7798d8645
commit 0f891cad5a
4 changed files with 583 additions and 297 deletions

View File

@ -1518,7 +1518,7 @@ command = [
[[linter]]
code = 'RUFF'
include_patterns = ['**/*.py', '**/*.pyi']
include_patterns = ['**/*.py', '**/*.pyi', 'torch/utils/data/*.ipynb']
exclude_patterns = [
'caffe2/**',
'functorch/docs/**',

View File

@ -1,32 +1,11 @@
{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python3610jvsc74a57bd0eb5e09632d6ea1cbf3eb9da7e37b7cf581db5ed13074b21cc44e159dc62acdab",
"display_name": "Python 3.6.10 64-bit ('dataloader': conda)"
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## \\[RFC\\] How DataFrames (DF) and DataPipes (DP) work together"
],
"cell_type": "markdown",
"metadata": {}
]
},
{
"cell_type": "code",
@ -51,8 +30,7 @@
" def __init__(self, range = 20):\n",
" self.range = range\n",
" def __iter__(self):\n",
" for i in range(self.range):\n",
" yield i\n",
" yield from self.range\n",
"\n",
"def get_dataframes_pipe(range = 10, dataframe_size = 7):\n",
" return ExampleIterPipe(range = range).map(lambda i: (i, i % 3))._to_dataframes_pipe(columns = ['i','j'], dataframe_size = dataframe_size)\n",
@ -62,11 +40,11 @@
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Doesn't matter how DF composed internally, iterator over DF Pipe gives single rows to user. This is similar to regular DataPipe."
],
"cell_type": "markdown",
"metadata": {}
]
},
{
"cell_type": "code",
@ -74,10 +52,31 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"DataFrames Pipe\n(0, 0)\n(1, 1)\n(2, 2)\n(3, 0)\n(4, 1)\n(5, 2)\n(6, 0)\n(7, 1)\n(8, 2)\n(9, 0)\nRegular DataPipe\n(0, 0)\n(1, 1)\n(2, 2)\n(3, 0)\n(4, 1)\n(5, 2)\n(6, 0)\n(7, 1)\n(8, 2)\n(9, 0)\n"
"DataFrames Pipe\n",
"(0, 0)\n",
"(1, 1)\n",
"(2, 2)\n",
"(3, 0)\n",
"(4, 1)\n",
"(5, 2)\n",
"(6, 0)\n",
"(7, 1)\n",
"(8, 2)\n",
"(9, 0)\n",
"Regular DataPipe\n",
"(0, 0)\n",
"(1, 1)\n",
"(2, 2)\n",
"(3, 0)\n",
"(4, 1)\n",
"(5, 2)\n",
"(6, 0)\n",
"(7, 1)\n",
"(8, 2)\n",
"(9, 0)\n"
]
}
],
@ -94,11 +93,11 @@
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can iterate over raw DF using `raw_iterator`"
],
"cell_type": "markdown",
"metadata": {}
]
},
{
"cell_type": "code",
@ -106,10 +105,21 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
" i j\n0 0 0\n1 1 1\n2 2 2\n3 3 0\n4 4 1\n5 5 2\n6 6 0\n i j\n0 7 1\n1 8 2\n2 9 0\n"
" i j\n",
"0 0 0\n",
"1 1 1\n",
"2 2 2\n",
"3 3 0\n",
"4 4 1\n",
"5 5 2\n",
"6 6 0\n",
" i j\n",
"0 7 1\n",
"1 8 2\n",
"2 9 0\n"
]
}
],
@ -120,11 +130,11 @@
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Operations over DF Pipe is captured"
],
"cell_type": "markdown",
"metadata": {}
]
},
{
"cell_type": "code",
@ -134,10 +144,13 @@
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"var_3 = input_var_2.i * 100\nvar_4 = var_3 + input_var_2.j\nvar_5 = var_4 - 2.7\ninput_var_2[\"y\"] = var_5\n"
"var_3 = input_var_2.i * 100\n",
"var_4 = var_3 + input_var_2.j\n",
"var_5 = var_4 - 2.7\n",
"input_var_2[\"y\"] = var_5\n"
]
}
],
@ -148,11 +161,11 @@
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Captured operations executed on `__next__` calls of constructed DataPipe"
],
"cell_type": "markdown",
"metadata": {}
]
},
{
"cell_type": "code",
@ -160,10 +173,23 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
" i j y\n0 0 0 -2.7\n1 1 1 98.3\n2 2 2 199.3\n i j y\n0 3 0 297.3\n1 4 1 398.3\n2 5 2 499.3\n i j y\n0 6 0 597.3\n1 7 1 698.3\n2 8 2 799.3\n i j y\n0 9 0 897.3\n"
" i j y\n",
"0 0 0 -2.7\n",
"1 1 1 98.3\n",
"2 2 2 199.3\n",
" i j y\n",
"0 3 0 297.3\n",
"1 4 1 398.3\n",
"2 5 2 499.3\n",
" i j y\n",
"0 6 0 597.3\n",
"1 7 1 698.3\n",
"2 8 2 799.3\n",
" i j y\n",
"0 9 0 897.3\n"
]
}
],
@ -175,11 +201,11 @@
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`shuffle` of DataFramePipe effects rows in individual manner"
],
"cell_type": "markdown",
"metadata": {}
]
},
{
"cell_type": "code",
@ -187,10 +213,46 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"Raw DataFrames iterator\n i j\n2 8 2\n2 2 2\n2 5 2\n i j\n1 4 1\n1 1 1\n0 3 0\n i j\n1 7 1\n0 9 0\n0 6 0\n i j\n0 0 0\nRegular DataFrames iterator\n(1, 1)\n(5, 2)\n(8, 2)\n(9, 0)\n(7, 1)\n(6, 0)\n(3, 0)\n(4, 1)\n(0, 0)\n(2, 2)\nRegular iterator\n(5, 2)\n(6, 0)\n(0, 0)\n(9, 0)\n(3, 0)\n(1, 1)\n(2, 2)\n(8, 2)\n(4, 1)\n(7, 1)\n"
"Raw DataFrames iterator\n",
" i j\n",
"2 8 2\n",
"2 2 2\n",
"2 5 2\n",
" i j\n",
"1 4 1\n",
"1 1 1\n",
"0 3 0\n",
" i j\n",
"1 7 1\n",
"0 9 0\n",
"0 6 0\n",
" i j\n",
"0 0 0\n",
"Regular DataFrames iterator\n",
"(1, 1)\n",
"(5, 2)\n",
"(8, 2)\n",
"(9, 0)\n",
"(7, 1)\n",
"(6, 0)\n",
"(3, 0)\n",
"(4, 1)\n",
"(0, 0)\n",
"(2, 2)\n",
"Regular iterator\n",
"(5, 2)\n",
"(6, 0)\n",
"(0, 0)\n",
"(9, 0)\n",
"(3, 0)\n",
"(1, 1)\n",
"(2, 2)\n",
"(8, 2)\n",
"(4, 1)\n",
"(7, 1)\n"
]
}
],
@ -215,11 +277,11 @@
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can continue mixing DF and DP operations"
],
"cell_type": "markdown",
"metadata": {}
]
},
{
"cell_type": "code",
@ -227,10 +289,23 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
" i j y\n0 -17 -17 -197000.0\n1 -13 -16 3813000.0\n0 -11 -17 5803000.0\n i j y\n2 -12 -15 4823000.0\n1 -10 -16 6813000.0\n1 -16 -16 813000.0\n i j y\n0 -8 -17 8803000.0\n2 -9 -15 7823000.0\n0 -14 -17 2803000.0\n i j y\n2 -15 -15 1823000.0\n"
" i j y\n",
"0 -17 -17 -197000.0\n",
"1 -13 -16 3813000.0\n",
"0 -11 -17 5803000.0\n",
" i j y\n",
"2 -12 -15 4823000.0\n",
"1 -10 -16 6813000.0\n",
"1 -16 -16 813000.0\n",
" i j y\n",
"0 -8 -17 8803000.0\n",
"2 -9 -15 7823000.0\n",
"0 -14 -17 2803000.0\n",
" i j y\n",
"2 -15 -15 1823000.0\n"
]
}
],
@ -245,11 +320,11 @@
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Batching combines everything into `list` it is possible to nest `list`s. List may have any number of DataFrames as soon as total number of rows equal to batch size."
],
"cell_type": "markdown",
"metadata": {}
]
},
{
"cell_type": "code",
@ -257,10 +332,21 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"Iterate over DataFrame batches\n[(6, 0),(0, 0)]\n[(4, 1),(1, 1)]\n[(2, 2),(9, 0)]\n[(3, 0),(5, 2)]\n[(7, 1),(8, 2)]\nIterate over regular batches\n[(1, 1),(4, 1)]\n[(2, 2),(3, 0)]\n[(6, 0),(7, 1)]\n[(8, 2),(0, 0)]\n[(5, 2),(9, 0)]\n"
"Iterate over DataFrame batches\n",
"[(6, 0),(0, 0)]\n",
"[(4, 1),(1, 1)]\n",
"[(2, 2),(9, 0)]\n",
"[(3, 0),(5, 2)]\n",
"[(7, 1),(8, 2)]\n",
"Iterate over regular batches\n",
"[(1, 1),(4, 1)]\n",
"[(2, 2),(3, 0)]\n",
"[(6, 0),(7, 1)]\n",
"[(8, 2),(0, 0)]\n",
"[(5, 2),(9, 0)]\n"
]
}
],
@ -282,11 +368,11 @@
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Some details about internal storage of batched DataFrames and how they are iterated"
],
"cell_type": "markdown",
"metadata": {}
]
},
{
"cell_type": "code",
@ -294,8 +380,8 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"Type: <class 'torch.utils.data.datapipes.iter.dataframes.DataChunkDF'>\n",
"As string: [(0, 0),(3, 0)]\n",
@ -381,15 +467,15 @@
" print('-- df batch start --')\n",
" for item in i.raw_iterator():\n",
" print(item)\n",
" print('-- df batch end --') "
" print('-- df batch end --')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`concat` should work only of DF with same schema, this code should produce an error "
],
"cell_type": "markdown",
"metadata": {}
]
},
{
"cell_type": "code",
@ -407,12 +493,12 @@
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`unbatch` of `list` with DataFrame works similarly to regular unbatch.\n",
"Note: DataFrame sizes might change"
],
"cell_type": "markdown",
"metadata": {}
]
},
{
"cell_type": "code",
@ -420,9 +506,9 @@
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "AttributeError",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
@ -445,11 +531,11 @@
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`map` applied to individual rows, `nesting_level` argument used to penetrate batching"
],
"cell_type": "markdown",
"metadata": {}
]
},
{
"cell_type": "code",
@ -457,10 +543,15 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"Iterate over DataFrame batches\n[(1111000, 1111000),(1112000, 1112000),(1113000, 1113000),(1114000, 1111000),(1115000, 1112000)]\n[(1116000, 1113000),(1117000, 1111000),(1118000, 1112000),(1119000, 1113000),(1120000, 1111000)]\nIterate over regular batches\n[(1111000, 0),(1112000, 1),(1113000, 2),(1114000, 0),(1115000, 1)]\n[(1116000, 2),(1117000, 0),(1118000, 1),(1119000, 2),(1120000, 0)]\n"
"Iterate over DataFrame batches\n",
"[(1111000, 1111000),(1112000, 1112000),(1113000, 1113000),(1114000, 1111000),(1115000, 1112000)]\n",
"[(1116000, 1113000),(1117000, 1111000),(1118000, 1112000),(1119000, 1113000),(1120000, 1111000)]\n",
"Iterate over regular batches\n",
"[(1111000, 0),(1112000, 1),(1113000, 2),(1114000, 0),(1115000, 1)]\n",
"[(1116000, 2),(1117000, 0),(1118000, 1),(1119000, 2),(1120000, 0)]\n"
]
}
],
@ -483,11 +574,11 @@
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`filter` applied to individual rows, `nesting_level` argument used to penetrate batching"
],
"cell_type": "markdown",
"metadata": {}
]
},
{
"cell_type": "code",
@ -495,10 +586,15 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"Iterate over DataFrame batches\n[(6, 0),(7, 1),(8, 2),(9, 0),(10, 1)]\n[(11, 2),(12, 0)]\nIterate over regular batches\n[(6, 0),(7, 1),(8, 2),(9, 0),(10, 1)]\n[(11, 2),(12, 0)]\n"
"Iterate over DataFrame batches\n",
"[(6, 0),(7, 1),(8, 2),(9, 0),(10, 1)]\n",
"[(11, 2),(12, 0)]\n",
"Iterate over regular batches\n",
"[(6, 0),(7, 1),(8, 2),(9, 0),(10, 1)]\n",
"[(11, 2),(12, 0)]\n"
]
}
],
@ -519,5 +615,26 @@
" print(i)"
]
}
]
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.6.10 64-bit ('dataloader': conda)",
"name": "python3610jvsc74a57bd0eb5e09632d6ea1cbf3eb9da7e37b7cf581db5ed13074b21cc44e159dc62acdab"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
},
"orig_nbformat": 2
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because it is too large Load Diff

View File

@ -16,7 +16,9 @@
"outputs": [],
"source": [
"from torch.utils.data import IterDataPipe\n",
"from typing import Any, Iterator, List, Tuple, TypeVar, Set, Union\n",
"from typing import Any, TypeVar, Union\n",
"from collections.abc import Iterator\n",
"import sys\n",
"\n",
"T_co = TypeVar('T_co', covariant=True)"
]
@ -122,9 +124,10 @@
"metadata": {},
"outputs": [],
"source": [
"class DP(IterDataPipe[Tuple]):\n",
" def __iter__(self) -> Iterator[Tuple[int, str]]:\n",
" pass"
"class DP(IterDataPipe[tuple]):\n",
" def __iter__(self) -> Iterator[tuple[int, str]]:\n",
" pass\n",
"print(DP.type)"
]
},
{
@ -135,7 +138,8 @@
"source": [
"class DP(IterDataPipe):\n",
" def __iter__(self) -> Iterator[int]:\n",
" pass"
" pass\n",
"print(DP.type)"
]
},
{
@ -197,14 +201,14 @@
}
],
"source": [
"class DP(IterDataPipe[Tuple[T_co, str]]):\n",
" def __iter__(self) -> Iterator[Tuple[T_co, str]]:\n",
"class DP(IterDataPipe[tuple[T_co, str]]):\n",
" def __iter__(self) -> Iterator[tuple[T_co, str]]:\n",
" pass\n",
"print(DP.type)\n",
"\n",
"T = TypeVar('T', int, str) # equals to Union[int, str]\n",
"class DP(IterDataPipe[Tuple[T, str]]):\n",
" def __iter__(self) -> Iterator[Tuple[Union[int, str], str]]:\n",
"class DP(IterDataPipe[tuple[T, str]]):\n",
" def __iter__(self) -> Iterator[tuple[Union[int, str], str]]:\n",
" pass\n",
"print(DP.type)"
]
@ -242,8 +246,8 @@
}
],
"source": [
"class DP(IterDataPipe[List[int]]):\n",
" def __iter__(self) -> Iterator[List[int]]:\n",
"class DP(IterDataPipe[list[int]]):\n",
" def __iter__(self) -> Iterator[list[int]]:\n",
" pass\n",
"print_helper(DP, DP())"
]
@ -313,8 +317,7 @@
" self.dp = dp\n",
"\n",
" def __iter__(self):\n",
" for d in self.dp:\n",
" yield d"
" yield from self.dp"
]
},
{
@ -378,7 +381,7 @@
"metadata": {},
"outputs": [],
"source": [
"class Temp(IterDataPipe[Tuple[int, T_co]]):\n",
"class Temp(IterDataPipe[tuple[int, T_co]]):\n",
" def __iter__(self):\n",
" pass\n",
"dp = DP(Temp())"
@ -407,14 +410,13 @@
"source": [
"from torch.utils.data import runtime_validation, runtime_validation_disabled\n",
"\n",
"class DP(IterDataPipe[Tuple[int, T_co]]):\n",
"class DP(IterDataPipe[tuple[int, T_co]]):\n",
" def __init__(self, datasource):\n",
" self.ds = datasource\n",
"\n",
" @runtime_validation\n",
" def __iter__(self):\n",
" for d in self.ds:\n",
" yield d"
" yield from self.ds"
]
},
{
@ -608,8 +610,7 @@
" self.ds = ds\n",
"\n",
" def __iter__(self):\n",
" for d in self.ds:\n",
" yield d\n",
" yield from self.ds\n",
"dp = DP(ds).reinforce_type(int)"
]
},
@ -625,8 +626,7 @@
"\n",
" @runtime_validation\n",
" def __iter__(self):\n",
" for d in self.ds:\n",
" yield d"
" yield from self.ds"
]
},
{