mirror of
https://github.com/huggingface/transformers.git
synced 2025-11-07 14:04:43 +08:00
Compare commits
29 Commits
add_user_a
...
add_docs
| Author | SHA1 | Date | |
|---|---|---|---|
| 02e0fd8111 | |||
| 5689dd6b8e | |||
| 571352d378 | |||
| 2418196ef4 | |||
| 561233cabf | |||
| 36b640562b | |||
| 0c4a202408 | |||
| 20396951af | |||
| 3c4cdd549d | |||
| 020e713ac8 | |||
| 371ef0f4a2 | |||
| 6efc1799c1 | |||
| 325810e7fc | |||
| 9a19171fad | |||
| 26fca86312 | |||
| 900cf9d33b | |||
| 154d5101a4 | |||
| e3d4fa692e | |||
| dd4e048e75 | |||
| 6ff4fabd9d | |||
| 6d4450e341 | |||
| aee5c2384a | |||
| 5b6c209bc5 | |||
| 258c76e4dc | |||
| 64397a8301 | |||
| cd309610c0 | |||
| dd8f231495 | |||
| 1619a3475f | |||
| ff0f7d6498 |
2
.github/workflows/benchmark.yml
vendored
2
.github/workflows/benchmark.yml
vendored
@ -52,7 +52,7 @@ jobs:
|
||||
commit_id=$GITHUB_SHA
|
||||
fi
|
||||
commit_msg=$(git show -s --format=%s | cut -c1-70)
|
||||
python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
|
||||
python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --level 2 --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
PUSH_TO_HUB_TOKEN: ${{ secrets.PUSH_TO_HUB_TOKEN }}
|
||||
|
||||
2
.github/workflows/build-docker-images.yml
vendored
2
.github/workflows/build-docker-images.yml
vendored
@ -97,7 +97,7 @@ jobs:
|
||||
latest-torch-deepspeed-docker:
|
||||
name: "Latest PyTorch + DeepSpeed"
|
||||
runs-on:
|
||||
group: aws-g4dn-2xlarge-cache
|
||||
group: aws-general-8-plus
|
||||
steps:
|
||||
-
|
||||
name: Set up Docker Buildx
|
||||
|
||||
22
.github/workflows/codeql.yml
vendored
Normal file
22
.github/workflows/codeql.yml
vendored
Normal file
@ -0,0 +1,22 @@
|
||||
---
|
||||
name: CodeQL Security Analysis
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ["main"]
|
||||
# pull_request:
|
||||
# branches: ["main"]
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
codeql:
|
||||
name: CodeQL Analysis
|
||||
uses: huggingface/security-workflows/.github/workflows/codeql-reusable.yml@main
|
||||
permissions:
|
||||
security-events: write
|
||||
packages: read
|
||||
actions: read
|
||||
contents: read
|
||||
with:
|
||||
languages: '["actions"]'
|
||||
queries: 'security-extended,security-and-quality'
|
||||
@ -2,7 +2,7 @@ name: Self-hosted runner (AMD scheduled CI caller)
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: "17 2 * * *"
|
||||
- cron: "17 5 * * *"
|
||||
|
||||
jobs:
|
||||
run_scheduled_amd_ci:
|
||||
|
||||
10
.github/workflows/self-scheduled.yml
vendored
10
.github/workflows/self-scheduled.yml
vendored
@ -102,8 +102,10 @@ jobs:
|
||||
working-directory: /transformers/tests
|
||||
run: |
|
||||
if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
|
||||
echo "folder_slices=$(python3 ../utils/split_model_tests.py --subdirs '${{ inputs.subdirs }}' --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
|
||||
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
|
||||
python3 ../utils/split_model_tests.py --subdirs '${{ inputs.subdirs }}' --num_splits ${{ env.NUM_SLICES }} > folder_slices.txt
|
||||
echo "folder_slices=$(cat folder_slices.txt)" >> $GITHUB_OUTPUT
|
||||
python3 -c "import ast; folder_slices = ast.literal_eval(open('folder_slices.txt').read()); open('slice_ids.txt', 'w').write(str(list(range(len(folder_slices)))))"
|
||||
echo "slice_ids=$(cat slice_ids.txt)" >> $GITHUB_OUTPUT
|
||||
elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
|
||||
echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
|
||||
echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
|
||||
@ -336,7 +338,7 @@ jobs:
|
||||
working-directory: ${{ inputs.working-directory-prefix }}/
|
||||
run: |
|
||||
python3 -m pip uninstall -y deepspeed
|
||||
DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check
|
||||
|
||||
# To avoid unknown test failures
|
||||
- name: Pre build DeepSpeed *again* (for nightly & Past CI)
|
||||
@ -346,7 +348,7 @@ jobs:
|
||||
python3 -m pip uninstall -y deepspeed
|
||||
rm -rf DeepSpeed
|
||||
git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
|
||||
16
.github/workflows/ssh-runner.yml
vendored
16
.github/workflows/ssh-runner.yml
vendored
@ -4,7 +4,7 @@ on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
runner_type:
|
||||
description: 'Type of runner to test (a10 or t4)'
|
||||
description: 'Type of runner to test (a10)'
|
||||
required: true
|
||||
docker_image:
|
||||
description: 'Name of the Docker image'
|
||||
@ -36,14 +36,10 @@ jobs:
|
||||
NUM_GPUS: ${{ github.event.inputs.num_gpus }}
|
||||
RUNNER_TYPE: ${{ github.event.inputs.runner_type }}
|
||||
run: |
|
||||
if [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "t4" ]]; then
|
||||
echo "RUNNER=aws-g4dn-4xlarge-cache" >> $GITHUB_ENV
|
||||
elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "t4" ]]; then
|
||||
echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV
|
||||
elif [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then
|
||||
echo "RUNNER=aws-g5-4xlarge-cache" >> $GITHUB_ENV
|
||||
if [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then
|
||||
echo "RUNNER=aws-g5-4xlarge-cache-ssh" >> $GITHUB_ENV
|
||||
elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "a10" ]]; then
|
||||
echo "RUNNER=aws-g5-12xlarge-cache" >> $GITHUB_ENV
|
||||
echo "RUNNER=aws-g5-12xlarge-cache-ssh" >> $GITHUB_ENV
|
||||
else
|
||||
echo "RUNNER=" >> $GITHUB_ENV
|
||||
fi
|
||||
@ -61,8 +57,6 @@ jobs:
|
||||
group: ${{ needs.get_runner.outputs.RUNNER }}
|
||||
container:
|
||||
image: ${{ github.event.inputs.docker_image }}
|
||||
options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
@ -106,7 +100,7 @@ jobs:
|
||||
else
|
||||
echo "SLACKCHANNEL=${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
|
||||
- name: Tailscale # In order to be able to SSH when a test fails
|
||||
uses: huggingface/tailscale-action@main
|
||||
with:
|
||||
|
||||
@ -1,8 +1,11 @@
|
||||
import hashlib
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from transformers.utils.import_utils import is_flash_attn_2_available
|
||||
|
||||
|
||||
KERNELIZATION_AVAILABLE = False
|
||||
try:
|
||||
@ -18,6 +21,16 @@ logger = logging.getLogger(__name__)
|
||||
class BenchmarkConfig:
|
||||
"""Configuration for a single benchmark scenario."""
|
||||
|
||||
all_attn_implementations = [
|
||||
("flash_attention_2", None),
|
||||
("eager", None),
|
||||
("sdpa", "math"),
|
||||
("sdpa", "flash_attention"),
|
||||
("flex_attention", None),
|
||||
]
|
||||
|
||||
all_compiled_modes = [None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
warmup_iterations: int = 5,
|
||||
@ -59,6 +72,13 @@ class BenchmarkConfig:
|
||||
def check_validity(self, skip_validity_check: bool = False) -> None:
|
||||
if skip_validity_check:
|
||||
return
|
||||
# Check FA is installed
|
||||
if self.attn_implementation == "flash_attention_2" and not is_flash_attn_2_available():
|
||||
logger.warning(
|
||||
"Flash attention does not support compile mode. Defaulting to SDPA w/ flash attention backend."
|
||||
)
|
||||
self.attn_implementation = "sdpa"
|
||||
self.sdpa_backend = "flash_attention"
|
||||
# Flash attention does not support compile mode, so we turn it off # FIXME: it would be better to support it
|
||||
is_fa = self.attn_implementation == "flash_attention_2"
|
||||
is_fa |= self.attn_implementation == "sdpa" and self.sdpa_backend == "flash_attention"
|
||||
@ -127,88 +147,68 @@ class BenchmarkConfig:
|
||||
)
|
||||
|
||||
|
||||
def cross_generate_configs(
|
||||
attn_impl_and_sdpa_backend: list[tuple[str, str | None]],
|
||||
compiled_mode: list[str | None],
|
||||
kernelized: list[bool],
|
||||
warmup_iterations: int = 5,
|
||||
measurement_iterations: int = 20,
|
||||
batch_size: int = 1,
|
||||
sequence_length: int = 128,
|
||||
num_tokens_to_generate: int = 128,
|
||||
gpu_monitoring: bool = True,
|
||||
def adapt_configs(
|
||||
configs: list[BenchmarkConfig],
|
||||
warmup_iterations: int | list[int] = 5,
|
||||
measurement_iterations: int | list[int] = 20,
|
||||
batch_size: int | list[int] = 1,
|
||||
sequence_length: int | list[int] = 128,
|
||||
num_tokens_to_generate: int | list[int] = 128,
|
||||
gpu_monitoring: bool | list[bool] = True,
|
||||
) -> list[BenchmarkConfig]:
|
||||
# Create kwargs common to all configs
|
||||
kwargs = {
|
||||
"warmup_iterations": warmup_iterations,
|
||||
"measurement_iterations": measurement_iterations,
|
||||
"batch_size": batch_size,
|
||||
"sequence_length": sequence_length,
|
||||
"num_tokens_to_generate": num_tokens_to_generate,
|
||||
"gpu_monitoring": gpu_monitoring,
|
||||
}
|
||||
# Cross-generate all combinations of attn_implementation, compiled_mode, and kernelized
|
||||
configs = []
|
||||
for attn_implementation, sdpa_backend in list(dict.fromkeys(attn_impl_and_sdpa_backend)):
|
||||
for cm in list(dict.fromkeys(compiled_mode)):
|
||||
for kernelize_on in list(dict.fromkeys(kernelized)):
|
||||
config = BenchmarkConfig(
|
||||
attn_implementation=attn_implementation,
|
||||
sdpa_backend=sdpa_backend,
|
||||
compile_mode=cm,
|
||||
kernelize=kernelize_on,
|
||||
**kwargs,
|
||||
)
|
||||
configs.append(config)
|
||||
return configs
|
||||
|
||||
|
||||
def generate_all_configs(
|
||||
warmup_iterations: int = 5,
|
||||
measurement_iterations: int = 20,
|
||||
batch_size: int = 1,
|
||||
sequence_length: int = 128,
|
||||
num_tokens_to_generate: int = 128,
|
||||
gpu_monitoring: bool = True,
|
||||
) -> list[BenchmarkConfig]:
|
||||
all_attn_implementations = [
|
||||
("flash_attention_2", None),
|
||||
("eager", None),
|
||||
("sdpa", "math"),
|
||||
("sdpa", "flash_attention"),
|
||||
("flex_attention", None),
|
||||
]
|
||||
return cross_generate_configs(
|
||||
attn_impl_and_sdpa_backend=all_attn_implementations,
|
||||
compiled_mode=[None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"],
|
||||
kernelized=[False, KERNELIZATION_AVAILABLE],
|
||||
warmup_iterations=warmup_iterations,
|
||||
measurement_iterations=measurement_iterations,
|
||||
batch_size=batch_size,
|
||||
sequence_length=sequence_length,
|
||||
num_tokens_to_generate=num_tokens_to_generate,
|
||||
gpu_monitoring=gpu_monitoring,
|
||||
parameters = (
|
||||
x if isinstance(x, list) else [x]
|
||||
for x in [
|
||||
warmup_iterations,
|
||||
measurement_iterations,
|
||||
batch_size,
|
||||
sequence_length,
|
||||
num_tokens_to_generate,
|
||||
gpu_monitoring,
|
||||
]
|
||||
)
|
||||
iterator = itertools.product(*parameters)
|
||||
|
||||
adapted_configs = []
|
||||
for warmup_iters, measurement_iters, bs, seqlen, ntok, monitor in iterator:
|
||||
for config in configs:
|
||||
config = config.to_dict()
|
||||
config["warmup_iterations"] = warmup_iters
|
||||
config["measurement_iterations"] = measurement_iters
|
||||
config["batch_size"] = bs
|
||||
config["sequence_length"] = seqlen
|
||||
config["num_tokens_to_generate"] = ntok
|
||||
config["gpu_monitoring"] = monitor
|
||||
adapted_configs.append(BenchmarkConfig.from_dict(config))
|
||||
return adapted_configs
|
||||
|
||||
|
||||
def generate_main_configs(
|
||||
warmup_iterations: int = 5,
|
||||
measurement_iterations: int = 20,
|
||||
batch_size: int = 1,
|
||||
sequence_length: int = 128,
|
||||
num_tokens_to_generate: int = 128,
|
||||
) -> list[BenchmarkConfig]:
|
||||
# Create kwargs common to all configs
|
||||
kwargs = {
|
||||
"warmup_iterations": warmup_iterations,
|
||||
"measurement_iterations": measurement_iterations,
|
||||
"batch_size": batch_size,
|
||||
"sequence_length": sequence_length,
|
||||
"num_tokens_to_generate": num_tokens_to_generate,
|
||||
}
|
||||
return [ # TODO: test max-autotune instead of default
|
||||
BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=False, **kwargs),
|
||||
BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=True, **kwargs),
|
||||
BenchmarkConfig(attn_implementation="eager", compile_mode="default", gpu_monitoring=True, **kwargs),
|
||||
BenchmarkConfig(attn_implementation="flash_attention_2", gpu_monitoring=True, **kwargs),
|
||||
]
|
||||
def get_config_by_level(level: int) -> list[BenchmarkConfig]:
|
||||
configs = []
|
||||
# Early return if level is greater than 3: we generate all combinations of configs, maybe even w/ all compile modes
|
||||
if level >= 3:
|
||||
for attn_implementation, sdpa_backend in BenchmarkConfig.all_attn_implementations:
|
||||
# Usually there is not much to gain by compiling with other modes, but we allow it for level 4
|
||||
compile_modes = BenchmarkConfig.all_compiled_modes if level >= 4 else [None, "default"]
|
||||
for cm in compile_modes:
|
||||
for kernelize_on in [False, KERNELIZATION_AVAILABLE]:
|
||||
configs.append(
|
||||
BenchmarkConfig(
|
||||
attn_implementation=attn_implementation,
|
||||
sdpa_backend=sdpa_backend,
|
||||
compile_mode=cm,
|
||||
kernelize=kernelize_on,
|
||||
)
|
||||
)
|
||||
return configs
|
||||
# Otherwise, we add the configs for the given level
|
||||
if level >= 0:
|
||||
configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default"))
|
||||
if level >= 1:
|
||||
configs.append(BenchmarkConfig(attn_implementation="flash_attention_2"))
|
||||
configs.append(BenchmarkConfig(attn_implementation="eager", compile_mode="default"))
|
||||
if level >= 2:
|
||||
configs.append(BenchmarkConfig(attn_implementation="sdpa", compile_mode="default"))
|
||||
configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", kernelize=True))
|
||||
configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", kernelize=True))
|
||||
return configs
|
||||
|
||||
@ -23,7 +23,7 @@ import logging
|
||||
import sys
|
||||
import uuid
|
||||
|
||||
from framework.benchmark_config import BenchmarkConfig, generate_all_configs, generate_main_configs
|
||||
from framework.benchmark_config import adapt_configs, get_config_by_level
|
||||
from framework.benchmark_runner import BenchmarkRunner
|
||||
|
||||
|
||||
@ -40,7 +40,14 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--sequence-length", "-s", type=int, nargs="+", help="Sequence length")
|
||||
parser.add_argument("--num-tokens-to-generate", "-n", type=int, nargs="+", help="Number of tokens to generate")
|
||||
|
||||
parser.add_argument("--cross-generate", action="store_true", help="Cross-generate all combinations of configs")
|
||||
parser.add_argument(
|
||||
"--level",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Level of coverage for the benchmark. 0: only the main config, 1: a few important configs, 2: a config for"
|
||||
" each attn implementation an option, 3: cross-generate all combinations of configs, 4: cross-generate all"
|
||||
" combinations of configs w/ all compile modes",
|
||||
)
|
||||
parser.add_argument("--num-tokens-to-profile", "-p", type=int, default=0, help="Number of tokens to profile")
|
||||
|
||||
parser.add_argument("--branch-name", type=str, help="Git branch name")
|
||||
@ -79,64 +86,24 @@ if __name__ == "__main__":
|
||||
"At least one of the arguments --batch-size, --sequence-length, or --num-tokens-to-generate is required"
|
||||
)
|
||||
|
||||
# If there is only one (batch_size, sequence_length, num_tokens_to_generate), we benchmark across configs
|
||||
elif len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 1:
|
||||
if args.cross_generate:
|
||||
benchmark_configs = generate_all_configs(
|
||||
warmup_iterations=args.warmup,
|
||||
measurement_iterations=args.iterations,
|
||||
batch_size=args.batch_size[0],
|
||||
sequence_length=args.sequence_length[0],
|
||||
num_tokens_to_generate=args.num_tokens_to_generate[0],
|
||||
gpu_monitoring=not args.no_gpu_monitoring,
|
||||
)
|
||||
else:
|
||||
benchmark_configs = generate_main_configs(
|
||||
warmup_iterations=args.warmup,
|
||||
measurement_iterations=args.iterations,
|
||||
batch_size=args.batch_size[0],
|
||||
sequence_length=args.sequence_length[0],
|
||||
num_tokens_to_generate=args.num_tokens_to_generate[0],
|
||||
)
|
||||
|
||||
# Otherwise, we benchmark across all combinations of dimensions
|
||||
else:
|
||||
main_config = generate_main_configs(
|
||||
warmup_iterations=args.warmup,
|
||||
measurement_iterations=args.iterations,
|
||||
batch_size=args.batch_size[0],
|
||||
sequence_length=args.sequence_length[0],
|
||||
num_tokens_to_generate=args.num_tokens_to_generate[0],
|
||||
)[0]
|
||||
benchmark_configs = []
|
||||
for num_tokens_to_generate in args.num_tokens_to_generate:
|
||||
for sequence_length in args.sequence_length:
|
||||
for batch_size in args.batch_size:
|
||||
cfg_dict = main_config.to_dict()
|
||||
cfg_dict["batch_size"] = batch_size
|
||||
cfg_dict["sequence_length"] = sequence_length
|
||||
cfg_dict["num_tokens_to_generate"] = num_tokens_to_generate
|
||||
cfg_dict.pop("name")
|
||||
benchmark_configs.append(BenchmarkConfig.from_dict(cfg_dict))
|
||||
|
||||
runner = BenchmarkRunner(
|
||||
logger,
|
||||
args.output_dir,
|
||||
args.branch_name,
|
||||
args.commit_id,
|
||||
args.commit_message,
|
||||
# Get the configs for the given coverage level
|
||||
configs = get_config_by_level(args.level)
|
||||
# Adapt the configs to the given arguments
|
||||
configs = adapt_configs(
|
||||
configs,
|
||||
args.warmup,
|
||||
args.iterations,
|
||||
args.batch_size,
|
||||
args.sequence_length,
|
||||
args.num_tokens_to_generate,
|
||||
not args.no_gpu_monitoring,
|
||||
)
|
||||
|
||||
runner = BenchmarkRunner(logger, args.output_dir, args.branch_name, args.commit_id, args.commit_message)
|
||||
timestamp, results = runner.run_benchmarks(
|
||||
args.model_id,
|
||||
benchmark_configs,
|
||||
args.num_tokens_to_profile,
|
||||
pretty_print_summary=True,
|
||||
args.model_id, configs, args.num_tokens_to_profile, pretty_print_summary=True
|
||||
)
|
||||
|
||||
dataset_id = args.push_result_to_dataset
|
||||
if dataset_id is not None and len(results) > 0:
|
||||
runner.push_results_to_hub(
|
||||
dataset_id,
|
||||
results,
|
||||
timestamp,
|
||||
)
|
||||
runner.push_results_to_hub(dataset_id, results, timestamp)
|
||||
|
||||
@ -29,7 +29,7 @@ RUN python3 -m pip uninstall -y apex torch torchvision torchaudio
|
||||
RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir
|
||||
|
||||
# Pre-build DeepSpeed, so it's be ready for testing (to avoid timeout)
|
||||
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache-dir -v --disable-pip-version-check 2>&1
|
||||
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache-dir -v --disable-pip-version-check 2>&1
|
||||
|
||||
ARG REF=main
|
||||
WORKDIR /
|
||||
|
||||
@ -21,7 +21,7 @@ RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'p
|
||||
# Install latest release PyTorch
|
||||
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
|
||||
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
|
||||
RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
|
||||
RUN python3 -m pip uninstall -y torch torchvision torchaudio torchcodec && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||
|
||||
@ -43,7 +43,7 @@ RUN python3 -m pip uninstall -y deepspeed
|
||||
# This has to be run (again) inside the GPU VMs running the tests.
|
||||
# The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
|
||||
# TODO: Find out why test fail.
|
||||
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
|
||||
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check 2>&1
|
||||
|
||||
# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
|
||||
RUN python3 -m pip uninstall -y kernels
|
||||
|
||||
@ -81,7 +81,7 @@ RUN python3 -m pip uninstall -y flash-attn
|
||||
RUN cd transformers && python3 setup.py develop
|
||||
|
||||
# Add fp-quant for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir "fp-quant>=0.2.0"
|
||||
RUN python3 -m pip install --no-cache-dir "fp-quant>=0.3.2"
|
||||
|
||||
# Low usage or incompatible lib, will enable later on
|
||||
|
||||
|
||||
@ -158,6 +158,24 @@ print("Retrieval scores (query x image):")
|
||||
print(scores)
|
||||
```
|
||||
|
||||
You can also use checkpoints for `ColQwen2.5` that are **compatible with the ColQwen2 architecture**. This version of the model uses [Qwen2_5_VL](./qwen2_5_vl) as the backbone.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import ColQwen2ForRetrieval, ColQwen2Processor
|
||||
from transformers.utils.import_utils import is_flash_attn_2_available
|
||||
|
||||
model_name = "Sahil-Kabir/colqwen2.5-v0.2-hf" # An existing compatible checkpoint
|
||||
|
||||
model = ColQwen2ForRetrieval.from_pretrained(
|
||||
model_name,
|
||||
dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
attn_implementation="flash_attention_2" if is_flash_attn_2_available() else "sdpa"
|
||||
)
|
||||
processor = ColQwen2Processor.from_pretrained(model_name)
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- [`~ColQwen2Processor.score_retrieval`] returns a 2D tensor where the first dimension is the number of queries and the second dimension is the number of images. A higher score indicates more similarity between the query and image.
|
||||
|
||||
@ -75,11 +75,11 @@ A processor requires an image_processor and a tokenizer. Hence, inputs can be lo
|
||||
from PIL import Image
|
||||
from transformers import AutoTokenizer
|
||||
from transformers.models.fuyu.processing_fuyu import FuyuProcessor
|
||||
from transformers.models.fuyu.image_processing_fuyu import FuyuImageProcessor
|
||||
from transformers.models.fuyu.image_processing_fuyu_fast import FuyuImageProcessorFast
|
||||
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained('adept-hf-collab/fuyu-8b')
|
||||
image_processor = FuyuImageProcessor()
|
||||
image_processor = FuyuImageProcessorFast()
|
||||
|
||||
|
||||
processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
|
||||
@ -118,6 +118,11 @@ The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece.
|
||||
[[autodoc]] FuyuImageProcessor
|
||||
- __call__
|
||||
|
||||
## FuyuImageProcessor
|
||||
|
||||
[[autodoc]] FuyuImageProcessorFast
|
||||
- __call__
|
||||
|
||||
## FuyuProcessor
|
||||
|
||||
[[autodoc]] FuyuProcessor
|
||||
|
||||
@ -61,6 +61,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
|
||||
[[autodoc]] GLPNImageProcessor
|
||||
- preprocess
|
||||
|
||||
## GLPNImageProcessorFast
|
||||
|
||||
[[autodoc]] GLPNImageProcessorFast
|
||||
- preprocess
|
||||
|
||||
## GLPNModel
|
||||
|
||||
[[autodoc]] GLPNModel
|
||||
|
||||
@ -40,7 +40,7 @@ You can choose between MXFP4 and NVFP4 with `FPQuantConfig(forward_dtype="mxfp4"
|
||||
|
||||
A **Blackwell-generation GPU is required** to run the kernels. Runtime support for FP-Quant is implemented through the [QuTLASS](https://github.com/IST-DASLab/qutlass) library and a lightweight PyTorch interface lib [`fp_quant`](https://github.com/IST-DASLab/FP-Quant/tree/master/inference_lib). We recommend installing the former **from source** and the latter with `pip install fp_quant`.
|
||||
|
||||
Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquant=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.
|
||||
Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquantization=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.
|
||||
|
||||
> [!TIP]
|
||||
> Find models pre-quantized with FP-Quant in the official ISTA-DASLab [collection](https://huggingface.co/collections/ISTA-DASLab/fp-quant-6877c186103a21d3a02568ee).
|
||||
|
||||
@ -24,8 +24,9 @@ Mask generation models are trained on large amounts of data and operate in two m
|
||||
- Prompting mode: In this mode, the model takes in an image and a prompt, where a prompt can be a 2D point location (XY coordinates) in the image within an object or a bounding box surrounding an object. In prompting mode, the model only returns the mask over the object
|
||||
that the prompt is pointing out.
|
||||
- Segment Everything mode: In segment everything, given an image, the model generates every mask in the image. To do so, a grid of points is generated and overlaid on the image for inference.
|
||||
- Video Inference: The model takes in a video, and a point or box prompt in a video frame, which is tracked throughout the video. You can get more information on how to do video inference by following [SAM 2 docs](model_doc/sam2).
|
||||
|
||||
Mask generation task is supported by [Segment Anything Model (SAM)](model_doc/sam). It's a powerful model that consists of a Vision Transformer-based image encoder, a prompt encoder, and a two-way transformer mask decoder. Images and prompts are encoded, and the decoder takes these embeddings and generates valid masks.
|
||||
Mask generation task is supported by [Segment Anything Model (SAM)](model_doc/sam) and [Segment Anything Model 2 (SAM2)](model_doc/sam2), while video inference is supported by [Segment Anything Model 2 (SAM2)](model_doc/sam2). SAM is a powerful model that consists of a Vision Transformer-based image encoder, a prompt encoder, and a two-way transformer mask decoder. Images and prompts are encoded, and the decoder takes these embeddings and generates valid masks. Meanwhile SAM 2 extends SAM by adding a memory module to track the masks.
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sam.png" alt="SAM Architecture"/>
|
||||
@ -53,7 +54,7 @@ The easiest way to infer mask generation models is to use the `mask-generation`
|
||||
```python
|
||||
>>> from transformers import pipeline
|
||||
|
||||
>>> checkpoint = "facebook/sam-vit-base"
|
||||
>>> checkpoint = "facebook/sam2-hiera-base-plus"
|
||||
>>> mask_generator = pipeline(model=checkpoint, task="mask-generation")
|
||||
```
|
||||
|
||||
@ -80,20 +81,12 @@ masks = mask_generator(image, points_per_batch=128, pred_iou_thresh=0.88)
|
||||
The `masks` looks like the following:
|
||||
|
||||
```bash
|
||||
{'masks': [array([[False, False, False, ..., True, True, True],
|
||||
[False, False, False, ..., True, True, True],
|
||||
[False, False, False, ..., True, True, True],
|
||||
...,
|
||||
[False, False, False, ..., False, False, False],
|
||||
[False, False, False, ..., False, False, False],
|
||||
[False, False, False, ..., False, False, False]]),
|
||||
array([[False, False, False, ..., False, False, False],
|
||||
[False, False, False, ..., False, False, False],
|
||||
[False, False, False, ..., False, False, False],
|
||||
...,
|
||||
'scores': tensor([0.9972, 0.9917,
|
||||
...,
|
||||
}
|
||||
{'masks': [tensor([[False, False, False, ..., True, True, True],
|
||||
[False, False, False, ..., True, True, True],
|
||||
[False, False, False, ..., True, True, True],
|
||||
...,
|
||||
[False, False, False, ..., False, False, False], ..
|
||||
'scores': tensor([0.9874, 0.9793, 0.9780, 0.9776, ... 0.9016])}
|
||||
```
|
||||
|
||||
We can visualize them like this:
|
||||
@ -235,3 +228,270 @@ plt.show()
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/box_inference.png" alt="Visualized Inference"/>
|
||||
</div>
|
||||
|
||||
## Fine-tuning for Mask Generation
|
||||
|
||||
We will fine-tune SAM2.1 on small part of MicroMat dataset for image matting. We need to install monai library to use the DICE loss, and trackio for logging the masks during training.
|
||||
|
||||
```bash
|
||||
pip install -q datasets monai trackio
|
||||
```
|
||||
We can now load our dataset and take a look.
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
|
||||
dataset = load_dataset("merve/MicroMat-mini", split="train")
|
||||
dataset
|
||||
# Dataset({
|
||||
# features: ['image', 'mask', 'prompt', 'image_id', 'object_id', 'sample_idx', 'granularity',
|
||||
# 'image_path', 'mask_path', 'prompt_path'], num_rows: 94
|
||||
#})
|
||||
```
|
||||
We need image, mask and prompt columns. We split for train and test.
|
||||
|
||||
```python
|
||||
dataset = dataset.train_test_split(test_size=0.1)
|
||||
train_ds = dataset["train"]
|
||||
val_ds = dataset["test"]
|
||||
```
|
||||
|
||||
Let's take a look at a sample.
|
||||
```python
|
||||
train_ds[0]
|
||||
```
|
||||
```
|
||||
{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=2040x1356>,
|
||||
'mask': <PIL.PngImagePlugin.PngImageFile image mode=L size=2040x1356>,
|
||||
'prompt': '{"point": [[137, 1165, 1], [77, 1273, 0], [58, 1351, 0]], "bbox": [0, 701, 251, 1356]}',
|
||||
'image_id': '0034',
|
||||
'object_id': '34',
|
||||
'sample_idx': 1,
|
||||
'granularity': 'fine',
|
||||
'image_path': '/content/MicroMat-mini/img/0034.png',
|
||||
'mask_path': '/content/MicroMat-mini/mask/0034_34.png',
|
||||
'prompt_path': '/content/MicroMat-mini/prompt/0034_34.json'}
|
||||
```
|
||||
Prompts are string of dictionaries, so can get bounding boxes like below.
|
||||
```python
|
||||
import json
|
||||
|
||||
json.loads(train_ds["prompt"][0])["bbox"]
|
||||
# [0, 701, 251, 1356]
|
||||
```
|
||||
|
||||
Visualize an example image, prompt and mask.
|
||||
|
||||
```python
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
def show_mask(mask, ax):
|
||||
color = np.array([0.12, 0.56, 1.0, 0.6])
|
||||
mask = np.array(mask)
|
||||
h, w = mask.shape
|
||||
mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, 4)
|
||||
ax.imshow(mask_image)
|
||||
x0, y0, x1, y1 = eval(train_ds["prompt"][0])["bbox"]
|
||||
ax.add_patch(
|
||||
plt.Rectangle((x0, y0), x1 - x0, y1 - y0,
|
||||
fill=False, edgecolor="lime", linewidth=2))
|
||||
|
||||
example = train_ds[0]
|
||||
image = np.array(example["image"])
|
||||
ground_truth_mask = np.array(example["mask"])
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
ax.imshow(image)
|
||||
show_mask(ground_truth_mask, ax)
|
||||
ax.set_title("Ground truth mask")
|
||||
ax.set_axis_off()
|
||||
|
||||
plt.show()
|
||||
```
|
||||
|
||||
Now we can define our dataset for loading the data. SAMDataset wraps an our dataset and formats each sample the way the SAM processor expects. So instead of raw images and masks, you get processed images, bounding boxes, and ground-truth masks ready for training.
|
||||
|
||||
By default, processor resizes images, so on top of images and masks, it also returns original sizes. We also need to binarize the mask as it has values 0, 255.
|
||||
|
||||
```python
|
||||
from torch.utils.data import Dataset
|
||||
import torch
|
||||
|
||||
class SAMDataset(Dataset):
|
||||
def __init__(self, dataset, processor):
|
||||
self.dataset = dataset
|
||||
self.processor = processor
|
||||
|
||||
def __len__(self):
|
||||
return len(self.dataset)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
item = self.dataset[idx]
|
||||
image = item["image"]
|
||||
prompt = eval(item["prompt"])["bbox"]
|
||||
inputs = self.processor(image, input_boxes=[[prompt]], return_tensors="pt")
|
||||
inputs["ground_truth_mask"] = (np.array(item["mask"]) > 0).astype(np.float32)
|
||||
inputs["original_image_size"] = torch.tensor(image.size[::-1])
|
||||
|
||||
|
||||
return inputs
|
||||
```
|
||||
|
||||
We can initialize the processor and the dataset with it.
|
||||
|
||||
```python
|
||||
from transformers import Sam2Processor
|
||||
|
||||
processor = Sam2Processor.from_pretrained("facebook/sam2.1-hiera-small")
|
||||
train_dataset = SAMDataset(dataset=train_ds, processor=processor)
|
||||
```
|
||||
|
||||
We need to define a data collator that will turn varying size of ground truth masks to batches of reshaped masks in same shape. We reshape them using nearest neighbor interpolation. We also make batched tensors for rest of the elements in the batch. If your masks are all of same size, feel free to skip this step.
|
||||
|
||||
```python
|
||||
import torch.nn.functional as F
|
||||
|
||||
def collate_fn(batch, target_hw=(256, 256)):
|
||||
|
||||
pixel_values = torch.cat([item["pixel_values"] for item in batch], dim=0)
|
||||
original_sizes = torch.stack([item["original_sizes"] for item in batch])
|
||||
reshaped_input_sizes = torch.stack([item["reshaped_input_sizes"] for item in batch])
|
||||
input_boxes = torch.cat([item["input_boxes"] for item in batch], dim=0)
|
||||
ground_truth_masks = torch.cat([
|
||||
F.interpolate(
|
||||
torch.as_tensor(x["ground_truth_mask"]).unsqueeze(0).unsqueeze(0).float(),
|
||||
size=(256, 256),
|
||||
mode="nearest"
|
||||
)
|
||||
for x in batch
|
||||
], dim=0).long()
|
||||
|
||||
return {
|
||||
"pixel_values": pixel_values,
|
||||
"original_sizes": original_sizes,
|
||||
"reshaped_input_sizes": reshaped_input_sizes,
|
||||
"input_boxes": input_boxes,
|
||||
"ground_truth_mask": ground_truth_masks,
|
||||
"original_image_size": torch.stack([item["original_image_size"] for item in batch]),
|
||||
}
|
||||
|
||||
from torch.utils.data import DataLoader
|
||||
train_dataloader = DataLoader(
|
||||
train_dataset,
|
||||
batch_size=4,
|
||||
shuffle=True,
|
||||
collate_fn=collate_fn,
|
||||
)
|
||||
```
|
||||
|
||||
Let's take a look at what the data loader yields.
|
||||
|
||||
```python
|
||||
batch = next(iter(train_dataloader))
|
||||
for k,v in batch.items():
|
||||
print(k,v.shape)
|
||||
|
||||
# pixel_values torch.Size([4, 3, 1024, 1024])
|
||||
# original_sizes torch.Size([4, 1, 2])
|
||||
# reshaped_input_sizes torch.Size([4, 1, 2])
|
||||
# input_boxes torch.Size([4, 1, 4])
|
||||
# ground_truth_mask torch.Size([4, 1, 256, 256])
|
||||
#original_image_size torch.Size([4, 2])
|
||||
```
|
||||
We will now load the model, we will freeze the vision and the prompt encoder and only train the mask decoder.
|
||||
|
||||
```python
|
||||
from transformers import Sam2Model
|
||||
|
||||
model = Sam2Model.from_pretrained("facebook/sam2.1-hiera-small")
|
||||
|
||||
for name, param in model.named_parameters():
|
||||
if name.startswith("vision_encoder") or name.startswith("prompt_encoder"):
|
||||
param.requires_grad_(False)
|
||||
```
|
||||
|
||||
We can now define the optimizer and the loss function.
|
||||
```python
|
||||
from torch.optim import Adam
|
||||
import monai
|
||||
|
||||
optimizer = Adam(model.mask_decoder.parameters(), lr=1e-5, weight_decay=0)
|
||||
seg_loss = monai.losses.DiceCELoss(sigmoid=True, squared_pred=True, reduction='mean')
|
||||
```
|
||||
|
||||
We need to log our predictions to trackio so we can monitor the model improvement in the middle of the training.
|
||||
|
||||
```python
|
||||
from PIL import Image
|
||||
import trackio
|
||||
import json
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def predict_fn(img, bbox):
|
||||
|
||||
inputs = processor(images=img, input_boxes=[[bbox]], return_tensors="pt").to(model.device)
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
|
||||
masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"])[0]
|
||||
return masks
|
||||
|
||||
def log_eval_masks_trackio(dataset, indices, step, predict_fn, project=None, sample_cap=8):
|
||||
logs = {"eval/step": int(step)}
|
||||
for idx in indices[:sample_cap]:
|
||||
item = dataset[idx]
|
||||
img = item["image"]
|
||||
bbox = json.loads(item["prompt"])["bbox"]
|
||||
preds = predict_fn(img, bbox)
|
||||
preds = preds.squeeze(0)
|
||||
mask = (preds[0] > 0).cpu().numpy()
|
||||
|
||||
overlay = np.asarray(img, dtype=np.uint8).copy()
|
||||
overlay[mask] = 0.55 * overlay[mask] + 0.45 * np.array([0, 255, 0], dtype=np.float32)
|
||||
logs[f"{idx}/overlay"] = trackio.Image(overlay, caption="overlay")
|
||||
|
||||
trackio.log(logs)
|
||||
```
|
||||
We can now write our training loop and train!
|
||||
|
||||
Notice how we log our loss and evaluation masks with trackio.
|
||||
```python
|
||||
from tqdm import tqdm
|
||||
from statistics import mean
|
||||
import trackio
|
||||
import torch
|
||||
|
||||
num_epochs = 30
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
model.to(device)
|
||||
|
||||
model.train()
|
||||
trackio.init(project="mask-eval")
|
||||
for epoch in range(num_epochs):
|
||||
epoch_losses = []
|
||||
for batch in tqdm(train_dataloader):
|
||||
outputs = model(pixel_values=batch["pixel_values"].to(device),
|
||||
input_boxes=batch["input_boxes"].to(device),
|
||||
multimask_output=False)
|
||||
|
||||
predicted_masks = outputs.pred_masks.squeeze(1)
|
||||
ground_truth_masks = batch["ground_truth_mask"].float().to(device)
|
||||
loss = seg_loss(predicted_masks, ground_truth_masks)
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
|
||||
optimizer.step()
|
||||
epoch_losses.append(loss.item())
|
||||
|
||||
log_eval_masks_trackio(dataset=val_ds, indices=[0, 3, 6, 9], step=epoch, predict_fn=predict_fn, project="mask-eval")
|
||||
print(f'Epoch: {epoch}')
|
||||
print(f'Mean loss: {mean(epoch_losses)}')
|
||||
trackio.log({"loss": mean(epoch_losses)})
|
||||
|
||||
trackio.finish()
|
||||
```
|
||||
@ -187,7 +187,7 @@ from torch import nn
|
||||
from transformers import Trainer
|
||||
|
||||
class CustomTrainer(Trainer):
|
||||
def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False num_items_in_batch: Optional[torch.Tensor] = None):
|
||||
def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False, num_items_in_batch: Optional[torch.Tensor] = None):
|
||||
labels = inputs.pop("labels")
|
||||
# forward pass
|
||||
outputs = model(**inputs)
|
||||
|
||||
2
setup.py
2
setup.py
@ -113,7 +113,7 @@ _deps = [
|
||||
"GitPython<3.1.19",
|
||||
"hf-doc-builder>=0.3.0",
|
||||
"hf_xet",
|
||||
"huggingface-hub==1.0.0.rc6",
|
||||
"huggingface-hub>=1.0.0,<2.0",
|
||||
"importlib_metadata",
|
||||
"ipadic>=1.0.0,<2.0",
|
||||
"jinja2>=3.1.0",
|
||||
|
||||
@ -23,7 +23,7 @@ deps = {
|
||||
"GitPython": "GitPython<3.1.19",
|
||||
"hf-doc-builder": "hf-doc-builder>=0.3.0",
|
||||
"hf_xet": "hf_xet",
|
||||
"huggingface-hub": "huggingface-hub==1.0.0.rc6",
|
||||
"huggingface-hub": "huggingface-hub>=1.0.0,<2.0",
|
||||
"importlib_metadata": "importlib_metadata",
|
||||
"ipadic": "ipadic>=1.0.0,<2.0",
|
||||
"jinja2": "jinja2>=3.1.0",
|
||||
|
||||
@ -2192,7 +2192,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
has_disk_offload = "disk" in all_model_devices
|
||||
can_compile &= not has_disk_offload
|
||||
|
||||
# Finally: if the user has manually specified compilation options, but compilation is not possible, let's warn
|
||||
# If the user has manually specified compilation options, but compilation is not possible, let's warn
|
||||
# them
|
||||
if generation_config.compile_config is not None and not can_compile:
|
||||
logger.warning_once(
|
||||
@ -2200,6 +2200,18 @@ class GenerationMixin(ContinuousMixin):
|
||||
"will be skipped."
|
||||
)
|
||||
|
||||
# Finally: if we can compile, disable tokenizers parallelism and check for FA2 + static cache
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "0"
|
||||
# If we use FA2 and a static cache, we cannot compile with fullgraph
|
||||
if self.config._attn_implementation == "flash_attention_2":
|
||||
# only raise warning if the user passed an explicit compile-config
|
||||
if generation_config.compile_config is not None and generation_config.compile_config.fullgraph:
|
||||
logger.warning_once(
|
||||
"When using Flash Attention 2 and a static cache, you cannot use the option `CompileConfig(fullgraph=True)` as "
|
||||
"FA2 introduces graph breaks. We overrode the option with `fullgraph=False`."
|
||||
)
|
||||
generation_config.compile_config.fullgraph = False
|
||||
|
||||
return can_compile
|
||||
|
||||
def _get_deprecated_gen_repo(
|
||||
@ -2636,7 +2648,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
UserWarning,
|
||||
)
|
||||
|
||||
# 8. prepare logits processors and stopping criteria
|
||||
# 8. Prepare logits processors and stopping criteria
|
||||
prepared_logits_processor = self._get_logits_processor(
|
||||
generation_config=generation_config,
|
||||
input_ids_seq_length=input_ids_length,
|
||||
@ -2843,40 +2855,21 @@ class GenerationMixin(ContinuousMixin):
|
||||
batch_size, cur_len = input_ids.shape[:2]
|
||||
this_peer_finished = False
|
||||
unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
|
||||
model_kwargs = self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)
|
||||
|
||||
model_forward = self.__call__
|
||||
compile_forward = self._valid_auto_compile_criteria(model_kwargs, generation_config)
|
||||
if compile_forward:
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "0"
|
||||
# If we use FA2 and a static cache, we cannot compile with fullgraph
|
||||
if self.config._attn_implementation == "flash_attention_2":
|
||||
# only raise warning if the user passed an explicit compile-config
|
||||
if generation_config.compile_config is not None and generation_config.compile_config.fullgraph:
|
||||
logger.warning_once(
|
||||
"When using Flash Attention 2 and a static cache, you cannot use the option `CompileConfig(fullgraph=True)` as "
|
||||
"FA2 introduces graph breaks. We overrode the option with `fullgraph=False`."
|
||||
)
|
||||
generation_config.compile_config.fullgraph = False
|
||||
model_forward = self.get_compiled_call(generation_config.compile_config)
|
||||
model_forward = (
|
||||
self.get_compiled_call(generation_config.compile_config)
|
||||
if self._valid_auto_compile_criteria(model_kwargs, generation_config)
|
||||
else self.__call__
|
||||
)
|
||||
|
||||
if generation_config.prefill_chunk_size is not None:
|
||||
model_kwargs = self._prefill_chunking(input_ids, generation_config, **model_kwargs)
|
||||
is_prefill = False
|
||||
else:
|
||||
is_prefill = True
|
||||
prefill_consumed = False
|
||||
outputs = self._prefill(input_ids, generation_config, model_kwargs)
|
||||
|
||||
while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
|
||||
# prepare model inputs
|
||||
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
|
||||
|
||||
if is_prefill:
|
||||
outputs = self(**model_inputs, return_dict=True)
|
||||
is_prefill = False
|
||||
else:
|
||||
if prefill_consumed:
|
||||
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
|
||||
outputs = model_forward(**model_inputs, return_dict=True)
|
||||
|
||||
# synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
|
||||
prefill_consumed = True
|
||||
model_kwargs = self._update_model_kwargs_for_generation(
|
||||
outputs,
|
||||
model_kwargs,
|
||||
@ -3246,7 +3239,6 @@ class GenerationMixin(ContinuousMixin):
|
||||
`return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
|
||||
`model.config.is_encoder_decoder=True`.
|
||||
"""
|
||||
|
||||
# 1. init beam_search values
|
||||
pad_token_id = generation_config._pad_token_tensor
|
||||
eos_token_id = generation_config._eos_token_tensor
|
||||
@ -3287,8 +3279,6 @@ class GenerationMixin(ContinuousMixin):
|
||||
dim=0,
|
||||
).to(input_ids.device)
|
||||
|
||||
model_kwargs = self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)
|
||||
|
||||
# (joao) feature lost in the refactor. Probably won't implement, hurts readability with minimal gains (there
|
||||
# are newer low-memory alternatives like the offloaded cache)
|
||||
sequential = generation_config.low_memory
|
||||
@ -3350,13 +3340,18 @@ class GenerationMixin(ContinuousMixin):
|
||||
)
|
||||
beam_indices = running_beam_indices.detach().clone()
|
||||
|
||||
prefill_consumed = False
|
||||
flat_running_sequences = input_ids
|
||||
model_outputs = self._prefill(input_ids, generation_config, model_kwargs)
|
||||
|
||||
# 4. run the generation loop
|
||||
while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
|
||||
# a. Forward current tokens, obtain the logits
|
||||
flat_running_sequences = self._flatten_beam_dim(running_sequences[:, :, :cur_len])
|
||||
model_inputs = self.prepare_inputs_for_generation(flat_running_sequences, **model_kwargs)
|
||||
|
||||
model_outputs = self(**model_inputs, return_dict=True)
|
||||
if prefill_consumed:
|
||||
# a. Forward current tokens, obtain the logits
|
||||
flat_running_sequences = self._flatten_beam_dim(running_sequences[:, :, :cur_len])
|
||||
model_inputs = self.prepare_inputs_for_generation(flat_running_sequences, **model_kwargs)
|
||||
model_outputs = self(**model_inputs, return_dict=True)
|
||||
prefill_consumed = True
|
||||
|
||||
# synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
|
||||
model_kwargs = self._update_model_kwargs_for_generation(
|
||||
@ -3839,49 +3834,51 @@ class GenerationMixin(ContinuousMixin):
|
||||
else:
|
||||
return input_ids
|
||||
|
||||
def _prefill_chunking(self, input_ids: torch.LongTensor, generation_config: GenerationConfig, **model_kwargs):
|
||||
# Even if we are not compiling the forward, flex is always compiled when used. With chunk prefill, we may
|
||||
# end up needing just a bit more graphs than the default (which is 8). Doing this avoids very cryptic warnings
|
||||
torch._dynamo.config.cache_size_limit = 64
|
||||
# TODO: v5.1: make public once API stabilized
|
||||
def _prefill(self, input_ids: torch.LongTensor, generation_config: GenerationConfig, model_kwargs):
|
||||
if generation_config.prefill_chunk_size is None:
|
||||
model_kwargs = self._get_initial_cache_position(input_ids.shape[1], input_ids.device, model_kwargs)
|
||||
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
|
||||
return self(**model_inputs, return_dict=True)
|
||||
else: # Chunked prefill
|
||||
# Even if we are not compiling the forward, flex is always compiled when used. With chunked prefill, we may
|
||||
# end up needing just a bit more graphs than the default (which is 8). Doing this avoids very cryptic warnings
|
||||
torch._dynamo.config.cache_size_limit = 64
|
||||
|
||||
chunk_size = generation_config.prefill_chunk_size
|
||||
# Only chunk up the token just before last, so that decoding is completely performed outside this function
|
||||
# (here we simply prefill the cache)
|
||||
input_chunks = torch.split(input_ids[:, :-1], chunk_size, dim=-1)
|
||||
chunk_size = generation_config.prefill_chunk_size
|
||||
input_chunks = torch.split(input_ids, chunk_size, dim=-1)
|
||||
|
||||
if "past_key_values" not in model_kwargs:
|
||||
raise ValueError("Cannot use prefill chunking without a cache")
|
||||
if "past_key_values" not in model_kwargs:
|
||||
raise ValueError("Cannot use prefill chunking without a cache")
|
||||
|
||||
model_forward = self.forward
|
||||
|
||||
compile_forward = self._valid_auto_compile_criteria(model_kwargs, generation_config)
|
||||
if compile_forward:
|
||||
model_forward = self.get_compiled_call(generation_config.compile_config)
|
||||
|
||||
attention_mask = model_kwargs.pop("attention_mask", None)
|
||||
|
||||
past_length = 0
|
||||
for input_chunk in input_chunks:
|
||||
current_length = past_length + input_chunk.shape[-1]
|
||||
# Prepare inputs
|
||||
if attention_mask is not None:
|
||||
model_kwargs["attention_mask"] = attention_mask[:, :current_length]
|
||||
model_kwargs["cache_position"] = torch.arange(
|
||||
past_length, current_length, dtype=torch.long, device=input_chunk.device
|
||||
model_forward = (
|
||||
self.get_compiled_call(generation_config.compile_config)
|
||||
if self._valid_auto_compile_criteria(model_kwargs, generation_config)
|
||||
else self.__call__
|
||||
)
|
||||
model_kwargs["position_ids"] = model_kwargs["cache_position"].unsqueeze(0)
|
||||
model_inputs = self.prepare_inputs_for_generation(input_chunk, **model_kwargs)
|
||||
|
||||
outputs = model_forward(**model_inputs, return_dict=True)
|
||||
attention_mask = model_kwargs.pop("attention_mask", None)
|
||||
past_length = 0
|
||||
for input_chunk in input_chunks:
|
||||
current_length = past_length + input_chunk.shape[-1]
|
||||
if attention_mask is not None:
|
||||
model_kwargs["attention_mask"] = attention_mask[:, :current_length]
|
||||
model_kwargs["cache_position"] = torch.arange(
|
||||
past_length, current_length, dtype=torch.long, device=input_chunk.device
|
||||
)
|
||||
model_kwargs["position_ids"] = model_kwargs["cache_position"].unsqueeze(0)
|
||||
model_inputs = self.prepare_inputs_for_generation(input_chunk, **model_kwargs)
|
||||
|
||||
model_kwargs["past_key_values"] = outputs.past_key_values
|
||||
past_length = current_length
|
||||
outputs = model_forward(**model_inputs, return_dict=True)
|
||||
|
||||
model_kwargs["attention_mask"] = attention_mask
|
||||
model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + 1
|
||||
_ = model_kwargs.pop("position_ids", None)
|
||||
model_kwargs["past_key_values"] = outputs.past_key_values
|
||||
past_length = current_length
|
||||
|
||||
return model_kwargs
|
||||
model_kwargs["attention_mask"] = attention_mask
|
||||
model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + 1
|
||||
_ = model_kwargs.pop("position_ids", None)
|
||||
# Latest outputs contain next token logits
|
||||
return outputs
|
||||
|
||||
|
||||
def _speculative_sampling(
|
||||
|
||||
@ -362,25 +362,13 @@ class ImageProcessingMixin(PushToHubMixin):
|
||||
"""
|
||||
image_processor_dict = image_processor_dict.copy()
|
||||
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
|
||||
|
||||
# The `size` parameter is a dict and was previously an int or tuple in feature extractors.
|
||||
# We set `size` here directly to the `image_processor_dict` so that it is converted to the appropriate
|
||||
# dict within the image processor and isn't overwritten if `size` is passed in as a kwarg.
|
||||
if "size" in kwargs and "size" in image_processor_dict:
|
||||
image_processor_dict["size"] = kwargs.pop("size")
|
||||
if "crop_size" in kwargs and "crop_size" in image_processor_dict:
|
||||
image_processor_dict["crop_size"] = kwargs.pop("crop_size")
|
||||
|
||||
image_processor_dict.update({k: v for k, v in kwargs.items() if k in cls.valid_kwargs.__annotations__})
|
||||
image_processor = cls(**image_processor_dict)
|
||||
|
||||
# Update image_processor with kwargs if needed
|
||||
to_remove = []
|
||||
for key, value in kwargs.items():
|
||||
# Remove kwargs that are used to initialize the image processor attributes
|
||||
for key in list(kwargs):
|
||||
if hasattr(image_processor, key):
|
||||
setattr(image_processor, key, value)
|
||||
to_remove.append(key)
|
||||
for key in to_remove:
|
||||
kwargs.pop(key, None)
|
||||
kwargs.pop(key)
|
||||
|
||||
logger.info(f"Image processor {image_processor}")
|
||||
if return_unused_kwargs:
|
||||
|
||||
@ -185,6 +185,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
|
||||
input_data_format = None
|
||||
device = None
|
||||
model_input_names = ["pixel_values"]
|
||||
image_seq_length = None
|
||||
valid_kwargs = ImagesKwargs
|
||||
unused_kwargs = None
|
||||
|
||||
@ -227,6 +228,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
|
||||
padding_mode: Optional[str] = "constant",
|
||||
return_mask: bool = False,
|
||||
disable_grouping: Optional[bool] = False,
|
||||
is_nested: Optional[bool] = False,
|
||||
**kwargs,
|
||||
) -> Union[tuple["torch.Tensor", "torch.Tensor"], "torch.Tensor"]:
|
||||
"""
|
||||
@ -257,7 +259,9 @@ class BaseImageProcessorFast(BaseImageProcessor):
|
||||
else:
|
||||
pad_size = get_max_height_width(images)
|
||||
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
|
||||
grouped_images, grouped_images_index = group_images_by_shape(
|
||||
images, disable_grouping=disable_grouping, is_nested=is_nested
|
||||
)
|
||||
processed_images_grouped = {}
|
||||
processed_masks_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
@ -280,9 +284,9 @@ class BaseImageProcessorFast(BaseImageProcessor):
|
||||
stacked_masks[..., : image_size[0], : image_size[1]] = 1
|
||||
processed_masks_grouped[shape] = stacked_masks
|
||||
|
||||
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
||||
processed_images = reorder_images(processed_images_grouped, grouped_images_index, is_nested=is_nested)
|
||||
if return_mask:
|
||||
processed_masks = reorder_images(processed_masks_grouped, grouped_images_index)
|
||||
processed_masks = reorder_images(processed_masks_grouped, grouped_images_index, is_nested=is_nested)
|
||||
return processed_images, processed_masks
|
||||
|
||||
return processed_images
|
||||
@ -305,6 +309,8 @@ class BaseImageProcessorFast(BaseImageProcessor):
|
||||
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
|
||||
interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
|
||||
`InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
|
||||
antialias (`bool`, *optional*, defaults to `True`):
|
||||
Whether to use antialiasing.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: The resized image.
|
||||
|
||||
@ -35,6 +35,10 @@ def adapt_fp_quant_config(config: FPQuantConfig):
|
||||
|
||||
if config.backward_dtype == "bf16":
|
||||
backward_dtype = FPQuantDtype.BF16
|
||||
elif config.backward_dtype == "mxfp8":
|
||||
backward_dtype = FPQuantDtype.MXFP8
|
||||
elif config.backward_dtype == "mxfp4":
|
||||
backward_dtype = FPQuantDtype.MXFP4
|
||||
else:
|
||||
raise ValueError(f"Unsupported backward dtype: {config.backward_dtype}")
|
||||
|
||||
|
||||
@ -11,17 +11,15 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import importlib.metadata
|
||||
import re
|
||||
from collections.abc import Callable
|
||||
from functools import partial
|
||||
from types import ModuleType
|
||||
from typing import Optional, Union
|
||||
|
||||
from packaging import version as pkg_version
|
||||
|
||||
from ..modeling_flash_attention_utils import lazy_import_flash_attention
|
||||
from ..utils import logging
|
||||
from ..utils.import_utils import is_kernels_available
|
||||
from .flash_attention import flash_attention_forward
|
||||
|
||||
|
||||
@ -67,6 +65,12 @@ try:
|
||||
layer_name="LigerRMSNorm",
|
||||
)
|
||||
},
|
||||
"xpu": {
|
||||
Mode.INFERENCE: LayerRepository(
|
||||
repo_id="kernels-community/rmsnorm",
|
||||
layer_name="RMSNorm",
|
||||
)
|
||||
},
|
||||
},
|
||||
"MLP": {
|
||||
"cuda": LayerRepository(
|
||||
@ -142,7 +146,18 @@ try:
|
||||
},
|
||||
}
|
||||
|
||||
register_kernel_mapping(_KERNEL_MAPPING)
|
||||
def has_key(d, key):
|
||||
return key in d or any(isinstance(v, dict) and has_key(v, key) for v in d.values())
|
||||
|
||||
def register_kernel_mapping_transformers(mapping=None):
|
||||
if mapping is None:
|
||||
mapping = _KERNEL_MAPPING
|
||||
if has_key(mapping, "xpu") and not is_kernels_available(MIN_VERSION="0.10.2"):
|
||||
raise ImportError(
|
||||
"kernels uses an incompatible version. Please install the latest version with `pip install -U kernels`."
|
||||
)
|
||||
register_kernel_mapping(mapping)
|
||||
|
||||
|
||||
except ImportError:
|
||||
_kernels_available = False
|
||||
@ -221,7 +236,7 @@ def load_and_register_attn_kernel(attn_implementation: str, attention_wrapper: O
|
||||
|
||||
# Load the kernel from hub
|
||||
try:
|
||||
kernel = get_kernel_wrapper(repo_id, revision=rev)
|
||||
kernel = get_kernel(repo_id, revision=rev)
|
||||
except Exception as e:
|
||||
raise ValueError(f"An error occurred while trying to load from '{repo_id}': {e}.")
|
||||
# correctly wrap the kernel
|
||||
@ -245,10 +260,12 @@ def lazy_load_kernel(kernel_name: str, mapping: dict[str, Optional[ModuleType]]
|
||||
mapping[kernel_name] = None
|
||||
return None
|
||||
if _kernels_available:
|
||||
from kernels import get_kernel
|
||||
|
||||
try:
|
||||
repo_id = _HUB_KERNEL_MAPPING[kernel_name]["repo_id"]
|
||||
version = _HUB_KERNEL_MAPPING[kernel_name].get("version", None)
|
||||
kernel = get_kernel_wrapper(repo_id, version=version)
|
||||
kernel = get_kernel(repo_id, version=version)
|
||||
mapping[kernel_name] = kernel
|
||||
except FileNotFoundError:
|
||||
mapping[kernel_name] = None
|
||||
@ -280,25 +297,11 @@ def lazy_load_kernel(kernel_name: str, mapping: dict[str, Optional[ModuleType]]
|
||||
return mapping[kernel_name]
|
||||
|
||||
|
||||
def get_kernel_wrapper(kernel_name: str, revision: Optional[str] = None, version: Optional[str] = None) -> ModuleType:
|
||||
from .. import __version__
|
||||
|
||||
user_agent = {"framework": "transformers", "version": __version__, "repo_id": kernel_name}
|
||||
if _kernels_available:
|
||||
kernels_version = importlib.metadata.version("kernels")
|
||||
if pkg_version.parse(kernels_version) >= pkg_version.parse("0.10.4"):
|
||||
return get_kernel(kernel_name, revision=revision, version=version, user_agent=user_agent)
|
||||
else:
|
||||
return get_kernel(kernel_name, revision=revision)
|
||||
else:
|
||||
raise ImportError("kernels is not installed, please install it with `pip install kernels`")
|
||||
|
||||
|
||||
__all__ = [
|
||||
"LayerRepository",
|
||||
"use_kernel_forward_from_hub",
|
||||
"register_kernel_mapping",
|
||||
"register_kernel_mapping_transformers",
|
||||
"replace_kernel_forward_from_hub",
|
||||
"lazy_load_kernel",
|
||||
"get_kernel_wrapper",
|
||||
]
|
||||
|
||||
@ -470,10 +470,10 @@ def replace_with_mxfp4_linear(
|
||||
if quantization_config.dequantize:
|
||||
return model
|
||||
else:
|
||||
from .hub_kernels import get_kernel_wrapper
|
||||
from kernels import get_kernel
|
||||
|
||||
global triton_kernels_hub
|
||||
triton_kernels_hub = get_kernel_wrapper("kernels-community/triton_kernels")
|
||||
triton_kernels_hub = get_kernel("kernels-community/triton_kernels")
|
||||
|
||||
modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert
|
||||
|
||||
|
||||
@ -4033,10 +4033,14 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
|
||||
if use_kernels:
|
||||
if not is_kernels_available():
|
||||
raise ValueError(
|
||||
"Kernels are not available. To use kernels, please install kernels using `pip install kernels`"
|
||||
"`use_kernels=True` requires kernels>=0.9.0. Please install the latest version with `pip install -U kernels`"
|
||||
)
|
||||
from kernels import use_kernel_mapping
|
||||
|
||||
from .integrations.hub_kernels import register_kernel_mapping_transformers
|
||||
|
||||
register_kernel_mapping_transformers()
|
||||
|
||||
if kernel_config is not None and isinstance(kernel_config, KernelConfig):
|
||||
# This will make sure the mapping is valid, and the layers are registered in the model
|
||||
kernel_config.sanitize_kernel_mapping(self)
|
||||
|
||||
@ -98,12 +98,12 @@ else:
|
||||
("eomt", ("EomtImageProcessor", "EomtImageProcessorFast")),
|
||||
("flava", ("FlavaImageProcessor", "FlavaImageProcessorFast")),
|
||||
("focalnet", ("BitImageProcessor", "BitImageProcessorFast")),
|
||||
("fuyu", ("FuyuImageProcessor", None)),
|
||||
("fuyu", ("FuyuImageProcessor", "FuyuImageProcessorFast")),
|
||||
("gemma3", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
|
||||
("gemma3n", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
|
||||
("git", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
|
||||
("glm4v", ("Glm4vImageProcessor", "Glm4vImageProcessorFast")),
|
||||
("glpn", ("GLPNImageProcessor", None)),
|
||||
("glpn", ("GLPNImageProcessor", "GLPNImageProcessorFast")),
|
||||
("got_ocr2", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
|
||||
("grounding-dino", ("GroundingDinoImageProcessor", "GroundingDinoImageProcessorFast")),
|
||||
("groupvit", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
|
||||
|
||||
@ -39,9 +39,10 @@ from typing import Any, Optional
|
||||
|
||||
import torch
|
||||
from huggingface_hub import snapshot_download
|
||||
from peft import PeftModel
|
||||
from safetensors import safe_open
|
||||
|
||||
from transformers import AutoConfig
|
||||
from transformers import AutoConfig, AutoModel
|
||||
from transformers.models.colqwen2 import ColQwen2ForRetrieval
|
||||
from transformers.models.colqwen2.configuration_colqwen2 import ColQwen2Config
|
||||
from transformers.utils import logging
|
||||
@ -69,7 +70,7 @@ def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> d
|
||||
original_state_dict[key] = f.get_tensor(key)
|
||||
|
||||
# Some weights are tied, so `lm.head`` is not saved. Let's clone to load state dict.
|
||||
if "lm_head.weight" not in original_state_dict:
|
||||
if "lm_head.weight" not in original_state_dict and "model.embed_tokens.weight" in original_state_dict:
|
||||
original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone()
|
||||
|
||||
return original_state_dict
|
||||
@ -124,7 +125,21 @@ def convert_colqwen2_weights_to_hf(
|
||||
config.is_composition = False
|
||||
|
||||
# Load the untrained model
|
||||
model = ColQwen2ForRetrieval(config=config).to("cpu").eval()
|
||||
vlm_name_or_path = getattr(config.vlm_config, "_name_or_path", None)
|
||||
if vlm_name_or_path and "2.5" in str(vlm_name_or_path):
|
||||
print(
|
||||
"Detected colqwen2.5 adapters in vlm_config; loading base model %s and merging PEFT weights."
|
||||
% vlm_name_or_path
|
||||
)
|
||||
base_model = AutoModel.from_pretrained(
|
||||
vlm_name_or_path,
|
||||
device_map="cpu",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
peft_model = PeftModel.from_pretrained(base_model, model_id)
|
||||
model = peft_model.merge_and_unload()
|
||||
else:
|
||||
model = ColQwen2ForRetrieval(config=config).to("cpu").eval()
|
||||
print("Created model with new config and randomly initialized weights")
|
||||
|
||||
# NOTE: The new model was initialized with float32 weights. We need to convert it to the desired precision.
|
||||
@ -201,6 +216,7 @@ if __name__ == "__main__":
|
||||
help="Name or path of the original VLM backbone model",
|
||||
default=None,
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
convert_colqwen2_weights_to_hf(
|
||||
|
||||
@ -172,7 +172,6 @@ class ColQwen2ForRetrieval(ColQwen2PreTrainedModel):
|
||||
inputs_embeds = self.vlm.language_model.embed_tokens(input_ids)
|
||||
|
||||
if pixel_values is not None:
|
||||
pixel_values = pixel_values.type(self.vlm.visual.get_dtype())
|
||||
image_embeds = self.vlm.visual(pixel_values, grid_thw=image_grid_thw)
|
||||
image_mask = (
|
||||
(input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
|
||||
|
||||
@ -359,7 +359,6 @@ class ColQwen2ForRetrieval(ColPaliForRetrieval):
|
||||
inputs_embeds = self.vlm.language_model.embed_tokens(input_ids)
|
||||
|
||||
if pixel_values is not None:
|
||||
pixel_values = pixel_values.type(self.vlm.visual.get_dtype())
|
||||
image_embeds = self.vlm.visual(pixel_values, grid_thw=image_grid_thw)
|
||||
image_mask = (
|
||||
(input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
|
||||
|
||||
@ -13,7 +13,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
@ -204,11 +203,11 @@ class CsmGenerationMixin(GenerationMixin):
|
||||
criterion.max_length -= cur_len
|
||||
# ============================================
|
||||
|
||||
model_forward = self.__call__
|
||||
compile_forward = self._valid_auto_compile_criteria(model_kwargs, generation_config)
|
||||
if compile_forward:
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "0"
|
||||
model_forward = self.get_compiled_call(generation_config.compile_config)
|
||||
model_forward = (
|
||||
self.get_compiled_call(generation_config.compile_config)
|
||||
if self._valid_auto_compile_criteria(model_kwargs, generation_config)
|
||||
else self.__call__
|
||||
)
|
||||
|
||||
is_prefill = True
|
||||
while self._has_unfinished_sequences(
|
||||
|
||||
@ -278,6 +278,12 @@ class DiaGenerationMixin(GenerationMixin):
|
||||
)
|
||||
generation_mode = generation_config.get_generation_mode(assistant_model)
|
||||
|
||||
if generation_mode not in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
|
||||
raise ValueError(
|
||||
"Got incompatible mode for generation, should be one of greedy or sampling. "
|
||||
"Ensure that beam search is de-activated by setting `num_beams=1`."
|
||||
)
|
||||
|
||||
self._validate_model_kwargs(model_kwargs.copy())
|
||||
self._validate_generation_mode(generation_mode, generation_config, generation_mode_kwargs)
|
||||
|
||||
@ -382,26 +388,29 @@ class DiaGenerationMixin(GenerationMixin):
|
||||
# Prepare inner 2D logic in generation loop
|
||||
input_ids = input_ids.reshape(-1, input_ids.shape[-1])
|
||||
|
||||
# 10. go into different generation modes
|
||||
if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
|
||||
# 11. expand input_ids with `num_return_sequences` additional sequences per batch
|
||||
if generation_config.num_return_sequences > 1:
|
||||
raise ValueError("`num_return_sequences>1` is incompatible with Dia.")
|
||||
model_kwargs = self._get_initial_cache_position(input_ids.shape[1], input_ids.device, model_kwargs)
|
||||
# prepare model inputs
|
||||
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
|
||||
|
||||
# 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
|
||||
return self._sample(
|
||||
input_ids,
|
||||
logits_processor=prepared_logits_processor,
|
||||
stopping_criteria=prepared_stopping_criteria,
|
||||
generation_config=generation_config,
|
||||
**generation_mode_kwargs,
|
||||
**model_kwargs,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Got incompatible mode for generation, should be one of greedy or sampling. "
|
||||
"Ensure that beam search is de-activated by setting `num_beams=1`."
|
||||
)
|
||||
# 10. Prefill
|
||||
model_inputs.update({"output_attentions": generation_config.output_attentions})
|
||||
model_inputs.update({"output_hidden_states": generation_config.output_hidden_states})
|
||||
outputs = self(**model_inputs, return_dict=True)
|
||||
|
||||
# 11. expand input_ids with `num_return_sequences` additional sequences per batch
|
||||
if generation_config.num_return_sequences > 1:
|
||||
raise ValueError("`num_return_sequences>1` is incompatible with Dia.")
|
||||
|
||||
# 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
|
||||
return self._sample(
|
||||
input_ids,
|
||||
logits_processor=prepared_logits_processor,
|
||||
stopping_criteria=prepared_stopping_criteria,
|
||||
generation_config=generation_config,
|
||||
prefill_outputs=outputs,
|
||||
**generation_mode_kwargs,
|
||||
**model_kwargs,
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def generate(
|
||||
|
||||
@ -20,6 +20,7 @@ from ...utils.import_utils import define_import_structure
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_fuyu import *
|
||||
from .image_processing_fuyu import *
|
||||
from .image_processing_fuyu_fast import *
|
||||
from .modeling_fuyu import *
|
||||
from .processing_fuyu import *
|
||||
else:
|
||||
|
||||
@ -29,6 +29,7 @@ from ...image_utils import (
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
SizeDict,
|
||||
get_image_size,
|
||||
infer_channel_dimension_format,
|
||||
is_scaled_image,
|
||||
@ -37,6 +38,7 @@ from ...image_utils import (
|
||||
to_numpy_array,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
filter_out_non_signature_kwargs,
|
||||
@ -70,6 +72,21 @@ def make_list_of_list_of_images(
|
||||
raise ValueError("images must be a list of list of images or a list of images or an image.")
|
||||
|
||||
|
||||
class FuyuImagesKwargs(ImagesKwargs, total=False):
|
||||
r"""
|
||||
patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 30, "width": 30}`):
|
||||
Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
|
||||
padding_value (`float`, *optional*, defaults to 1.0):
|
||||
The value to pad the image with.
|
||||
padding_mode (`str`, *optional*, defaults to "constant"):
|
||||
The padding mode to use when padding the image.
|
||||
"""
|
||||
|
||||
patch_size: Optional[SizeDict]
|
||||
padding_value: float
|
||||
padding_mode: str
|
||||
|
||||
|
||||
class FuyuBatchFeature(BatchFeature):
|
||||
"""
|
||||
BatchFeature class for Fuyu image processor and processor.
|
||||
@ -232,6 +249,7 @@ class FuyuImageProcessor(BaseImageProcessor):
|
||||
"image_patch_indices_per_batch",
|
||||
"image_patch_indices_per_subsequence",
|
||||
]
|
||||
valid_kwargs = FuyuImagesKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
382
src/transformers/models/fuyu/image_processing_fuyu_fast.py
Normal file
382
src/transformers/models/fuyu/image_processing_fuyu_fast.py
Normal file
@ -0,0 +1,382 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Fast Image processor class for Fuyu."""
|
||||
|
||||
import math
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from ...image_processing_utils import get_size_dict
|
||||
from ...image_processing_utils_fast import (
|
||||
BaseImageProcessorFast,
|
||||
group_images_by_shape,
|
||||
reorder_images,
|
||||
)
|
||||
from ...image_utils import (
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
SizeDict,
|
||||
)
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
auto_docstring,
|
||||
is_torchvision_available,
|
||||
logging,
|
||||
requires_backends,
|
||||
)
|
||||
from .image_processing_fuyu import FuyuBatchFeature, FuyuImagesKwargs, make_list_of_list_of_images
|
||||
|
||||
|
||||
if is_torchvision_available():
|
||||
from torchvision.transforms.v2 import functional as F
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class FuyuImageProcessorFast(BaseImageProcessorFast):
|
||||
do_resize = True
|
||||
size = {"height": 1080, "width": 1920}
|
||||
resample = PILImageResampling.BILINEAR
|
||||
do_pad = True
|
||||
padding_value = 1.0
|
||||
padding_mode = "constant"
|
||||
do_normalize = True
|
||||
image_mean = 0.5
|
||||
image_std = 0.5
|
||||
do_rescale = True
|
||||
rescale_factor = 1 / 255
|
||||
model_input_names = [
|
||||
"images",
|
||||
"image_input_ids",
|
||||
"image_patches",
|
||||
"image_patch_indices_per_batch",
|
||||
"image_patch_indices_per_subsequence",
|
||||
]
|
||||
valid_kwargs = FuyuImagesKwargs
|
||||
|
||||
def _prepare_images_structure(
|
||||
self,
|
||||
images: ImageInput,
|
||||
expected_ndims: int = 3,
|
||||
) -> ImageInput:
|
||||
images = self.fetch_images(images)
|
||||
return make_list_of_list_of_images(images)
|
||||
|
||||
def resize(
|
||||
self,
|
||||
image: torch.Tensor,
|
||||
size: SizeDict,
|
||||
interpolation: Optional["F.InterpolationMode"] = None,
|
||||
antialias: bool = True,
|
||||
**kwargs,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Resize an image to fit within `(size["height"], size["width"])` while maintaining aspect ratio.
|
||||
Only resizes if the image is larger than the target size.
|
||||
Args:
|
||||
image (`torch.Tensor`):
|
||||
Image to resize.
|
||||
size (`SizeDict`):
|
||||
Dictionary in the format `{"height": int, "width": int}` specifying the max size of the output image.
|
||||
interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
|
||||
`InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BILINEAR`.
|
||||
antialias (`bool`, *optional*, defaults to `True`):
|
||||
Whether to apply antialiasing when resizing.
|
||||
"""
|
||||
interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
|
||||
image_height, image_width = image.shape[-2:]
|
||||
target_height, target_width = size.height, size.width
|
||||
# Only resize if image is larger than target
|
||||
if image_width <= target_width and image_height <= target_height:
|
||||
return image
|
||||
# Calculate optimal scale factor to fit within target size
|
||||
height_scale_factor = target_height / image_height
|
||||
width_scale_factor = target_width / image_width
|
||||
optimal_scale_factor = min(height_scale_factor, width_scale_factor)
|
||||
|
||||
new_height = int(image_height * optimal_scale_factor)
|
||||
new_width = int(image_width * optimal_scale_factor)
|
||||
|
||||
return super().resize(
|
||||
image, SizeDict(height=new_height, width=new_width), interpolation=interpolation, antialias=antialias
|
||||
)
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: list["torch.Tensor"],
|
||||
do_resize: bool,
|
||||
size: SizeDict,
|
||||
interpolation: Optional["F.InterpolationMode"],
|
||||
do_rescale: bool,
|
||||
rescale_factor: float,
|
||||
do_normalize: bool,
|
||||
image_mean: Optional[Union[float, list[float]]],
|
||||
image_std: Optional[Union[float, list[float]]],
|
||||
do_pad: Optional[bool],
|
||||
padding_value: Optional[float],
|
||||
padding_mode: Optional[str],
|
||||
disable_grouping: Optional[bool],
|
||||
return_tensors: Optional[Union[str, TensorType]],
|
||||
**kwargs,
|
||||
) -> FuyuBatchFeature:
|
||||
# Group images by size for batched resizing
|
||||
original_image_sizes = [batch_image[0].shape[-2:] for batch_image in images if batch_image]
|
||||
grouped_images, grouped_images_index = group_images_by_shape(
|
||||
images, disable_grouping=disable_grouping, is_nested=True
|
||||
)
|
||||
resized_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_resize:
|
||||
stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
|
||||
resized_images_grouped[shape] = stacked_images
|
||||
resized_images = reorder_images(resized_images_grouped, grouped_images_index, is_nested=True)
|
||||
|
||||
image_sizes = [batch_image[0].shape[-2:] for batch_image in resized_images if batch_image]
|
||||
image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
|
||||
image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
|
||||
image_scale_factors = [
|
||||
[resized_size[0] / original_size[0]]
|
||||
for original_size, resized_size in zip(original_image_sizes, image_sizes)
|
||||
]
|
||||
if do_pad:
|
||||
resized_images = self.pad(
|
||||
resized_images,
|
||||
pad_size=size,
|
||||
fill_value=padding_value,
|
||||
padding_mode=padding_mode,
|
||||
disable_grouping=disable_grouping,
|
||||
is_nested=True,
|
||||
)
|
||||
# Group images by size for further processing
|
||||
# Needed in case do_resize is False, or resize returns images with different sizes
|
||||
grouped_images, grouped_images_index = group_images_by_shape(
|
||||
resized_images, disable_grouping=disable_grouping, is_nested=True
|
||||
)
|
||||
processed_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
# Fused rescale and normalize
|
||||
stacked_images = self.rescale_and_normalize(
|
||||
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
||||
)
|
||||
processed_images_grouped[shape] = stacked_images
|
||||
processed_images = reorder_images(processed_images_grouped, grouped_images_index, is_nested=True)
|
||||
|
||||
return FuyuBatchFeature(
|
||||
data={
|
||||
"images": processed_images,
|
||||
"image_unpadded_heights": image_unpadded_heights,
|
||||
"image_unpadded_widths": image_unpadded_widths,
|
||||
"image_scale_factors": image_scale_factors,
|
||||
},
|
||||
tensor_type=return_tensors,
|
||||
)
|
||||
|
||||
def get_num_patches(self, image_height: int, image_width: int, patch_size: Optional[SizeDict] = None) -> int:
|
||||
"""
|
||||
Calculate number of patches required to encode an image.
|
||||
Args:
|
||||
image_height (`int`):
|
||||
Height of the image.
|
||||
image_width (`int`):
|
||||
Width of the image.
|
||||
patch_size (`SizeDict`, *optional*):
|
||||
Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
|
||||
"""
|
||||
if patch_size is None:
|
||||
patch_size = SizeDict(**self.patch_size)
|
||||
patch_height, patch_width = patch_size.height, patch_size.width
|
||||
if image_height % patch_height != 0:
|
||||
raise ValueError(f"{image_height=} must be divisible by {patch_height}")
|
||||
if image_width % patch_width != 0:
|
||||
raise ValueError(f"{image_width=} must be divisible by {patch_width}")
|
||||
num_patches_per_dim_h = image_height // patch_height
|
||||
num_patches_per_dim_w = image_width // patch_width
|
||||
num_patches = num_patches_per_dim_h * num_patches_per_dim_w
|
||||
return num_patches
|
||||
|
||||
def patchify_image(self, image: torch.Tensor, patch_size: Optional[SizeDict] = None) -> torch.Tensor:
|
||||
"""
|
||||
Convert an image into a tensor of patches using PyTorch's unfold operation.
|
||||
Args:
|
||||
image (`torch.Tensor`):
|
||||
Image to convert. Shape: [batch, channels, height, width]
|
||||
patch_size (`SizeDict`, *optional*):
|
||||
Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
|
||||
"""
|
||||
requires_backends(self, ["torch"])
|
||||
if patch_size is None:
|
||||
patch_size = SizeDict(**self.patch_size)
|
||||
patch_height, patch_width = patch_size.height, patch_size.width
|
||||
batch_size, channels, _, _ = image.shape
|
||||
# Use unfold to extract patches
|
||||
unfolded_along_height = image.unfold(2, patch_height, patch_height)
|
||||
patches = unfolded_along_height.unfold(3, patch_width, patch_width)
|
||||
patches = patches.contiguous()
|
||||
# Reshape to [batch, num_patches, channels * patch_h * patch_w]
|
||||
patches = patches.view(batch_size, channels, -1, patch_height, patch_width)
|
||||
patches = patches.permute(0, 2, 3, 4, 1)
|
||||
patches = patches.reshape(batch_size, -1, channels * patch_height * patch_width)
|
||||
return patches
|
||||
|
||||
def preprocess_with_tokenizer_info(
|
||||
self,
|
||||
image_input: torch.Tensor,
|
||||
image_present: torch.Tensor,
|
||||
image_unpadded_h: torch.Tensor,
|
||||
image_unpadded_w: torch.Tensor,
|
||||
image_placeholder_id: int,
|
||||
image_newline_id: int,
|
||||
variable_sized: bool,
|
||||
patch_size: Optional[dict[str, int]] = None,
|
||||
) -> FuyuBatchFeature:
|
||||
"""
|
||||
Process images for model input. In particular, variable-sized images are handled here.
|
||||
|
||||
Args:
|
||||
image_input (`torch.Tensor` of shape [batch_size, subsequence_size, num_channels, height, width]):
|
||||
Tensor of images padded to model input size.
|
||||
image_present (`torch.Tensor` of shape [batch_size, subsequence_size, num_images]):
|
||||
Tensor of 1s and 0s indicating whether an image is present.
|
||||
image_unpadded_h (`torch.Tensor` of shape [batch_size, subsequence_size]):
|
||||
Tensor of unpadded image heights.
|
||||
image_unpadded_w (`torch.Tensor` of shape [batch_size, subsequence_size]):
|
||||
Tensor of unpadded image widths.
|
||||
image_placeholder_id (int):
|
||||
The id of the image placeholder token. Comes from an associated tokenizer.
|
||||
image_newline_id (int):
|
||||
The id of the image newline token. Comes from an associated tokenizer.
|
||||
variable_sized (bool):
|
||||
Whether to process images as variable-sized.
|
||||
patch_size (`dict[str, int]`, *optional*):
|
||||
Size of the patches.
|
||||
"""
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
if patch_size is None:
|
||||
patch_size = SizeDict(**self.patch_size)
|
||||
else:
|
||||
patch_size = SizeDict(**patch_size)
|
||||
patch_height, patch_width = patch_size.height, patch_size.width
|
||||
# Only images that are present
|
||||
images: list[list[torch.Tensor]] = []
|
||||
batch_image_patches: list[list[torch.Tensor]] = []
|
||||
# Image input ids for every subsequence, including ones with no image present
|
||||
batch_image_input_ids: list[list[torch.Tensor]] = []
|
||||
for batch_index in range(image_input.shape[0]):
|
||||
image_input_ids = []
|
||||
image_patches = []
|
||||
for subseq_index in range(image_input.shape[1]):
|
||||
if image_present[batch_index, subseq_index]:
|
||||
image = image_input[batch_index, subseq_index]
|
||||
image_height, image_width = image.shape[1], image.shape[2]
|
||||
if variable_sized:
|
||||
# Calculate new dimensions based on unpadded size
|
||||
# The min() is required here due to floating point issues
|
||||
new_h = min(
|
||||
image_height,
|
||||
math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
|
||||
)
|
||||
new_w = min(
|
||||
image_width,
|
||||
math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
|
||||
)
|
||||
image = image[:, :new_h, :new_w]
|
||||
image_height, image_width = new_h, new_w
|
||||
num_patches = self.get_num_patches(
|
||||
image_height=image_height, image_width=image_width, patch_size=patch_size
|
||||
)
|
||||
# Create tensor of placeholder IDs
|
||||
tensor_of_image_ids = torch.full(
|
||||
[num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
|
||||
)
|
||||
# Patchify the image
|
||||
patches = self.patchify_image(image=image.unsqueeze(0), patch_size=patch_size).squeeze(0)
|
||||
assert num_patches == patches.shape[0]
|
||||
if variable_sized:
|
||||
# Terminate each line with newline ID
|
||||
tensor_of_image_ids = tensor_of_image_ids.reshape(-1, image_width // patch_width)
|
||||
newline_ids = torch.full(
|
||||
[tensor_of_image_ids.shape[0], 1],
|
||||
image_newline_id,
|
||||
dtype=torch.int32,
|
||||
device=image_input.device,
|
||||
)
|
||||
tensor_of_image_ids = torch.cat([tensor_of_image_ids, newline_ids], dim=1)
|
||||
tensor_of_image_ids = tensor_of_image_ids.reshape(-1)
|
||||
images.append([image])
|
||||
image_input_ids.append(tensor_of_image_ids)
|
||||
image_patches.append(patches)
|
||||
else:
|
||||
image_input_ids.append(torch.tensor([], dtype=torch.int32, device=image_input.device))
|
||||
batch_image_input_ids.append(image_input_ids)
|
||||
batch_image_patches.append(image_patches)
|
||||
# Create image patch indices
|
||||
image_patch_indices_per_batch: list[list[torch.Tensor]] = []
|
||||
image_patch_indices_per_subsequence: list[list[torch.Tensor]] = []
|
||||
|
||||
for sample_image_input_ids in batch_image_input_ids:
|
||||
index_offset = 0
|
||||
per_batch_indices = []
|
||||
per_subsequence_indices = []
|
||||
for subseq_image_input_ids in sample_image_input_ids:
|
||||
# Indices of image patches
|
||||
patches_mask = subseq_image_input_ids == image_placeholder_id
|
||||
num_patches = torch.count_nonzero(patches_mask)
|
||||
indices = torch.arange(num_patches, dtype=torch.int64, device=subseq_image_input_ids.device).type_as(
|
||||
subseq_image_input_ids
|
||||
)
|
||||
# Place those indices in the image input ids token stream, with -1 representing non-index tokens
|
||||
indices_in_stream_per_batch = torch.full_like(subseq_image_input_ids, -1)
|
||||
indices_in_stream_per_subsequence = torch.full_like(subseq_image_input_ids, -1)
|
||||
patches_inds = torch.nonzero(patches_mask, as_tuple=True)[0]
|
||||
|
||||
indices_in_stream_per_batch[patches_inds] = indices + index_offset
|
||||
indices_in_stream_per_subsequence[patches_inds] = indices
|
||||
|
||||
per_batch_indices.append(indices_in_stream_per_batch)
|
||||
per_subsequence_indices.append(indices_in_stream_per_subsequence)
|
||||
index_offset += num_patches
|
||||
|
||||
image_patch_indices_per_batch.append(per_batch_indices)
|
||||
image_patch_indices_per_subsequence.append(per_subsequence_indices)
|
||||
return FuyuBatchFeature(
|
||||
data={
|
||||
"images": images,
|
||||
"image_input_ids": batch_image_input_ids,
|
||||
"image_patches": batch_image_patches,
|
||||
"image_patch_indices_per_batch": image_patch_indices_per_batch,
|
||||
"image_patch_indices_per_subsequence": image_patch_indices_per_subsequence,
|
||||
}
|
||||
)
|
||||
|
||||
def _further_process_kwargs(
|
||||
self,
|
||||
patch_size: Optional[dict[str, int]] = None,
|
||||
**kwargs,
|
||||
) -> dict:
|
||||
"""
|
||||
Process Fuyu-specific kwargs before validation.
|
||||
"""
|
||||
kwargs = super()._further_process_kwargs(**kwargs)
|
||||
if patch_size is not None:
|
||||
patch_size = SizeDict(**get_size_dict(patch_size, param_name="patch_size"))
|
||||
kwargs["patch_size"] = patch_size
|
||||
return kwargs
|
||||
|
||||
|
||||
__all__ = ["FuyuImageProcessorFast"]
|
||||
@ -21,6 +21,7 @@ if TYPE_CHECKING:
|
||||
from .configuration_glpn import *
|
||||
from .feature_extraction_glpn import *
|
||||
from .image_processing_glpn import *
|
||||
from .image_processing_glpn_fast import *
|
||||
from .modeling_glpn import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
@ -39,6 +39,7 @@ from ...image_utils import (
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from ...processing_utils import ImagesKwargs
|
||||
from ...utils import TensorType, filter_out_non_signature_kwargs, logging, requires_backends
|
||||
|
||||
|
||||
@ -49,6 +50,17 @@ if is_torch_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class GLPNImageProcessorKwargs(ImagesKwargs, total=False):
|
||||
"""
|
||||
size_divisor (`int`, *optional*, defaults to 32):
|
||||
When `do_resize` is `True`, images are resized so their height and width are rounded down to the closest
|
||||
multiple of `size_divisor`.
|
||||
"""
|
||||
|
||||
size_divisor: int
|
||||
resample: PILImageResampling
|
||||
|
||||
|
||||
@requires(backends=("vision",))
|
||||
class GLPNImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
@ -66,9 +78,12 @@ class GLPNImageProcessor(BaseImageProcessor):
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Can be
|
||||
overridden by `do_rescale` in `preprocess`.
|
||||
rescale_factor (`float`, *optional*, defaults to `1 / 255`):
|
||||
The scaling factor to apply to the pixel values. Can be overridden by `rescale_factor` in `preprocess`.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
valid_kwargs = GLPNImageProcessorKwargs
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -76,12 +91,14 @@ class GLPNImageProcessor(BaseImageProcessor):
|
||||
size_divisor: int = 32,
|
||||
resample=PILImageResampling.BILINEAR,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: Optional[float] = 1 / 255,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.do_resize = do_resize
|
||||
self.do_rescale = do_rescale
|
||||
self.size_divisor = size_divisor
|
||||
self.resample = resample
|
||||
self.rescale_factor = rescale_factor
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def resize(
|
||||
@ -142,6 +159,7 @@ class GLPNImageProcessor(BaseImageProcessor):
|
||||
size_divisor: Optional[int] = None,
|
||||
resample=None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
return_tensors: Optional[Union[TensorType, str]] = None,
|
||||
data_format: ChannelDimension = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
@ -181,6 +199,7 @@ class GLPNImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
size_divisor = size_divisor if size_divisor is not None else self.size_divisor
|
||||
resample = resample if resample is not None else self.resample
|
||||
|
||||
@ -217,7 +236,9 @@ class GLPNImageProcessor(BaseImageProcessor):
|
||||
]
|
||||
|
||||
if do_rescale:
|
||||
images = [self.rescale(image, scale=1 / 255, input_data_format=input_data_format) for image in images]
|
||||
images = [
|
||||
self.rescale(image, scale=rescale_factor, input_data_format=input_data_format) for image in images
|
||||
]
|
||||
|
||||
images = [
|
||||
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
|
||||
|
||||
136
src/transformers/models/glpn/image_processing_glpn_fast.py
Normal file
136
src/transformers/models/glpn/image_processing_glpn_fast.py
Normal file
@ -0,0 +1,136 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Fast Image processor class for GLPN."""
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torchvision.transforms.v2 import functional as F
|
||||
|
||||
from ...image_processing_utils import BatchFeature
|
||||
from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
|
||||
from ...image_utils import (
|
||||
PILImageResampling,
|
||||
SizeDict,
|
||||
)
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
auto_docstring,
|
||||
requires_backends,
|
||||
)
|
||||
from .image_processing_glpn import GLPNImageProcessorKwargs
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class GLPNImageProcessorFast(BaseImageProcessorFast):
|
||||
do_resize = True
|
||||
do_rescale = True
|
||||
rescale_factor = 1 / 255
|
||||
resample = PILImageResampling.BILINEAR
|
||||
size_divisor = 32
|
||||
valid_kwargs = GLPNImageProcessorKwargs
|
||||
|
||||
def _validate_preprocess_kwargs(self, **kwargs):
|
||||
# pop `do_resize` to not raise an error as `size` is not None
|
||||
kwargs.pop("do_resize", None)
|
||||
return super()._validate_preprocess_kwargs(**kwargs)
|
||||
|
||||
def resize(
|
||||
self,
|
||||
image: "torch.Tensor",
|
||||
size_divisor: int,
|
||||
interpolation: Optional["F.InterpolationMode"] = None,
|
||||
antialias: bool = True,
|
||||
**kwargs,
|
||||
) -> "torch.Tensor":
|
||||
"""
|
||||
Resize an image to `(size["height"], size["width"])`.
|
||||
|
||||
Args:
|
||||
image (`torch.Tensor`):
|
||||
Image to resize.
|
||||
size (`SizeDict`):
|
||||
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
|
||||
interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
|
||||
`InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
|
||||
antialias (`bool`, *optional*, defaults to `True`):
|
||||
Whether to use antialiasing.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: The resized image.
|
||||
"""
|
||||
height, width = image.shape[-2:]
|
||||
# Rounds the height and width down to the closest multiple of size_divisor
|
||||
new_h = height // size_divisor * size_divisor
|
||||
new_w = width // size_divisor * size_divisor
|
||||
return super().resize(
|
||||
image, SizeDict(height=new_h, width=new_w), interpolation=interpolation, antialias=antialias
|
||||
)
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: list["torch.Tensor"],
|
||||
do_resize: bool,
|
||||
size_divisor: Optional[int] = None,
|
||||
interpolation: Optional["F.InterpolationMode"] = None,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: Optional[float] = 1 / 255,
|
||||
do_normalize: bool = False,
|
||||
image_mean: Optional[Union[float, list[float]]] = None,
|
||||
image_std: Optional[Union[float, list[float]]] = None,
|
||||
disable_grouping: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
resample: Optional[PILImageResampling] = None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
grouped_images, grouped_index = group_images_by_shape(images, disable_grouping=disable_grouping)
|
||||
processed_groups = {}
|
||||
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_resize:
|
||||
stacked_images = self.resize(stacked_images, size_divisor=size_divisor, interpolation=interpolation)
|
||||
stacked_images = self.rescale_and_normalize(
|
||||
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
||||
)
|
||||
processed_groups[shape] = stacked_images
|
||||
|
||||
processed_images = reorder_images(processed_groups, grouped_index)
|
||||
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
||||
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
||||
|
||||
def post_process_depth_estimation(self, outputs, target_sizes=None):
|
||||
"""
|
||||
Convert raw model outputs to final depth predictions.
|
||||
Mirrors slow GLPN: PyTorch interpolate w/ bicubic, align_corners=False.
|
||||
"""
|
||||
requires_backends(self, "torch")
|
||||
predicted_depth = outputs.predicted_depth
|
||||
|
||||
results = []
|
||||
target_sizes = target_sizes or [None] * predicted_depth.shape[0]
|
||||
for depth, target_size in zip(predicted_depth, target_sizes):
|
||||
if target_size is not None:
|
||||
# Add batch and channel dimensions for interpolation
|
||||
depth_4d = depth[None, None, ...]
|
||||
resized = torch.nn.functional.interpolate(
|
||||
depth_4d, size=target_size, mode="bicubic", align_corners=False
|
||||
)
|
||||
depth = resized.squeeze(0).squeeze(0)
|
||||
results.append({"predicted_depth": depth})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
__all__ = ["GLPNImageProcessorFast"]
|
||||
@ -286,8 +286,8 @@ class Idefics3Processor(ProcessorMixin):
|
||||
f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same."
|
||||
)
|
||||
|
||||
image_rows = inputs.pop("rows", [[0] * len(text)])
|
||||
image_cols = inputs.pop("cols", [[0] * len(text)])
|
||||
image_rows = inputs.pop("rows", [[0] * n_images for n_images in n_images_in_text])
|
||||
image_cols = inputs.pop("cols", [[0] * n_images for n_images in n_images_in_text])
|
||||
|
||||
fake_image_token = self.fake_image_token
|
||||
image_token = self.image_token
|
||||
|
||||
@ -53,9 +53,9 @@ def load_cuda_kernels():
|
||||
global mra_cuda_kernel
|
||||
if not is_kernels_available():
|
||||
raise ImportError("kernels is not installed, please install it with `pip install kernels`")
|
||||
from ...integrations.hub_kernels import get_kernel_wrapper
|
||||
from kernels import get_kernel
|
||||
|
||||
mra_cuda_kernel = get_kernel_wrapper("kernels-community/mra")
|
||||
mra_cuda_kernel = get_kernel("kernels-community/mra")
|
||||
|
||||
|
||||
def sparse_max(sparse_qk_prod, indices, query_num_block, key_num_block):
|
||||
|
||||
@ -2109,6 +2109,7 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
|
||||
stopping_criteria: Optional[StoppingCriteriaList] = None,
|
||||
synced_gpus: Optional[bool] = None,
|
||||
streamer: Optional["BaseStreamer"] = None,
|
||||
use_model_defaults: Optional[bool] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
@ -2153,6 +2154,11 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
|
||||
streamer (`BaseStreamer`, *optional*):
|
||||
Streamer object that will be used to stream the generated sequences. Generated tokens are passed
|
||||
through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
|
||||
use_model_defaults (`bool`, *optional*):
|
||||
When it is `True`, unset parameters in `generation_config` will be set to the model-specific default
|
||||
generation configuration (`model.generation_config`), as opposed to the global defaults
|
||||
(`GenerationConfig()`). If unset, models saved starting from `v4.50` will consider this flag to be
|
||||
`True`.
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
|
||||
forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
|
||||
@ -2175,13 +2181,19 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
|
||||
- [`~generation.GenerateBeamEncoderDecoderOutput`]
|
||||
"""
|
||||
# 1. Handle `generation_config` and kwargs that might update it, and validate the resulting objects
|
||||
if generation_config is None:
|
||||
generation_config = self.generation_config
|
||||
generation_mode_kwargs = self._extract_generation_mode_kwargs(None, kwargs, False, None, None)
|
||||
generation_config, model_kwargs = self._prepare_generation_config(
|
||||
generation_config, use_model_defaults, **kwargs
|
||||
)
|
||||
generation_mode = generation_config.get_generation_mode()
|
||||
if generation_mode not in [GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH]:
|
||||
raise ValueError(
|
||||
"Got incompatible mode for generation, should be one of greedy or sampling. "
|
||||
"Ensure that beam search is de-activated by setting `num_beams=1`."
|
||||
)
|
||||
|
||||
generation_config = copy.deepcopy(generation_config)
|
||||
model_kwargs = generation_config.update(**kwargs) # All unused kwargs must be model kwargs
|
||||
generation_config.validate()
|
||||
self._validate_model_kwargs(model_kwargs.copy())
|
||||
self._validate_generation_mode(generation_mode, generation_config, generation_mode_kwargs)
|
||||
|
||||
if model_kwargs.get("encoder_outputs") is not None and type(model_kwargs["encoder_outputs"]) is tuple:
|
||||
# wrap the unconditional outputs as a BaseModelOutput for compatibility with the rest of generate
|
||||
@ -2281,31 +2293,26 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
|
||||
generation_config=generation_config, stopping_criteria=stopping_criteria
|
||||
)
|
||||
|
||||
if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
|
||||
# expand input_ids with `num_return_sequences` additional sequences per batch
|
||||
input_ids, model_kwargs = self._expand_inputs_for_generation(
|
||||
input_ids=input_ids,
|
||||
expand_size=generation_config.num_return_sequences,
|
||||
is_encoder_decoder=self.config.is_encoder_decoder,
|
||||
**model_kwargs,
|
||||
)
|
||||
# expand input_ids with `num_return_sequences` additional sequences per batch
|
||||
input_ids, model_kwargs = self._expand_inputs_for_generation(
|
||||
input_ids=input_ids,
|
||||
expand_size=generation_config.num_return_sequences,
|
||||
is_encoder_decoder=self.config.is_encoder_decoder,
|
||||
**model_kwargs,
|
||||
)
|
||||
|
||||
# 11. run sample
|
||||
outputs = self._sample(
|
||||
input_ids,
|
||||
logits_processor=logits_processor,
|
||||
stopping_criteria=stopping_criteria,
|
||||
generation_config=generation_config,
|
||||
synced_gpus=synced_gpus,
|
||||
streamer=streamer,
|
||||
**model_kwargs,
|
||||
)
|
||||
# 10b. prepare prefill outputs
|
||||
generation_mode_kwargs["prefill_outputs"] = self._prefill(input_ids, generation_config, model_kwargs)
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
"Got incompatible mode for generation, should be one of greedy or sampling. "
|
||||
"Ensure that beam search is de-activated by setting `num_beams=1`."
|
||||
)
|
||||
# 11. run sample
|
||||
outputs = self._sample(
|
||||
input_ids,
|
||||
logits_processor=logits_processor,
|
||||
stopping_criteria=stopping_criteria,
|
||||
generation_config=generation_config,
|
||||
**generation_mode_kwargs,
|
||||
**model_kwargs,
|
||||
)
|
||||
|
||||
if generation_config.return_dict_in_generate:
|
||||
output_ids = outputs.sequences
|
||||
|
||||
@ -291,7 +291,7 @@ class Owlv2ImageProcessor(BaseImageProcessor):
|
||||
image = pad(
|
||||
image=image,
|
||||
padding=((0, size - height), (0, size - width)),
|
||||
constant_values=0.5,
|
||||
constant_values=0.0,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
|
||||
@ -228,7 +228,7 @@ class Owlv2ImageProcessorFast(BaseImageProcessorFast):
|
||||
|
||||
return results
|
||||
|
||||
def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.5) -> "torch.Tensor":
|
||||
def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.0) -> "torch.Tensor":
|
||||
"""
|
||||
Pad an image with zeros to the given size.
|
||||
"""
|
||||
@ -245,7 +245,7 @@ class Owlv2ImageProcessorFast(BaseImageProcessorFast):
|
||||
self,
|
||||
images: list["torch.Tensor"],
|
||||
disable_grouping: Optional[bool],
|
||||
constant_value: float = 0.5,
|
||||
constant_value: float = 0.0,
|
||||
**kwargs,
|
||||
) -> list["torch.Tensor"]:
|
||||
"""
|
||||
@ -351,7 +351,7 @@ class Owlv2ImageProcessorFast(BaseImageProcessorFast):
|
||||
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
||||
|
||||
if do_pad:
|
||||
processed_images = self.pad(processed_images, constant_value=0.5, disable_grouping=disable_grouping)
|
||||
processed_images = self.pad(processed_images, constant_value=0.0, disable_grouping=disable_grouping)
|
||||
|
||||
grouped_images, grouped_images_index = group_images_by_shape(
|
||||
processed_images, disable_grouping=disable_grouping
|
||||
|
||||
@ -52,7 +52,7 @@ class Owlv2ImageProcessorFast(OwlViTImageProcessorFast):
|
||||
crop_size = None
|
||||
do_center_crop = None
|
||||
|
||||
def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.5) -> "torch.Tensor":
|
||||
def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.0) -> "torch.Tensor":
|
||||
"""
|
||||
Pad an image with zeros to the given size.
|
||||
"""
|
||||
@ -69,7 +69,7 @@ class Owlv2ImageProcessorFast(OwlViTImageProcessorFast):
|
||||
self,
|
||||
images: list["torch.Tensor"],
|
||||
disable_grouping: Optional[bool],
|
||||
constant_value: float = 0.5,
|
||||
constant_value: float = 0.0,
|
||||
**kwargs,
|
||||
) -> list["torch.Tensor"]:
|
||||
"""
|
||||
@ -175,7 +175,7 @@ class Owlv2ImageProcessorFast(OwlViTImageProcessorFast):
|
||||
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
||||
|
||||
if do_pad:
|
||||
processed_images = self.pad(processed_images, constant_value=0.5, disable_grouping=disable_grouping)
|
||||
processed_images = self.pad(processed_images, constant_value=0.0, disable_grouping=disable_grouping)
|
||||
|
||||
grouped_images, grouped_images_index = group_images_by_shape(
|
||||
processed_images, disable_grouping=disable_grouping
|
||||
|
||||
@ -53,11 +53,18 @@ class Pix2StructImageProcessorKwargs(ImagesKwargs, total=False):
|
||||
"""
|
||||
max_patches (`int`, *optional*):
|
||||
Maximum number of patches to extract.
|
||||
patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
|
||||
The patch size to use for the image. According to Pix2Struct paper and code, the patch size is 16x16.
|
||||
is_vqa (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the image processor is for the VQA task. If `True` and `header_text` is passed in, text is
|
||||
rendered onto the input images.
|
||||
header_text (`Union[list[str], str]`, *optional*):
|
||||
Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
|
||||
"""
|
||||
|
||||
max_patches: int
|
||||
patch_size: dict[str, int]
|
||||
is_vqa: bool
|
||||
header_text: Optional[Union[list[str], str]]
|
||||
|
||||
|
||||
|
||||
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""RAG model implementation."""
|
||||
|
||||
import copy
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Union
|
||||
@ -24,7 +23,8 @@ from torch import nn
|
||||
|
||||
from ...cache_utils import Cache, EncoderDecoderCache
|
||||
from ...configuration_utils import PreTrainedConfig
|
||||
from ...generation import GenerationConfig, GenerationMixin, LogitsProcessorList, StoppingCriteriaList
|
||||
from ...generation import GenerationConfig, GenerationMixin, GenerationMode, LogitsProcessorList, StoppingCriteriaList
|
||||
from ...generation.utils import GENERATION_MODES_MAPPING
|
||||
from ...modeling_outputs import ModelOutput
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...utils import auto_docstring, logging
|
||||
@ -1403,6 +1403,7 @@ class RagTokenForGeneration(RagPreTrainedModel, GenerationMixin):
|
||||
prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
|
||||
logits_processor: Optional[LogitsProcessorList] = LogitsProcessorList(),
|
||||
stopping_criteria: Optional[StoppingCriteriaList] = StoppingCriteriaList(),
|
||||
use_model_defaults: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> torch.LongTensor:
|
||||
"""
|
||||
@ -1461,6 +1462,11 @@ class RagTokenForGeneration(RagPreTrainedModel, GenerationMixin):
|
||||
Custom stopping criteria that complement the default stopping criteria built from arguments and a
|
||||
model's config. If a stopping criteria is passed that is already created with the arguments or a
|
||||
model's config an error is thrown.
|
||||
use_model_defaults (`bool`, *optional*):
|
||||
When it is `True`, unset parameters in `generation_config` will be set to the model-specific default
|
||||
generation configuration (`model.generation_config`), as opposed to the global defaults
|
||||
(`GenerationConfig()`). If unset, models saved starting from `v4.50` will consider this flag to be
|
||||
`True`.
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
|
||||
forwarded to the `forward` function of the model.
|
||||
@ -1471,10 +1477,24 @@ class RagTokenForGeneration(RagPreTrainedModel, GenerationMixin):
|
||||
finished early due to the `eos_token_id`.
|
||||
"""
|
||||
# Handle `generation_config` and kwargs that might update it
|
||||
if generation_config is None:
|
||||
generation_config = self.generation_config
|
||||
generation_config = copy.deepcopy(generation_config)
|
||||
model_kwargs = generation_config.update(**kwargs) # All unused kwargs must be model kwargs
|
||||
generation_mode_kwargs = self._extract_generation_mode_kwargs(None, kwargs, False, None, None)
|
||||
generation_config, model_kwargs = self._prepare_generation_config(
|
||||
generation_config, use_model_defaults, **kwargs
|
||||
)
|
||||
generation_mode = generation_config.get_generation_mode()
|
||||
if generation_mode not in [
|
||||
GenerationMode.SAMPLE,
|
||||
GenerationMode.GREEDY_SEARCH,
|
||||
GenerationMode.BEAM_SEARCH,
|
||||
GenerationMode.BEAM_SAMPLE,
|
||||
]:
|
||||
raise ValueError(
|
||||
f"RAG model is not compatible with {generation_mode} generation. Please check your generation parameters."
|
||||
)
|
||||
# type() required to access the unbound class-level method
|
||||
decoding_method = getattr(type(self), GENERATION_MODES_MAPPING[generation_mode])
|
||||
self._validate_model_kwargs(model_kwargs.copy())
|
||||
self._validate_generation_mode(generation_mode, generation_config, generation_mode_kwargs)
|
||||
|
||||
kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None
|
||||
self._prepare_special_tokens(generation_config, kwargs_has_attention_mask)
|
||||
@ -1550,7 +1570,7 @@ class RagTokenForGeneration(RagPreTrainedModel, GenerationMixin):
|
||||
model_kwargs["attention_mask"] = context_attention_mask
|
||||
model_kwargs["n_docs"] = n_docs
|
||||
|
||||
pre_processor = self._get_logits_processor(
|
||||
prepared_logits_processor = self._get_logits_processor(
|
||||
generation_config=generation_config,
|
||||
input_ids_seq_length=input_ids_seq_length,
|
||||
encoder_input_ids=context_input_ids,
|
||||
@ -1571,37 +1591,18 @@ class RagTokenForGeneration(RagPreTrainedModel, GenerationMixin):
|
||||
max_cache_length=generation_config.max_length - 1,
|
||||
)
|
||||
|
||||
if generation_config.num_beams == 1:
|
||||
if generation_config.num_return_sequences > 1:
|
||||
raise ValueError(
|
||||
f"num_return_sequences has to be 1, but is {generation_config.num_return_sequences} when doing"
|
||||
" greedy search."
|
||||
)
|
||||
return self._sample(
|
||||
input_ids,
|
||||
logits_processor=pre_processor,
|
||||
stopping_criteria=prepared_stopping_criteria,
|
||||
generation_config=generation_config,
|
||||
synced_gpus=False,
|
||||
streamer=None,
|
||||
**model_kwargs,
|
||||
)
|
||||
elif generation_config.num_beams > 1:
|
||||
if generation_config.num_return_sequences > generation_config.num_beams:
|
||||
raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
|
||||
# Prefill pass
|
||||
generation_mode_kwargs["prefill_outputs"] = self._prefill(input_ids, generation_config, model_kwargs)
|
||||
|
||||
return self._beam_search(
|
||||
input_ids,
|
||||
logits_processor=pre_processor,
|
||||
stopping_criteria=prepared_stopping_criteria,
|
||||
generation_config=generation_config,
|
||||
synced_gpus=False,
|
||||
**model_kwargs,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"`num_beams` has to be an integer strictly superior to 0 (≥ 1), but is {generation_config.num_beams}"
|
||||
)
|
||||
return decoding_method(
|
||||
self,
|
||||
input_ids,
|
||||
logits_processor=prepared_logits_processor,
|
||||
stopping_criteria=prepared_stopping_criteria,
|
||||
generation_config=generation_config,
|
||||
**generation_mode_kwargs,
|
||||
**model_kwargs,
|
||||
)
|
||||
|
||||
# Auxiliary functions for beam search
|
||||
def _temporary_reorder_cache(self, past_key_values, beam_idx):
|
||||
|
||||
@ -48,9 +48,9 @@ def load_wkv_cuda_kernel(context_length):
|
||||
if not is_kernels_available():
|
||||
raise ImportError("kernels is not installed, please install it with `pip install kernels`")
|
||||
|
||||
from ...integrations.hub_kernels import get_kernel_wrapper
|
||||
from kernels import get_kernel
|
||||
|
||||
rwkv_cuda_kernel = get_kernel_wrapper("kernels-community/rwkv")
|
||||
rwkv_cuda_kernel = get_kernel("kernels-community/rwkv")
|
||||
rwkv_cuda_kernel.max_seq_length = context_length
|
||||
|
||||
|
||||
|
||||
@ -172,8 +172,6 @@ class SmolVLMProcessor(ProcessorMixin):
|
||||
|
||||
def expand_text_with_image_tokens(self, text, image_rows, image_cols):
|
||||
prompt_strings = []
|
||||
image_rows = image_rows if image_rows is not None else [[0] * len(text)]
|
||||
image_cols = image_cols if image_cols is not None else [[0] * len(text)]
|
||||
for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
|
||||
# Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
|
||||
image_prompt_strings = []
|
||||
@ -330,6 +328,11 @@ class SmolVLMProcessor(ProcessorMixin):
|
||||
raise ValueError(
|
||||
f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same."
|
||||
)
|
||||
# Set default values for image_rows and image_cols if not provided
|
||||
if image_rows is None:
|
||||
image_rows = [[0] * n_images for n_images in n_images_in_text]
|
||||
if image_cols is None:
|
||||
image_cols = [[0] * n_images for n_images in n_images_in_text]
|
||||
text = self.expand_text_with_image_tokens(text, image_rows=image_rows, image_cols=image_cols)
|
||||
|
||||
elif videos is not None:
|
||||
|
||||
@ -53,9 +53,9 @@ def load_cuda_kernels():
|
||||
global lsh_cumulation
|
||||
if not is_kernels_available():
|
||||
raise ImportError("kernels is not installed, please install it with `pip install kernels`")
|
||||
from ...integrations.hub_kernels import get_kernel_wrapper
|
||||
from kernels import get_kernel
|
||||
|
||||
yoso = get_kernel_wrapper("kernels-community/yoso")
|
||||
yoso = get_kernel("kernels-community/yoso")
|
||||
lsh_cumulation = yoso.lsh_cumulation
|
||||
|
||||
|
||||
|
||||
@ -59,6 +59,7 @@ from .base import (
|
||||
get_default_model_and_revision,
|
||||
load_model,
|
||||
)
|
||||
from .deprecated import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
|
||||
from .depth_estimation import DepthEstimationPipeline
|
||||
from .document_question_answering import DocumentQuestionAnsweringPipeline
|
||||
from .feature_extraction import FeatureExtractionPipeline
|
||||
@ -74,7 +75,6 @@ from .mask_generation import MaskGenerationPipeline
|
||||
from .object_detection import ObjectDetectionPipeline
|
||||
from .question_answering import QuestionAnsweringArgumentHandler, QuestionAnsweringPipeline
|
||||
from .table_question_answering import TableQuestionAnsweringArgumentHandler, TableQuestionAnsweringPipeline
|
||||
from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
|
||||
from .text_classification import TextClassificationPipeline
|
||||
from .text_generation import TextGenerationPipeline
|
||||
from .text_to_audio import TextToAudioPipeline
|
||||
|
||||
16
src/transformers/pipelines/deprecated/__init__.py
Normal file
16
src/transformers/pipelines/deprecated/__init__.py
Normal file
@ -0,0 +1,16 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
|
||||
@ -2,14 +2,14 @@ import enum
|
||||
import warnings
|
||||
from typing import Any
|
||||
|
||||
from ..generation import GenerationConfig
|
||||
from ..tokenization_utils import TruncationStrategy
|
||||
from ..utils import add_end_docstrings, is_torch_available, logging
|
||||
from .base import Pipeline, build_pipeline_init_args
|
||||
from ...generation import GenerationConfig
|
||||
from ...tokenization_utils import TruncationStrategy
|
||||
from ...utils import add_end_docstrings, is_torch_available, logging
|
||||
from ..base import Pipeline, build_pipeline_init_args
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
from ..models.auto.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
|
||||
from ...models.auto.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
@ -77,6 +77,12 @@ class Text2TextGenerationPipeline(Pipeline):
|
||||
return_name = "generated"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
if self.return_name == "generated": # Check this isn't summarization/translation instead
|
||||
logger.warning_once(
|
||||
"The `Text2TextGenerationPipeline` is deprecated and no longer maintained. For most "
|
||||
"purposes, we recommend using newer models with causal pipelines like "
|
||||
"`TextGenerationPipeline` or `ImageTextToTextPipeline`."
|
||||
)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self.check_model_type(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES)
|
||||
@ -254,6 +260,14 @@ class SummarizationPipeline(Text2TextGenerationPipeline):
|
||||
# Used in the return key of the pipeline.
|
||||
return_name = "summary"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
logger.warning_once(
|
||||
"The `SummarizationPipeline` is deprecated and no longer maintained. For most "
|
||||
"summarization tasks, we recommend appropriately prompting modern general-purpose LLMs "
|
||||
"via pipelines like `TextGenerationPipeline` or `ImageTextToTextPipeline`."
|
||||
)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
r"""
|
||||
Summarize the text(s) given as inputs.
|
||||
@ -323,6 +337,14 @@ class TranslationPipeline(Text2TextGenerationPipeline):
|
||||
# Used in the return key of the pipeline.
|
||||
return_name = "translation"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
logger.warning_once(
|
||||
"The `TranslationPipeline` is deprecated and no longer maintained. For most "
|
||||
"translation tasks, we recommend appropriately prompting modern general-purpose LLMs "
|
||||
"via pipelines like `TextGenerationPipeline` or `ImageTextToTextPipeline`."
|
||||
)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def check_inputs(self, input_length: int, min_length: int, max_new_tokens: int):
|
||||
"""
|
||||
Removed input length check - unnecessary with max_new_tokens (previously relevant for max_length)
|
||||
@ -219,6 +219,9 @@ class ImagesKwargs(TypedDict, total=False):
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
disable_grouping (`bool`, *optional*):
|
||||
Whether to group images by shapes when processing or not, only relevant for fast image processing.
|
||||
image_seq_length (`int`, *optional*):
|
||||
The number of image tokens to be used for each image in the input.
|
||||
Added for backward compatibility but this should be set as a processor attribute in future models.
|
||||
"""
|
||||
|
||||
do_convert_rgb: Optional[bool]
|
||||
@ -239,6 +242,7 @@ class ImagesKwargs(TypedDict, total=False):
|
||||
device: Annotated[Optional[str], device_validator()]
|
||||
return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
|
||||
disable_grouping: Optional[bool]
|
||||
image_seq_length: Optional[int]
|
||||
|
||||
|
||||
class VideosKwargs(TypedDict, total=False):
|
||||
@ -1366,8 +1370,8 @@ class ProcessorMixin(PushToHubMixin):
|
||||
if token is not None:
|
||||
kwargs["token"] = token
|
||||
|
||||
processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
|
||||
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
|
||||
return cls.from_args_and_dict(args, processor_dict, **kwargs)
|
||||
|
||||
@classmethod
|
||||
|
||||
@ -20,7 +20,7 @@ from .quantizers_utils import get_module_from_name
|
||||
if TYPE_CHECKING:
|
||||
from ..modeling_utils import PreTrainedModel
|
||||
|
||||
from ..utils import is_fp_quant_available, is_qutlass_available, is_torch_available, logging
|
||||
from ..utils import is_fp_quant_available, is_qutlass_available, is_torch_available, is_torch_xpu_available, logging
|
||||
from ..utils.quantization_config import QuantizationConfigMixin
|
||||
|
||||
|
||||
@ -45,9 +45,9 @@ class FPQuantHfQuantizer(HfQuantizer):
|
||||
self.quantization_config = quantization_config
|
||||
|
||||
def validate_environment(self, device_map, **kwargs):
|
||||
if not torch.cuda.is_available():
|
||||
if not torch.cuda.is_available() and not is_torch_xpu_available():
|
||||
raise NotImplementedError(
|
||||
"FPQuant quantization is only supported on GPU. Please use a different quantizer."
|
||||
"FPQuant quantization is only supported on GPU or Intel XPU. Please use a different quantizer."
|
||||
)
|
||||
|
||||
if not is_qutlass_available() and not self.quantization_config.pseudoquantization:
|
||||
|
||||
@ -55,9 +55,9 @@ class Mxfp4HfQuantizer(HfQuantizer):
|
||||
"""Lazy import and initialize kernels only when needed"""
|
||||
if self.triton_kernels_hub is None:
|
||||
try:
|
||||
from ..integrations.hub_kernels import get_kernel_wrapper
|
||||
from kernels import get_kernel
|
||||
|
||||
self.triton_kernels_hub = get_kernel_wrapper("kernels-community/triton_kernels")
|
||||
self.triton_kernels_hub = get_kernel("kernels-community/triton_kernels")
|
||||
except ImportError:
|
||||
raise ImportError("kernels package is required for MXFP4 quantization")
|
||||
return self.triton_kernels_hub
|
||||
|
||||
@ -885,6 +885,12 @@ class TrainingArguments:
|
||||
)
|
||||
},
|
||||
)
|
||||
logging_dir: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "Deprecated and will be removed in v5.2. Set env var `TENSORBOARD_LOGGING_DIR` instead. TensorBoard log directory."
|
||||
},
|
||||
)
|
||||
logging_strategy: Union[IntervalStrategy, str] = field(
|
||||
default="steps",
|
||||
metadata={"help": "The logging strategy to use."},
|
||||
@ -1695,6 +1701,11 @@ class TrainingArguments:
|
||||
if isinstance(self.include_num_input_tokens_seen, bool):
|
||||
self.include_num_input_tokens_seen = "all" if self.include_num_input_tokens_seen else "no"
|
||||
|
||||
if self.logging_dir is not None:
|
||||
logger.warning(
|
||||
"`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead."
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
self_as_dict = asdict(self)
|
||||
|
||||
|
||||
@ -87,6 +87,7 @@ VPTQ_MIN_VERSION = "0.0.4"
|
||||
TORCHAO_MIN_VERSION = "0.4.0"
|
||||
AUTOROUND_MIN_VERSION = "0.5.0"
|
||||
TRITON_MIN_VERSION = "1.0.0"
|
||||
KERNELS_MIN_VERSION = "0.9.0"
|
||||
|
||||
|
||||
@lru_cache
|
||||
@ -513,8 +514,9 @@ def is_kenlm_available() -> bool:
|
||||
|
||||
|
||||
@lru_cache
|
||||
def is_kernels_available() -> bool:
|
||||
return _is_package_available("kernels")
|
||||
def is_kernels_available(MIN_VERSION: str = KERNELS_MIN_VERSION) -> bool:
|
||||
is_available, kernels_version = _is_package_available("kernels", return_version=True)
|
||||
return is_available and version.parse(kernels_version) >= version.parse(MIN_VERSION)
|
||||
|
||||
|
||||
@lru_cache
|
||||
@ -971,13 +973,13 @@ def is_quark_available() -> bool:
|
||||
@lru_cache
|
||||
def is_fp_quant_available():
|
||||
is_available, fp_quant_version = _is_package_available("fp_quant", return_version=True)
|
||||
return is_available and version.parse(fp_quant_version) >= version.parse("0.2.0")
|
||||
return is_available and version.parse(fp_quant_version) >= version.parse("0.3.2")
|
||||
|
||||
|
||||
@lru_cache
|
||||
def is_qutlass_available():
|
||||
is_available, qutlass_version = _is_package_available("qutlass", return_version=True)
|
||||
return is_available and version.parse(qutlass_version) >= version.parse("0.1.0")
|
||||
return is_available and version.parse(qutlass_version) >= version.parse("0.2.0")
|
||||
|
||||
|
||||
@lru_cache
|
||||
|
||||
@ -12,12 +12,9 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ..utils import PushToHubMixin, is_kernels_available, is_torch_available
|
||||
from ..utils import PushToHubMixin, is_torch_available
|
||||
|
||||
|
||||
if is_kernels_available():
|
||||
from kernels import LayerRepository, Mode
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
@ -58,6 +55,8 @@ def infer_device(model):
|
||||
|
||||
|
||||
def add_to_mapping(layer_name, device, repo_name, mode, compatible_mapping):
|
||||
from kernels import LayerRepository
|
||||
|
||||
if device not in ["cuda", "rocm", "xpu"]:
|
||||
raise ValueError(f"Only cuda, rocm, and xpu devices supported, got: {device}")
|
||||
repo_layer_name = repo_name.split(":")[1]
|
||||
@ -82,6 +81,8 @@ class KernelConfig(PushToHubMixin):
|
||||
self.registered_layer_names = {}
|
||||
|
||||
def update_kernel(self, repo_id, registered_name, layer_name, device, mode, revision=None):
|
||||
from kernels import LayerRepository
|
||||
|
||||
self.kernel_mapping[registered_name] = {
|
||||
device: {
|
||||
mode: LayerRepository(
|
||||
@ -204,6 +205,8 @@ class KernelConfig(PushToHubMixin):
|
||||
The device is inferred from the model's parameters if not provided.
|
||||
The Mode is inferred from the model's training state.
|
||||
"""
|
||||
from kernels import Mode
|
||||
|
||||
compatible_mapping = {}
|
||||
for layer_name, kernel in self.kernel_mapping.items():
|
||||
# Infer Mode: use Mode.TRAINING if model is training, else use Mode.INFERENCE
|
||||
|
||||
@ -1601,8 +1601,12 @@ class FPQuantConfig(QuantizationConfigMixin):
|
||||
else:
|
||||
raise ValueError("Only 'mxfp4' and 'nvfp4' are supported for forward_dtype for now.")
|
||||
|
||||
if self.backward_dtype != "bf16":
|
||||
raise ValueError("Only 'bf16' is supported for backward_dtype for now.")
|
||||
if self.backward_dtype not in ["bf16", "mxfp8", "mxfp4"]:
|
||||
raise ValueError("Only 'bf16', 'mxfp8' and 'mxfp4' are supported for backward_dtype for now.")
|
||||
|
||||
if self.backward_dtype != "bf16" and self.forward_dtype != "mxfp4":
|
||||
raise ValueError("Only 'mxfp4' forward is compatible with non-bf16 backwards for now.")
|
||||
|
||||
if self.transform_init not in ["hadamard", "identity", "gsr"]:
|
||||
raise ValueError("Only 'hadamard', 'identity' and 'gsr' are supported for transform_init.")
|
||||
|
||||
|
||||
224
tests/fsdp/test_context_parallel.py
Normal file
224
tests/fsdp/test_context_parallel.py
Normal file
@ -0,0 +1,224 @@
|
||||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from transformers import is_torch_available
|
||||
from transformers.testing_utils import (
|
||||
TestCasePlus,
|
||||
execute_subprocess_async,
|
||||
require_accelerate,
|
||||
require_torch_multi_accelerator,
|
||||
run_first,
|
||||
slow,
|
||||
)
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from transformers import (
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
DataCollatorForLanguageModeling,
|
||||
HfArgumentParser,
|
||||
Trainer,
|
||||
TrainingArguments,
|
||||
)
|
||||
|
||||
|
||||
class TestContextParallel(TestCasePlus):
|
||||
"""Test Trainer with Torch context parallelism enabled via accelerate's ParallelismConfig."""
|
||||
|
||||
@require_torch_multi_accelerator
|
||||
@require_accelerate
|
||||
@slow
|
||||
@run_first
|
||||
def test_cp_equivalence(self):
|
||||
"""Test that CP produces the same losses as without CP."""
|
||||
|
||||
# Shared setup
|
||||
world_size = 2
|
||||
script_path = __file__
|
||||
|
||||
# Step 1: Run with CP enabled (cp_size=world_size)
|
||||
cp_yes_output_dir = Path(self.get_auto_remove_tmp_dir()).resolve()
|
||||
cp_yes_config_path = cp_yes_output_dir / "context_parallel_config.yaml"
|
||||
cp_yes_losses_path = cp_yes_output_dir / "cp_yes_losses.json"
|
||||
|
||||
# Write config file inline (self-contained test)
|
||||
with open(cp_yes_config_path, "w") as f:
|
||||
f.write(
|
||||
f"""distributed_type: FSDP
|
||||
fsdp_config:
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_state_dict_type: SHARDED_STATE_DICT
|
||||
fsdp_version: 2
|
||||
mixed_precision: bf16
|
||||
num_processes: {world_size}
|
||||
parallelism_config:
|
||||
parallelism_config_dp_replicate_size: 1
|
||||
parallelism_config_dp_shard_size: 1
|
||||
parallelism_config_tp_size: 1
|
||||
parallelism_config_cp_size: {world_size}
|
||||
parallelism_config_cp_comm_strategy: alltoall
|
||||
"""
|
||||
)
|
||||
|
||||
cmd_cp_yes = f"""
|
||||
accelerate launch
|
||||
--config_file {cp_yes_config_path}
|
||||
{script_path}
|
||||
--output_dir {cp_yes_output_dir}
|
||||
--report_to none
|
||||
--max_steps 10
|
||||
--per_device_train_batch_size 1
|
||||
--gradient_accumulation_steps 1
|
||||
--logging_steps 1
|
||||
--remove_unused_columns False
|
||||
--seed 42
|
||||
--loss_output_file {cp_yes_losses_path}
|
||||
""".split()
|
||||
|
||||
execute_subprocess_async(cmd_cp_yes, env=self.get_env())
|
||||
|
||||
# Step 2: Run without CP (FSDP with num_processes=1, no parallelism_config)
|
||||
cp_no_output_dir = Path(self.get_auto_remove_tmp_dir()).resolve()
|
||||
cp_no_config_path = cp_no_output_dir / "context_parallel_config.yaml"
|
||||
cp_no_losses_path = cp_no_output_dir / "cp_no_losses.json"
|
||||
|
||||
# Write config file inline (self-contained test)
|
||||
with open(cp_no_config_path, "w") as f:
|
||||
f.write(
|
||||
"""distributed_type: FSDP
|
||||
fsdp_config:
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_state_dict_type: SHARDED_STATE_DICT
|
||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||
fsdp_version: 2
|
||||
mixed_precision: bf16
|
||||
num_processes: 1
|
||||
"""
|
||||
)
|
||||
|
||||
cmd_cp_no = f"""
|
||||
accelerate launch
|
||||
--config_file {cp_no_config_path}
|
||||
{script_path}
|
||||
--output_dir {cp_no_output_dir}
|
||||
--report_to none
|
||||
--max_steps 10
|
||||
--per_device_train_batch_size 1
|
||||
--gradient_accumulation_steps 1
|
||||
--logging_steps 1
|
||||
--remove_unused_columns False
|
||||
--seed 42
|
||||
--loss_output_file {cp_no_losses_path}
|
||||
""".split()
|
||||
|
||||
execute_subprocess_async(cmd_cp_no, env=self.get_env())
|
||||
|
||||
# Compare losses - should be very close since CP just splits sequence computation
|
||||
with open(cp_yes_losses_path) as f:
|
||||
cp_yes_losses = json.load(f)
|
||||
with open(cp_no_losses_path) as f:
|
||||
cp_no_losses = json.load(f)
|
||||
|
||||
assert len(cp_yes_losses) == len(cp_no_losses), (
|
||||
f"Different number of losses: CP has {len(cp_yes_losses)}, no-CP has {len(cp_no_losses)}"
|
||||
)
|
||||
|
||||
# CP should produce very similar results (small numerical differences expected)
|
||||
# The differences come from:
|
||||
# - Different gradient reduction patterns in distributed training
|
||||
# - BF16 mixed precision accumulated differences
|
||||
# - Sequence splitting and gathering in CP mode
|
||||
cp_yes_losses_tensor = torch.tensor(cp_yes_losses)
|
||||
cp_no_losses_tensor = torch.tensor(cp_no_losses)
|
||||
|
||||
# Use torch.testing.assert_close with rtol=2% and atol=0.02
|
||||
# Testing shows actual differences are typically <1.5%
|
||||
torch.testing.assert_close(
|
||||
cp_yes_losses_tensor,
|
||||
cp_no_losses_tensor,
|
||||
rtol=2e-2, # 2% relative tolerance
|
||||
atol=2e-2, # 0.02 absolute tolerance
|
||||
msg=f"CP losses {cp_yes_losses} do not match non-CP losses {cp_no_losses}",
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Parse custom arguments (not TrainingArguments parameters)
|
||||
loss_output_file = None
|
||||
|
||||
if "--loss_output_file" in sys.argv:
|
||||
idx = sys.argv.index("--loss_output_file")
|
||||
loss_output_file = sys.argv[idx + 1]
|
||||
sys.argv.pop(idx)
|
||||
sys.argv.pop(idx)
|
||||
|
||||
parser = HfArgumentParser((TrainingArguments,))
|
||||
training_args = parser.parse_args_into_dataclasses()[0]
|
||||
|
||||
# Use SmolLM (small Llama-based model that works with CP)
|
||||
model_name = "HuggingFaceTB/SmolLM-135M"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name,
|
||||
attn_implementation="sdpa", # CP requires SDPA
|
||||
)
|
||||
|
||||
# Create simple dataset: just tokenize some text
|
||||
texts = [
|
||||
"The quick brown fox jumps over the lazy dog. " * 10,
|
||||
"Hello world, this is a test sentence for training. " * 10,
|
||||
] * 4 # 8 samples total
|
||||
|
||||
def tokenize_function(examples):
|
||||
return tokenizer(examples, max_length=128, truncation=True, padding="max_length")
|
||||
|
||||
train_dataset = [tokenize_function(text) for text in texts]
|
||||
|
||||
# Use standard DataCollatorForLanguageModeling for causal LM
|
||||
# pad_to_multiple_of=4 ensures sequences are divisible by cp_size * 2 (for cp_size=2)
|
||||
# Trainer will automatically generate position_ids and shift_labels as needed
|
||||
data_collator = DataCollatorForLanguageModeling(
|
||||
tokenizer=tokenizer,
|
||||
mlm=False, # Causal language modeling
|
||||
pad_to_multiple_of=4,
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
data_collator=data_collator,
|
||||
)
|
||||
|
||||
# Train for a few steps
|
||||
trainer.train()
|
||||
|
||||
# Verify training completed
|
||||
assert trainer.state.global_step > 0, "Training should have completed at least one step"
|
||||
|
||||
# Save losses to file if requested (for equivalence testing)
|
||||
if loss_output_file and training_args.process_index == 0:
|
||||
losses = [log["loss"] for log in trainer.state.log_history if "loss" in log]
|
||||
with open(loss_output_file, "w") as f:
|
||||
json.dump(losses, f)
|
||||
@ -350,9 +350,9 @@ class ContinuousBatchingTest(unittest.TestCase):
|
||||
|
||||
messages = [{"content": "What is the Transformers library known for?", "role": "user"}]
|
||||
|
||||
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(
|
||||
model.device
|
||||
)[0]
|
||||
inputs = tokenizer.apply_chat_template(
|
||||
messages, return_tensors="pt", add_generation_prompt=True, return_dict=False
|
||||
).to(model.device)[0]
|
||||
|
||||
request_id = manager.add_request(inputs, max_new_tokens=max_new_tokens, streaming=True)
|
||||
|
||||
@ -382,9 +382,9 @@ class ContinuousBatchingTest(unittest.TestCase):
|
||||
|
||||
messages = [{"content": "What is the Transformers library known for?", "role": "user"}]
|
||||
|
||||
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(
|
||||
model.device
|
||||
)[0]
|
||||
inputs = tokenizer.apply_chat_template(
|
||||
messages, return_tensors="pt", add_generation_prompt=True, return_dict=False
|
||||
).to(model.device)[0]
|
||||
|
||||
request_id = manager.add_request(inputs, max_new_tokens=max_new_tokens, streaming=False)
|
||||
|
||||
@ -409,9 +409,9 @@ class ContinuousBatchingTest(unittest.TestCase):
|
||||
|
||||
messages = [{"content": "What is the Transformers library known for?", "role": "user"}]
|
||||
|
||||
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(
|
||||
model.device
|
||||
)[0]
|
||||
inputs = tokenizer.apply_chat_template(
|
||||
messages, return_tensors="pt", add_generation_prompt=True, return_dict=False
|
||||
).to(model.device)[0]
|
||||
|
||||
# Non-streaming request
|
||||
request_id = manager.add_request(inputs, max_new_tokens=max_new_tokens, streaming=False)
|
||||
|
||||
@ -88,6 +88,15 @@ class AutoFeatureExtractorTest(unittest.TestCase):
|
||||
|
||||
self.assertIsInstance(processor, Wav2Vec2Processor)
|
||||
|
||||
def test_processor_from_local_subfolder_from_repo(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
|
||||
processor.save_pretrained(f"{tmpdirname}/processor_subfolder")
|
||||
|
||||
processor = Wav2Vec2Processor.from_pretrained(tmpdirname, subfolder="processor_subfolder")
|
||||
|
||||
self.assertIsInstance(processor, Wav2Vec2Processor)
|
||||
|
||||
def test_processor_from_local_directory_from_extractor_config(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
# copy relevant files
|
||||
|
||||
@ -335,12 +335,61 @@ class ColQwen2ModelIntegrationTest(unittest.TestCase):
|
||||
[15.6562, 12.2656, 20.2969],
|
||||
],
|
||||
("cuda", 8): [
|
||||
[15.0703, 8.7422, 15.0312],
|
||||
[9.5078, 16.8906, 10.6250],
|
||||
[15.6484, 12.3984, 20.4688],
|
||||
[16.2812, 8.3672, 14.5703],
|
||||
[9.4922, 17.1875, 10.3281],
|
||||
[15.0312, 11.3984, 20.1719],
|
||||
],
|
||||
}
|
||||
)
|
||||
expected_scores = torch.tensor(expectations.get_expectation(), dtype=scores.dtype)
|
||||
|
||||
assert torch.allclose(scores, expected_scores, atol=1e-3), f"Expected scores {expected_scores}, got {scores}"
|
||||
|
||||
@slow
|
||||
def test_model_integration_test_2(self):
|
||||
"""
|
||||
Test if the model is able to retrieve the correct pages for a small and easy dataset.
|
||||
This test uses a ColQwen2.5 checkpoint that is compatible with the ColQwen2 architecture.
|
||||
"""
|
||||
model = ColQwen2ForRetrieval.from_pretrained(
|
||||
"Sahil-Kabir/colqwen2.5-v0.2-hf",
|
||||
device_map=torch_device,
|
||||
dtype=torch.bfloat16,
|
||||
).eval()
|
||||
processor = ColQwen2Processor.from_pretrained("Sahil-Kabir/colqwen2.5-v0.2-hf", trust_remote_code=True)
|
||||
|
||||
# Load the test dataset
|
||||
ds = load_dataset("hf-internal-testing/document-visual-retrieval-test", split="test")
|
||||
|
||||
# Preprocess the examples
|
||||
batch_images = processor(images=list(ds["image"])).to(torch_device)
|
||||
batch_queries = processor(text=list(ds["query"])).to(torch_device)
|
||||
|
||||
with torch.inference_mode():
|
||||
image_embeddings = model(**batch_images).embeddings
|
||||
query_embeddings = model(**batch_queries).embeddings
|
||||
|
||||
# Compute retrieval scores
|
||||
scores = processor.score_retrieval(
|
||||
query_embeddings=query_embeddings,
|
||||
passage_embeddings=image_embeddings,
|
||||
)
|
||||
|
||||
assert scores.ndim == 2, f"Expected 2D tensor, got {scores.ndim}"
|
||||
assert scores.shape == (len(ds), len(ds)), f"Expected shape {(len(ds), len(ds))}, got {scores.shape}"
|
||||
|
||||
# Check if the maximum scores per row are in the diagonal of the matrix score
|
||||
self.assertTrue((scores.argmax(axis=1) == torch.arange(len(ds), device=scores.device)).all())
|
||||
# Further validation: fine-grained check, with a hardcoded score from the original Hf implementation.
|
||||
expectations = Expectations(
|
||||
{
|
||||
("cuda", 8): [
|
||||
[16.3750, 10.9375, 14.7500],
|
||||
[11.3750, 16.8750, 12.0625],
|
||||
[15.3125, 13.1250, 21.5000],
|
||||
]
|
||||
}
|
||||
)
|
||||
expected_scores = torch.tensor(expectations.get_expectation(), dtype=scores.dtype)
|
||||
|
||||
assert torch.allclose(scores, expected_scores, atol=0.15), f"Expected scores {expected_scores}, got {scores}"
|
||||
|
||||
@ -1,63 +1,466 @@
|
||||
import io
|
||||
import unittest
|
||||
|
||||
import httpx
|
||||
import numpy as np
|
||||
import pytest
|
||||
from packaging import version
|
||||
|
||||
from transformers import is_torch_available, is_vision_available
|
||||
from transformers.image_utils import SizeDict
|
||||
from transformers.testing_utils import (
|
||||
require_torch,
|
||||
require_torch_accelerator,
|
||||
require_torchvision,
|
||||
require_vision,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
from ...test_image_processing_common import ImageProcessingTestMixin
|
||||
|
||||
|
||||
if is_torch_available() and is_vision_available():
|
||||
import torch
|
||||
|
||||
from transformers import FuyuImageProcessor
|
||||
from transformers import FuyuImageProcessor, FuyuImageProcessorFast
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
|
||||
class FuyuImageProcessingTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=3,
|
||||
num_channels=3,
|
||||
image_size=18,
|
||||
min_resolution=30,
|
||||
max_resolution=400,
|
||||
do_resize=True,
|
||||
size=None,
|
||||
do_pad=True,
|
||||
do_normalize=True,
|
||||
image_mean=[0.5, 0.5, 0.5],
|
||||
image_std=[0.5, 0.5, 0.5],
|
||||
do_rescale=True,
|
||||
rescale_factor=1 / 255,
|
||||
patch_size=None,
|
||||
):
|
||||
size = size if size is not None else {"height": 180, "width": 360}
|
||||
patch_size = patch_size if patch_size is not None else {"height": 30, "width": 30}
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.num_channels = num_channels
|
||||
self.image_size = image_size
|
||||
self.min_resolution = 30
|
||||
self.max_resolution = 360
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.do_pad = do_pad
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.patch_size = patch_size
|
||||
|
||||
def prepare_image_processor_dict(self):
|
||||
return {
|
||||
"do_resize": self.do_resize,
|
||||
"size": self.size,
|
||||
"do_pad": self.do_pad,
|
||||
"do_normalize": self.do_normalize,
|
||||
"image_mean": self.image_mean,
|
||||
"image_std": self.image_std,
|
||||
"do_rescale": self.do_rescale,
|
||||
"rescale_factor": self.rescale_factor,
|
||||
"patch_size": self.patch_size,
|
||||
}
|
||||
|
||||
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
|
||||
"""Prepares a batch of images for testing"""
|
||||
if equal_resolution:
|
||||
image_inputs = [
|
||||
np.random.randint(
|
||||
0, 256, (self.num_channels, self.max_resolution, self.max_resolution), dtype=np.uint8
|
||||
)
|
||||
for _ in range(self.batch_size)
|
||||
]
|
||||
else:
|
||||
heights = [
|
||||
h - (h % 30) for h in np.random.randint(self.min_resolution, self.max_resolution, self.batch_size)
|
||||
]
|
||||
widths = [
|
||||
w - (w % 30) for w in np.random.randint(self.min_resolution, self.max_resolution, self.batch_size)
|
||||
]
|
||||
|
||||
image_inputs = [
|
||||
np.random.randint(0, 256, (self.num_channels, height, width), dtype=np.uint8)
|
||||
for height, width in zip(heights, widths)
|
||||
]
|
||||
|
||||
if not numpify and not torchify:
|
||||
image_inputs = [Image.fromarray(np.moveaxis(img, 0, -1)) for img in image_inputs]
|
||||
|
||||
if torchify:
|
||||
image_inputs = [torch.from_numpy(img) for img in image_inputs]
|
||||
|
||||
return image_inputs
|
||||
|
||||
def expected_output_image_shape(self, images):
|
||||
return self.num_channels, self.size["height"], self.size["width"]
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
@require_torchvision
|
||||
class TestFuyuImageProcessor(unittest.TestCase):
|
||||
class FuyuImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = FuyuImageProcessor
|
||||
fast_image_processing_class = FuyuImageProcessorFast
|
||||
|
||||
# Skip tests that expect pixel_values output
|
||||
test_cast_dtype = None
|
||||
|
||||
def setUp(self):
|
||||
self.size = {"height": 160, "width": 320}
|
||||
self.processor = FuyuImageProcessor(size=self.size, padding_value=1.0)
|
||||
self.batch_size = 3
|
||||
self.channels = 3
|
||||
self.height = 300
|
||||
self.width = 300
|
||||
self.image_processor_tester = FuyuImageProcessingTester(self)
|
||||
self.image_processor_dict = self.image_processor_tester.prepare_image_processor_dict()
|
||||
|
||||
self.image_input = torch.rand(self.batch_size, self.channels, self.height, self.width)
|
||||
# Initialize image_processor_list (from ImageProcessingTestMixin)
|
||||
image_processor_list = []
|
||||
if self.test_slow_image_processor and self.image_processing_class:
|
||||
image_processor_list.append(self.image_processing_class)
|
||||
if self.test_fast_image_processor and self.fast_image_processing_class:
|
||||
image_processor_list.append(self.fast_image_processing_class)
|
||||
self.image_processor_list = image_processor_list
|
||||
|
||||
self.image_patch_dim_h = 30
|
||||
self.image_patch_dim_w = 30
|
||||
self.sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
|
||||
self.sample_image_pil = Image.fromarray(self.sample_image)
|
||||
def test_call_pil(self):
|
||||
"""Override to handle Fuyu's custom output structure"""
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, Image.Image)
|
||||
|
||||
def test_patches(self):
|
||||
expected_num_patches = self.processor.get_num_patches(image_height=self.height, image_width=self.width)
|
||||
encoded_images = image_processing(image_inputs[0], return_tensors="pt")
|
||||
self.assertIn("images", encoded_images)
|
||||
self.assertEqual(len(encoded_images.images), 1)
|
||||
|
||||
patches_final = self.processor.patchify_image(image=self.image_input)
|
||||
assert patches_final.shape[1] == expected_num_patches, (
|
||||
f"Expected {expected_num_patches} patches, got {patches_final.shape[1]}."
|
||||
encoded_images = image_processing(image_inputs, return_tensors="pt")
|
||||
self.assertIn("images", encoded_images)
|
||||
self.assertEqual(len(encoded_images.images), self.image_processor_tester.batch_size)
|
||||
|
||||
def test_call_numpy(self):
|
||||
"""Override to handle Fuyu's custom output structure"""
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, np.ndarray)
|
||||
|
||||
encoded_images = image_processing(image_inputs[0], return_tensors="pt")
|
||||
self.assertIn("images", encoded_images)
|
||||
self.assertEqual(len(encoded_images.images), 1)
|
||||
|
||||
encoded_images = image_processing(image_inputs, return_tensors="pt")
|
||||
self.assertIn("images", encoded_images)
|
||||
self.assertEqual(len(encoded_images.images), self.image_processor_tester.batch_size)
|
||||
|
||||
def test_call_pytorch(self):
|
||||
"""Override to handle Fuyu's custom output structure"""
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processing = image_processing_class(**self.image_processor_dict)
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
||||
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, torch.Tensor)
|
||||
|
||||
encoded_images = image_processing(image_inputs[0], return_tensors="pt")
|
||||
self.assertIn("images", encoded_images)
|
||||
self.assertEqual(len(encoded_images.images), 1)
|
||||
|
||||
encoded_images = image_processing(image_inputs, return_tensors="pt")
|
||||
self.assertIn("images", encoded_images)
|
||||
self.assertEqual(len(encoded_images.images), self.image_processor_tester.batch_size)
|
||||
|
||||
def test_call_numpy_4_channels(self):
|
||||
"""Skip this test as Fuyu doesn't support arbitrary channels"""
|
||||
self.skipTest("Fuyu processor is designed for 3-channel RGB images")
|
||||
|
||||
def test_slow_fast_equivalence(self):
|
||||
"""Override to handle Fuyu's custom output structure"""
|
||||
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test")
|
||||
|
||||
if self.image_processing_class is None or self.fast_image_processing_class is None:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
|
||||
dummy_image = Image.open(
|
||||
io.BytesIO(
|
||||
httpx.get("http://images.cocodataset.org/val2017/000000039769.jpg", follow_redirects=True).content
|
||||
)
|
||||
)
|
||||
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
|
||||
encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
|
||||
|
||||
self._assert_slow_fast_tensors_equivalence(encoding_slow.images[0][0], encoding_fast.images[0][0])
|
||||
|
||||
def test_slow_fast_equivalence_batched(self):
|
||||
"""Override to handle Fuyu's custom output structure"""
|
||||
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test")
|
||||
|
||||
if self.image_processing_class is None or self.fast_image_processing_class is None:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
|
||||
|
||||
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
|
||||
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
|
||||
encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
|
||||
|
||||
# Compare each image tensor
|
||||
for slow_img, fast_img in zip(encoding_slow.images, encoding_fast.images):
|
||||
self._assert_slow_fast_tensors_equivalence(slow_img[0], fast_img[0])
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
@require_vision
|
||||
@pytest.mark.torch_compile_test
|
||||
def test_can_compile_fast_image_processor(self):
|
||||
if self.fast_image_processing_class is None:
|
||||
self.skipTest("Skipping compilation test as fast image processor is not defined")
|
||||
if version.parse(torch.__version__) < version.parse("2.3"):
|
||||
self.skipTest(reason="This test requires torch >= 2.3 to run.")
|
||||
|
||||
torch.compiler.reset()
|
||||
input_image = torch.randint(0, 255, (3, 224, 224), dtype=torch.uint8)
|
||||
image_processor = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
output_eager = image_processor(input_image, device=torch_device, return_tensors="pt")
|
||||
|
||||
image_processor = torch.compile(image_processor, mode="reduce-overhead")
|
||||
output_compiled = image_processor(input_image, device=torch_device, return_tensors="pt")
|
||||
self._assert_slow_fast_tensors_equivalence(
|
||||
output_eager.images[0][0], output_compiled.images[0][0], atol=1e-4, rtol=1e-4, mean_atol=1e-5
|
||||
)
|
||||
|
||||
def test_image_processor_properties(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor = image_processing_class(**self.image_processor_dict)
|
||||
self.assertTrue(hasattr(image_processor, "do_resize"))
|
||||
self.assertTrue(hasattr(image_processor, "size"))
|
||||
self.assertTrue(hasattr(image_processor, "do_pad"))
|
||||
self.assertTrue(hasattr(image_processor, "do_normalize"))
|
||||
self.assertTrue(hasattr(image_processor, "image_mean"))
|
||||
self.assertTrue(hasattr(image_processor, "image_std"))
|
||||
self.assertTrue(hasattr(image_processor, "do_rescale"))
|
||||
self.assertTrue(hasattr(image_processor, "rescale_factor"))
|
||||
self.assertTrue(hasattr(image_processor, "patch_size"))
|
||||
|
||||
def test_patches(self):
|
||||
"""Test that patchify_image produces the expected number of patches."""
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor = image_processing_class(**self.image_processor_dict)
|
||||
batch_size = 3
|
||||
channels = 3
|
||||
height = 300
|
||||
width = 300
|
||||
image_input = torch.rand(batch_size, channels, height, width)
|
||||
|
||||
expected_num_patches = image_processor.get_num_patches(image_height=height, image_width=width)
|
||||
patches_final = image_processor.patchify_image(image=image_input)
|
||||
|
||||
self.assertEqual(patches_final.shape[1], expected_num_patches)
|
||||
|
||||
def test_patches_match_slow_fast(self):
|
||||
"""Test that fast processor produces same patches as slow processor."""
|
||||
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||
self.skipTest(reason="Skipping slow/fast patch equivalence test")
|
||||
|
||||
if self.image_processing_class is None or self.fast_image_processing_class is None:
|
||||
self.skipTest(
|
||||
reason="Skipping slow/fast patch equivalence test as one of the image processors is not defined"
|
||||
)
|
||||
|
||||
batch_size = 3
|
||||
channels = 3
|
||||
height = 300
|
||||
width = 300
|
||||
image_input = torch.rand(batch_size, channels, height, width)
|
||||
|
||||
processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||
processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
patches_fast = processor_fast.patchify_image(image=image_input)
|
||||
patches_slow = processor_slow.patchify_image(image=image_input)
|
||||
|
||||
self.assertEqual(patches_fast.shape, patches_slow.shape)
|
||||
torch.testing.assert_close(patches_fast, patches_slow, rtol=1e-4, atol=1e-4)
|
||||
|
||||
def test_scale_to_target_aspect_ratio(self):
|
||||
# (h:450, w:210) fitting (160, 320) -> (160, 210*160/450)
|
||||
scaled_image = self.processor.resize(self.sample_image, size=self.size)
|
||||
self.assertEqual(scaled_image.shape[0], 160)
|
||||
self.assertEqual(scaled_image.shape[1], 74)
|
||||
"""Test that resize maintains aspect ratio correctly."""
|
||||
sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
|
||||
|
||||
if self.test_slow_image_processor and self.image_processing_class:
|
||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
||||
scaled_image = image_processor.resize(sample_image, size=self.image_processor_dict["size"])
|
||||
self.assertEqual(scaled_image.shape[0], 180)
|
||||
self.assertEqual(scaled_image.shape[1], 84)
|
||||
|
||||
if self.test_fast_image_processor and self.fast_image_processing_class:
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
sample_tensor = torch.from_numpy(sample_image).permute(2, 0, 1).float()
|
||||
|
||||
size_dict = SizeDict(
|
||||
height=self.image_processor_dict["size"]["height"], width=self.image_processor_dict["size"]["width"]
|
||||
)
|
||||
scaled_image = image_processor_fast.resize(sample_tensor, size=size_dict)
|
||||
|
||||
self.assertEqual(scaled_image.shape[1], 180)
|
||||
self.assertEqual(scaled_image.shape[2], 84)
|
||||
|
||||
def test_apply_transformation_numpy(self):
|
||||
transformed_image = self.processor.preprocess(self.sample_image).images[0][0]
|
||||
self.assertEqual(transformed_image.shape[1], 160)
|
||||
self.assertEqual(transformed_image.shape[2], 320)
|
||||
"""Test preprocessing with numpy input."""
|
||||
sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
|
||||
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor = image_processing_class(**self.image_processor_dict)
|
||||
transformed_image = image_processor.preprocess(sample_image).images[0][0]
|
||||
self.assertEqual(transformed_image.shape[1], 180)
|
||||
self.assertEqual(transformed_image.shape[2], 360)
|
||||
|
||||
def test_apply_transformation_pil(self):
|
||||
transformed_image = self.processor.preprocess(self.sample_image_pil).images[0][0]
|
||||
self.assertEqual(transformed_image.shape[1], 160)
|
||||
self.assertEqual(transformed_image.shape[2], 320)
|
||||
"""Test preprocessing with PIL input."""
|
||||
sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
|
||||
sample_image_pil = Image.fromarray(sample_image)
|
||||
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor = image_processing_class(**self.image_processor_dict)
|
||||
transformed_image = image_processor.preprocess(sample_image_pil).images[0][0]
|
||||
self.assertEqual(transformed_image.shape[1], 180)
|
||||
self.assertEqual(transformed_image.shape[2], 360)
|
||||
|
||||
def test_preprocess_output_structure(self):
|
||||
"""Test that preprocess returns correct output structure."""
|
||||
sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
|
||||
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor = image_processing_class(**self.image_processor_dict)
|
||||
result = image_processor.preprocess(sample_image)
|
||||
|
||||
self.assertIn("images", result)
|
||||
self.assertIn("image_unpadded_heights", result)
|
||||
self.assertIn("image_unpadded_widths", result)
|
||||
self.assertIn("image_scale_factors", result)
|
||||
|
||||
self.assertEqual(len(result.images), 1)
|
||||
self.assertEqual(len(result.images[0]), 1)
|
||||
self.assertEqual(len(result.image_unpadded_heights), 1)
|
||||
self.assertEqual(len(result.image_unpadded_widths), 1)
|
||||
self.assertEqual(len(result.image_scale_factors), 1)
|
||||
|
||||
def test_batch_processing(self):
|
||||
"""Test processing multiple images."""
|
||||
sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
|
||||
sample_image_pil = Image.fromarray(sample_image)
|
||||
images = [sample_image, sample_image_pil]
|
||||
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor = image_processing_class(**self.image_processor_dict)
|
||||
result = image_processor.preprocess(images)
|
||||
|
||||
self.assertEqual(len(result.images), 2)
|
||||
for img in result.images:
|
||||
self.assertEqual(len(img), 1)
|
||||
if hasattr(img[0], "shape"):
|
||||
if len(img[0].shape) == 3:
|
||||
self.assertEqual(img[0].shape[1], 180)
|
||||
self.assertEqual(img[0].shape[2], 360)
|
||||
|
||||
def test_pad_image_fast(self):
|
||||
"""Test that padding works correctly for fast processor."""
|
||||
if not self.test_fast_image_processor or self.fast_image_processing_class is None:
|
||||
self.skipTest(reason="Fast processor not available")
|
||||
|
||||
from transformers.image_utils import SizeDict
|
||||
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
small_image = torch.rand(3, 100, 100)
|
||||
size_dict = SizeDict(height=180, width=360)
|
||||
|
||||
padded = image_processor_fast.pad([small_image], pad_size=size_dict, fill_value=1.0)[0]
|
||||
self.assertEqual(padded.shape[1], 180)
|
||||
self.assertEqual(padded.shape[2], 360)
|
||||
|
||||
self.assertTrue(torch.allclose(padded[:, 100:, :], torch.ones_like(padded[:, 100:, :])))
|
||||
self.assertTrue(torch.allclose(padded[:, :, 100:], torch.ones_like(padded[:, :, 100:])))
|
||||
|
||||
def test_preprocess_with_tokenizer_info(self):
|
||||
"""Test preprocess_with_tokenizer_info functionality."""
|
||||
batch_size = 2
|
||||
subseq_size = 1
|
||||
channels = 3
|
||||
image_input = torch.rand(batch_size, subseq_size, channels, 180, 360)
|
||||
image_present = torch.ones(batch_size, subseq_size, dtype=torch.bool)
|
||||
image_unpadded_h = torch.tensor([[180], [180]])
|
||||
image_unpadded_w = torch.tensor([[360], [360]])
|
||||
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor = image_processing_class(**self.image_processor_dict)
|
||||
|
||||
result = image_processor.preprocess_with_tokenizer_info(
|
||||
image_input=image_input,
|
||||
image_present=image_present,
|
||||
image_unpadded_h=image_unpadded_h,
|
||||
image_unpadded_w=image_unpadded_w,
|
||||
image_placeholder_id=100,
|
||||
image_newline_id=101,
|
||||
variable_sized=True,
|
||||
)
|
||||
|
||||
# Check output structure
|
||||
self.assertIn("images", result)
|
||||
self.assertIn("image_input_ids", result)
|
||||
self.assertIn("image_patches", result)
|
||||
self.assertIn("image_patch_indices_per_batch", result)
|
||||
self.assertIn("image_patch_indices_per_subsequence", result)
|
||||
|
||||
# Check batch structure
|
||||
self.assertEqual(len(result.images), batch_size)
|
||||
self.assertEqual(len(result.image_input_ids), batch_size)
|
||||
self.assertEqual(len(result.image_patches), batch_size)
|
||||
|
||||
def test_device_handling_fast(self):
|
||||
"""Test that fast processor can handle device placement."""
|
||||
if not self.test_fast_image_processor or self.fast_image_processing_class is None:
|
||||
self.skipTest(reason="Fast processor not available")
|
||||
|
||||
sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
result_cuda = image_processor_fast.preprocess(sample_image, device="cuda")
|
||||
self.assertEqual(result_cuda.images[0][0].device.type, "cuda")
|
||||
|
||||
result_cpu = image_processor_fast.preprocess(sample_image, device="cpu")
|
||||
self.assertEqual(result_cpu.images[0][0].device.type, "cpu")
|
||||
|
||||
def test_do_not_resize_if_smaller(self):
|
||||
"""Test that images smaller than target size are not resized."""
|
||||
if not self.test_fast_image_processor or self.fast_image_processing_class is None:
|
||||
self.skipTest(reason="Fast processor not available")
|
||||
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
small_image = torch.rand(3, 100, 150)
|
||||
size_dict = SizeDict(height=180, width=360)
|
||||
|
||||
resized = image_processor_fast.resize(small_image, size=size_dict)
|
||||
|
||||
self.assertEqual(resized.shape[1], 100)
|
||||
self.assertEqual(resized.shape[2], 150)
|
||||
|
||||
@ -18,7 +18,7 @@ import unittest
|
||||
import numpy as np
|
||||
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
|
||||
|
||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
|
||||
|
||||
@ -31,6 +31,9 @@ if is_vision_available():
|
||||
|
||||
from transformers import GLPNImageProcessor
|
||||
|
||||
if is_torchvision_available():
|
||||
from transformers import GLPNImageProcessorFast
|
||||
|
||||
|
||||
class GLPNImageProcessingTester:
|
||||
def __init__(
|
||||
@ -87,19 +90,32 @@ class GLPNImageProcessingTester:
|
||||
torchify=torchify,
|
||||
)
|
||||
|
||||
def prepare_depth_outputs(self):
|
||||
if not is_torch_available():
|
||||
return None
|
||||
depth_tensors = prepare_image_inputs(
|
||||
batch_size=self.batch_size,
|
||||
num_channels=1,
|
||||
min_resolution=self.min_resolution,
|
||||
max_resolution=self.max_resolution,
|
||||
equal_resolution=True,
|
||||
torchify=True,
|
||||
)
|
||||
depth_tensors = [depth_tensor.squeeze(0) for depth_tensor in depth_tensors]
|
||||
stacked_depth_tensors = torch.stack(depth_tensors, dim=0)
|
||||
return type("DepthOutput", (), {"predicted_depth": stacked_depth_tensors})
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class GLPNImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processing_class = GLPNImageProcessor if is_vision_available() else None
|
||||
fast_image_processing_class = GLPNImageProcessorFast if is_torchvision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.image_processor_tester = GLPNImageProcessingTester(self)
|
||||
|
||||
@property
|
||||
def image_processor_dict(self):
|
||||
return self.image_processor_tester.prepare_image_processor_dict()
|
||||
self.image_processor_dict = self.image_processor_tester.prepare_image_processor_dict()
|
||||
|
||||
def test_image_processor_properties(self):
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
@ -115,7 +131,6 @@ class GLPNImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
|
||||
for image in image_inputs:
|
||||
self.assertIsInstance(image, Image.Image)
|
||||
|
||||
# Test not batched input (GLPNImageProcessor doesn't support batching)
|
||||
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
||||
@ -161,3 +176,43 @@ class GLPNImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
|
||||
self.assertTrue(tuple(encoded_images.shape) == (1, *expected_output_image_shape))
|
||||
self.image_processing_class.num_channels = 3
|
||||
|
||||
# override as glpn image processors don't support heterogeneous batching
|
||||
@require_vision
|
||||
@require_torch
|
||||
def test_slow_fast_equivalence_batched(self):
|
||||
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test")
|
||||
|
||||
if self.image_processing_class is None or self.fast_image_processing_class is None:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
|
||||
|
||||
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
|
||||
image_processor_slow = self.image_processing_class(**self.image_processor_dict)
|
||||
image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
|
||||
encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
|
||||
|
||||
self._assert_slow_fast_tensors_equivalence(encoding_slow.pixel_values, encoding_fast.pixel_values)
|
||||
|
||||
def test_post_process_depth_equivalence(self):
|
||||
# Check that both processors produce equivalent post-processed depth maps
|
||||
if self.fast_image_processing_class is None:
|
||||
self.skipTest("TorchVision not available")
|
||||
|
||||
outputs = self.image_processor_tester.prepare_depth_outputs()
|
||||
slow = self.image_processing_class(**self.image_processor_dict)
|
||||
fast = self.fast_image_processing_class(**self.image_processor_dict)
|
||||
|
||||
# target_sizes simulate resized inference outputs
|
||||
target_sizes = [(240, 320)] * self.image_processor_tester.batch_size
|
||||
processed_slow = slow.post_process_depth_estimation(outputs, target_sizes=target_sizes)
|
||||
processed_fast = fast.post_process_depth_estimation(outputs, target_sizes=target_sizes)
|
||||
|
||||
# Compare per-sample predicted depth tensors
|
||||
for pred_slow, pred_fast in zip(processed_slow, processed_fast):
|
||||
depth_slow = pred_slow["predicted_depth"]
|
||||
depth_fast = pred_fast["predicted_depth"]
|
||||
torch.testing.assert_close(depth_fast, depth_slow, atol=1e-1, rtol=1e-3)
|
||||
self.assertLessEqual(torch.mean(torch.abs(depth_fast.float() - depth_slow.float())).item(), 5e-3)
|
||||
|
||||
@ -172,6 +172,7 @@ class Pix2StructProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor", max_patches=1024, patch_size={"height": 8, "width": 8})
|
||||
print("image_processor", image_processor)
|
||||
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
|
||||
@ -36,6 +36,7 @@ from transformers import (
|
||||
is_torch_available,
|
||||
)
|
||||
from transformers.testing_utils import (
|
||||
Expectations,
|
||||
backend_empty_cache,
|
||||
require_flash_attn,
|
||||
require_torch,
|
||||
@ -831,7 +832,14 @@ class VideoLlama3IntegrationTest(unittest.TestCase):
|
||||
torch.testing.assert_close(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=1e-4, rtol=1e-4)
|
||||
|
||||
output = model.generate(**inputs, max_new_tokens=20, do_sample=False, repetition_penalty=None)
|
||||
EXPECTED_DECODED_TEXT = "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress"
|
||||
# fmt: off
|
||||
EXPECTED_DECODED_TEXT = Expectations(
|
||||
{
|
||||
("cuda", None): "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress",
|
||||
("xpu", None): "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant night scene in a bustling Japanese city. A woman in a striking red dress",
|
||||
}
|
||||
).get_expectation()
|
||||
# fmt: on
|
||||
|
||||
self.assertEqual(
|
||||
self.processor.decode(output[0], skip_special_tokens=True),
|
||||
@ -874,11 +882,21 @@ class VideoLlama3IntegrationTest(unittest.TestCase):
|
||||
|
||||
# it should not matter whether two images are the same size or not
|
||||
output = model.generate(**inputs, max_new_tokens=20, do_sample=False, repetition_penalty=None)
|
||||
# fmt: off
|
||||
EXPECTED_DECODED_TEXT = Expectations(
|
||||
{
|
||||
("cuda", None): [
|
||||
"user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress",
|
||||
"user\nWhat is relativity?\nassistant\nRelativity is a scientific theory that describes the relationship between space and time. It was first proposed by",
|
||||
],
|
||||
("xpu", None): [
|
||||
"user\n\nDescribe the image.\nassistant\nThe image captures a vibrant night scene in a bustling Japanese city. A woman in a striking red dress",
|
||||
"user\nWhat is relativity?\nassistant\nRelativity is a scientific theory that describes the relationship between space and time. It was first proposed by",
|
||||
],
|
||||
}
|
||||
).get_expectation()
|
||||
# fmt: on
|
||||
|
||||
EXPECTED_DECODED_TEXT = [
|
||||
"user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress",
|
||||
"user\nWhat is relativity?\nassistant\nRelativity is a scientific theory that describes the relationship between space and time. It was first proposed by",
|
||||
] # fmt: skip
|
||||
self.assertEqual(
|
||||
self.processor.batch_decode(output, skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
|
||||
@ -22,14 +22,14 @@ from transformers.testing_utils import (
|
||||
require_accelerate,
|
||||
require_fp_quant,
|
||||
require_qutlass,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_gpu,
|
||||
require_torch_accelerator,
|
||||
require_torch_multi_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
@require_torch_accelerator
|
||||
class FPQuantConfigTest(unittest.TestCase):
|
||||
def test_to_dict(self):
|
||||
"""
|
||||
@ -53,7 +53,7 @@ class FPQuantConfigTest(unittest.TestCase):
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
@require_torch_accelerator
|
||||
@require_fp_quant
|
||||
@require_accelerate
|
||||
class FPQuantBaseTest(unittest.TestCase):
|
||||
@ -64,7 +64,7 @@ class FPQuantBaseTest(unittest.TestCase):
|
||||
|
||||
EXPECTED_OUTPUT = "1 2 3 4 5 6"
|
||||
|
||||
device_map = "cuda"
|
||||
device_map = torch_device
|
||||
|
||||
@classmethod
|
||||
def getQuantizationConfig(cls):
|
||||
@ -77,10 +77,10 @@ class FPQuantBaseTest(unittest.TestCase):
|
||||
Setup quantized model
|
||||
"""
|
||||
|
||||
quantization_config = cls.getQuantizationConfig()
|
||||
cls.quantization_config = cls.getQuantizationConfig()
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
cls.model_name, device_map=cls.device_map, quantization_config=quantization_config
|
||||
cls.model_name, device_map=cls.device_map, quantization_config=cls.quantization_config
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
@ -111,24 +111,25 @@ class FPQuantBaseTest(unittest.TestCase):
|
||||
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@require_torch_multi_gpu
|
||||
def test_quantized_model_multi_gpu(self):
|
||||
@require_torch_multi_accelerator
|
||||
def test_quantized_model_multi_accelerator(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly with multiple GPUs
|
||||
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
|
||||
Simple test that checks if the quantized model is working properly with multiple accelerators.
|
||||
Set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 CUDA GPUs. Or set ZE_AFFINITY_MASK=0,1
|
||||
if you have more than 2 Intel XPUs.
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
quantization_config = FPQuantConfig()
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, device_map="auto", quantization_config=quantization_config
|
||||
self.model_name, device_map="auto", quantization_config=self.quantization_config
|
||||
)
|
||||
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@require_torch_multi_gpu
|
||||
def test_save_pretrained_multi_gpu(self):
|
||||
@require_torch_multi_accelerator
|
||||
def test_save_pretrained_multi_accelerator(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly after being saved and loaded
|
||||
"""
|
||||
@ -163,6 +164,13 @@ class FPQuantMXFP4Test(FPQuantBaseTest):
|
||||
return FPQuantConfig(forward_dtype="mxfp4", pseudoquantization=False)
|
||||
|
||||
|
||||
@require_qutlass
|
||||
class FPQuantNVFP4Test(FPQuantBaseTest):
|
||||
@classmethod
|
||||
def getQuantizationConfig(cls):
|
||||
return FPQuantConfig(forward_dtype="nvfp4", pseudoquantization=False)
|
||||
|
||||
|
||||
@require_qutlass
|
||||
class FPQuantMXFP4GS128Test(FPQuantBaseTest):
|
||||
@classmethod
|
||||
|
||||
@ -81,6 +81,8 @@ if __name__ == "__main__":
|
||||
for idx in range(args.num_splits):
|
||||
start = end
|
||||
end = start + num_jobs_per_splits + (1 if idx < num_jobs % args.num_splits else 0)
|
||||
model_splits.append(d[start:end])
|
||||
# Only add the slice if it is not an empty list
|
||||
if len(d[start:end]) > 0:
|
||||
model_splits.append(d[start:end])
|
||||
|
||||
print(model_splits)
|
||||
|
||||
Reference in New Issue
Block a user