Compare commits

..

19 Commits

Author SHA1 Message Date
090d9c4b2a Merge branch 'main' into tensor-cache 2025-01-24 12:02:45 +01:00
5ccb79c16d fixed dynamic cache 2025-01-23 16:45:28 +01:00
80b49d721b rebased 2025-01-22 17:31:39 +01:00
dc1bd15ba9 Merge branch 'main' into tensor-cache 2025-01-22 17:30:23 +01:00
338f5954b9 more reverts 2025-01-22 17:29:48 +01:00
2f4e0bc93e Update src/transformers/cache_utils.py 2025-01-22 17:18:28 +01:00
485f959f85 revert 2025-01-22 17:17:17 +01:00
2bbbbbcf97 add device and dtype setters 2025-01-22 17:15:12 +01:00
85c71b004b Merge branch 'main' into tensor-cache 2025-01-22 15:53:33 +01:00
da60604f2c fix test_cache_utils 2025-01-22 15:43:14 +01:00
6e9799c817 add clone and to 2025-01-22 15:42:43 +01:00
4950a9e3f0 extract wrapper kwargs from init signature to correctly instantate 2025-01-22 13:49:01 +01:00
b67b6eb9b2 make cache class exportable and executorch compatible 2025-01-20 18:47:30 +01:00
d269417aab fix zamba and jamba dynamic cache 2025-01-20 17:21:49 +01:00
95c1686ee0 style 2025-01-20 17:09:21 +01:00
8606594ad4 fix boolean evaluation 2025-01-20 17:08:37 +01:00
45bb39bb80 torch tensor subclassing 2025-01-20 17:01:49 +01:00
a77a94b209 unproxy cache 2025-01-20 14:43:41 +01:00
d4b631edd0 use tensor cache instead of module cache 2025-01-20 14:17:28 +01:00
329 changed files with 1600 additions and 6136 deletions

View File

@ -30,7 +30,7 @@ jobs:
runs-on: ubuntu-22.04
name: Get PR number
# For security: only allow team members to run
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
outputs:
PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
steps:

View File

@ -1,4 +1,4 @@
FROM rocm/dev-ubuntu-22.04:6.2.4
FROM rocm/dev-ubuntu-22.04:6.3
LABEL maintainer="Hugging Face"
ARG DEBIAN_FRONTEND=noninteractive
@ -8,9 +8,11 @@ RUN apt update && \
apt clean && \
rm -rf /var/lib/apt/lists/*
RUN export PATH="${PATH:+${PATH}:}~/opt/rocm/bin"
RUN python3 -m pip install --no-cache-dir --upgrade pip numpy
RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2
RUN python3 -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3/
RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"

View File

@ -1,11 +1,11 @@
FROM rocm/dev-ubuntu-22.04:6.2.4
FROM rocm/dev-ubuntu-22.04:5.6
LABEL maintainer="Hugging Face"
ARG DEBIAN_FRONTEND=noninteractive
ARG PYTORCH='2.5.1'
ARG TORCH_VISION='0.20.0'
ARG TORCH_AUDIO='2.5.0'
ARG ROCM='6.2'
ARG PYTORCH='2.1.1'
ARG TORCH_VISION='0.16.1'
ARG TORCH_AUDIO='2.1.1'
ARG ROCM='5.6'
RUN apt update && \
apt install -y --no-install-recommends \
@ -45,4 +45,4 @@ RUN cd transformers && python3 setup.py develop
RUN python3 -c "from deepspeed.launcher.runner import main"
# Remove nvml as it is not compatible with ROCm
RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y
RUN python3 -m pip uninstall py3nvml pynvml -y

View File

@ -15,10 +15,6 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
ARG REF=main
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
# Install Rust for Tokenizers
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
ENV PATH="$HOME/.cargo/bin:${PATH}"
RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
# Install latest release PyTorch

View File

@ -626,8 +626,6 @@
title: YOSO
- local: model_doc/zamba
title: Zamba
- local: model_doc/zamba2
title: Zamba2
title: Text models
- isExpanded: false
sections:

View File

@ -162,7 +162,7 @@ agent.run(
improved_prompt could be "A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background"
Now that I have improved the prompt, I can use the image generator tool to generate an image based on this prompt.
=== Agent is executing the code below:
>>> Agent is executing the code below:
image = image_generator(prompt="A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background")
final_answer(image)
```

View File

@ -39,7 +39,7 @@ Let's make this concrete with a quick example using the `mistralai/Mistral-7B-In
... ]
>>> tokenizer.apply_chat_template(chat, tokenize=False)
"<s> [INST] Hello, how are you? [/INST] I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]"
"<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]"
```
Notice how the tokenizer has added the control tokens [INST] and [/INST] to indicate the start and end of

View File

@ -231,7 +231,7 @@ to check if the text is machine-generated (outputs `True` for machine-generated
>>> detector = WatermarkDetector(model_config=model.config, device="cpu", watermarking_config=watermarking_config)
>>> detection_out = detector(out, return_dict=True)
>>> detection_out.prediction
array([ True, True])
array([True, True])
```
@ -269,7 +269,7 @@ dimension you can act upon, in addition to selecting a decoding strategy. Popula
>>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
>>> outputs = model.generate(**inputs)
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
['I look forward to seeing you all again!\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']
['I look forward to seeing you all again!\n\n\n\n\n\n\n\n\n\n\n']
```
### Contrastive search
@ -445,7 +445,7 @@ To enable assisted decoding, set the `assistant_model` argument with a model.
>>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
>>> outputs = model.generate(**inputs, assistant_model=assistant_model)
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a glass of wine.']
['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
```
<Tip>
@ -461,7 +461,7 @@ If you're using a `pipeline` object, all you need to do is to pass the assistant
... model="meta-llama/Llama-3.1-8B",
... assistant_model="meta-llama/Llama-3.2-1B", # This extra line is all that's needed, also works with UAD
... torch_dtype=torch.bfloat16
... )
>>> )
>>> pipe_output = pipe("Once upon a time, ", max_new_tokens=50, do_sample=False)
>>> pipe_output[0]["generated_text"]
'Once upon a time, 3D printing was a niche technology that was only'
@ -488,7 +488,7 @@ just like in multinomial sampling. However, in assisted decoding, reducing the t
>>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
>>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5)
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
['Alice and Bob are two people who are very different, but they are both very good at what they do. Alice']
['Alice and Bob, a couple of friends of mine, who are both in the same office as']
```
We recommend to install `scikit-learn` library to enhance the candidate generation strategy and achieve additional speedup.
@ -518,7 +518,7 @@ to ensure the new tokens include the correct prompt suffix.
>>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
>>> outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=tokenizer, assistant_tokenizer=assistant_tokenizer)
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
['Alice and Bob are playing a game. Alice has a set of $n$ integers $a_1, a']
['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
```
#### Prompt Lookup
@ -547,7 +547,7 @@ If the model you're using was trained to do early exit, you can pass
>>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
>>> outputs = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_new_tokens=20)
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
['Alice and Bob are playing a game. Alice has a set of $n$ integers $a_1, a']
['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
```
### DoLa Decoding
@ -571,9 +571,10 @@ See the following examples for DoLa decoding with the 32-layer LLaMA-7B model.
>>> import torch
>>> from accelerate.test_utils.testing import get_backend
>>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
>>> tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
>>> model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", torch_dtype=torch.float16).to(device)
>>> model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", torch_dtype=torch.float16)
>>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
>>> model.to(device)
>>> set_seed(42)
>>> text = "On what date was the Declaration of Independence officially signed?"
@ -592,7 +593,7 @@ See the following examples for DoLa decoding with the 32-layer LLaMA-7B model.
# DoLa decoding with contrasting specific layers (layers 28 and 30)
>>> dola_custom_output = model.generate(**inputs, do_sample=False, max_new_tokens=50, dola_layers=[28,30], repetition_penalty=1.2)
>>> tokenizer.batch_decode(dola_custom_output[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
['\nIn 1891, when he was 54 years old, John Jacob Astor founded his empire. He opened a one-man business and spent the next 27 years working 10-hour days. When']
['\nIt was officially signed on 2 August 1776, when 56 members of the Second Continental Congress, representing the original 13 American colonies, voted unanimously for the resolution for independence. The 2']
```
#### Understanding the `dola_layers` argument

View File

@ -385,7 +385,6 @@ Flax), PyTorch, and/or TensorFlow.
| [YOLOS](model_doc/yolos) | ✅ | ❌ | ❌ |
| [YOSO](model_doc/yoso) | ✅ | ❌ | ❌ |
| [Zamba](model_doc/zamba) | ✅ | ❌ | ❌ |
| [Zamba2](model_doc/zamba2) | ✅ | ❌ | ❌ |
| [ZoeDepth](model_doc/zoedepth) | ✅ | ❌ | ❌ |
<!-- End table-->

View File

@ -32,32 +32,12 @@ Install 🤗 Transformers for whichever deep learning library you're working wit
You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, take a look at this [guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). A virtual environment makes it easier to manage different projects, and avoid compatibility issues between dependencies.
Create a virtual environment with [uv](https://docs.astral.sh/uv/) (refer to [Installation](https://docs.astral.sh/uv/getting-started/installation/) for installation instructions), a fast Rust-based Python package and project manager.
```bash
uv venv my-env
source my-env/bin/activate
```
Now you're ready to install 🤗 Transformers with pip or uv.
<hfoptions id="install">
<hfoption id="uv">
```bash
uv pip install transformers
```
</hfoption>
<hfoption id="pip">
Now you're ready to install 🤗 Transformers with the following command:
```bash
pip install transformers
```
</hfoption>
</hfoptions>
For GPU acceleration, install the appropriate CUDA drivers for [PyTorch](https://pytorch.org/get-started/locally) and TensorFlow(https://www.tensorflow.org/install/pip).
Run the command below to check if your system detects an NVIDIA GPU.

View File

@ -56,7 +56,7 @@ More concretely, key-value cache acts as a memory bank for these generative mode
>>> import torch
>>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
>>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
>>> model_id = "meta-llama/Llama-2-7b-chat-hf"
>>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0")
>>> tokenizer = AutoTokenizer.from_pretrained(model_id)
@ -82,13 +82,7 @@ More concretely, key-value cache acts as a memory bank for these generative mode
... cache_position = cache_position[-1:] + 1 # add one more position for the next token
>>> print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
```
```txt
<|user|>
Hello, what's your name.
<|assistant|>
My name is Sarah.
<|
"[INST] Hello, what's your name. [/INST] Hello! My name is LLaMA,"
```
</details>
@ -138,13 +132,17 @@ Cache quantization can be detrimental in terms of latency if the context length
>>> import torch
>>> from transformers import AutoTokenizer, AutoModelForCausalLM
>>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
>>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16).to("cuda:0")
>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
>>> inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "backend": "quanto"})
>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
I like rock music because it's a great way to express myself. I like the way it makes me feel, the
I like rock music because it's loud and energetic. It's a great way to express myself and rel
>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20)
>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
I like rock music because it's loud and energetic. I like to listen to it when I'm feeling
```
### Offloaded Cache
@ -233,14 +231,14 @@ For more examples with Static Cache and JIT compilation, take a look at [StaticC
>>> import torch
>>> from transformers import AutoTokenizer, AutoModelForCausalLM
>>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
>>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto")
>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
>>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
>>> # simply pass the cache implementation="static"
>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="static")
>>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
"Hello, my name is [Your Name] and I am a [Your Position] at [Your Company]. I am writing"
"Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of"
```
@ -258,7 +256,7 @@ This will use the [`~OffloadedStaticCache`] implementation instead.
>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
>>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
>>> # simply pass the cache implementation="offloaded_static"
>>> # simply pass the cache implementation="static"
>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="offloaded_static")
>>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
"Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of"
@ -277,14 +275,14 @@ Note that you can use this cache only for models that support sliding window, e.
>>> import torch
>>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache
>>> tokenizer = AutoTokenizer.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B")
>>> model = AutoModelForCausalLM.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B", torch_dtype=torch.float16).to("cuda:0")
>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
>>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16).to("cuda:0")
>>> inputs = tokenizer("Yesterday I was on a rock concert and.", return_tensors="pt").to(model.device)
>>> # can be used by passing in cache implementation
>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=30, cache_implementation="sliding_window")
>>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
"Yesterday I was on a rock concert and. I was so excited to see my favorite band perform live. I was so happy that I could hardly contain myself. I was jumping up and down and"
"Yesterday I was on a rock concert and. I was so excited to see my favorite band. I was so excited that I was jumping up and down and screaming. I was so excited that I"
```
### Sink Cache
@ -297,8 +295,8 @@ Unlike other cache classes, this one can't be used directly by indicating a `cac
>>> import torch
>>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache
>>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
>>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16).to("cuda:0")
>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
>>> inputs = tokenizer("This is a long story about unicorns, fairies and magic.", return_tensors="pt").to(model.device)
>>> # get our cache, specify number of sink tokens and window size
@ -306,7 +304,7 @@ Unlike other cache classes, this one can't be used directly by indicating a `cac
>>> past_key_values = SinkCache(window_length=256, num_sink_tokens=4)
>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=30, past_key_values=past_key_values)
>>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
"This is a long story about unicorns, fairies and magic. It is a story about a young girl named Lily who discovers that she has the power to control the elements. She learns that she can"
"This is a long story about unicorns, fairies and magic. It is a fantasy world where unicorns and fairies live together in harmony. The story follows a young girl named Lily"
```
### Encoder-Decoder Cache
@ -334,15 +332,15 @@ In case you are using Sink Cache, you have to crop your inputs to that maximum l
>>> import torch
>>> from transformers import AutoTokenizer,AutoModelForCausalLM
>>> from transformers.cache_utils import (
... DynamicCache,
... SinkCache,
... StaticCache,
... SlidingWindowCache,
... QuantoQuantizedCache,
... QuantizedCacheConfig,
... )
>>> DynamicCache,
>>> SinkCache,
>>> StaticCache,
>>> SlidingWindowCache,
>>> QuantoQuantizedCache,
>>> QuantizedCacheConfig,
>>> )
>>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
>>> model_id = "meta-llama/Llama-2-7b-chat-hf"
>>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map='auto')
>>> tokenizer = AutoTokenizer.from_pretrained(model_id)
@ -365,7 +363,7 @@ In case you are using Sink Cache, you have to crop your inputs to that maximum l
... messages.append({"role": "assistant", "content": completion})
print(messages)
[{'role': 'user', 'content': "Hello, what's your name?"}, {'role': 'assistant', 'content': "Hello, I'm AI."}, {'role': 'user', 'content': 'Btw, yesterday I was on a rock concert.'}, {'role': 'assistant', 'content': "I'm sorry to hear that you were on a rock concert yesterday. It sounds like a fun experience, but I'm not capable of experiencing music or concerts. However, I can provide you with some information about rock music and its history. Rock music emerged in the 1950s and 1960s in the United States and Britain, and it quickly gained popularity around the world. Some of the most famous rock bands of all time include The Beatles, The Rolling Stones, Led Zeppelin, and Pink Floyd. Rock music has a distinct sound and style, with elements of blues, country, and folk music. It often features guitar solos, heavy bass lines, and drums. Rock music has had a significant impact on popular culture, influencing genres such as punk rock, heavy metal, and alternative rock."}]
[{'role': 'user', 'content': "Hello, what's your name?"}, {'role': 'assistant', 'content': " Hello! My name is LLaMA, I'm a large language model trained by a team of researcher at Meta AI. 😊"}, {'role': 'user', 'content': 'Btw, yesterday I was on a rock concert.'}, {'role': 'assistant', 'content': ' Oh, cool! That sounds like a lot of fun! 🎉 Did you enjoy the concert? What was the band like? 🤔'}]
```
@ -378,7 +376,7 @@ Sometimes you would want to first fill-in cache object with key/values for certa
>>> import torch
>>> from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, StaticCache
>>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
>>> model_id = "meta-llama/Llama-2-7b-chat-hf"
>>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda")
>>> tokenizer = AutoTokenizer.from_pretrained(model_id)
@ -402,7 +400,7 @@ Sometimes you would want to first fill-in cache object with key/values for certa
... responses.append(response)
>>> print(responses)
['<s> You are a helpful assistant. Help me to write a blogpost about travelling. I am excited to share my experiences with you. I have been traveling for the past', '<s> You are a helpful assistant. What is the capital of France? \n\nAnswer: Paris is the capital of France.</s>']
['<s> You are a helpful assistant. Help me to write a blogpost about travelling.\n\nTitle: The Ultimate Guide to Travelling: Tips, Tricks, and', '<s> You are a helpful assistant. What is the capital of France?\n\nYes, the capital of France is Paris.</s>']
```
@ -416,8 +414,8 @@ this legacy format, you can seamlessly convert it to a `DynamicCache` and back.
>>> import torch
>>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
>>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
>>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto")
>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
>>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
>>> # `return_dict_in_generate=True` is required to return the cache. `return_legacy_cache` forces the returned cache

View File

@ -56,7 +56,7 @@ In the following, we demonstrate how to use `glm-4-9b-chat` for the inference. N
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> device = "cuda" # the device to load the model onto
>>> model = AutoModelForCausalLM.from_pretrained("THUDM/glm-4-9b-chat", device_map="auto", trust_remote_code=True)
>>> model = AutoModelForCausalLM.from_pretrained("THUDM/glm-4-9b-chat", device_map="auto")
>>> tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b-chat")
>>> prompt = "Give me a short introduction to large language model."

View File

@ -64,19 +64,18 @@ Here's how to use the model for zero-shot object detection:
>>> results = processor.post_process_grounded_object_detection(
... outputs,
... inputs.input_ids,
... box_threshold=0.4,
... threshold=0.4,
... text_threshold=0.3,
... target_sizes=[image.size[::-1]]
... target_sizes=[(image.height, image.width)]
... )
# Retrieve the first image result
>>> # Retrieve the first image result
>>> result = results[0]
>>> for box, score, labels in zip(result["boxes"], result["scores"], result["labels"]):
>>> for box, score, text_label in zip(result["boxes"], result["scores"], result["text_labels"]):
... box = [round(x, 2) for x in box.tolist()]
... print(f"Detected {labels} with confidence {round(score.item(), 3)} at location {box}")
Detected a cat with confidence 0.468 at location [344.78, 22.9, 637.3, 373.62]
Detected a cat with confidence 0.426 at location [11.74, 51.55, 316.51, 473.22]
... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
Detected a cat with confidence 0.479 at location [344.7, 23.11, 637.18, 374.28]
Detected a cat with confidence 0.438 at location [12.27, 51.91, 316.86, 472.44]
Detected a remote control with confidence 0.478 at location [38.57, 70.0, 176.78, 118.18]
```
## Grounded SAM

View File

@ -81,7 +81,7 @@ text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=
# Note that the template simply formats your prompt, you still have to tokenize it and obtain pixel values for your images
print(text_prompt)
'<|im_start|>user\n<image>What is shown in this image?<|im_end|>\n<|im_start|>assistant\nPage showing the list of options.<|im_end|>'
>>> "<|im_start|>user\n<image>What is shown in this image?<|im_end|>\n<|im_start|>assistant\nPage showing the list of options.<|im_end|>"
```
This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay).

View File

@ -110,13 +110,8 @@ To follow the example of the following image, `"Hello, I'm Moshi"` could be tran
>>> from datasets import load_dataset, Audio
>>> import torch, math
>>> from transformers import MoshiForConditionalGeneration, AutoFeatureExtractor, AutoTokenizer
>>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/moshiko-pytorch-bf16")
>>> tokenizer = AutoTokenizer.from_pretrained("kyutai/moshiko-pytorch-bf16")
>>> device = "cuda"
>>> dtype = torch.bfloat16
>>> # prepare user input audio
>>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))

View File

@ -57,7 +57,10 @@ Phi-3 has been integrated in the development version (4.40.0.dev) of `transforme
>>> outputs = model.generate(inputs, max_new_tokens=32)
>>> text = tokenizer.batch_decode(outputs)[0]
>>> print(text)
<|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonfruits can be combined in various delicious ways. Here are some creative ideas for incorporating both fruits
<s><|user|>
Can you provide ways to eat combinations of bananas and dragonfruits?<|end|>
<|assistant|>
Certainly! Bananas and dragonfruits can be combined in various delicious ways. Here are some ideas for eating combinations of bananas and
```
## Phi3Config

View File

@ -52,7 +52,7 @@ Here is how to use the processor to process text and audio:
```python
>>> # let's load an audio sample from an Arabic speech corpus
>>> from datasets import load_dataset
>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True)
>>> audio_sample = next(iter(dataset))["audio"]
>>> # now, process it

View File

@ -52,7 +52,7 @@ Here is how to use the processor to process text and audio:
```python
>>> # let's load an audio sample from an Arabic speech corpus
>>> from datasets import load_dataset
>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True)
>>> audio_sample = next(iter(dataset))["audio"]
>>> # now, process it

View File

@ -86,7 +86,7 @@ If you want to do the pre- and postprocessing yourself, here's how to do that:
>>> candidate_labels = ["2 cats", "2 dogs"]
# follows the pipeline prompt template to get same results
>>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
# important: we pass `padding=max_length` since the model was trained with this
>>> # important: we pass `padding=max_length` since the model was trained with this
>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
>>> with torch.no_grad():
@ -95,7 +95,7 @@ If you want to do the pre- and postprocessing yourself, here's how to do that:
>>> logits_per_image = outputs.logits_per_image
>>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
>>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
19.8% that image 0 is '2 cats'
31.9% that image 0 is 'a photo of 2 cats'
```
## Resources
@ -142,7 +142,8 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
# follows the pipeline prompt template to get same results
>>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
# important: we pass `padding=max_length` since the model was trained with this
>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt").to(device)
>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
>>> inputs.to(device)
>>> with torch.no_grad():
... with torch.autocast(device):
@ -151,7 +152,7 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
>>> logits_per_image = outputs.logits_per_image
>>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
>>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
19.8% that image 0 is '2 cats'
51.3% that image 0 is 'This is a photo of 2 cats.'
```

View File

@ -1,93 +0,0 @@
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Zamba2
Zamba2 is a large language model (LLM) trained by Zyphra, and made available under an Apache 2.0 license. Please see the [Zyphra Hugging Face](https://huggingface.co/collections/zyphra/) repository for model weights.
This model was contributed by [pglo](https://huggingface.co/pglo).
## Model details
Zamba2-1.2B, Zamba2-2.7B and Zamba2-7B are hybrid models combining state-space models (Specifically [Mamba](https://github.com/state-spaces/mamba)) and transformer, and were trained using next-token prediction. Zamba2 uses shared transformer layers after every 6 mamba blocks. It uses the [Mistral v0.1 tokenizer](https://huggingface.co/mistralai/Mistral-7B-v0.1). We came to this architecture after a series of ablations at small scales. Zamba2-1.2B, Zamba2-2.7B and Zamba2-7B were pre-trained on 2T and 3T tokens, respectively.
<img src=https://github.com/user-attachments/assets/c2cff209-b901-483c-87aa-774b82a0769f width=30% height=40% />
## Quick start
### Presequities
Zamba2 requires you use `transformers` version 4.48.0 or higher:
```bash
pip install transformers>=4.48.0
```
## Inference
```python
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba2-7B")
model = AutoModelForCausalLM.from_pretrained("Zyphra/Zamba2-7B", device_map="cuda", torch_dtype=torch.bfloat16)
input_text = "What factors contributed to the fall of the Roman Empire?"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(**input_ids, max_new_tokens=100)
print(tokenizer.decode(outputs[0]))
```
## Model card
The model cards can be found at:
* [Zamba2-1.2B](https://huggingface.co/Zyphra/Zamba2-1.2B)
* [Zamba2-2.7B](https://huggingface.co/Zyphra/Zamba2-2.7B)
* [Zamba2-7B](https://huggingface.co/Zyphra/Zamba2-7B)
## Issues
For issues with model output, or community discussion, please use the Hugging Face community [forum](https://huggingface.co/Zyphra/Zamba2-7B/discussions)
## License
The model weights are open-sourced via an Apache 2.0 license.
## Zamba2Config
[[autodoc]] Zamba2Config
## Zamba2Model
[[autodoc]] Zamba2Model
- forward
## Zamba2ForCausalLM
[[autodoc]] Zamba2ForCausalLM
- forward
## Zamba2ForSequenceClassification
[[autodoc]] transformers.Zamba2ForSequenceClassification
- forward

View File

@ -70,7 +70,7 @@ Alternatively, one can also perform inference using the classes:
>>> inputs = image_processor(images=image, return_tensors="pt")
>>> with torch.no_grad():
... outputs = model(inputs)
... outputs = model(pixel_values)
>>> # interpolate to original size and visualize the prediction
>>> ## ZoeDepth dynamically pads the input image. Thus we pass the original image size as argument

View File

@ -111,7 +111,6 @@ FlashAttention-2 is currently supported for the following architectures:
* [UniSpeech](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech#transformers.UniSpeechModel)
* [unispeech_sat](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech-sat#transformers.UniSpeechSatModel)
* [helium](https://huggingface.co/docs/transformers/main/en/model_doc/heliumtransformers.HeliumModel)
* [Zamba2](https://huggingface.co/docs/transformers/model_doc/zamba2)
You can request to add FlashAttention-2 support for another model by opening a GitHub Issue or Pull Request.
@ -329,7 +328,6 @@ For now, Transformers supports SDPA inference and training for the following arc
* [XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl#transformers.XLMRobertaXLModel)
* [YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos#transformers.YolosModel)
* [helium](https://huggingface.co/docs/transformers/main/en/model_doc/heliumtransformers.HeliumModel)
* [Zamba2](https://huggingface.co/docs/transformers/model_doc/zamba2)
<Tip>

View File

@ -305,7 +305,10 @@ There are two types of language modeling:
... for pred in preds
... ]
>>> preds
[{'score': 0.224, 'token': 3944, 'token_str': ' tool', 'sequence': 'Hugging Face is a community-based open-source tool for machine learning.'}]
[{'score': 0.2236,
'token': 1761,
'token_str': ' platform',
'sequence': 'Hugging Face is a community-based open-source platform for machine learning.'}]
```
## Multimodal

View File

@ -80,7 +80,7 @@ Run inference with decoder-only models with the `text-generation` pipeline:
>>> prompt = "Hello, I'm a language model"
>>> generator(prompt, max_length = 30)
[{'generated_text': "Hello, I'm a language model. Not a programming language at all: it's pretty simple.\n\nWhen I write a function, I mean"}]
[{'generated_text': "Hello, I'm a language model programmer so you can use some of my stuff. But you also need some sort of a C program to run."}]
```
To run inference with an encoder-decoder, use the `text2text-generation` pipeline:
@ -258,7 +258,7 @@ also be a suitable location for instructions. Typically, it's better to place th
>>> for seq in sequences:
... print(f"{seq['generated_text']}")
"Permaculture is an ecological design method that mimics natural ecosystems' diversity, functionality, and resilience using modern technology and indigenous knowledge. It aims to help"
Permaculture is an ecological design mimicking natural ecosystems to meet basic needs and prepare for climate change. It is based on traditional knowledge and scientific understanding.
```
#### Question answering
@ -284,7 +284,7 @@ the leading word or phrase (`"Answer:"`) to nudge the model to start generating
>>> for seq in sequences:
... print(f"Result: {seq['generated_text']}")
"Result: Modern tools are used, such as immersion blenders"
Result: Modern tools often used to make gazpacho include
```
#### Reasoning
@ -309,7 +309,7 @@ Let's try if we can make a model reason about a simple arithmetics task with a b
>>> for seq in sequences:
... print(f"Result: {seq['generated_text']}")
Result:
There are a total of 50 students in the class (5 groups x 4 students per group = 20 groups, and
There are a total of 5 groups, so there are 5 x 4=20 students in the class.
```
Correct! Let's increase the complexity a little and see if we can still get away with a basic prompt:

View File

@ -271,10 +271,6 @@ class DataTrainingArguments:
)
},
)
use_fast: Optional[bool] = field(
default=True,
metadata={"help": "Use a fast torchvision-base image processor if it is supported for a given model."},
)
@dataclass
@ -431,7 +427,6 @@ def main():
size={"max_height": data_args.image_square_size, "max_width": data_args.image_square_size},
do_pad=True,
pad_size={"height": data_args.image_square_size, "width": data_args.image_square_size},
use_fast=data_args.use_fast,
**common_pretrained_args,
)

View File

@ -256,12 +256,6 @@ def parse_args():
default=1333,
help="Image longest size will be resized to this value, then image will be padded to square.",
)
parser.add_argument(
"--use_fast",
type=bool,
default=True,
help="Use a fast torchvision-base image processor if it is supported for a given model.",
)
parser.add_argument(
"--cache_dir",
type=str,
@ -488,7 +482,6 @@ def main():
size={"max_height": args.image_square_size, "max_width": args.image_square_size},
do_pad=True,
pad_size={"height": args.image_square_size, "width": args.image_square_size},
use_fast=args.use_fast,
**common_pretrained_args,
)

View File

@ -889,7 +889,6 @@ _import_structure = {
"models.yolos": ["YolosConfig"],
"models.yoso": ["YosoConfig"],
"models.zamba": ["ZambaConfig"],
"models.zamba2": ["Zamba2Config"],
"models.zoedepth": ["ZoeDepthConfig"],
"onnx": [],
"pipelines": [
@ -3990,14 +3989,6 @@ else:
"ZambaPreTrainedModel",
]
)
_import_structure["models.zamba2"].extend(
[
"Zamba2ForCausalLM",
"Zamba2ForSequenceClassification",
"Zamba2Model",
"Zamba2PreTrainedModel",
]
)
_import_structure["models.zoedepth"].extend(
[
"ZoeDepthForDepthEstimation",
@ -6013,7 +6004,6 @@ if TYPE_CHECKING:
from .models.yolos import YolosConfig
from .models.yoso import YosoConfig
from .models.zamba import ZambaConfig
from .models.zamba2 import Zamba2Config
from .models.zoedepth import ZoeDepthConfig
# Pipelines
@ -8552,12 +8542,6 @@ if TYPE_CHECKING:
ZambaModel,
ZambaPreTrainedModel,
)
from .models.zamba2 import (
Zamba2ForCausalLM,
Zamba2ForSequenceClassification,
Zamba2Model,
Zamba2PreTrainedModel,
)
from .models.zoedepth import (
ZoeDepthForDepthEstimation,
ZoeDepthPreTrainedModel,

View File

@ -146,7 +146,7 @@ def chroma_filter_bank(
sampling_rate: int,
tuning: float = 0.0,
power: Optional[float] = 2.0,
weighting_parameters: Optional[Tuple[float, float]] = (5.0, 2.0),
weighting_parameters: Optional[Tuple[float]] = (5.0, 2),
start_at_c_chroma: Optional[bool] = True,
):
"""
@ -165,7 +165,7 @@ def chroma_filter_bank(
Tuning deviation from A440 in fractions of a chroma bin.
power (`float`, *optional*, defaults to 2.0):
If 12.0, normalizes each column with their L2 norm. If 1.0, normalizes each column with their L1 norm.
weighting_parameters (`Tuple[float, float]`, *optional*, defaults to `(5., 2.)`):
weighting_parameters (`Tuple[float]`, *optional*, defaults to `(5., 2.)`):
If specified, apply a Gaussian weighting parameterized by the first element of the tuple being the center and
the second element being the Gaussian half-width.
start_at_c_chroma (`float`, *optional*, defaults to `True`):

View File

@ -1,5 +1,6 @@
import copy
import importlib.metadata
import inspect
import json
import os
from dataclasses import dataclass
@ -9,12 +10,7 @@ import torch
from packaging import version
from .configuration_utils import PretrainedConfig
from .utils import (
is_hqq_available,
is_optimum_quanto_available,
is_torchdynamo_compiling,
logging,
)
from .utils import is_hqq_available, is_optimum_quanto_available, logging
from .utils.deprecation import deprecate_kwarg
@ -24,13 +20,82 @@ if is_hqq_available():
logger = logging.get_logger(__name__)
class Cache(torch.nn.Module):
class Cache(torch.Tensor):
"""
Base, abstract class for all caches. The actual data structure is specific to each subclass.
"""
def __init__(self):
super().__init__()
@staticmethod
def __new__(cls, *args, **kwargs):
# We use a tensor wrapper to allow for torch script tracing when using the cache as an input in a forward method
wrapper_kwargs = {}
init_signature = inspect.signature(cls.__init__)
init_arguments = list(init_signature.parameters.keys())
init_defaults = {
k: v.default for k, v in init_signature.parameters.items() if v.default is not inspect.Parameter.empty
}
for argument in ["dtype", "device"]:
if argument in init_arguments:
arg_idx = init_arguments.index(argument)
if len(args) > arg_idx and args[arg_idx] is not None:
wrapper_kwargs[argument] = args[arg_idx]
elif kwargs.get(argument, None) is not None:
wrapper_kwargs[argument] = kwargs[argument]
elif init_defaults[argument] is not None:
wrapper_kwargs[argument] = init_defaults[argument]
if "cache_config" in init_arguments:
cache_config_idx = init_arguments.index("cache_config")
if len(args) > cache_config_idx and args[cache_config_idx] is not None:
wrapper_kwargs["device"] = args[cache_config_idx].device
elif kwargs.get("cache_config", None) is not None:
wrapper_kwargs["device"] = kwargs["cache_config"].device
elif init_defaults["cache_config"] is not None:
wrapper_kwargs["device"] = init_defaults["cache_config"].device
self = torch.Tensor._make_wrapper_subclass(cls, (), **wrapper_kwargs, requires_grad=False)
# we create a dummy empty tensor for generic tensor flattening/unflattening
self._empty_tensor = torch.tensor([], **wrapper_kwargs, requires_grad=False)
return self
@classmethod
def __torch_dispatch__(cls, func, types, args, kwargs):
assert (
func.__name__ in cls.__dict__
), f"Class {cls.__name__} is a tensor wrapper and does not implement method {func.__name__}"
return getattr(cls, func.__name__)(*args, **kwargs)
def __repr__(self):
return f"{self.__class__.__name__}()"
def __bool__(self):
# in many places, past_key_values is checked for not being None using `if past_key_values:`
# I think `if past_key_values is not None:` should be used instead
return self is not None # True
def to(self, *args, **kwargs):
# originals
wrapper_kwargs = {"dtype": getattr(self, "dtype", None), "device": getattr(self, "device", None)}
# overrides
for arg in list(args) + list(kwargs.values()):
if isinstance(arg, (torch.device, str, int)):
wrapper_kwargs["device"] = arg
elif isinstance(arg, torch.dtype):
wrapper_kwargs["dtype"] = arg
# new wrapper
new_self = torch.Tensor._make_wrapper_subclass(self.__class__, (), **wrapper_kwargs)
new_self.__dict__ = {k: v for k, v in self.__dict__.items() if k not in ["device", "dtype"]}
return new_self
def clone(self):
wrapper_kwargs = {"dtype": getattr(self, "dtype", None), "device": getattr(self, "device", None)}
new_self = torch.Tensor._make_wrapper_subclass(self.__class__, (), **wrapper_kwargs, requires_grad=False)
new_self.__dict__ = copy.deepcopy(self.__dict__)
return new_self
def update(
self,
@ -304,7 +369,7 @@ class StaticCacheConfig(CacheConfig):
cache_implementation = "static"
def __init__(self, batch_size: int, max_cache_len: int, device="cpu"):
def __init__(self, batch_size: int, max_cache_len: int, device: Union[str, torch.device] = torch.device("cpu")):
self.batch_size = batch_size
self.max_cache_len = max_cache_len
self.device = device
@ -361,6 +426,16 @@ class DynamicCache(Cache):
```
"""
def __tensor_flatten__(self):
return ["_empty_tensor"], {"_seen_tokens": self._seen_tokens}
@staticmethod
def __tensor_unflatten__(inner_tensors, meta, _, __):
cache = DynamicCache()
cache._seen_tokens = meta["_seen_tokens"]
cache._empty_tensor = inner_tensors["_empty_tensor"]
return cache
@deprecate_kwarg("num_hidden_layers", version="4.47.0")
def __init__(self, num_hidden_layers: Optional[int] = None) -> None:
super().__init__()
@ -448,7 +523,7 @@ class DynamicCache(Cache):
or len(self.key_cache) <= layer_idx # skipped `layer_idx` and hasn't run a layer with cache after it
or len(self.key_cache[layer_idx]) == 0 # the layer has no cache
)
layer_seq_length = self.key_cache[layer_idx].shape[-2] if not is_empty_layer else 0
layer_seq_length = self.key_cache[layer_idx].shape[-2] if not is_empty_layer else torch.tensor(0)
return layer_seq_length
def get_max_cache_shape(self) -> Optional[int]:
@ -675,9 +750,6 @@ class QuantizedCache(DynamicCache):
self.axis_key = cache_config.axis_key
self.axis_value = cache_config.axis_value
self.compute_dtype = cache_config.compute_dtype
self.device = cache_config.device
super().__init__()
def update(
self,
@ -777,7 +849,7 @@ class QuantoQuantizedCache(QuantizedCache):
raise ImportError(
f"You need optimum-quanto package version to be greater or equal than 0.2.5 to use `QuantoQuantizedCache`. Detected version {optimum_quanto_version}."
)
from optimum.quanto import MaxOptimizer, qint2, qint4
from optimum.quanto import MaxOptimizer, qint2, qint4 # type: ignore
if self.nbits not in [2, 4]:
raise ValueError(f"`nbits` for `quanto` backend has to be one of [`2`, `4`] but got {self.nbits}")
@ -796,7 +868,7 @@ class QuantoQuantizedCache(QuantizedCache):
def _quantize(self, tensor, axis):
# We have two different API since in optimum-quanto, we don't use AffineQuantizer anymore
if is_optimum_quanto_available():
from optimum.quanto import quantize_weight
from optimum.quanto import quantize_weight # type: ignore
scale, zeropoint = self.optimizer(tensor, self.qtype, axis, self.q_group_size)
qtensor = quantize_weight(tensor, self.qtype, axis, scale, zeropoint, self.q_group_size)
@ -1105,7 +1177,7 @@ class StaticCache(Cache):
config: PretrainedConfig,
batch_size: int = None,
max_cache_len: int = None,
device: torch.device = None,
device: Union[torch.device, str] = torch.device("meta"),
dtype: torch.dtype = torch.float32,
max_batch_size: Optional[int] = None,
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
@ -1116,7 +1188,6 @@ class StaticCache(Cache):
f"The 'batch_size' argument of {self.__class__.__name__} is deprecated and will be removed in "
"v4.49. Use the more precisely named 'max_batch_size' argument instead."
)
self.max_batch_size = batch_size or max_batch_size
self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
@ -1125,8 +1196,6 @@ class StaticCache(Cache):
config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
)
self.dtype = dtype
self.device = torch.device(device) if device is not None else torch.device("meta")
self.num_key_value_heads = (
config.num_attention_heads
if getattr(config, "num_key_value_heads", None) is None
@ -1144,18 +1213,10 @@ class StaticCache(Cache):
layer_device = self.device
new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device)
new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device)
# Notes:
# 1. `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
# breaks when updating the cache. It can't be used if the cache code is being compiled (but in that case
# it is not needed anyway)
# 2. `torch.export()` requires mutations to be registered as buffers.
if not is_torchdynamo_compiling():
self.register_buffer(f"key_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=layer_device))
self.register_buffer(f"value_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=layer_device))
new_layer_key_cache = getattr(self, f"key_cache_{idx}")
new_layer_value_cache = getattr(self, f"value_cache_{idx}")
torch._dynamo.mark_static_address(new_layer_key_cache)
torch._dynamo.mark_static_address(new_layer_value_cache)
# Note: `mark_static_address` is used to tag the cache as a fixed data pointer,
# preventing compiled graph breaks when updating the cache.
torch._dynamo.mark_static_address(new_layer_key_cache)
torch._dynamo.mark_static_address(new_layer_key_cache)
self.key_cache.append(new_layer_key_cache)
self.value_cache.append(new_layer_value_cache)
@ -1304,7 +1365,7 @@ class SlidingWindowCache(StaticCache):
config: PretrainedConfig,
batch_size: int = None,
max_cache_len: int = None,
device: torch.device = None,
device: Union[torch.device, str] = torch.device("meta"),
dtype: torch.dtype = torch.float32,
max_batch_size: Optional[int] = None,
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
@ -1619,7 +1680,7 @@ class HybridCache(Cache):
config: PretrainedConfig,
batch_size: int = None,
max_cache_len: int = None,
device: Union[torch.device, str] = None,
device: Union[torch.device, str] = torch.device("meta"),
dtype: torch.dtype = torch.float32,
max_batch_size: Optional[int] = None,
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
@ -1648,7 +1709,6 @@ class HybridCache(Cache):
config.num_attention_heads if config.num_key_value_heads is None else config.num_key_value_heads
)
self.device = torch.device(device) if device is not None else torch.device("meta")
layer_switch = config.sliding_window_pattern if hasattr(config, "sliding_window_pattern") else 2 # 2 is for BC
self.is_sliding = torch.tensor(
[bool((i + 1) % layer_switch) for i in range(config.num_hidden_layers)], dtype=torch.bool
@ -1781,7 +1841,7 @@ class HybridCache(Cache):
return self.max_batch_size
class MambaCache:
class MambaCache(Cache):
"""
Cache for mamba model which does not have attention mechanism and key value states.
@ -1838,7 +1898,7 @@ class MambaCache:
config: PretrainedConfig,
batch_size: int = None,
dtype: torch.dtype = torch.float16,
device: Optional[Union[torch.device, str]] = None,
device: Union[torch.device, str] = torch.device("meta"),
max_batch_size: Optional[int] = None,
):
if batch_size is not None:
@ -1846,12 +1906,10 @@ class MambaCache:
f"The 'batch_size' argument of {self.__class__.__name__} is deprecated and will be removed in "
"v4.49. Use the more precisely named 'max_batch_size' argument instead."
)
self.dtype = dtype
self.max_batch_size = batch_size or max_batch_size
self.intermediate_size = config.intermediate_size
self.ssm_state_size = config.state_size
self.conv_kernel_size = config.conv_kernel
self.device = torch.device(device) if device is not None else torch.device("meta")
self.conv_states: List[torch.Tensor] = []
self.ssm_states: List[torch.Tensor] = []
@ -1981,17 +2039,14 @@ class OffloadedStaticCache(StaticCache):
config: PretrainedConfig,
max_batch_size: int,
max_cache_len: Optional[int],
device: Union[str, torch.device],
dtype: Optional[torch.dtype] = None,
device: Union[torch.device, str] = torch.device("meta"),
dtype: torch.dtype = torch.float32,
offload_device: Union[str, torch.device] = torch.device("cpu"),
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
) -> None:
super(Cache, self).__init__()
self.max_batch_size = max_batch_size
self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
self.device = torch.device(device) if layer_device_map is None else torch.device(layer_device_map[0])
self.offload_device = torch.device(offload_device)
self.dtype = dtype if dtype is not None else torch.float32
# Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
head_dim = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads

View File

@ -731,6 +731,7 @@ class GenerationMixin:
key != "cache_position"
and dict_to_expand[key] is not None
and isinstance(dict_to_expand[key], torch.Tensor)
and not isinstance(dict_to_expand[key], Cache)
):
dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
return dict_to_expand
@ -4519,13 +4520,13 @@ def _split(data, full_batch_size: int, num_hidden_layers: int, split_size: int =
"""
if data is None:
return [None] * (full_batch_size // split_size)
if isinstance(data, torch.Tensor):
return [data[i : i + split_size] for i in range(0, full_batch_size, split_size)]
# New cache format
elif isinstance(data, DynamicCache) or (
isinstance(data, EncoderDecoderCache) and isinstance(data.self_attention_cache, DynamicCache)
):
return data.batch_split(full_batch_size, split_size, num_hidden_layers)
if isinstance(data, torch.Tensor):
return [data[i : i + split_size] for i in range(0, full_batch_size, split_size)]
elif isinstance(data, tuple):
# If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
if isinstance(data[0], tuple):
@ -4632,13 +4633,13 @@ def stack_model_outputs(model_outputs: List[ModelOutput], config: PretrainedConf
"""
if any(data is None for data in data):
return None
if isinstance(data[0], torch.Tensor):
return torch.cat(data, dim=0)
# New cache format
elif isinstance(data[0], DynamicCache):
if isinstance(data[0], DynamicCache):
return DynamicCache.from_batch_splits(data, num_hidden_layers=num_hidden_layers)
elif isinstance(data[0], EncoderDecoderCache):
return EncoderDecoderCache.from_batch_splits(data, num_hidden_layers=num_hidden_layers)
elif isinstance(data[0], torch.Tensor):
return torch.cat(data, dim=0)
elif isinstance(data[0], tuple):
# If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
if isinstance(data[0][0], tuple):

View File

@ -16,10 +16,7 @@ from ..utils.import_utils import is_torch_available
if is_torch_available():
from transformers import (
PreTrainedModel,
StaticCache,
)
from transformers import PreTrainedModel, StaticCache
from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_3
@ -68,6 +65,8 @@ class TorchExportableModuleWithStaticCache(torch.nn.Module):
)
self.model = model
self.is_causal = any("CausalLM" in arch for arch in self.model.config.architectures)
self.static_cache = StaticCache(
config=self.model.config,
batch_size=self.model.generation_config.cache_config.batch_size,
@ -75,14 +74,13 @@ class TorchExportableModuleWithStaticCache(torch.nn.Module):
dtype=self.model.dtype,
device=self.model.generation_config.cache_config.device,
)
self.is_causal = any("CausalLM" in arch for arch in self.model.config.architectures)
for i in range(len(self.static_cache.key_cache)):
self.register_buffer(f"key_cache_{i}", self.static_cache.key_cache[i], persistent=False)
self.register_buffer(f"value_cache_{i}", self.static_cache.value_cache[i], persistent=False)
if self.is_causal:
causal_mask = torch.tril(
torch.ones(
self.static_cache.max_cache_len,
self.static_cache.max_cache_len,
dtype=torch.bool,
)
torch.ones(self.static_cache.max_cache_len, self.static_cache.max_cache_len, dtype=torch.bool)
)
self.register_buffer("mask", causal_mask, persistent=False)
@ -108,15 +106,20 @@ class TorchExportableModuleWithStaticCache(torch.nn.Module):
ensuring that the exported model can be executed in `ExecuTorch` out-of-the-box.
"""
_, seqlen = input_ids.shape
attn_mask = self.mask[cache_position, :seqlen] if self.is_causal else None
position_ids = cache_position.unsqueeze(0)
past_key_values = self.static_cache
outs = self.model(
input_ids=input_ids,
attention_mask=attn_mask,
position_ids=cache_position.unsqueeze(0),
position_ids=position_ids,
past_key_values=past_key_values,
cache_position=cache_position,
past_key_values=self.static_cache,
use_cache=True,
)
return outs.logits
@staticmethod
@ -143,7 +146,7 @@ class TorchExportableModuleWithStaticCache(torch.nn.Module):
prompt_token_len = prompt_token_ids.shape[-1]
max_generation_length = prompt_token_len + max_new_tokens
for buffer_name, buffer in exported_program.named_buffers():
if buffer_name.startswith("static_cache.key_cache"):
if buffer_name.startswith("key_cache"):
max_cache_len = buffer.shape[2]
max_generation_length = min(max_generation_length, max_cache_len)
break

View File

@ -1294,7 +1294,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
# This flag signal that the model can be used as an efficient backend in TGI and vLLM
# In practice, it means that they support attention interface functions, fully pass the kwargs
# through all modules up to the Attention layer, can slice logits with Tensor, and have a default TP plan
# through all modules up to the Attention layer, and can slice logits with Tensor
_supports_attention_backend = False
@property
@ -4037,9 +4037,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
sub_config = getattr(config, sub_config_key)
sub_config.torch_dtype = torch_dtype
elif isinstance(torch_dtype, torch.dtype):
for sub_config_key in config.sub_configs.keys():
sub_config = getattr(config, sub_config_key)
sub_config.torch_dtype = torch_dtype
pass
elif isinstance(torch_dtype, dict):
for key, curr_dtype in torch_dtype.items():
if hasattr(config, key):

View File

@ -303,6 +303,5 @@ from . import (
yolos,
yoso,
zamba,
zamba2,
zoedepth,
)

View File

@ -580,7 +580,7 @@ class _BaseAutoModelClass:
model_class ([`PreTrainedModel`]):
The model to register.
"""
if hasattr(model_class, "config_class") and model_class.config_class.__name__ != config_class.__name__:
if hasattr(model_class, "config_class") and str(model_class.config_class) != str(config_class):
raise ValueError(
"The model class you are passing has a `config_class` attribute that is not consistent with the "
f"config class you passed (model has {model_class.config_class} and you passed {config_class}. Fix "

View File

@ -335,7 +335,6 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("yolos", "YolosConfig"),
("yoso", "YosoConfig"),
("zamba", "ZambaConfig"),
("zamba2", "Zamba2Config"),
("zoedepth", "ZoeDepthConfig"),
]
)
@ -681,7 +680,6 @@ MODEL_NAMES_MAPPING = OrderedDict(
("yolos", "YOLOS"),
("yoso", "YOSO"),
("zamba", "Zamba"),
("zamba2", "Zamba2"),
("zoedepth", "ZoeDepth"),
]
)

View File

@ -303,7 +303,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
("yolos", "YolosModel"),
("yoso", "YosoModel"),
("zamba", "ZambaModel"),
("zamba2", "Zamba2Model"),
]
)
@ -578,7 +577,6 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("xlnet", "XLNetLMHeadModel"),
("xmod", "XmodForCausalLM"),
("zamba", "ZambaForCausalLM"),
("zamba2", "Zamba2ForCausalLM"),
]
)
@ -1057,7 +1055,6 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("xmod", "XmodForSequenceClassification"),
("yoso", "YosoForSequenceClassification"),
("zamba", "ZambaForSequenceClassification"),
("zamba2", "Zamba2ForSequenceClassification"),
]
)

View File

@ -583,13 +583,6 @@ else:
"LlamaTokenizerFast" if is_tokenizers_available() else None,
),
),
(
"zamba2",
(
"LlamaTokenizer" if is_sentencepiece_available() else None,
"LlamaTokenizerFast" if is_tokenizers_available() else None,
),
),
]
)

View File

@ -139,15 +139,6 @@ class CohereConfig(PretrainedConfig):
model_type = "cohere"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
def __init__(
self,

View File

@ -139,15 +139,6 @@ class Cohere2Config(PretrainedConfig):
model_type = "cohere2"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
def __init__(
self,

View File

@ -163,15 +163,6 @@ class Cohere2Config(PretrainedConfig):
model_type = "cohere2"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
def __init__(
self,

View File

@ -57,7 +57,7 @@ class DbrxAttentionConfig(PretrainedConfig):
self.kv_n_heads = kv_n_heads
self.rope_theta = rope_theta
for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash", "torch_dtype"]:
for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash"]:
if k in kwargs:
kwargs.pop(k)
if len(kwargs) != 0:
@ -109,7 +109,7 @@ class DbrxFFNConfig(PretrainedConfig):
self.moe_loss_weight = moe_loss_weight
self.moe_normalize_expert_weights = moe_normalize_expert_weights
for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash", "torch_dtype"]:
for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash"]:
if k in kwargs:
kwargs.pop(k)
if len(kwargs) != 0:

View File

@ -675,8 +675,6 @@ class DbrxExperts(nn.Module):
v1_chunked = [v1.squeeze(dim=0) for v1 in v1_chunked]
w2_chunked = [w2.squeeze(dim=0) for w2 in w2_chunked]
for expert_idx in range(0, self.moe_num_experts):
# (This cause torch.compile to fail with `torch._dynamo.exc.Unsupported: dynamic shape operator: aten.nonzero.default`)
# (set torch._dynamo.config.capture_dynamic_output_shape_ops = True may help but not tested)
topk_idx, token_idx = torch.where(expert_mask[expert_idx])
if token_idx.shape[0] == 0:
continue
@ -833,6 +831,7 @@ class DbrxPreTrainedModel(PreTrainedModel):
_supports_sdpa = True
_supports_cache_class = True
_supports_quantized_cache = True
_supports_static_cache = True
def _init_weights(self, module: nn.Module):
std = self.config.initializer_range

View File

@ -139,11 +139,6 @@ class DPTImageProcessor(BaseImageProcessor):
size_divisor (`int`, *optional*):
If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the
DINOv2 paper, which uses the model in combination with DPT.
do_reduce_labels (`bool`, *optional*, defaults to `False`):
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is
used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The
background label will be replaced by 255. Can be overridden by the `do_reduce_labels` parameter in the
`preprocess` method.
"""
model_input_names = ["pixel_values"]
@ -162,7 +157,6 @@ class DPTImageProcessor(BaseImageProcessor):
image_std: Optional[Union[float, List[float]]] = None,
do_pad: bool = False,
size_divisor: int = None,
do_reduce_labels: bool = False,
**kwargs,
) -> None:
super().__init__(**kwargs)
@ -180,7 +174,6 @@ class DPTImageProcessor(BaseImageProcessor):
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self.do_pad = do_pad
self.size_divisor = size_divisor
self.do_reduce_labels = do_reduce_labels
def resize(
self,
@ -282,160 +275,10 @@ class DPTImageProcessor(BaseImageProcessor):
return pad(image, ((pad_size_left, pad_size_right), (pad_size_top, pad_size_bottom)), data_format=data_format)
# Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.reduce_label
def reduce_label(self, label: ImageInput) -> np.ndarray:
label = to_numpy_array(label)
# Avoid using underflow conversion
label[label == 0] = 255
label = label - 1
label[label == 254] = 255
return label
def _preprocess(
self,
image: ImageInput,
do_reduce_labels: bool = None,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
keep_aspect_ratio: bool = None,
ensure_multiple_of: int = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_pad: bool = None,
size_divisor: int = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
):
if do_reduce_labels:
image = self.reduce_label(image)
if do_resize:
image = self.resize(
image=image,
size=size,
resample=resample,
keep_aspect_ratio=keep_aspect_ratio,
ensure_multiple_of=ensure_multiple_of,
input_data_format=input_data_format,
)
if do_rescale:
image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
if do_normalize:
image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
if do_pad:
image = self.pad_image(image=image, size_divisor=size_divisor, input_data_format=input_data_format)
return image
def _preprocess_image(
self,
image: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
keep_aspect_ratio: bool = None,
ensure_multiple_of: int = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_pad: bool = None,
size_divisor: int = None,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""Preprocesses a single image."""
# All transformations expect numpy arrays.
image = to_numpy_array(image)
if do_rescale and is_scaled_image(image):
logger.warning_once(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(image)
image = self._preprocess(
image,
do_reduce_labels=False,
do_resize=do_resize,
size=size,
resample=resample,
keep_aspect_ratio=keep_aspect_ratio,
ensure_multiple_of=ensure_multiple_of,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
do_pad=do_pad,
size_divisor=size_divisor,
input_data_format=input_data_format,
)
if data_format is not None:
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
return image
def _preprocess_segmentation_map(
self,
segmentation_map: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
keep_aspect_ratio: bool = None,
ensure_multiple_of: int = None,
do_reduce_labels: bool = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
):
"""Preprocesses a single segmentation map."""
# All transformations expect numpy arrays.
segmentation_map = to_numpy_array(segmentation_map)
# Add an axis to the segmentation maps for transformations.
if segmentation_map.ndim == 2:
segmentation_map = segmentation_map[None, ...]
added_dimension = True
input_data_format = ChannelDimension.FIRST
else:
added_dimension = False
if input_data_format is None:
input_data_format = infer_channel_dimension_format(segmentation_map, num_channels=1)
segmentation_map = self._preprocess(
image=segmentation_map,
do_reduce_labels=do_reduce_labels,
do_resize=do_resize,
size=size,
resample=resample,
keep_aspect_ratio=keep_aspect_ratio,
ensure_multiple_of=ensure_multiple_of,
do_normalize=False,
do_rescale=False,
input_data_format=input_data_format,
)
# Remove extra axis if added
if added_dimension:
segmentation_map = np.squeeze(segmentation_map, axis=0)
segmentation_map = segmentation_map.astype(np.int64)
return segmentation_map
# Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.__call__
def __call__(self, images, segmentation_maps=None, **kwargs):
# Overrides the `__call__` method of the `Preprocessor` class such that the images and segmentation maps can both
# be passed in as positional arguments.
return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs)
@filter_out_non_signature_kwargs()
def preprocess(
self,
images: ImageInput,
segmentation_maps: Optional[ImageInput] = None,
do_resize: bool = None,
size: int = None,
keep_aspect_ratio: bool = None,
@ -448,7 +291,6 @@ class DPTImageProcessor(BaseImageProcessor):
image_std: Optional[Union[float, List[float]]] = None,
do_pad: bool = None,
size_divisor: int = None,
do_reduce_labels: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
@ -460,8 +302,6 @@ class DPTImageProcessor(BaseImageProcessor):
images (`ImageInput`):
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
segmentation_maps (`ImageInput`, *optional*):
Segmentation map to preprocess.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@ -486,10 +326,6 @@ class DPTImageProcessor(BaseImageProcessor):
Image mean.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation.
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
is used for background, and background itself is not included in all classes of a dataset (e.g.
ADE20k). The background label will be replaced by 255.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
@ -521,13 +357,9 @@ class DPTImageProcessor(BaseImageProcessor):
image_std = image_std if image_std is not None else self.image_std
do_pad = do_pad if do_pad is not None else self.do_pad
size_divisor = size_divisor if size_divisor is not None else self.size_divisor
do_reduce_labels = do_reduce_labels if do_reduce_labels is not None else self.do_reduce_labels
images = make_list_of_images(images)
if segmentation_maps is not None:
segmentation_maps = make_list_of_images(segmentation_maps, expected_ndims=2)
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
@ -545,47 +377,55 @@ class DPTImageProcessor(BaseImageProcessor):
size=size,
resample=resample,
)
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
images = [
self._preprocess_image(
image=img,
do_resize=do_resize,
do_rescale=do_rescale,
do_normalize=do_normalize,
do_pad=do_pad,
size=size,
resample=resample,
keep_aspect_ratio=keep_aspect_ratio,
ensure_multiple_of=ensure_multiple_of,
rescale_factor=rescale_factor,
image_mean=image_mean,
image_std=image_std,
size_divisor=size_divisor,
data_format=data_format,
input_data_format=input_data_format,
if do_rescale and is_scaled_image(images[0]):
logger.warning_once(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
for img in images
]
data = {"pixel_values": images}
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
if segmentation_maps is not None:
segmentation_maps = [
self._preprocess_segmentation_map(
segmentation_map=segmentation_map,
do_reduce_labels=do_reduce_labels,
do_resize=do_resize,
if do_resize:
images = [
self.resize(
image=image,
size=size,
resample=resample,
keep_aspect_ratio=keep_aspect_ratio,
ensure_multiple_of=ensure_multiple_of,
input_data_format=input_data_format,
)
for segmentation_map in segmentation_maps
for image in images
]
data["labels"] = segmentation_maps
if do_rescale:
images = [
self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
for image in images
]
if do_normalize:
images = [
self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
for image in images
]
if do_pad:
images = [
self.pad_image(image=image, size_divisor=size_divisor, input_data_format=input_data_format)
for image in images
]
images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
]
data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors)
# Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->DPT

View File

@ -93,15 +93,6 @@ class GemmaConfig(PretrainedConfig):
model_type = "gemma"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
def __init__(
self,

View File

@ -117,15 +117,6 @@ class GemmaConfig(PretrainedConfig):
model_type = "gemma"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
def __init__(
self,

View File

@ -97,15 +97,6 @@ class Gemma2Config(PretrainedConfig):
model_type = "gemma2"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
def __init__(
self,

View File

@ -123,15 +123,6 @@ class Gemma2Config(PretrainedConfig):
model_type = "gemma2"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
def __init__(
self,

View File

@ -85,14 +85,6 @@ class GlmConfig(PretrainedConfig):
model_type = "glm"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.gate_up_proj": "colwise_rep", # we need to replicate here due to the `chunk` operation
"layers.*.mlp.down_proj": "rowwise_rep", # we need to replicate here due to the `chunk` operation
}
def __init__(
self,

View File

@ -330,8 +330,6 @@ class GraniteMoeTopKGating(nn.Module):
) # [num_tokens, num_experts]
gates = zeros.scatter(1, top_k_indices, 1) # [num_tokens, num_experts]
expert_size = gates.long().sum(0) # [num_experts,]
# (This cause torch.compile to fail with `torch._dynamo.exc.Unsupported: Backend compiler failed with a fake tensor exception at`)
# (and `DataDependentOutputException`)
expert_size = expert_size.tolist()
# sort and group input tokens according to expert assignment
@ -843,6 +841,7 @@ class GraniteMoePreTrainedModel(PreTrainedModel):
_supports_sdpa = True
_supports_cache_class = True
_supports_quantized_cache = True
_supports_static_cache = True
def _init_weights(self, module):
std = self.config.initializer_range
@ -1156,6 +1155,8 @@ class GraniteMoeModel(GraniteMoePreTrainedModel):
if attention_mask is not None and attention_mask.dim() == 4:
# in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
if attention_mask.max() != 0:
raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
causal_mask = attention_mask
else:
causal_mask = torch.full(

View File

@ -86,15 +86,6 @@ class HeliumConfig(PretrainedConfig):
model_type = "helium"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
def __init__(
self,

View File

@ -868,7 +868,7 @@ class IdeficsGatedCrossAttentionLayer(nn.Module):
)
hidden_states = nn.functional.dropout(hidden_states, p=self.config, training=self.training)
# Fill in zeros for cross_attention hidden_states of tokens attending to no images
hidden_states = hidden_states.masked_fill((cross_attention_gate == 0)[:, :, None], 0.0)
hidden_states[cross_attention_gate == 0] = hidden_states[cross_attention_gate == 0].fill_(0)
hidden_states = residual + self.act_cross_attn(self.alpha_cross_attn) * hidden_states
# Fully Connected
@ -917,6 +917,7 @@ class IdeficsPreTrainedModel(PreTrainedModel):
_no_split_modules = ["IdeficsDecoderLayer", "IdeficsGatedCrossAttentionLayer"]
_supports_sdpa = True
_supports_cache_class = True
_supports_static_cache = True
def _init_weights(self, module):
# important: this ported version of Idefics isn't meant for training from scratch - only
@ -1154,7 +1155,7 @@ class IdeficsModel(IdeficsPreTrainedModel):
elif position_ids is None:
position_ids = cache_position.unsqueeze(0)
if sum([x is None for x in [pixel_values, image_encoder_embeddings, perceiver_embeddings]]) != 2:
if (pixel_values, image_encoder_embeddings, perceiver_embeddings).count(None) != 2:
raise ValueError(
"Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None."
)

View File

@ -215,7 +215,6 @@ class HybridMambaAttentionDynamicCache(DynamicCache):
def __init__(self, config, batch_size, dtype=torch.float16, device=None):
super().__init__()
self.dtype = dtype
self.layers_block_type = config.layers_block_type
self.has_previous_state = False # only used by mamba
intermediate_size = config.mamba_expand * config.hidden_size

View File

@ -216,8 +216,6 @@ class JetMoeTopKGating(nn.Module):
) # [num_tokens, num_experts]
gates = zeros.scatter(1, top_k_indices, 1) # [num_tokens, num_experts]
expert_size = gates.long().sum(0) # [num_experts,]
# (This cause torch.compile to fail with `torch._dynamo.exc.Unsupported: Backend compiler failed with a fake tensor exception at`)
# (and `DataDependentOutputException`)
expert_size = expert_size.tolist()
# sort and group input tokens according to expert assignment

View File

@ -109,16 +109,6 @@ class MixtralConfig(PretrainedConfig):
model_type = "mixtral"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.block_sparse_moe.gate": "colwise_rep", # we need to replicate here to correctly route experts
"layers.*.block_sparse_moe.experts.*.w1": "colwise",
"layers.*.block_sparse_moe.experts.*.w2": "rowwise",
"layers.*.block_sparse_moe.experts.*.w3": "colwise",
}
def __init__(
self,

View File

@ -106,15 +106,6 @@ class OlmoConfig(PretrainedConfig):
model_type = "olmo"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
def __init__(
self,

View File

@ -89,15 +89,6 @@ class Olmo2Config(PretrainedConfig):
model_type = "olmo2"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k
"layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k
"layers.*.self_attn.v_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k
"layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here due to the added norm on q and k
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
def __init__(
self,

View File

@ -100,15 +100,6 @@ class Olmo2Config(OlmoConfig):
"""
model_type = "olmo2"
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k
"layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k
"layers.*.self_attn.v_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k
"layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here due to the added norm on q and k
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
def __init__(
self,

View File

@ -138,14 +138,6 @@ class PhiConfig(PretrainedConfig):
model_type = "phi"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.dense": "rowwise",
"layers.*.mlp.fc1": "colwise",
"layers.*.mlp.fc2": "rowwise",
}
def __init__(
self,

View File

@ -107,12 +107,6 @@ class Phi3Config(PretrainedConfig):
model_type = "phi3"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {
"layers.*.self_attn.qkv_proj": "colwise_rep", # we need to replicate here due to the slicing of qkv
"layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here due to the slicing of qkv
"layers.*.mlp.gate_up_proj": "colwise_rep", # we need to replicate here due to the `chunk` operation
"layers.*.mlp.down_proj": "rowwise_rep", # we need to replicate here due to the `chunk` operation
}
def __init__(
self,

View File

@ -183,7 +183,7 @@ def convert_wav2vec2_bert_checkpoint(
with torch.no_grad():
outputs = hf_wav2vec(**inputs)
torch.testing.assert_close(original_output, outputs.last_hidden_state, rtol=5e-3, atol=5e-3)
torch.testing.assert_close(original_output, outputs.last_hidden_state, atol=5e-3, rtol=5e-3)
if __name__ == "__main__":

View File

@ -129,7 +129,6 @@ class ZambaHybridDynamicCache(DynamicCache):
"""
def __init__(self, config, batch_size, dtype=torch.float16, device=None):
self.dtype = dtype
self.layers_block_type = config.layers_block_type
self.has_previous_state = False # only used by mamba
self.intermediate_size = config.mamba_expand * config.hidden_size
@ -139,9 +138,7 @@ class ZambaHybridDynamicCache(DynamicCache):
self.conv_states = []
self.ssm_states = []
self.transformer_layers = []
self._modules = {}
self._parameters = {}
self._buffers = {}
for i in range(config.num_hidden_layers):
self.conv_states += [
torch.zeros(batch_size, self.intermediate_size, self.conv_kernel_size, device=device, dtype=dtype)
@ -272,6 +269,7 @@ class ZambaAttention(nn.Module):
layer_idx: int,
attention_mask: Optional[torch.Tensor],
past_key_value: Optional[ZambaHybridDynamicCache] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[FlashAttentionKwargs],
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
input_shape = hidden_states.shape[:-1]
@ -620,9 +618,11 @@ class ZambaAttentionDecoderLayer(nn.Module):
original_hidden_states: torch.Tensor,
layer_idx: int,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[ZambaHybridDynamicCache] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[FlashAttentionKwargs],
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
@ -635,6 +635,7 @@ class ZambaAttentionDecoderLayer(nn.Module):
layer_idx (`int`): layer_idx in the forward pass. Used to distinguish Zamba's tied transformer layers.
attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
`(batch, sequence_length)` where padding elements are indicated by 0.
position_ids (`torch.LongTensor`, *optional*): token positions of shape `(batch, seq_len)`. Used for positional encodings.
past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
@ -651,9 +652,11 @@ class ZambaAttentionDecoderLayer(nn.Module):
hidden_states=hidden_states,
layer_idx=layer_idx,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
**kwargs,
)
# feed-forward (MLP)
@ -682,12 +685,12 @@ class ZambaMambaDecoderLayer(nn.Module):
layer_idx: int = None,
attention_mask: Optional[torch.Tensor] = None,
causal_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[ZambaHybridDynamicCache] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None,
transformer_hidden_states: Optional[torch.Tensor] = None,
**kwargs,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
@ -750,6 +753,7 @@ class ZambaHybridLayer(nn.Module):
layer_idx: int = None,
attention_mask: Optional[torch.Tensor] = None,
causal_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[ZambaHybridDynamicCache] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
@ -779,6 +783,7 @@ class ZambaHybridLayer(nn.Module):
original_hidden_states=original_hidden_states,
layer_idx=layer_idx,
attention_mask=causal_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
@ -796,6 +801,7 @@ class ZambaHybridLayer(nn.Module):
hidden_states,
transformer_hidden_states=transformer_hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
@ -1099,6 +1105,7 @@ class ZambaModel(ZambaPreTrainedModel):
layer_idx,
attention_mask,
causal_mask,
position_ids,
past_key_values,
output_attentions,
use_cache,
@ -1111,6 +1118,7 @@ class ZambaModel(ZambaPreTrainedModel):
layer_idx=layer_idx,
attention_mask=attention_mask,
causal_mask=causal_mask,
position_ids=position_ids,
past_key_value=past_key_values,
output_attentions=output_attentions,
use_cache=use_cache,

View File

@ -1,27 +0,0 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import _LazyModule
from ...utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_zamba2 import *
from .modeling_zamba2 import *
else:
import sys
_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

View File

@ -1,236 +0,0 @@
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# This file was automatically generated from src/transformers/models/zamba2/modular_zamba2.py.
# Do NOT edit this file manually as any edits will be overwritten by the generation of
# the file from the modular. If any change should be done, please apply the change to the
# modular_zamba2.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# coding=utf-8
# Copyright 2024 Zyphra Technologies and the HuggingFace Inc. team. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ...configuration_utils import PretrainedConfig
class Zamba2Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Zamba2Model`]. It is used to instantiate a
Zamba2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Zamba2 model.
[Zyphra/Zamba2-2.7B](https://huggingface.co/Zyphra/Zamba2-2.7B)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the Zamba2 model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`Zamba2Model`]
max_position_embeddings (`int`, *optional*, defaults to 4096):
The maximum sequence length that this model might ever be used with.
hidden_size (`int`, *optional*, defaults to 2560):
Dimension of the hidden representations.
num_hidden_layers (`int`, *optional*, defaults to 54):
Number of hidden layers in the model.
layers_block_type (`list`, *optional*):
List of layer types, which can be either "mamba" or "hybrid".
mamba_d_state (`int`, *optional*, defaults to 64): shape of the state space latents.
mamba_d_conv (`int`, *optional*, defaults to 4): Size of the convolution kernel.
mamba_expand (`int`, *optional*, defaults to 2): Expanding factor used to determine the intermediate size.
mamba_ngroups (`int`, *optional*, defaults to 1):
Number of groups for the evolution matrices of mamba 2.
time_step_min (`float`, *optional*, defaults to 0.001):
Minimum `time_step` used to bound `dt_proj.bias`.
time_step_max (`float`, *optional*, defaults to 0.1):
Maximum `time_step` used to bound `dt_proj.bias`.
time_step_floor (`float`, *optional*, defaults to 0.0001):
Minimum clamping value of the `dt_proj.bias` layer initialization.
time_step_limit (`tuple`, *optional*):
Accepted range of time step values.
n_mamba_heads (`int`, *optional*, defaults to 8):
Number of heads for the evolution matrices of mamba 2.
use_conv_bias (`bool`, *optional*, defaults to `True`):
Whether or not to use bias in the convolution layer of the mixer block.
chunk_size (`int`, *optional*, defaults to 256):
Size of the chunks that will comprise the sequence.
add_bias_linear (`bool`, *optional*, defaults to `False`):
Flag indicating whether or not to use bias in various layers
intermediate_size (`int`, *optional*, defaults to 4 * hidden_size):
Dimension of the MLP representations.
hidden_act (`str`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the MLP.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=None`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf).
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
num_mem_blocks (`int`, *optional*, defaults to 1):
Number of unshared transformer blocks.
use_shared_attention_adapter (`bool`, *optional*, defaults to `False`):
If True, unshared adapters (formally the same as LoRA but used in the base model) will be added to the q, k, v projectors in the shared attention layers.
adapter_rank (`int`, *optional*, defaults to 128):
Rank of the adapter in the shared MLP and shared attention layers.
use_mem_rope (`bool`, *optional*, defaults to `False`):
If True, includes RoPE in the shared attention layers.
rope_theta (`float`, *optional*, defaults to `10000.0`):
The base period of the RoPE embeddings.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the
logits of the last prompt token are needed for generation. For long sequences, the logits for the entire
sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint
significantly.
pad_token_id (`int`, *optional*, defaults to 0):
The id of the padding token.
bos_token_id (`int`, *optional*, defaults to 1):
The id of the "beginning-of-sequence" token.
eos_token_id (`int`, *optional*, defaults to 2):
The id of the "end-of-sequence" token.
use_long_context (`bool`, *optional*, defaults to `False`):
Activates the context-extended version of Zamba by modifying RoPE.
```python
>>> from transformers import Zamba2Model, Zamba2Config
>>> # Initializing a Zamba2-2.7B style configuration
>>> configuration = Zamba2Config()
>>> # Initializing a model from the Zamba2-2.7B style configuration
>>> model = Zamba2Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "zamba2"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=32000,
max_position_embeddings=4096,
hidden_size=2560,
num_hidden_layers=54,
layers_block_type=None,
mamba_d_state=64,
mamba_d_conv=4,
mamba_expand=2,
mamba_ngroups=1,
time_step_min=0.001,
time_step_max=0.1,
time_step_floor=1e-4,
time_step_limit=None,
n_mamba_heads=8,
use_conv_bias=True,
chunk_size=256,
add_bias_linear=False,
intermediate_size=None,
hidden_act="gelu",
num_attention_heads=32,
num_key_value_heads=None,
attention_dropout=0.0,
num_mem_blocks=1,
use_shared_attention_adapter=False,
adapter_rank=128,
use_mem_rope=False,
rope_theta=10000,
initializer_range=0.02,
rms_norm_eps=1e-5,
use_cache=True,
num_logits_to_keep=1,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
use_long_context=False,
**kwargs,
):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
if intermediate_size is None:
self.intermediate_size = 4 * hidden_size
else:
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_mem_blocks = num_mem_blocks
self.attention_hidden_size = 2 * hidden_size
self.attention_head_dim = 2 * self.hidden_size // self.num_attention_heads
self.attention_dropout = attention_dropout
self.use_mem_rope = use_mem_rope
self.use_long_context = use_long_context
if use_mem_rope and use_long_context:
a = 8
rope_theta = rope_theta * a ** (self.attention_head_dim / (self.attention_head_dim - 2))
self.rope_theta = rope_theta
self.mamba_d_state = mamba_d_state
self.mamba_d_conv = mamba_d_conv
self.mamba_expand = mamba_expand
self.add_bias_linear = add_bias_linear
self.mamba_ngroups = mamba_ngroups
self.n_mamba_heads = n_mamba_heads
self.mamba_headdim = int(mamba_expand * hidden_size) // n_mamba_heads
self.use_conv_bias = use_conv_bias
self.chunk_size = chunk_size
self.time_step_limit = time_step_limit
self.use_shared_attention_adapter = use_shared_attention_adapter
self.adapter_rank = adapter_rank
self.time_step_min = time_step_min
self.time_step_max = time_step_max
self.time_step_floor = time_step_floor
if use_long_context:
self.max_position_embeddings = 16384
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.num_attention_heads = num_attention_heads
self.kv_channels = self.hidden_size // self.num_attention_heads
self.num_query_groups = self.num_attention_heads
# Below, "mamba" stands for mamba layer, "hybrid" stands for hybrid layer (composed by a shared transformer followed by mamba layer)
if layers_block_type is None:
self.layers_block_type = (
["mamba"]
+ (["mamba"] * 5 + ["hybrid"]) * 7
+ ["mamba"] * 4
+ ["hybrid"]
+ ["mamba"] * 3
+ ["hybrid"]
+ ["mamba"] * 2
)
else:
self.layers_block_type = layers_block_type
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.num_logits_to_keep = num_logits_to_keep
self.hybrid_layer_ids = [index for index, type in enumerate(self.layers_block_type) if type == "hybrid"]
__all__ = ["Zamba2Config"]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1148,9 +1148,6 @@ class Pipeline(_ScikitCompat, PushToHubMixin):
elif self.device.type == "musa":
with torch.musa.device(self.device):
yield
elif self.device.type == "xpu":
with torch.xpu.device(self.device):
yield
else:
yield

View File

@ -189,9 +189,9 @@ class ImageClassificationPipeline(Pipeline):
def postprocess(self, model_outputs, function_to_apply=None, top_k=5):
if function_to_apply is None:
if self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1:
if self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels == 1:
function_to_apply = ClassificationFunction.SIGMOID
elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1:
elif self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels > 1:
function_to_apply = ClassificationFunction.SOFTMAX
elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
function_to_apply = self.model.config.function_to_apply

View File

@ -358,7 +358,5 @@ def translate_to_torch_parallel_style(style: str):
return RowwiseParallel()
elif style == "colwise_rep":
return ColwiseParallel(output_layouts=Replicate())
elif style == "rowwise_rep":
return RowwiseParallel(input_layouts=Replicate())
else:
raise ValueError(f"Unsupported parallel style value: {style}")

View File

@ -1435,7 +1435,6 @@ def set_model_tester_for_less_flaky_test(test_case):
# TODO (if possible): Avoid exceptional cases
exceptional_classes = [
"ZambaModelTester",
"Zamba2ModelTester",
"RwkvModelTester",
"AriaVisionText2TextModelTester",
"GPTNeoModelTester",

View File

@ -10576,34 +10576,6 @@ class ZambaPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
class Zamba2ForCausalLM(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Zamba2ForSequenceClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Zamba2Model(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Zamba2PreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class ZoeDepthForDepthEstimation(metaclass=DummyObject):
_backends = ["torch"]

View File

@ -35,7 +35,7 @@ from torch.fx._symbolic_trace import is_fx_tracing
from torch.fx.proxy import ParameterProxy
from .. import logging
from ..cache_utils import Cache, DynamicCache, SinkCache, StaticCache
from ..cache_utils import Cache
from ..modeling_utils import PretrainedConfig, PreTrainedModel
from ..models.auto import get_values
from ..models.auto.modeling_auto import (
@ -811,40 +811,40 @@ def _proxies_to_metas(v):
return v
def create_cache_proxy_factory_fn(orig_cache_cls: Type[Cache]) -> Callable[[Node], HFCacheProxy]:
def cache_proxy_factory_fn(n: Node) -> HFCacheProxy:
global _CURRENT_TRACER
if not isinstance(_CURRENT_TRACER, HFTracer):
raise RuntimeError("Cannot create HFCacheProxy because there is no HFTracer currently tracing.")
cache_proxy = HFCacheProxy(n, _CURRENT_TRACER)
cache_proxy.install_orig_cache_cls(orig_cache_cls)
return cache_proxy
# def create_cache_proxy_factory_fn(orig_cache_cls: Type[Cache]) -> Callable[[Node], HFCacheProxy]:
# def cache_proxy_factory_fn(n: Node) -> HFCacheProxy:
# global _CURRENT_TRACER
# if not isinstance(_CURRENT_TRACER, HFTracer):
# raise RuntimeError("Cannot create HFCacheProxy because there is no HFTracer currently tracing.")
# cache_proxy = HFCacheProxy(n, _CURRENT_TRACER)
# cache_proxy.install_orig_cache_cls(orig_cache_cls)
# return cache_proxy
return cache_proxy_factory_fn
# return cache_proxy_factory_fn
# Proxyable equivalent of the cache classes defined in `transformers.cache_utils`.
ProxyableCache = HFProxyableClassMeta(
"ProxyableCache", (Cache,), {}, proxy_factory_fn=create_cache_proxy_factory_fn(Cache)
)
ProxyableDynamicCache = HFProxyableClassMeta(
"ProxyableDynamicCache",
(DynamicCache,),
{},
proxy_factory_fn=create_cache_proxy_factory_fn(DynamicCache),
)
ProxyableSinkCache = HFProxyableClassMeta(
"ProxyableSinkCache",
(SinkCache,),
{},
proxy_factory_fn=create_cache_proxy_factory_fn(SinkCache),
)
ProxyableStaticCache = HFProxyableClassMeta(
"ProxyableStaticCache",
(StaticCache,),
{},
proxy_factory_fn=create_cache_proxy_factory_fn(StaticCache),
)
# # Proxyable equivalent of the cache classes defined in `transformers.cache_utils`.
# ProxyableCache = HFProxyableClassMeta(
# "ProxyableCache", (Cache,), {}, proxy_factory_fn=create_cache_proxy_factory_fn(Cache)
# )
# ProxyableDynamicCache = HFProxyableClassMeta(
# "ProxyableDynamicCache",
# (DynamicCache,),
# {},
# proxy_factory_fn=create_cache_proxy_factory_fn(DynamicCache),
# )
# ProxyableSinkCache = HFProxyableClassMeta(
# "ProxyableSinkCache",
# (SinkCache,),
# {},
# proxy_factory_fn=create_cache_proxy_factory_fn(SinkCache),
# )
# ProxyableStaticCache = HFProxyableClassMeta(
# "ProxyableStaticCache",
# (StaticCache,),
# {},
# proxy_factory_fn=create_cache_proxy_factory_fn(StaticCache),
# )
def _generate_random_int(low: int = 10, high: int = 20, forbidden_values: Optional[List[int]] = None):
@ -879,10 +879,10 @@ class HFTracer(Tracer):
"tril",
]
_CLASSES_TO_PATCH = {
Cache: ProxyableCache,
DynamicCache: ProxyableDynamicCache,
SinkCache: ProxyableSinkCache,
StaticCache: ProxyableStaticCache,
# Cache: ProxyableCache,
# DynamicCache: ProxyableDynamicCache,
# SinkCache: ProxyableSinkCache,
# StaticCache: ProxyableStaticCache,
}
supported_archs = (PreTrainedModel,) if not is_peft_available() else (PreTrainedModel, PeftModel)

View File

@ -47,7 +47,7 @@ class AgentAudioTests(unittest.TestCase):
path = str(agent_type.to_string())
# Ensure that the tensor and the agent_type's tensor are the same
torch.testing.assert_close(tensor, agent_type.to_raw(), rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(tensor, agent_type.to_raw(), atol=1e-4))
del agent_type
@ -56,7 +56,7 @@ class AgentAudioTests(unittest.TestCase):
# Ensure that the file contains the same value as the original tensor
new_tensor, _ = sf.read(path)
torch.testing.assert_close(tensor, torch.tensor(new_tensor), rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(tensor, torch.tensor(new_tensor), atol=1e-4))
def test_from_string(self):
tensor = torch.rand(12, dtype=torch.float64) - 0.5
@ -65,7 +65,7 @@ class AgentAudioTests(unittest.TestCase):
agent_type = AgentAudio(path)
torch.testing.assert_close(tensor, agent_type.to_raw(), rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(tensor, agent_type.to_raw(), atol=1e-4))
self.assertEqual(agent_type.to_string(), path)
@ -78,7 +78,7 @@ class AgentImageTests(unittest.TestCase):
path = str(agent_type.to_string())
# Ensure that the tensor and the agent_type's tensor are the same
torch.testing.assert_close(tensor, agent_type._tensor, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(tensor, agent_type._tensor, atol=1e-4))
self.assertIsInstance(agent_type.to_raw(), Image.Image)

View File

@ -66,7 +66,7 @@ class BetterTransformerIntegrationTest(unittest.TestCase):
)
output_from_pretrained = model_reloaded.generate(**inp)
torch.testing.assert_close(output, output_from_pretrained)
self.assertTrue(torch.allclose(output, output_from_pretrained))
def test_error_save_pretrained(self):
r"""

View File

@ -360,14 +360,14 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
model.config.max_position_embeddings, model.config.rotary_dim
)
self.assertFalse(torch.allclose(good_deepspeed_sin_cos, bad_deepspeed_sin_cos))
torch.testing.assert_close(good_torch_sin_cos, good_deepspeed_sin_cos.cpu())
self.assertTrue(torch.allclose(good_torch_sin_cos, good_deepspeed_sin_cos.cpu()))
# Finally, we can see that the incorrect pattern is okay on vanilla torch, demostrating that this issue is
# exclusive to DeepSpeed
bad_torch_sin_cos = bad_deepspeed_create_sinusoidal_positions(
model.config.max_position_embeddings, model.config.rotary_dim
)
torch.testing.assert_close(bad_torch_sin_cos, good_torch_sin_cos)
self.assertTrue(torch.allclose(bad_torch_sin_cos, good_torch_sin_cos))
class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):

View File

@ -166,8 +166,8 @@ class LogitsProcessorTest(unittest.TestCase):
processed_scores = temp_dist_warper_smoother(input_ids, scores)
# uniform distribution stays uniform
torch.testing.assert_close(probs[0, :], warped_prob_sharp[0, :], rtol=1e-3, atol=1e-3)
torch.testing.assert_close(probs[0, :], warped_prob_smooth[0, :], rtol=1e-3, atol=1e-3)
self.assertTrue(torch.allclose(probs[0, :], warped_prob_sharp[0, :], atol=1e-3))
self.assertTrue(torch.allclose(probs[0, :], warped_prob_smooth[0, :], atol=1e-3))
# sharp peaks get higher, valleys get lower
self.assertLess(probs[1, :].max(), warped_prob_sharp[1, :].max())
@ -288,7 +288,7 @@ class LogitsProcessorTest(unittest.TestCase):
EXPECTED_FILTERED_DIST = torch.tensor(
[[0.3, 0.0, 0.0, 0.5], [0.0, 0.3, 0.3, 0.25]], device=torch_device, dtype=torch.float
)
torch.testing.assert_close(filtered_dist, EXPECTED_FILTERED_DIST, rtol=1e-3, atol=1e-3)
self.assertTrue(torch.allclose(filtered_dist, EXPECTED_FILTERED_DIST, atol=1e-3))
# processor should not change logits in-place
self.assertFalse(torch.all(top_p_warp(input_ids, dist) == dist))
@ -335,7 +335,7 @@ class LogitsProcessorTest(unittest.TestCase):
device=torch_device,
dtype=torch.float,
)
torch.testing.assert_close(filtered_dist, EXPECTED_FILTERED_DIST, rtol=1e-3, atol=1e-3)
self.assertTrue(torch.allclose(filtered_dist, EXPECTED_FILTERED_DIST, atol=1e-3))
# processor should not change logits in-place
self.assertFalse(torch.all(min_p_warp(input_ids, dist) == dist))
@ -372,7 +372,7 @@ class LogitsProcessorTest(unittest.TestCase):
EXPECTED_FILTERED_DIST = torch.tensor(
[[0.97, 0.0, 0.0, 0.0], [0.0, 0.2, 0.2, 0.2]], device=torch_device, dtype=torch.float
)
torch.testing.assert_close(filtered_dist, EXPECTED_FILTERED_DIST, rtol=1e-3, atol=1e-3)
self.assertTrue(torch.allclose(filtered_dist, EXPECTED_FILTERED_DIST, atol=1e-3))
# processor should not change logits in-place
self.assertFalse(torch.all(typical_warp(input_ids, dist) == dist))
@ -422,7 +422,7 @@ class LogitsProcessorTest(unittest.TestCase):
EXPECTED_FILTERED_DIST = torch.tensor(
[[0.87, 0, 0, 0], [0.4, 0.299, 0.101, 0.2]], device=torch_device, dtype=torch.float
)
torch.testing.assert_close(filtered_dist, EXPECTED_FILTERED_DIST, rtol=1e-3, atol=1e-3)
self.assertTrue(torch.allclose(filtered_dist, EXPECTED_FILTERED_DIST, atol=1e-3))
# processor should not change logits in-place
self.assertFalse(torch.all(epsilon_warp(input_ids, dist) == dist))
@ -462,7 +462,7 @@ class LogitsProcessorTest(unittest.TestCase):
EXPECTED_FILTERED_DIST = torch.tensor(
[[0.0, 0.1, 0.8, 0.1], [0.0, 0.0, 0.9, 0.0]], device=torch_device, dtype=torch.float
)
torch.testing.assert_close(filtered_dist, EXPECTED_FILTERED_DIST, rtol=1e-3, atol=1e-3)
self.assertTrue(torch.allclose(filtered_dist, EXPECTED_FILTERED_DIST, atol=1e-3))
# processor should not change logits in-place
self.assertFalse(torch.all(eta_warp(input_ids, dist) == dist))
@ -599,7 +599,7 @@ class LogitsProcessorTest(unittest.TestCase):
# check edge case
no_bad_words_dist_proc = NoBadWordsLogitsProcessor(bad_words_ids=[[4]], eos_token_id=eos_token_id)
filtered_scores = no_bad_words_dist_proc(input_ids, scores)
torch.testing.assert_close(scores, filtered_scores, rtol=1e-3, atol=1e-3)
self.assertTrue(torch.allclose(scores, filtered_scores, atol=1e-3))
def test_bias_dist_processor(self):
vocab_size = 5
@ -674,7 +674,7 @@ class LogitsProcessorTest(unittest.TestCase):
scores_comp = processor(input_ids, scores_comp)
# scores should be equal
torch.testing.assert_close(scores, scores_comp, rtol=1e-3, atol=1e-3)
self.assertTrue(torch.allclose(scores, scores_comp, atol=1e-3))
# input_ids should never be changed
self.assertListEqual(input_ids.tolist(), input_ids_comp.tolist())

View File

@ -1531,7 +1531,7 @@ class GenerationTesterMixin:
next_logits_with_padding = model(**model_kwargs).logits[:, -1, :]
# They should result in very similar logits
torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, rtol=1e-5, atol=1e-5)
torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, atol=1e-5, rtol=1e-5)
@pytest.mark.generate
def test_past_key_values_format(self):
@ -2279,7 +2279,6 @@ class GenerationTesterMixin:
"mamba",
"xlnet",
"zamba",
"zamba2",
)
has_standard_cache = not any(
model_name in config.__class__.__name__.lower() for model_name in models_without_standard_cache
@ -2709,7 +2708,7 @@ class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMi
transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
transition_scores_sum = transition_scores.sum(-1)
torch.testing.assert_close(transition_scores_sum, outputs.sequences_scores, rtol=1e-3, atol=1e-3)
self.assertTrue(torch.allclose(transition_scores_sum, outputs.sequences_scores, atol=1e-3))
def test_beam_search_low_memory(self):
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")

View File

@ -350,7 +350,7 @@ class AlbertModelIntegrationTest(unittest.TestCase):
[[[-0.6513, 1.5035, -0.2766], [-0.6515, 1.5046, -0.2780], [-0.6512, 1.5049, -0.2784]]]
)
torch.testing.assert_close(output[:, 1:4, 1:4], expected_slice, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
@slow
def test_export(self):

View File

@ -651,4 +651,4 @@ class AlignModelIntegrationTest(unittest.TestCase):
torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
)
expected_logits = torch.tensor([[9.7093, 3.4679]], device=torch_device)
torch.testing.assert_close(outputs.logits_per_image, expected_logits, rtol=1e-3, atol=1e-3)
self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))

View File

@ -612,7 +612,7 @@ class AltCLIPModelIntegrationTest(unittest.TestCase):
probs = outputs.logits_per_image.softmax(dim=1)
expected_probs = torch.tensor([[9.9942e-01, 5.7805e-04]], device=torch_device)
torch.testing.assert_close(probs, expected_probs, rtol=5e-3, atol=5e-3)
self.assertTrue(torch.allclose(probs, expected_probs, atol=5e-3))
@slow
def test_inference_interpolate_pos_encoding(self):
@ -651,6 +651,6 @@ class AltCLIPModelIntegrationTest(unittest.TestCase):
[[-0.3589, -0.5939, 0.3534], [0.4346, 0.1647, 0.7071], [1.1404, -0.4716, 0.1664]]
).to(torch_device)
torch.testing.assert_close(
outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4
self.assertTrue(
torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)
)

View File

@ -239,7 +239,7 @@ class AriaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMi
with torch.no_grad():
out_ids = model(input_ids=input_ids, **inputs)[0]
out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
torch.testing.assert_close(out_embeds, out_ids)
self.assertTrue(torch.allclose(out_embeds, out_ids))
@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"

View File

@ -174,7 +174,7 @@ class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test
feature_extractor = ASTFeatureExtractor()
input_values = feature_extractor(input_speech, return_tensors="pt").input_values
self.assertEqual(input_values.shape, (1, 1024, 128))
torch.testing.assert_close(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))
def test_feat_extract_from_and_save_pretrained(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)

View File

@ -266,4 +266,4 @@ class ASTModelIntegrationTest(unittest.TestCase):
expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602]).to(torch_device)
torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))

View File

@ -445,7 +445,7 @@ class AutoformerModelIntegrationTests(unittest.TestCase):
expected_slice = torch.tensor(
[[0.3593, -1.3398, 0.6330], [0.2279, 1.5396, -0.1792], [0.0450, 1.3225, -0.2335]], device=torch_device
)
torch.testing.assert_close(output[0, :3, :3], expected_slice, rtol=TOLERANCE, atol=TOLERANCE)
self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
def test_inference_head(self):
model = AutoformerForPrediction.from_pretrained("huggingface/autoformer-tourism-monthly").to(torch_device)
@ -463,7 +463,7 @@ class AutoformerModelIntegrationTests(unittest.TestCase):
expected_slice = torch.tensor(
[[-0.0734, -0.9036, 0.8358], [4.7186, 2.4113, 1.9581], [1.7953, 2.3558, 1.2970]], device=torch_device
)
torch.testing.assert_close(output[0, :3, :3], expected_slice, rtol=TOLERANCE, atol=TOLERANCE)
self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
def test_seq_to_seq_generation(self):
model = AutoformerForPrediction.from_pretrained("huggingface/autoformer-tourism-monthly").to(torch_device)
@ -481,4 +481,4 @@ class AutoformerModelIntegrationTests(unittest.TestCase):
expected_slice = torch.tensor([3130.6763, 4056.5293, 7053.0786], device=torch_device)
mean_prediction = outputs.sequences.mean(dim=1)
torch.testing.assert_close(mean_prediction[0, -3:], expected_slice, rtol=1e-1)
self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))

View File

@ -312,11 +312,11 @@ class BambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
for name, param in model.named_parameters():
if param.requires_grad:
if "A_log" in name:
A = torch.arange(1, config.mamba_n_heads + 1, dtype=torch.float32)
torch.testing.assert_close(param.data, torch.log(A), rtol=1e-5, atol=1e-5)
A = torch.arange(1, config.mamba_n_heads + 1, dtype=torch.float32)[None, :]
self.assertTrue(torch.allclose(param.data, torch.log(A), atol=1e-5, rtol=1e-5))
elif "D" in name:
D = torch.ones(config.mamba_n_heads, dtype=torch.float32)
torch.testing.assert_close(param.data, D, rtol=1e-5, atol=1e-5)
self.assertTrue(torch.allclose(param.data, D, atol=1e-5, rtol=1e-5))
else:
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
@ -482,7 +482,7 @@ class BambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
next_logits_with_padding = model(**model_kwargs).logits[:, -1, :]
# They should result in very similar logits
torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, rtol=1e-5, atol=1e-5)
torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, atol=1e-5, rtol=1e-1)
@slow

View File

@ -599,7 +599,7 @@ class BarkSemanticModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Te
with torch.no_grad():
out_embeds = model(**inputs)[0]
torch.testing.assert_close(out_embeds, out_ids)
self.assertTrue(torch.allclose(out_embeds, out_ids))
@require_torch_fp16
def test_generate_fp16(self):
@ -688,7 +688,7 @@ class BarkCoarseModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
with torch.no_grad():
out_embeds = model(**inputs)[0]
torch.testing.assert_close(out_embeds, out_ids)
self.assertTrue(torch.allclose(out_embeds, out_ids))
@require_torch_fp16
def test_generate_fp16(self):
@ -1252,8 +1252,8 @@ class BarkModelIntegrationTests(unittest.TestCase):
self.assertEqual(tuple(audio_lengths), (output1.shape[1], output2.shape[1]))
# then assert almost equal
torch.testing.assert_close(outputs[0, : audio_lengths[0]], output1.squeeze(), rtol=2e-3, atol=2e-3)
torch.testing.assert_close(outputs[1, : audio_lengths[1]], output2.squeeze(), rtol=2e-3, atol=2e-3)
self.assertTrue(torch.allclose(outputs[0, : audio_lengths[0]], output1.squeeze(), atol=2e-3))
self.assertTrue(torch.allclose(outputs[1, : audio_lengths[1]], output2.squeeze(), atol=2e-3))
# now test single input with return_output_lengths = True
outputs, _ = self.model.generate(**s1, **args, return_output_lengths=True)

View File

@ -887,7 +887,7 @@ class BartModelIntegrationTests(unittest.TestCase):
expected_slice = torch.tensor(
[[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
)
torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-3, atol=1e-3)
self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
@slow
def test_base_mask_filling(self):

View File

@ -634,7 +634,7 @@ class BeitModelIntegrationTest(unittest.TestCase):
[[-3.2437, 0.5072, -13.9174], [-3.2456, 0.4948, -13.9401], [-3.2033, 0.5121, -13.8550]]
).to(torch_device)
torch.testing.assert_close(logits[bool_masked_pos][:3, :3], expected_slice, rtol=1e-2, atol=1e-2)
self.assertTrue(torch.allclose(logits[bool_masked_pos][:3, :3], expected_slice, atol=1e-2))
@slow
def test_inference_image_classification_head_imagenet_1k(self):
@ -655,7 +655,7 @@ class BeitModelIntegrationTest(unittest.TestCase):
expected_slice = torch.tensor([-1.2385, -1.0987, -1.0108]).to(torch_device)
torch.testing.assert_close(logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(logits[0, :3], expected_slice, atol=1e-4))
expected_class_idx = 281
self.assertEqual(logits.argmax(-1).item(), expected_class_idx)
@ -681,7 +681,7 @@ class BeitModelIntegrationTest(unittest.TestCase):
expected_slice = torch.tensor([1.6881, -0.2787, 0.5901]).to(torch_device)
torch.testing.assert_close(logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(logits[0, :3], expected_slice, atol=1e-4))
expected_class_idx = 2396
self.assertEqual(logits.argmax(-1).item(), expected_class_idx)
@ -727,7 +727,7 @@ class BeitModelIntegrationTest(unittest.TestCase):
device=torch_device,
)
torch.testing.assert_close(logits[0, :3, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(logits[0, :3, :3, :3], expected_slice, atol=1e-4))
@slow
def test_post_processing_semantic_segmentation(self):

View File

@ -682,7 +682,7 @@ class BertModelIntegrationTest(unittest.TestCase):
self.assertEqual(output.shape, expected_shape)
expected_slice = torch.tensor([[[0.4249, 0.1008, 0.7531], [0.3771, 0.1188, 0.7467], [0.4152, 0.1098, 0.7108]]])
torch.testing.assert_close(output[:, 1:4, 1:4], expected_slice, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
@slow
def test_inference_no_head_relative_embedding_key(self):
@ -697,7 +697,7 @@ class BertModelIntegrationTest(unittest.TestCase):
[[[0.0756, 0.3142, -0.5128], [0.3761, 0.3462, -0.5477], [0.2052, 0.3760, -0.1240]]]
)
torch.testing.assert_close(output[:, 1:4, 1:4], expected_slice, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
@slow
def test_inference_no_head_relative_embedding_key_query(self):
@ -712,7 +712,7 @@ class BertModelIntegrationTest(unittest.TestCase):
[[[0.6496, 0.3784, 0.8203], [0.8148, 0.5656, 0.2636], [-0.0681, 0.5597, 0.7045]]]
)
torch.testing.assert_close(output[:, 1:4, 1:4], expected_slice, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
def test_sdpa_ignored_mask(self):
pkv = []

View File

@ -319,7 +319,7 @@ class BertGenerationEncoderIntegrationTest(unittest.TestCase):
expected_slice = torch.tensor(
[[[0.1775, 0.0083, -0.0321], [1.6002, 0.1287, 0.3912], [2.1473, 0.5791, 0.6066]]]
)
torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
@require_torch
@ -335,4 +335,4 @@ class BertGenerationDecoderIntegrationTest(unittest.TestCase):
expected_slice = torch.tensor(
[[[-0.5788, -2.5994, -3.7054], [0.0438, 4.7997, 1.8795], [1.5862, 6.6409, 4.4638]]]
)
torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))

View File

@ -674,12 +674,12 @@ class BigBirdModelIntegrationTest(unittest.TestCase):
device=torch_device,
)
torch.testing.assert_close(
prediction_logits[0, 128:132, 128:132], expected_prediction_logits_slice, rtol=1e-4, atol=1e-4
self.assertTrue(
torch.allclose(prediction_logits[0, 128:132, 128:132], expected_prediction_logits_slice, atol=1e-4)
)
expected_seq_relationship_logits = torch.tensor([[46.9465, 47.9517]], device=torch_device)
torch.testing.assert_close(seq_relationship_logits, expected_seq_relationship_logits, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(seq_relationship_logits, expected_seq_relationship_logits, atol=1e-4))
def test_inference_full_pretraining(self):
model = BigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base", attention_type="original_full")
@ -703,12 +703,12 @@ class BigBirdModelIntegrationTest(unittest.TestCase):
],
device=torch_device,
)
torch.testing.assert_close(
prediction_logits[0, 128:132, 128:132], expected_prediction_logits_slice, rtol=1e-4, atol=1e-4
self.assertTrue(
torch.allclose(prediction_logits[0, 128:132, 128:132], expected_prediction_logits_slice, atol=1e-4)
)
expected_seq_relationship_logits = torch.tensor([[41.4503, 41.2406]], device=torch_device)
torch.testing.assert_close(seq_relationship_logits, expected_seq_relationship_logits, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(seq_relationship_logits, expected_seq_relationship_logits, atol=1e-4))
def test_block_sparse_attention_probs(self):
"""
@ -773,7 +773,7 @@ class BigBirdModelIntegrationTest(unittest.TestCase):
cl = torch.einsum("bhqk,bhkd->bhqd", attention_probs, value_layer)
cl = cl.view(context_layer.size())
torch.testing.assert_close(context_layer, cl, rtol=0.001, atol=0.001)
self.assertTrue(torch.allclose(context_layer, cl, atol=0.001))
def test_block_sparse_context_layer(self):
model = BigBirdModel.from_pretrained(
@ -822,7 +822,7 @@ class BigBirdModelIntegrationTest(unittest.TestCase):
context_layer = context_layer[0]
self.assertEqual(context_layer.shape, torch.Size((1, 128, 768)))
torch.testing.assert_close(context_layer[0, 64:78, 300:310], targeted_cl, rtol=0.0001, atol=0.0001)
self.assertTrue(torch.allclose(context_layer[0, 64:78, 300:310], targeted_cl, atol=0.0001))
def test_tokenizer_inference(self):
tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
@ -871,7 +871,7 @@ class BigBirdModelIntegrationTest(unittest.TestCase):
device=torch_device,
)
torch.testing.assert_close(prediction[0, 52:64, 320:324], expected_prediction, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(prediction[0, 52:64, 320:324], expected_prediction, atol=1e-4))
def test_inference_question_answering(self):
tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-base-trivia-itc")
@ -923,8 +923,8 @@ class BigBirdModelIntegrationTest(unittest.TestCase):
)
# fmt: on
torch.testing.assert_close(start_logits[:, 64:96], target_start_logits, rtol=1e-4, atol=1e-4)
torch.testing.assert_close(end_logits[:, 64:96], target_end_logits, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(start_logits[:, 64:96], target_start_logits, atol=1e-4))
self.assertTrue(torch.allclose(end_logits[:, 64:96], target_end_logits, atol=1e-4))
input_ids = inputs["input_ids"].tolist()
answer = [
@ -966,4 +966,4 @@ class BigBirdModelIntegrationTest(unittest.TestCase):
# fmt: on
self.assertEqual(output.shape, torch.Size((1, 241, 768)))
torch.testing.assert_close(output[0, 64:78, 300:310], target, rtol=0.0001, atol=0.0001)
self.assertTrue(torch.allclose(output[0, 64:78, 300:310], target, atol=0.0001))

View File

@ -418,12 +418,12 @@ class BigBirdPegasusModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
with torch.no_grad():
logits_single_first = model(input_ids=input_ids[:1, :-chunk_length], labels=labels[:1]).logits
torch.testing.assert_close(logits_batched[0, -3:], logits_single_first[0, -3:], rtol=tolerance, atol=tolerance)
self.assertTrue(torch.allclose(logits_batched[0, -3:], logits_single_first[0, -3:], atol=tolerance))
with torch.no_grad():
logits_single_second = model(input_ids=input_ids[1:], labels=labels[1:, :-4]).logits
torch.testing.assert_close(logits_batched[1, :3], logits_single_second[0, :3], rtol=tolerance, atol=tolerance)
self.assertTrue(torch.allclose(logits_batched[1, :3], logits_single_second[0, :3], atol=tolerance))
def test_auto_padding(self):
ids = [[7, 6, 9] * 65]
@ -445,7 +445,7 @@ class BigBirdPegasusModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
"logits"
]
torch.testing.assert_close(output1, output2, rtol=1e-5, atol=1e-5)
self.assertTrue(torch.allclose(output1, output2, atol=1e-5))
def test_for_change_to_full_attn(self):
self.model_tester.seq_length = 9
@ -462,7 +462,7 @@ class BigBirdPegasusModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
model.load_state_dict(state_dict)
outputs2 = model(**input_dict)["logits"]
torch.testing.assert_close(outputs1, outputs2, rtol=1e-5, atol=1e-5)
self.assertTrue(torch.allclose(outputs1, outputs2, atol=1e-5))
@unittest.skip(
reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
@ -523,8 +523,8 @@ class BigBirdPegasusModelIntegrationTests(unittest.TestCase):
)
# fmt: on
torch.testing.assert_close(
prediction_logits[0, 4:8, 128:156], expected_prediction_logits_slice, rtol=1e-4, atol=1e-4
self.assertTrue(
torch.allclose(prediction_logits[0, 4:8, 128:156], expected_prediction_logits_slice, atol=1e-4)
)
def test_inference_full_attn(self):
@ -544,8 +544,8 @@ class BigBirdPegasusModelIntegrationTests(unittest.TestCase):
device=torch_device,
)
# fmt: on
torch.testing.assert_close(
prediction_logits[0, 4:8, 128:156], expected_prediction_logits_slice, rtol=1e-4, atol=1e-4
self.assertTrue(
torch.allclose(prediction_logits[0, 4:8, 128:156], expected_prediction_logits_slice, atol=1e-4)
)
def test_seq_to_seq_generation(self):

View File

@ -432,7 +432,7 @@ class BioGptModelIntegrationTest(unittest.TestCase):
[[[-9.5236, -9.8918, 10.4557], [-11.0469, -9.6423, 8.1022], [-8.8664, -7.8826, 5.5325]]]
)
torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
@slow
def test_biogpt_generation(self):

View File

@ -296,7 +296,7 @@ class BitModelIntegrationTest(unittest.TestCase):
expected_slice = torch.tensor([[-0.6526, -0.5263, -1.4398]]).to(torch_device)
torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
@require_torch

View File

@ -1431,5 +1431,5 @@ class BlipModelIntegrationTest(unittest.TestCase):
expected_scores = torch.Tensor([[0.0029, 0.9971]])
torch.testing.assert_close(torch.nn.Softmax()(out_itm[0].cpu()), expected_scores, rtol=1e-3, atol=1e-3)
torch.testing.assert_close(out[0].cpu(), torch.Tensor([[0.5162]]), rtol=1e-3, atol=1e-3)
self.assertTrue(torch.allclose(torch.nn.Softmax()(out_itm[0].cpu()), expected_scores, rtol=1e-3, atol=1e-3))
self.assertTrue(torch.allclose(out[0].cpu(), torch.Tensor([[0.5162]]), rtol=1e-3, atol=1e-3))

View File

@ -901,7 +901,7 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT
next_logits_with_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :]
# They should result in very similar logits
torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, rtol=1e-5, atol=1e-5)
self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
@unittest.skip("BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present")
@parameterized.expand([("greedy", 1), ("beam search", 2)])
@ -2215,8 +2215,8 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
# verify
expected_scores = torch.Tensor([[0.0238, 0.9762]])
torch.testing.assert_close(torch.nn.Softmax()(out_itm[0].cpu()), expected_scores, rtol=1e-3, atol=1e-3)
torch.testing.assert_close(out[0].cpu(), torch.Tensor([[0.4406]]), rtol=1e-3, atol=1e-3)
self.assertTrue(torch.allclose(torch.nn.Softmax()(out_itm[0].cpu()), expected_scores, rtol=1e-3, atol=1e-3))
self.assertTrue(torch.allclose(out[0].cpu(), torch.Tensor([[0.4406]]), rtol=1e-3, atol=1e-3))
@require_torch_accelerator
@require_torch_fp16
@ -2235,8 +2235,10 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
# verify
expected_scores = torch.Tensor([[0.0239, 0.9761]])
torch.testing.assert_close(torch.nn.Softmax()(out_itm[0].cpu().float()), expected_scores, rtol=1e-3, atol=1e-3)
torch.testing.assert_close(out[0].cpu().float(), torch.Tensor([[0.4406]]), rtol=1e-3, atol=1e-3)
self.assertTrue(
torch.allclose(torch.nn.Softmax()(out_itm[0].cpu().float()), expected_scores, rtol=1e-3, atol=1e-3)
)
self.assertTrue(torch.allclose(out[0].cpu().float(), torch.Tensor([[0.4406]]), rtol=1e-3, atol=1e-3))
@require_torch_accelerator
@require_torch_fp16

View File

@ -689,4 +689,4 @@ class BridgeTowerModelTrainingTest(unittest.TestCase):
[[-0.6518, 0.4978, -0.4544], [-2.6672, -0.0843, -0.4210], [-2.4510, -0.1002, -0.3458]]
).to(torch_device)
torch.testing.assert_close(outputs.image_features[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(outputs.image_features[0, :3, :3], expected_slice, atol=1e-4))

View File

@ -452,4 +452,4 @@ class BrosModelIntegrationTest(unittest.TestCase):
).to(torch_device)
torch.set_printoptions(sci_mode=False)
torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))

View File

@ -60,7 +60,7 @@ class CamembertModelIntegrationTest(unittest.TestCase):
# camembert.eval()
# expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach()
torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
@slow
@require_torch_sdpa
@ -81,4 +81,4 @@ class CamembertModelIntegrationTest(unittest.TestCase):
with torch.no_grad():
output = model(input_ids)["last_hidden_state"].detach()
torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))

View File

@ -562,7 +562,7 @@ class CanineModelIntegrationTest(unittest.TestCase):
]
)
torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-2, atol=1e-2)
self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-2))
# verify pooled output
expected_shape = torch.Size((1, 768))
@ -570,4 +570,4 @@ class CanineModelIntegrationTest(unittest.TestCase):
expected_slice = torch.tensor([-0.884311497, -0.529064834, 0.723164916])
torch.testing.assert_close(outputs.pooler_output[0, :3], expected_slice, rtol=1e-2, atol=1e-2)
self.assertTrue(torch.allclose(outputs.pooler_output[0, :3], expected_slice, atol=1e-2))

View File

@ -320,7 +320,7 @@ class ChameleonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
# Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
# maximum sequence length, so the outputs for the short input should match.
if scaling_type == "dynamic":
torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5)
self.assertTrue(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
else:
self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))

Some files were not shown because too many files have changed in this diff Show More