[GPT OSS] Refactor the tests as it was not properly checking the outputs (#40288)

* it was long due!

* use the official kernel

* more permissive

* update the kernel as well

* mmm should it be this?

* up pu

* fixup

* Update test_modeling_gpt_oss.py

* style

* start with 20b
This commit is contained in:
Arthur
2025-08-20 16:47:41 +02:00
committed by GitHub
parent 3b7230124b
commit 4977ec2ae8
2 changed files with 246 additions and 405 deletions

View File

@ -1,346 +1,122 @@
[
{
"quantized": true,
"model": "120b",
"kernels": false,
"attn_impl": "ft-hf-o-c/vllm-flash-attn3",
"mode": "eval",
"outputs": [
".....Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
"How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
]
},
{
"quantized": true,
"model": "120b",
"kernels": false,
"attn_impl": "ft-hf-o-c/vllm-flash-attn3",
"mode": "train",
"outputs": [
".....Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
"How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
]
},
{
"quantized": true,
"model": "120b",
"kernels": true,
"attn_impl": "ft-hf-o-c/vllm-flash-attn3",
"mode": "eval",
"outputs": [
"Did not work"
]
},
{
"quantized": true,
"model": "120b",
"kernels": true,
"attn_impl": "ft-hf-o-c/vllm-flash-attn3",
"mode": "train",
"outputs": [
"Did not work"
]
},
{
"quantized": true,
"model": "120b",
"kernels": false,
"attn_impl": "eager",
"mode": "eval",
"outputs": [
".....Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
"How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
]
},
{
"quantized": true,
"model": "120b",
"kernels": false,
"attn_impl": "eager",
"mode": "train",
"outputs": [
".....Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
"How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
]
},
{
"quantized": true,
"model": "120b",
"kernels": true,
"attn_impl": "eager",
"mode": "eval",
"outputs": [
"Did not work"
]
},
{
"quantized": true,
"model": "120b",
"kernels": true,
"attn_impl": "eager",
"mode": "train",
"outputs": [
"Did not work"
]
},
{
"quantized": true,
"model": "20b",
"kernels": false,
"attn_impl": "ft-hf-o-c/vllm-flash-attn3",
"mode": "eval",
"outputs": [
".....Roses are red, violets are blue, I love you, and I love you too.\n\nIt sounds like you're looking for",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
]
},
{
"quantized": true,
"model": "20b",
"kernels": false,
"attn_impl": "ft-hf-o-c/vllm-flash-attn3",
"mode": "train",
"outputs": [
".....Roses are red, violets are blue, I love you, and I love you too.\n\nIt sounds like you're looking for",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
]
},
{
"quantized": true,
"model": "20b",
"kernels": true,
"attn_impl": "ft-hf-o-c/vllm-flash-attn3",
"mode": "eval",
"outputs": [
"Did not work"
]
},
{
"quantized": true,
"model": "20b",
"kernels": true,
"attn_impl": "ft-hf-o-c/vllm-flash-attn3",
"mode": "train",
"outputs": [
"Did not work"
]
},
{
"quantized": true,
"model": "20b",
"kernels": false,
"attn_impl": "eager",
"mode": "eval",
"outputs": [
".....Roses are red, violets are blue, I love you, and I love you too.\n\nIt sounds like you're expressing a",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
]
},
{
"quantized": true,
"model": "20b",
"kernels": false,
"attn_impl": "eager",
"mode": "train",
"outputs": [
".....Roses are red, violets are blue, I love you, and I love you too.\n\nIt sounds like you're expressing a",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
]
},
{
"quantized": true,
"model": "20b",
"kernels": true,
"attn_impl": "eager",
"mode": "eval",
"outputs": [
"Did not work"
]
},
{
"quantized": true,
"model": "20b",
"kernels": true,
"attn_impl": "eager",
"mode": "train",
"outputs": [
"Did not work"
]
},
{
"quantized": false,
"model": "120b",
"kernels": false,
"attn_impl": "ft-hf-o-c/vllm-flash-attn3",
"mode": "eval",
"outputs": [
".....Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
"How are you? Tell me the name of the president of the United Kingdom?\n\nThe United Kingdom does not have a president. The head of state is the"
]
},
{
"quantized": false,
"model": "120b",
"kernels": false,
"attn_impl": "ft-hf-o-c/vllm-flash-attn3",
"mode": "train",
"outputs": [
".....Roses are red, violets are blue, I am a language model trained by OpenAI.\n\nI am a large language model",
"How are you? Tell me the name of the president of the United\n\nHello! I'm an AI language model, so I don't have feelings, but I'm here"
]
},
{
"quantized": false,
"model": "120b",
"kernels": true,
"attn_impl": "ft-hf-o-c/vllm-flash-attn3",
"mode": "eval",
"outputs": [
".....Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
"How are you? Tell me the name of the president of the United Kingdom?\n\nThe United Kingdom does not have a president. The head of state is the"
]
},
{
"quantized": false,
"model": "120b",
"kernels": true,
"attn_impl": "ft-hf-o-c/vllm-flash-attn3",
"mode": "train",
"outputs": [
".....Roses are red, violets are blue, I am a language model trained by OpenAI.\n\nI am a large language model",
"How are you? Tell me the name of the president of the United\n\nHello! I'm an AI language model, so I don't have feelings, but I'm here"
]
},
{
"quantized": false,
"model": "120b",
"kernels": false,
"attn_impl": "eager",
"mode": "eval",
"outputs": [
".....Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
"How are you? Tell me the name of the president of the United States?\n\nAs an AI language model, I do not have personal feelings or emotions,"
]
},
{
"quantized": false,
"model": "120b",
"kernels": false,
"attn_impl": "eager",
"mode": "train",
"outputs": [
".....Roses are red, violets are blue, I am a language model, and I can help you with your request.\n\nSure",
"How are you? Tell me the name of the president of the United\n\nHello! I'm an AI language model, so I don't have feelings, but I'm here"
]
},
{
"quantized": false,
"model": "120b",
"kernels": true,
"attn_impl": "eager",
"mode": "eval",
"outputs": [
".....Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
"How are you? Tell me the name of the president of the United States?\n\nAs an AI language model, I do not have personal feelings or emotions,"
]
},
{
"quantized": false,
"model": "120b",
"kernels": true,
"attn_impl": "eager",
"mode": "train",
"outputs": [
".....Roses are red, violets are blue, I am a language model, and I can help you with your request.\n\nSure",
"How are you? Tell me the name of the president of the United\n\nHello! I'm an AI language model, so I don't have feelings, but I'm here"
]
},
{
"quantized": false,
"model": "20b",
"kernels": false,
"attn_impl": "ft-hf-o-c/vllm-flash-attn3",
"mode": "eval",
"outputs": [
".....Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
]
},
{
"quantized": false,
"model": "20b",
"kernels": false,
"attn_impl": "ft-hf-o-c/vllm-flash-attn3",
"mode": "train",
"outputs": [
".....Roses are red, violets are blue\" (makes sense). But the phrase \"the answer is 3\" is not a",
"How are you? Tell me the name of the president of the United States.\" The answer to that is \"Joe Biden.\" The user is asking for the name"
]
},
{
"quantized": false,
"model": "20b",
"kernels": true,
"attn_impl": "ft-hf-o-c/vllm-flash-attn3",
"mode": "eval",
"outputs": [
".....Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
]
},
{
"quantized": false,
"model": "20b",
"kernels": true,
"attn_impl": "ft-hf-o-c/vllm-flash-attn3",
"mode": "train",
"outputs": [
".....Roses are red, violets are blue\" (makes sense). But the phrase \"the answer is 3\" is not a",
"How are you? Tell me the name of the president of the United States.\" The answer to that is \"Joe Biden.\" The user is asking for the name"
]
},
{
"quantized": false,
"model": "20b",
"kernels": false,
"attn_impl": "eager",
"mode": "eval",
"outputs": [
".....Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
]
},
{
"quantized": false,
"model": "20b",
"kernels": false,
"attn_impl": "eager",
"mode": "train",
"outputs": [
".....Roses are red, violets are blue.\" -> from which we can derive a rule: if we have a red object that is",
"How are you? Tell me the name of the president of the United States.\n\nI am an AI language model and I do not have a personal life or"
]
},
{
"quantized": false,
"model": "20b",
"kernels": true,
"attn_impl": "eager",
"mode": "eval",
"outputs": [
".....Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
]
},
{
"quantized": false,
"model": "20b",
"kernels": true,
"attn_impl": "eager",
"mode": "train",
"outputs": [
".....Roses are red, violets are blue.\" -> from which we can derive a rule: if we have a red object that is",
"How are you? Tell me the name of the president of the United States.\n\nI am an AI language model and I do not have a personal life or"
]
}
]
{
"quantized=true|model=120b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
"Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
"How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
],
"quantized=true|model=120b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
"Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
"How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
],
"quantized=true|model=120b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
"Did not work"
],
"quantized=true|model=120b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
"Did not work"
],
"quantized=true|model=120b|kernels=false|attn_impl=eager|mode=eval": [
"Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
"How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
],
"quantized=true|model=120b|kernels=false|attn_impl=eager|mode=train": [
"Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
"How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
],
"quantized=true|model=120b|kernels=true|attn_impl=eager|mode=eval": [
"Did not work"
],
"quantized=true|model=120b|kernels=true|attn_impl=eager|mode=train": [
"Did not work"
],
"quantized=true|model=20b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
"Roses are red, violets are blue, I love you, and I love you too.\nIt sounds like you're looking for",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
],
"quantized=true|model=20b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
"Roses are red, violets are blue, I love you, and I love you too.\n\nIt sounds like you're looking for",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
],
"quantized=true|model=20b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
"Did not work"
],
"quantized=true|model=20b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
"Did not work"
],
"quantized=true|model=20b|kernels=false|attn_impl=eager|mode=eval": [
"Roses are red, violets are blue, I love you, and I love you too.\n\nIt sounds like you're expressing a",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
],
"quantized=true|model=20b|kernels=false|attn_impl=eager|mode=train": [
"Roses are red, violets are blue, I love you, and I love you too.\n\nIt sounds like you're expressing a",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
],
"quantized=true|model=20b|kernels=true|attn_impl=eager|mode=eval": [
"Did not work"
],
"quantized=true|model=20b|kernels=true|attn_impl=eager|mode=train": [
"Did not work"
],
"quantized=false|model=120b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
"Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
"How are you? Tell me the name of the president of the United Kingdom?\n\nThe United Kingdom does not have a president. The head of state is the"
],
"quantized=false|model=120b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
"Roses are red, violets are blue, I am a language model trained by OpenAI.\n\nI am a large language model",
"How are you? Tell me the name of the president of the United\n\nHello! I'm an AI language model, so I don't have feelings, but I'm here"
],
"quantized=false|model=120b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
"Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
"How are you? Tell me the name of the president of the United Kingdom?\n\nThe United Kingdom does not have a president. The head of state is the"
],
"quantized=false|model=120b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
"Roses are red, violets are blue, I am a language model trained by OpenAI.\n\nI am a large language model",
"How are you? Tell me the name of the president of the United\n\nHello! I'm an AI language model, so I don't have feelings, but I'm here"
],
"quantized=false|model=120b|kernels=false|attn_impl=eager|mode=eval": [
"Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
"How are you? Tell me the name of the president of the United States?\n\nAs an AI language model, I do not have personal feelings or emotions,"
],
"quantized=false|model=120b|kernels=false|attn_impl=eager|mode=train": [
"Roses are red, violets are blue, I am a language model, and I can help you with your request.\n\nSure",
"How are you? Tell me the name of the president of the United\n\nHello! I'm an AI language model, so I don't have feelings, but I'm here"
],
"quantized=false|model=120b|kernels=true|attn_impl=eager|mode=eval": [
"Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
"How are you? Tell me the name of the president of the United States?\n\nAs an AI language model, I do not have personal feelings or emotions,"
],
"quantized=false|model=120b|kernels=true|attn_impl=eager|mode=train": [
"Roses are red, violets are blue, I am a language model, and I can help you with your request.\n\nSure",
"How are you? Tell me the name of the president of the United\n\nHello! I'm an AI language model, so I don't have feelings, but I'm here"
],
"quantized=false|model=20b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
"Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
],
"quantized=false|model=20b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
"Roses are red, violets are blue\" (makes sense). But the phrase \"the answer is 3\" is not a",
"How are you? Tell me the name of the president of the United States.\" The answer to that is \"Joe Biden.\" The user is asking for the name"
],
"quantized=false|model=20b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
"Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
],
"quantized=false|model=20b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
"Roses are red, violets are blue\" (makes sense). But the phrase \"the answer is 3\" is not a",
"How are you? Tell me the name of the president of the United States.\" The answer to that is \"Joe Biden.\" The user is asking for the name"
],
"quantized=false|model=20b|kernels=false|attn_impl=eager|mode=eval": [
"Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
],
"quantized=false|model=20b|kernels=false|attn_impl=eager|mode=train": [
"Roses are red, violets are blue.\" -> from which we can derive a rule: if we have a red object that is",
"How are you? Tell me the name of the president of the United States.\n\nI am an AI language model and I do not have a personal life or"
],
"quantized=false|model=20b|kernels=true|attn_impl=eager|mode=eval": [
"Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
"How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
],
"quantized=false|model=20b|kernels=true|attn_impl=eager|mode=train": [
"Roses are red, violets are blue.\" -> from which we can derive a rule: if we have a red object that is",
"How are you? Tell me the name of the president of the United States.\n\nI am an AI language model and I do not have a personal life or"
]
}

View File

@ -13,6 +13,7 @@
# limitations under the License.
"""Testing suite for the PyTorch GptOss model."""
import difflib
import inspect
import json
import os
@ -194,6 +195,10 @@ def distributed_worker(quantized, model_size, kernels, attn_impl, mode):
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.testing_utils import torch_device
def generate_config_key(quantized, model, kernels, attn_impl, mode):
"""Generate a key for the restructured integration test results."""
return f"quantized={str(quantized).lower()}|model={model}|kernels={str(kernels).lower()}|attn_impl={attn_impl}|mode={mode}"
input_text = [
"Roses are red, violets",
"How are you? Tell me the name of the president of",
@ -204,7 +209,7 @@ def distributed_worker(quantized, model_size, kernels, attn_impl, mode):
kernels = kernels.lower() == "true"
# Distributed model loading
model_id = f"/fsx/vb/new-oai/gpt-oss-{model_size}-trfs"
model_id = f"openai/gpt-oss-{model_size}"
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype="auto",
@ -219,26 +224,54 @@ def distributed_worker(quantized, model_size, kernels, attn_impl, mode):
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_texts = tokenizer.batch_decode(output, skip_special_tokens=False)
# Only rank 0 writes results
# Only rank 0 writes results and validates against expected outputs
if int(os.environ.get("RANK", "0")) == 0:
result_entry = {
"quantized": quantized,
"model": model_size,
"kernels": kernels,
"attn_impl": attn_impl,
"mode": mode,
"outputs": output_texts,
}
# Generate key to look up expected outputs
key = generate_config_key(quantized, model_size, kernels, attn_impl, mode)
# Load expected outputs from restructured JSON
if os.path.exists(RESULTS_PATH):
with open(RESULTS_PATH, "r") as f:
results = json.load(f)
else:
results = []
results.append(result_entry)
expected_results = json.load(f)
with open(RESULTS_PATH, "w") as f:
json.dump(results, f, indent=2)
# Check if we have expected results for this configuration
if key in expected_results:
expected_outputs = expected_results[key]
# Compare actual outputs with expected outputs
assert len(output_texts) == len(expected_outputs), f"Output length mismatch for {key}"
for i, (actual, expected) in enumerate(zip(output_texts, expected_outputs)):
actual_stripped = actual.strip()
expected_stripped = expected.strip()
# Make lengths match by taking minimum length to be resilient to generation differences
min_length = min(len(actual_stripped), len(expected_stripped))
actual_truncated = actual_stripped[:min_length]
expected_truncated = expected_stripped[:min_length]
if actual_truncated != expected_truncated:
diff = "\n".join(
difflib.unified_diff(
expected_truncated.splitlines(keepends=True),
actual_truncated.splitlines(keepends=True),
fromfile=f"expected[{i}]",
tofile=f"actual[{i}]",
lineterm="",
)
)
raise AssertionError(
f"Output mismatch at index {i} for {key}:\n"
f"Expected: '{expected_stripped}'\n"
f"Actual: '{actual_stripped}'\n"
f"Diff (truncated to min length {min_length}):\n{diff}"
)
print(f"✓ Outputs match expected results for {key}")
else:
print(f"Warning: No expected results found for configuration: {key}")
else:
print(f"Warning: Results file {RESULTS_PATH} not found")
@slow
@ -249,6 +282,11 @@ class GptOssIntegrationTest(unittest.TestCase):
"How are you? Tell me the name of the president of",
]
@staticmethod
def generate_config_key(quantized, model, kernels, attn_impl, mode):
"""Generate a key for the restructured integration test results."""
return f"quantized={str(quantized).lower()}|model={model}|kernels={str(kernels).lower()}|attn_impl={attn_impl}|mode={mode}"
def setUp(self):
cleanup(torch_device, gc_collect=True)
@ -271,7 +309,7 @@ class GptOssIntegrationTest(unittest.TestCase):
inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(model.device)
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
return output_text
# ------------------------
@ -320,38 +358,38 @@ if __name__ == "__main__":
# Shared parameterization
# ------------------------
PARAMETERS = [
(False, "120b", False, "eager", "eval"),
(False, "120b", False, "eager", "train"),
(False, "120b", False, "ft-hf-o-c/vllm-flash-attn3", "eval"),
(False, "120b", False, "ft-hf-o-c/vllm-flash-attn3", "train"),
(False, "120b", True, "eager", "eval"),
(False, "120b", True, "eager", "train"),
(False, "120b", True, "ft-hf-o-c/vllm-flash-attn3", "eval"),
(False, "120b", True, "ft-hf-o-c/vllm-flash-attn3", "train"),
(True, "120b", False, "eager", "eval"),
(True, "120b", False, "eager", "train"),
(True, "120b", False, "ft-hf-o-c/vllm-flash-attn3", "eval"),
(True, "120b", False, "ft-hf-o-c/vllm-flash-attn3", "train"),
(True, "120b", True, "eager", "eval"),
(True, "120b", True, "eager", "train"),
(True, "120b", True, "ft-hf-o-c/vllm-flash-attn3", "eval"),
(True, "120b", True, "ft-hf-o-c/vllm-flash-attn3", "train"),
(False, "20b", False, "eager", "eval"),
(False, "20b", False, "eager", "train"),
(False, "20b", False, "ft-hf-o-c/vllm-flash-attn3", "eval"),
(False, "20b", False, "ft-hf-o-c/vllm-flash-attn3", "train"),
(False, "20b", False, "kernels-community/vllm-flash-attn3", "eval"),
(False, "20b", False, "kernels-community/vllm-flash-attn3", "train"),
(False, "20b", True, "eager", "eval"),
(False, "20b", True, "eager", "train"),
(False, "20b", True, "ft-hf-o-c/vllm-flash-attn3", "eval"),
(False, "20b", True, "ft-hf-o-c/vllm-flash-attn3", "train"),
(False, "20b", True, "kernels-community/vllm-flash-attn3", "eval"),
(False, "20b", True, "kernels-community/vllm-flash-attn3", "train"),
(True, "20b", False, "eager", "eval"),
(True, "20b", False, "eager", "train"),
(True, "20b", False, "ft-hf-o-c/vllm-flash-attn3", "eval"),
(True, "20b", False, "ft-hf-o-c/vllm-flash-attn3", "train"),
(True, "20b", False, "kernels-community/vllm-flash-attn3", "eval"),
(True, "20b", False, "kernels-community/vllm-flash-attn3", "train"),
(True, "20b", True, "eager", "eval"),
(True, "20b", True, "eager", "train"),
(True, "20b", True, "ft-hf-o-c/vllm-flash-attn3", "eval"),
(True, "20b", True, "ft-hf-o-c/vllm-flash-attn3", "train"),
(True, "20b", True, "kernels-community/vllm-flash-attn3", "eval"),
(True, "20b", True, "kernels-community/vllm-flash-attn3", "train"),
(False, "120b", False, "eager", "eval"),
(False, "120b", False, "eager", "train"),
(False, "120b", False, "kernels-community/vllm-flash-attn3", "eval"),
(False, "120b", False, "kernels-community/vllm-flash-attn3", "train"),
(False, "120b", True, "eager", "eval"),
(False, "120b", True, "eager", "train"),
(False, "120b", True, "kernels-community/vllm-flash-attn3", "eval"),
(False, "120b", True, "kernels-community/vllm-flash-attn3", "train"),
(True, "120b", False, "eager", "eval"),
(True, "120b", False, "eager", "train"),
(True, "120b", False, "kernels-community/vllm-flash-attn3", "eval"),
(True, "120b", False, "kernels-community/vllm-flash-attn3", "train"),
(True, "120b", True, "eager", "eval"),
(True, "120b", True, "eager", "train"),
(True, "120b", True, "kernels-community/vllm-flash-attn3", "eval"),
(True, "120b", True, "kernels-community/vllm-flash-attn3", "train"),
]
# ------------------------
@ -360,7 +398,7 @@ if __name__ == "__main__":
@parameterized.expand(PARAMETERS)
@require_read_token
def test_model_outputs(self, quantized, model, kernels, attn_impl, mode):
model_id = f"/fsx/vb/new-oai/gpt-oss-{model}-trfs"
model_id = f"openai/gpt-oss-{model}"
output_texts = self.load_and_forward(
model_id,
attn_impl,
@ -368,23 +406,50 @@ if __name__ == "__main__":
use_kernels=kernels,
)
result_entry = {
"quantized": quantized,
"model": model,
"kernels": kernels,
"attn_impl": attn_impl,
"mode": mode,
"outputs": output_texts,
}
# Generate key to look up expected outputs
key = self.generate_config_key(quantized, model, kernels, attn_impl, mode)
# Load expected outputs from restructured JSON
if os.path.exists(RESULTS_PATH):
with open(RESULTS_PATH, "r") as f:
results = json.load(f)
else:
results = []
results.append(result_entry)
with open(RESULTS_PATH, "w") as f:
json.dump(results, f, indent=2)
expected_results = json.load(f)
# Check if we have expected results for this configuration
if key in expected_results:
expected_outputs = expected_results[key]
# Compare actual outputs with expected outputs
self.assertEqual(len(output_texts), len(expected_outputs), f"Output length mismatch for {key}")
for i, (actual, expected) in enumerate(zip(output_texts, expected_outputs)):
actual_stripped = actual.strip()
expected_stripped = expected.strip()
# Make lengths match by taking minimum length to be resilient to generation differences
min_length = min(len(actual_stripped), len(expected_stripped))
actual_truncated = actual_stripped[:min_length]
expected_truncated = expected_stripped[:min_length]
if actual_truncated != expected_truncated:
diff = "\n".join(
difflib.unified_diff(
expected_truncated.splitlines(keepends=True),
actual_truncated.splitlines(keepends=True),
fromfile=f"expected[{i}]",
tofile=f"actual[{i}]",
lineterm="",
)
)
self.fail(
f"Output mismatch at index {i} for {key}:\n"
f"Expected: '{expected_stripped}'\n"
f"Actual: '{actual_stripped}'\n"
f"Diff (truncated to min length {min_length}):\n{diff}"
)
else:
# If no expected results exist, this is a new configuration
# We could optionally add it to the results file here
print(f"Warning: No expected results found for configuration: {key}")
self.assertIsInstance(output_texts, list)
self.assertTrue(all(isinstance(x, str) for x in output_texts))
@ -409,7 +474,7 @@ if __name__ == "__main__":
if quantized:
self.skipTest("Training test for quantized models is not supported.")
model_id = f"/fsx/vb/new-oai/gpt-oss-{model}-trfs"
model_id = f"openai/gpt-oss-{model}"
model_obj = AutoModelForCausalLM.from_pretrained(
model_id,
@ -471,7 +536,7 @@ if __name__ == "__main__":
]
)
model_id = "/fsx/vb/new-oai/gpt-oss-20b-trfs"
model_id = "openai/gpt-oss-20b"
model = AutoModelForCausalLM.from_pretrained(
model_id,
@ -537,7 +602,7 @@ I am a language model, not a human being"""
]
)
model_id = "/fsx/vb/new-oai/gpt-oss-120b-trfs"
model_id = "openai/gpt-oss-120b"
model = AutoModelForCausalLM.from_pretrained(
model_id,