Compare commits

...

1 Commits

Author SHA1 Message Date
aa7e793a8e Update (base update)
[ghstack-poisoned]
2025-08-27 19:17:52 -07:00
10 changed files with 426 additions and 34 deletions

View File

@ -0,0 +1,223 @@
from __future__ import annotations
import logging
import os
import textwrap
import xml.etree.ElementTree as ET
from pathlib import Path
from typing import TYPE_CHECKING
from cli.lib.common.utils import get_wheels
from jinja2 import Template
if TYPE_CHECKING:
from collections.abc import Iterable, Mapping
logger = logging.getLogger(__name__)
# ---- Template (title + per-command failures) ----
_TPL_FAIL_BY_CMD = Template(
textwrap.dedent("""\
## {{ title }}
{%- for section in sections if section.failures %}
### Test Command: {{ section.label }}
{%- for f in section.failures %}
- {{ f }}
{%- endfor %}
{%- endfor %}
""")
)
_TPL_CONTENT = Template(
textwrap.dedent("""\
## {{ title }}
```{{ lang }}
{{ content }}
```
""")
)
_TPL_LIST_ITEMS = Template(
textwrap.dedent("""\
## {{ title }}
{% for it in items %}
- {{ it.pkg }}: {{ it.relpath }}
{% else %}
_(no item found)_
{% endfor %}
""")
)
_TPL_TABLE = Template(
textwrap.dedent("""\
{%- if rows %}
| {{ cols | join(' | ') }} |
|{%- for _ in cols %} --- |{%- endfor %}
{%- for r in rows %}
| {%- for c in cols %} {{ r.get(c, "") }} |{%- endfor %}
{%- endfor %}
{%- else %}
_(no data)_
{%- endif %}
""")
)
def gh_summary_path() -> Path | None:
"""Return the Path to the GitHub step summary file, or None if not set."""
p = os.environ.get("GITHUB_STEP_SUMMARY")
return Path(p) if p else None
def write_gh_step_summary(md: str, *, append_content: bool = True) -> bool:
"""
Write Markdown content to the GitHub Step Summary file if GITHUB_STEP_SUMMARY is set.
append_content: default true, if True, append to the end of the file, else overwrite the whole file
Returns:
True if written successfully (in GitHub Actions environment),
False if skipped (e.g., running locally where the variable is not set).
"""
sp = gh_summary_path()
if not sp:
logger.info("[gh-summary] GITHUB_STEP_SUMMARY not set, skipping write.")
return False
md_clean = textwrap.dedent(md).strip() + "\n"
mode = "a" if append_content else "w"
with sp.open(mode, encoding="utf-8") as f:
f.write(md_clean)
return True
def md_heading(text: str, level: int = 2) -> str:
"""Generate a Markdown heading string with the given level (1-6)."""
return f"{'#' * max(1, min(level, 6))} {text}\n"
def md_details(summary: str, content: str) -> str:
"""Generate a collapsible <details> block with a summary and inner content."""
return f"<details>\n<summary>{summary}</summary>\n\n{content}\n\n</details>\n"
def summarize_content_from_file(
output_dir: Path,
freeze_file: str,
title: str = "Content from file",
code_lang: str = "", # e.g. "text" or "ini"
) -> bool:
f = Path(output_dir) / freeze_file
if not f.exists():
return False
content = f.read_text(encoding="utf-8").strip()
md = render_content(content, title=title, lang=code_lang)
return write_gh_step_summary(md)
def summarize_wheels(path: Path, title: str = "Wheels", max_depth: int = 3):
items = get_wheels(path, max_depth=max_depth)
if not items:
return False
md = render_list(items, title=title)
return write_gh_step_summary(md)
def md_kv_table(rows: Iterable[Mapping[str, str | int | float]]) -> str:
"""
Render a list of dicts as a Markdown table using Jinja template.
"""
rows = list(rows)
cols = list({k for r in rows for k in r.keys()})
md = _TPL_TABLE.render(cols=cols, rows=rows).strip() + "\n"
return md
def render_list(
items: Iterable[str],
*,
title: str = "List",
) -> str:
tpl = _TPL_LIST_ITEMS
md = tpl.render(title=title, items=items)
return md
def render_content(
content: str,
*,
title: str = "Content",
lang: str = "text",
) -> str:
tpl = _TPL_CONTENT
md = tpl.render(title=title, content=content, lang=lang)
return md
def summarize_failures_by_test_command(
xml_and_labels: Iterable[tuple[str | Path, str]],
*,
title: str = "Pytest Failures by Test Command",
dedupe_within_command: bool = True,
) -> bool:
"""
Render a single Markdown block summarizing failures grouped by test command.
Returns True if anything was written, False otherwise.
"""
sections: list[dict] = []
for xml_path, label in xml_and_labels:
xmlp = Path(xml_path)
if not xmlp.exists():
# optional: your logger
# logger.warning("XML %s not found, skipping", xmlp)
continue
failed = _parse_failed(xmlp)
if dedupe_within_command:
failed = sorted(set(failed))
# collect even if empty; we'll filter in the template render
sections.append({"label": label, "failures": failed})
# If *all* sections are empty or we collected nothing, skip writing.
if not sections or all(not s["failures"] for s in sections):
return False
md = _TPL_FAIL_BY_CMD.render(title=title, sections=sections).rstrip() + "\n"
return write_gh_step_summary(md)
def _to_name_from_testcase(tc: ET.Element) -> str:
name = tc.attrib.get("name", "")
file_attr = tc.attrib.get("file")
if file_attr:
return f"{file_attr}:{name}"
classname = tc.attrib.get("classname", "")
parts = classname.split(".") if classname else []
if len(parts) >= 1:
mod_parts = parts[:-1] if len(parts) >= 2 else parts
mod_path = "/".join(mod_parts) + ".py" if mod_parts else "unknown.py"
return f"{mod_path}:{name}"
return f"unknown.py:{name or 'unknown_test'}"
def _parse_failed(xml_path: Path) -> list[str]:
if not xml_path.exists():
return []
tree = ET.parse(xml_path)
root = tree.getroot()
failed: list[str] = []
for tc in root.iter("testcase"):
if any(x.tag in {"failure", "error"} for x in tc):
failed.append(_to_name_from_testcase(tc))
return failed

View File

@ -45,7 +45,7 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
# Checkout pinned commit
commit = get_post_build_pinned_commit(target)
logger.info("Checking out pinned commit %s", commit)
logger.info("Checking out pinned %s commit %s", target, commit)
r.git.checkout(commit)
# Update submodules if requested
@ -55,7 +55,7 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
sm.update(init=True, recursive=True, progress=PrintProgress())
logger.info("Successfully cloned %s", target)
return r
return r, commit
except GitCommandError as e:
logger.error("Git operation failed: %s", e)

View File

@ -4,7 +4,7 @@ import shlex
import shutil
import sys
from collections.abc import Iterable
from importlib.metadata import PackageNotFoundError, version
from importlib.metadata import PackageNotFoundError, version # noqa: UP035
from typing import Optional, Union
from cli.lib.common.utils import run_command

View File

@ -4,12 +4,16 @@ General Utility helpers for CLI tasks.
import logging
import os
import secrets
import shlex
import subprocess
import sys
from contextlib import contextmanager
from pathlib import Path
from typing import Optional
from cli.lib.common.path_helper import ensure_dir_exists
logger = logging.getLogger(__name__)
@ -115,3 +119,63 @@ def working_directory(path: str):
yield
finally:
os.chdir(prev_cwd)
def get_wheels(
output_dir: Path,
max_depth: Optional[int] = None,
) -> list[str]:
"""Return a list of wheels found in the given output directory."""
root = Path(output_dir)
if not root.exists():
return []
items = []
for dirpath, _, filenames in os.walk(root):
depth = Path(dirpath).relative_to(root).parts
if max_depth is not None and len(depth) > max_depth:
continue
for fname in sorted(filenames):
if fname.endswith(".whl"):
pkg = fname.split("-")[0]
relpath = str((Path(dirpath) / fname).relative_to(root))
items.append({"pkg": pkg, "relpath": relpath})
return items
def attach_junitxml_if_pytest(
cmd: str,
dir: Optional[Path],
prefix: str,
*,
ensure_unique: bool = False,
resolve_xml: bool = False,
) -> tuple[str, Optional[Path]]:
"""
Append --junitxml=<ABS_PATH> to a pytest command string.
The XML filename is <prefix>_<random-hex>.xml.
- dir: target folder (will be created), if None, skip the junitxml attachment
- prefix: filename prefix (e.g., "junit" -> junit_ab12cd34.xml)
- ensure_unique: if True, regenerate a hash with 8 characters
Returns: (amended_cmd, abs_xml_path)
"""
if "pytest" not in cmd:
return cmd, None
if dir is None:
return cmd, None
ensure_dir_exists(dir)
file_name_prefix = f"{prefix}"
if ensure_unique:
file_name_prefix += f"_{unique_hex(8)}"
xml_path = dir / f"{file_name_prefix}_junit_pytest.xml"
if resolve_xml:
xml_path = xml_path.resolve()
return f"{cmd} --junitxml={xml_path.as_posix()}", xml_path
def unique_hex(length: int = 8) -> str:
"""Return a random hex string of `length` characters."""
return secrets.token_hex((length + 1) // 2)[:length]

View File

@ -1,13 +1,33 @@
import logging
from typing import Any
import os
import textwrap
from pathlib import Path
from typing import Any, Optional
from cli.lib.common.gh_summary import write_gh_step_summary
from cli.lib.common.git_helper import clone_external_repo
from cli.lib.common.pip_helper import pip_install_packages
from cli.lib.common.utils import run_command, temp_environ, working_directory
from cli.lib.common.utils import (
attach_junitxml_if_pytest,
run_command,
temp_environ,
working_directory,
)
from jinja2 import Template
logger = logging.getLogger(__name__)
_TPL_VLLM_INFO = Template(
textwrap.dedent("""\
## Vllm against Pytorch CI Test Summary
**Vllm Commit**: [{{ vllm_commit }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})
{%- if torch_sha %}
**Pytorch Commit**: [{{ torch_sha }}](https://github.com/pytorch/pytorch/commit/{{ torch_sha }})
{%- endif %}
""")
)
def sample_vllm_test_library():
"""
@ -172,6 +192,9 @@ def run_test_plan(
tests_map: dict[str, Any],
shard_id: int = 0,
num_shards: int = 0,
*,
test_summary_path: Optional[Path] = None,
test_summary_result: Optional[list[tuple[str, str]]] = None,
):
"""
a method to run list of tests based on the test plan.
@ -184,7 +207,6 @@ def run_test_plan(
tests = tests_map[test_plan]
pkgs = tests.get("package_install", [])
title = tests.get("title", "unknown test")
is_parallel = check_parallelism(tests, title, shard_id, num_shards)
if is_parallel:
title = title.replace("%N", f"{shard_id}/{num_shards}")
@ -198,7 +220,15 @@ def run_test_plan(
temp_environ(tests.get("env_vars", {})),
):
failures = []
for step in tests["steps"]:
for idx, step in enumerate(tests["steps"]):
# generate xml report for each test for test summary if needed
step, xml_file_path = attach_junitxml_if_pytest(
cmd=step, dir=test_summary_path, prefix=f"{test_plan}_{idx}"
)
if xml_file_path and xml_file_path.exists() and test_summary_result:
test_summary_result.append((title, str(xml_file_path)))
else:
logger.info("No test report will be generate for %s", step)
logger.info("Running step: %s", step)
if is_parallel:
step = replace_buildkite_placeholders(step, shard_id, num_shards)
@ -214,12 +244,13 @@ def run_test_plan(
def clone_vllm(dst: str = "vllm"):
clone_external_repo(
_, commit = clone_external_repo(
target="vllm",
repo="https://github.com/vllm-project/vllm.git",
dst=dst,
update_submodules=True,
)
return commit
def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) -> str:
@ -230,3 +261,12 @@ def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) ->
for k in sorted(mapping, key=len, reverse=True):
step = step.replace(k, mapping[k])
return step
def summarize_build_info(vllm_commit: str) -> bool:
torch_sha = os.getenv("GITHUB_SHA")
md = (
_TPL_VLLM_INFO.render(vllm_commit=vllm_commit, torch_sha=torch_sha).strip()
+ "\n"
)
return write_gh_step_summary(md)

View File

@ -13,6 +13,11 @@ from cli.lib.common.envs_helper import (
env_str_field,
with_params_help,
)
from cli.lib.common.gh_summary import (
gh_summary_path,
summarize_content_from_file,
summarize_wheels,
)
from cli.lib.common.path_helper import (
copy,
ensure_dir_exists,
@ -21,7 +26,7 @@ from cli.lib.common.path_helper import (
is_path_exist,
)
from cli.lib.common.utils import run_command
from cli.lib.core.vllm.lib import clone_vllm
from cli.lib.core.vllm.lib import clone_vllm, summarize_build_info
logger = logging.getLogger(__name__)
@ -153,18 +158,43 @@ class VllmBuildRunner(BaseRunner):
"""
inputs = VllmBuildParameters()
logger.info("Running vllm build with inputs: %s", inputs)
clone_vllm()
vllm_commit = clone_vllm()
self.cp_dockerfile_if_exist(inputs)
# cp torch wheels from root direct to vllm workspace if exist
self.cp_torch_whls_if_exist(inputs)
ensure_dir_exists(inputs.output_dir)
# make sure the output dir to store the build artifacts exist
ensure_dir_exists(Path(inputs.output_dir))
cmd = self._generate_docker_build_cmd(inputs)
logger.info("Running docker build: \n %s", cmd)
run_command(cmd, cwd="vllm", env=os.environ.copy())
try:
run_command(cmd, cwd="vllm", env=os.environ.copy())
finally:
self.genearte_vllm_build_summary(vllm_commit, inputs)
def genearte_vllm_build_summary(
self, vllm_commit: str, inputs: VllmBuildParameters
):
if not gh_summary_path():
return logger.info("Skipping, not detect GH Summary env var....")
logger.info("Generate GH Summary ...")
# summarize vllm build info
summarize_build_info(vllm_commit)
# summarize vllm build artifacts
vllm_artifact_dir = inputs.output_dir / "wheels"
summarize_content_from_file(
vllm_artifact_dir,
"build_summary.txt",
title="Vllm build env pip package summary",
)
summarize_wheels(
inputs.torch_whls_path, max_depth=3, title="Torch Wheels Artifacts"
)
summarize_wheels(vllm_artifact_dir, max_depth=3, title="Vllm Wheels Artifacts")
def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str:
if not inputs.use_torch_whl:

View File

@ -11,6 +11,11 @@ from typing import Any
from cli.lib.common.cli_helper import BaseRunner
from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env
from cli.lib.common.gh_summary import (
gh_summary_path,
summarize_failures_by_test_command,
write_gh_step_summary,
)
from cli.lib.common.path_helper import copy, remove_dir
from cli.lib.common.pip_helper import (
pip_install_first_match,
@ -18,8 +23,8 @@ from cli.lib.common.pip_helper import (
pkg_exists,
run_python,
)
from cli.lib.common.utils import run_command, working_directory
from cli.lib.core.vllm.lib import clone_vllm, run_test_plan, sample_vllm_test_library
from cli.lib.common.utils import ensure_dir_exists, run_command, working_directory
from cli.lib.core.vllm.lib import clone_vllm, run_test_plan, sample_vllm_test_library, summarize_build_info
logger = logging.getLogger(__name__)
@ -91,33 +96,54 @@ class VllmTestRunner(BaseRunner):
logger.info("Display VllmTestParameters %s", params)
self._set_envs(params)
clone_vllm(dst=self.work_directory)
vllm_commit = clone_vllm(dst=self.work_directory)
with working_directory(self.work_directory):
remove_dir(Path("vllm"))
self._install_wheels(params)
self._install_dependencies()
# verify the torches are not overridden by test dependencies
check_versions()
return vllm_commit
def run(self):
"""
main function to run vllm test
"""
self.prepare()
with working_directory(self.work_directory):
if self.test_type == TestInpuType.TEST_PLAN:
if self.num_shards > 1:
run_test_plan(
self.test_plan,
"vllm",
sample_vllm_test_library(),
self.shard_id,
self.num_shards,
)
vllm_commit = self.prepare()
# prepare test summary
test_summary_path = Path("tmp_pytest_report").resolve()
ensure_dir_exists(test_summary_path)
test_summary_result = []
try:
with working_directory(self.work_directory):
if self.test_type == TestInpuType.TEST_PLAN:
if self.num_shards > 1:
run_test_plan(
self.test_plan,
"vllm",
sample_vllm_test_library(),
self.shard_id,
self.num_shards,
test_summary_path=test_summary_path,
test_summary_result=test_summary_result,
)
else:
run_test_plan(self.test_plan, "vllm", sample_vllm_test_library())
else:
raise ValueError(f"Unknown test type {self.test_type}")
raise ValueError(f"Unknown test type {self.test_type}")
except Exception as e:
logger.error("Failed to run vllm test: %s", e)
finally:
self.vllm_test_gh_summary(vllm_commit, test_summary_result)
def vllm_test_gh_summary(
self, vllm_commit: str, test_summary_results: list[tuple[str, str]]
):
if not gh_summary_path():
return logger.info("Skipping, not detect GH Summary env var....")
logger.info("Generate GH Summary ...")
summarize_build_info(vllm_commit)
summarize_failures_by_test_command(test_summary_results)
def _install_wheels(self, params: VllmTestParameters):
logger.info("Running vllm test with inputs: %s", params)
@ -220,6 +246,8 @@ def preprocess_test_in(
target_path = Path(target_file)
lines = target_path.read_text().splitlines()
pkgs_to_add = []
# Remove lines starting with the package names (==, @, >=) — case-insensitive
pattern = re.compile(rf"^({'|'.join(pkgs_to_remove)})\s*(==|@|>=)", re.IGNORECASE)
kept_lines = [line for line in lines if not pattern.match(line)]
@ -236,7 +264,11 @@ def preprocess_test_in(
]
# Write back: header_lines + blank + kept_lines
out = "\n".join(header_lines + [""] + kept_lines) + "\n"
out_lines = header_lines + [""] + kept_lines
if pkgs_to_add:
out_lines += [""] + pkgs_to_add
out = "\n".join(out_lines) + "\n"
target_path.write_text(out)
logger.info("[INFO] Updated %s", target_file)

View File

@ -48,6 +48,7 @@ runs:
BASE_IMAGE: ${{ inputs.docker-image }}
BUILD_TARGETS: ${{ inputs.build-targets }}
PARENT_OUTPUT_DIR: ${{ inputs.output-dir}}
shell: bash
run: |
set -euo pipefail

View File

@ -176,6 +176,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
RUN cat torch_build_versions.txt
RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
@ -358,7 +359,7 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# Build flashinfer for torch nightly from source around 10 mins
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
ARG FLASHINFER_GIT_REF="v0.2.9rc2"
ARG FLASHINFER_GIT_REF="v0.2.14.post1"
RUN --mount=type=cache,target=/root/.cache/uv \
git clone --depth 1 --recursive --shallow-submodules \
--branch ${FLASHINFER_GIT_REF} \
@ -376,6 +377,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Logging to confirm the torch versions
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio\|^xformers\|^vllm\|^flashinfer' > build_summary.txt
################### VLLM INSTALLED IMAGE ####################
@ -433,4 +435,5 @@ FROM scratch as export-wheels
# Just copy the wheels we prepared in previous stages
COPY --from=base /workspace/xformers-dist /wheels/xformers
COPY --from=build /workspace/vllm-dist /wheels/vllm
COPY --from=vllm-base /workspace/build_summary.txt /wheels/build_summary.txt
COPY --from=vllm-base /workspace/wheels/flashinfer /wheels/flashinfer-python

View File

@ -6,8 +6,7 @@ on:
- ciflow/vllm/*
workflow_dispatch:
schedule:
# Every 12 hours starting at 00:00 UTC (00:00 and 12:00)
- cron: '0 0,12 * * *'
- cron: '0 */8 * * *' # every 8 hours at minute 0 (UTC)
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}