diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
index a37c4c9ba..2270f3245 100644
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -49,7 +49,7 @@ jobs:
       e2e_tracker: ${{ steps.filter.outputs.e2e_tracker }}
       ut_tracker: ${{ steps.filter.outputs.ut_tracker }}
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v4
       - uses: dorny/paths-filter@v3
         id: filter
         with:
@@ -130,9 +130,9 @@ jobs:
           verbose: true
 
   e2e:
-    needs: [lint, changes]
+    needs: [changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.
-    if: ${{ github.event_name == 'pull_request' && needs.lint.result == 'success' && needs.changes.outputs.e2e_tracker == 'true' }}
+    if: ${{ github.event_name == 'pull_request' && needs.changes.outputs.e2e_tracker == 'true' }}
     strategy:
       max-parallel: 2
       matrix:
@@ -160,7 +160,7 @@ jobs:
           apt install git -y
 
       - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
 
       - name: Install system dependencies
         run: |
@@ -168,7 +168,7 @@ jobs:
           apt-get -y install gcc g++ cmake libnuma-dev
 
       - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
         with:
           repository: vllm-project/vllm
           ref: ${{ matrix.vllm_version }}
@@ -192,7 +192,7 @@ jobs:
           VLLM_USE_MODELSCOPE: True
         run: |
           pytest -sv tests/e2e/singlecard/test_offline_inference.py
-          pytest -sv tests/e2e/singlecard/test_ilama_lora.py
+          # pytest -sv tests/e2e/singlecard/test_ilama_lora.py
           pytest -sv tests/e2e/singlecard/test_guided_decoding.py
           pytest -sv tests/e2e/singlecard/test_camem.py
           pytest -sv tests/e2e/singlecard/test_embedding.py
@@ -242,7 +242,7 @@ jobs:
           apt install git -y
 
       - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
 
       - name: Install system dependencies
         run: |
@@ -250,7 +250,7 @@ jobs:
           apt-get -y install gcc g++ cmake libnuma-dev
 
       - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
         with:
           repository: vllm-project/vllm
           ref: ${{ matrix.vllm_version }}
@@ -273,7 +273,7 @@ jobs:
           VLLM_WORKER_MULTIPROC_METHOD: spawn
           VLLM_USE_MODELSCOPE: True
         run: |
-          pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
+          # pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
           # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
           # To avoid oom, we need to run the test in a single process.
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py
index 99a565bde..7cf36a9f0 100644
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
@@ -29,7 +29,7 @@ import argparse
 
 from vllm.assets.audio import AudioAsset
 try:
-    import librosa
+    import librosa # type: ignore
 except ImportError:
     raise Exception("Can't import librosa, please ensure it's installed")
 
diff --git a/tests/e2e/singlecard/sample/test_rejection_sampler.py b/tests/e2e/singlecard/sample/test_rejection_sampler.py
index 2a3312028..3774b7205 100644
--- a/tests/e2e/singlecard/sample/test_rejection_sampler.py
+++ b/tests/e2e/singlecard/sample/test_rejection_sampler.py
@@ -4,7 +4,7 @@ from typing import Any, Optional
 import pytest
 import torch
 import torch.nn.functional as F
-from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.logits_processor import LogitsProcessors
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 
@@ -66,7 +66,7 @@ def create_sampling_metadata(
                             output_token_ids=[],
                             allowed_token_ids_mask=None,
                             bad_words_token_ids={},
-                            logitsprocs=LogitsProcessorManager())
+                            logitsprocs=LogitsProcessors())
 
 
 ########################### Tests for Greedy Sampling ###################
diff --git a/tests/ut/attention/test_attention_v1.py b/tests/ut/attention/test_attention_v1.py
index e8fe7ab6b..ab593414e 100644
--- a/tests/ut/attention/test_attention_v1.py
+++ b/tests/ut/attention/test_attention_v1.py
@@ -9,6 +9,7 @@ from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend,
                                                 AscendAttentionState,
                                                 AscendMetadata,
                                                 CommonAttentionState)
+from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
 
 
 class TestAscendAttentionBackend(TestBase):
@@ -67,8 +68,12 @@ class TestAscendAttentionBackend(TestBase):
 class TestAscendAttentionMetadataBuilder(TestBase):
 
     def setUp(self):
-        self.mock_runner = MagicMock()
-        self.builder = AscendAttentionMetadataBuilder(self.mock_runner)
+        self.mock_vllm_config = MagicMock()
+        self.mock_vllm_config.model_config.max_model_len = 640
+        self.mock_vllm_config.cache_config.block_size = 64
+        self.mock_device = 'cpu:0'
+        self.builder = AscendAttentionMetadataBuilder(self.mock_vllm_config,
+                                                      self.mock_device)
 
     def test_reorder_batch(self):
         mock_input_batch = MagicMock()
@@ -86,31 +91,28 @@ class TestAscendAttentionMetadataBuilder(TestBase):
     def test_build_prefill_no_cache(self, mock_is_310p, mock_nd_to_nz_2d,
                                     mock_npu_format_cast,
                                     mock_ascend_metadata):
-        num_reqs = 2
-        num_actual_tokens = 10
-        max_query_len = 5
-
-        self.mock_runner.input_batch.block_table = [MagicMock()]
-        self.mock_runner.input_batch.block_table[
-            0].get_device_tensor.return_value = torch.zeros((10, 10))
-        self.mock_runner.max_num_blocks_per_req = 10
-        self.mock_runner.query_lens = torch.tensor([3, 4])
-        self.mock_runner.seq_lens_cpu = torch.tensor([5, 6])
-        self.mock_runner.slot_mapping_cpu = torch.tensor(range(20))
-        self.mock_runner.device = 'cpu:0'
-        self.mock_runner.attn_mask = torch.ones((10, 10))
-        self.mock_runner.attn_state = AscendAttentionState.PrefillNoCache
-        self.mock_runner.query_start_loc_cpu = torch.tensor([0, 3, 7])
+        common_attn_metadata = AscendCommonAttentionMetadata(
+            query_start_loc=torch.tensor([0, 3, 7]),
+            query_start_loc_cpu=torch.tensor([0, 3, 7]),
+            seq_lens_cpu=torch.tensor([5, 6]),
+            num_reqs=2,
+            num_actual_tokens=10,
+            max_query_len=5,
+            decode_token_per_req=torch.tensor([1, 1]),
+            block_table_tensor=torch.zeros((10, 10)),
+            slot_mapping_cpu=torch.tensor(range(20)),
+            actual_seq_lengths_q=torch.tensor([0, 1]),
+            positions=torch.tensor([10, 10]),
+            attn_mask=torch.ones((10, 10)),
+            spec_attn_mask=None,
+            attn_state=AscendAttentionState.PrefillNoCache)
 
         mock_nz_tensor = MagicMock()
+        mock_model = MagicMock()
         mock_nd_to_nz_2d.return_value = mock_nz_tensor
         mock_npu_format_cast.return_value = mock_nz_tensor
 
-        self.builder.build(
-            num_reqs,
-            num_actual_tokens,
-            max_query_len,
-        )
+        self.builder.build(common_attn_metadata, mock_model)
 
     @patch('vllm_ascend.attention.attention_v1.AscendMetadata')
     @patch('torch_npu.npu_format_cast')
@@ -120,51 +122,53 @@ class TestAscendAttentionMetadataBuilder(TestBase):
     def test_build_chunked_prefill(self, mock_ascend_attention_state,
                                    mock_is_310p, mock_nd_to_nz_spec,
                                    mock_npu_format_cast, mock_ascend_metadata):
-        num_reqs = 3
-        num_actual_tokens = 15
-        max_query_len = 6
-
-        self.mock_runner.input_batch.block_table = [MagicMock()]
-        self.mock_runner.input_batch.block_table[
-            0].get_device_tensor.return_value = torch.zeros((10, 10))
-        self.mock_runner.max_num_blocks_per_req = 10
-        self.mock_runner.query_lens = torch.tensor([2, 3, 4])
-        self.mock_runner.seq_lens_cpu = torch.tensor([4, 5, 6])
-        self.mock_runner.slot_mapping_cpu = torch.tensor(range(20))
-        self.mock_runner.device = 'cpu:0'
-        self.mock_runner.attn_mask = torch.ones((15, 15))
-        self.mock_runner.attn_state = AscendAttentionState.ChunkedPrefill
-        self.mock_runner.query_start_loc_cpu = torch.tensor([0, 2, 5, 9])
+        common_attn_metadata = AscendCommonAttentionMetadata(
+            query_start_loc=torch.tensor([0, 2, 5, 9]),
+            query_start_loc_cpu=torch.tensor([0, 2, 5, 9]),
+            seq_lens_cpu=torch.tensor([4, 5, 6]),
+            num_reqs=3,
+            num_actual_tokens=15,
+            max_query_len=6,
+            decode_token_per_req=torch.tensor([1, 1, 1]),
+            block_table_tensor=torch.zeros((10, 10)),
+            slot_mapping_cpu=torch.tensor(range(20)),
+            actual_seq_lengths_q=torch.tensor([0, 1, 2]),
+            positions=torch.tensor([10, 10]),
+            attn_mask=torch.ones((15, 15)),
+            spec_attn_mask=None,
+            attn_state=AscendAttentionState.ChunkedPrefill)
 
         mock_ascend_attention_state = MagicMock()
         mock_ascend_attention_state.PrefillNoCache = 0
 
         mock_nz_tensor = MagicMock()
+        mock_model = MagicMock()
         mock_nd_to_nz_spec.return_value = mock_nz_tensor
         mock_npu_format_cast.return_value = mock_nz_tensor
 
-        self.builder.build(num_reqs, num_actual_tokens, max_query_len)
+        self.builder.build(common_attn_metadata, mock_model)
 
     @patch('vllm_ascend.attention.attention_v1.AscendMetadata')
     @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False)
     def test_build_non_310p(self, mock_is_310p, mock_ascend_metadata):
-        num_reqs = 3
-        num_actual_tokens = 15
-        max_query_len = 6
+        common_attn_metadata = AscendCommonAttentionMetadata(
+            query_start_loc=torch.tensor([0, 2, 5, 9]),
+            query_start_loc_cpu=torch.tensor([0, 2, 5, 9]),
+            seq_lens_cpu=torch.tensor([4, 5, 6]),
+            num_reqs=3,
+            num_actual_tokens=15,
+            max_query_len=6,
+            decode_token_per_req=torch.tensor([1, 1, 1]),
+            block_table_tensor=torch.zeros((10, 10)),
+            slot_mapping_cpu=torch.tensor(range(20)),
+            actual_seq_lengths_q=torch.tensor([0, 1, 2]),
+            positions=torch.tensor([10, 10]),
+            attn_mask=torch.ones((15, 15)),
+            spec_attn_mask=None,
+            attn_state=AscendAttentionState.ChunkedPrefill)
+        mock_model = MagicMock()
 
-        self.mock_runner.input_batch.block_table = [MagicMock()]
-        self.mock_runner.input_batch.block_table[
-            0].get_device_tensor.return_value = torch.zeros((10, 10))
-        self.mock_runner.max_num_blocks_per_req = 10
-        self.mock_runner.query_lens = torch.tensor([2, 3, 4])
-        self.mock_runner.seq_lens_cpu = torch.tensor([4, 5, 6])
-        self.mock_runner.slot_mapping_cpu = torch.tensor(range(20))
-        self.mock_runner.device = 'cpu:0'
-        self.mock_runner.attn_mask = torch.ones((15, 15))
-        self.mock_runner.attn_state = AscendAttentionState.ChunkedPrefill
-        self.mock_runner.query_start_loc_cpu = torch.tensor([0, 2, 5, 9])
-
-        self.builder.build(num_reqs, num_actual_tokens, max_query_len)
+        self.builder.build(common_attn_metadata, mock_model)
 
 
 class TestAscendAttentionBackendImpl(TestBase):
diff --git a/tests/ut/attention/test_mla_v1.py b/tests/ut/attention/test_mla_v1.py
index 497b7b53a..be2a7d897 100644
--- a/tests/ut/attention/test_mla_v1.py
+++ b/tests/ut/attention/test_mla_v1.py
@@ -1,6 +1,5 @@
 from unittest.mock import MagicMock, patch
 
-import numpy as np
 import torch
 from vllm.distributed.parallel_state import GroupCoordinator
 from vllm.model_executor.layers.linear import LinearBase
@@ -12,6 +11,7 @@ from vllm_ascend.attention.mla_v1 import (AscendMLABackend,
                                           AscendMLAImpl, AscendMLAMetadata,
                                           AscendMLAMetadataBuilder,
                                           AscendMLAPrefillMetadata)
+from vllm_ascend.torchair.utils import TorchairCommonAttentionMetadata
 
 
 class TestAscendMLABackend(TestBase):
@@ -178,40 +178,41 @@ class TestAscendMLAMetadata(TestBase):
 class TestAscendMLAMetadataBuilder(TestBase):
 
     def test_ascend_mla_metadata_builder_default(self):
-        runner = MagicMock()
-        runner.scheduler_config = MagicMock()
-        runner.model_config = MagicMock()
-        runner.scheduler_config.max_num_seqs = 4
-        runner.model_config.max_model_len = 1024
-        runner.model_config.get_head_size.return_value = 64
-        runner.model_config.dtype = torch.float16
-        runner.chunked_prefill_enabled = False
-        runner.device = "cpu"
-        runner.block_size = 16
-        runner.decode_token_per_req = 1
+        mock_vllm_config = MagicMock()
+        mock_vllm_config.model_config.max_model_len = 1024
+        mock_vllm_config.model_config.get_head_size.return_value = 64
+        mock_vllm_config.model_config.dtype = torch.float16
+        mock_vllm_config.cache_config.block_size = 16
+        mock_vllm_config.scheduler_config.max_num_seqs = 4
+        mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
+        mock_device = 'cpu'
 
         ascend_config = MagicMock()
         ascend_config.torchair_graph_config = MagicMock()
         ascend_config.torchair_graph_config.enabled = True
         with patch("vllm_ascend.attention.mla_v1.get_ascend_config",
                    return_value=ascend_config):
-            builder = AscendMLAMetadataBuilder(runner)
+            builder = AscendMLAMetadataBuilder(mock_vllm_config, mock_device)
 
-            self.assertEqual(builder.runner, runner)
-            self.assertEqual(builder.block_size, runner.block_size)
-            self.assertEqual(builder.chunked_prefill_enabled,
-                             runner.chunked_prefill_enabled)
+            self.assertEqual(builder.block_size,
+                             mock_vllm_config.cache_config.block_size)
+            self.assertEqual(
+                builder.chunked_prefill_enabled,
+                mock_vllm_config.scheduler_config.chunked_prefill_enabled)
             self.assertEqual(builder.torchair_graph_enabled, True)
 
     @patch("vllm_ascend.attention.mla_v1.get_ascend_config")
     def test_reorder_batch_with_torchair_graph(self, ascend_config):
-        runner = MagicMock()
-        runner.chunked_prefill_enabled = False
-        runner.decode_token_per_req = 1
+        mock_vllm_config = MagicMock()
+        mock_vllm_config.model_config.max_model_len = 1024
+        mock_vllm_config.cache_config.block_size = 16
+        mock_vllm_config.scheduler_config.max_num_seqs = 4
+        mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
+        mock_device = 'cpu'
         ascend_config.torchair_graph_config = MagicMock()
         ascend_config.torchair_graph_config.enabled = True
 
-        builder = AscendMLAMetadataBuilder(runner)
+        builder = AscendMLAMetadataBuilder(mock_vllm_config, mock_device)
 
         input_batch = MagicMock()
         input_batch.req_ids = [0, 1, 2, 3]
@@ -230,22 +231,23 @@ class TestAscendMLAMetadataBuilder(TestBase):
         modified = builder.reorder_batch(input_batch, scheduler_output)
 
         self.assertFalse(modified)
-        self.assertEqual(builder._num_decodes, 4)
-        self.assertEqual(builder._num_prefills, 0)
-        self.assertEqual(builder._num_decode_tokens, 7)
-        self.assertEqual(builder._num_prefill_tokens, 0)
         input_batch.swap_states.assert_not_called()
 
     def test_reorder_batch_without_torchair_graph(self):
         ascend_config = MagicMock()
-        runner = MagicMock()
-        runner.chunked_prefill_enabled = False
-        runner.decode_token_per_req = 1
         ascend_config.torchair_graph_config = MagicMock()
         ascend_config.torchair_graph_config.enabled = False
+
+        mock_vllm_config = MagicMock()
+        mock_vllm_config.model_config.max_model_len = 1024
+        mock_vllm_config.cache_config.block_size = 16
+        mock_vllm_config.scheduler_config.max_num_seqs = 4
+        mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
+        mock_device = 'cpu'
+
         with patch("vllm_ascend.attention.mla_v1.get_ascend_config",
                    return_value=ascend_config):
-            builder = AscendMLAMetadataBuilder(runner)
+            builder = AscendMLAMetadataBuilder(mock_vllm_config, mock_device)
 
         input_batch = MagicMock()
         input_batch.req_ids = [0, 1, 2, 3]
@@ -264,10 +266,6 @@ class TestAscendMLAMetadataBuilder(TestBase):
         modified = builder.reorder_batch(input_batch, scheduler_output)
 
         self.assertTrue(modified)
-        self.assertEqual(builder._num_decodes, 2)
-        self.assertEqual(builder._num_prefills, 2)
-        self.assertEqual(builder._num_decode_tokens, 2)
-        self.assertEqual(builder._num_prefill_tokens, 5)
         input_batch.swap_states.assert_called_once_with(1, 2)
 
     @patch("vllm_ascend.attention.mla_v1.get_ascend_config")
@@ -275,11 +273,13 @@ class TestAscendMLAMetadataBuilder(TestBase):
         ascend_config = MagicMock()
         mock_ascend_config.return_value = ascend_config
         ascend_config.torchair_graph_config.enabled = False
-        runner = MagicMock()
-        runner.graph_block_tables = torch.zeros((8, 64), dtype=torch.int32)
-        runner.chunked_prefill_enabled = False
-        runner.decode_token_per_req = 1
-        builder = AscendMLAMetadataBuilder(runner=runner)
+        mock_vllm_config = MagicMock()
+        mock_vllm_config.model_config.max_model_len = 1024
+        mock_vllm_config.cache_config.block_size = 16
+        mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
+        mock_device = 'cpu'
+
+        builder = AscendMLAMetadataBuilder(mock_vllm_config, mock_device)
         block_tables = torch.randint(0, 100, (3, 10), dtype=torch.int32)
 
         result = builder._get_graph_runner_block_tables(3, block_tables)
@@ -292,11 +292,13 @@ class TestAscendMLAMetadataBuilder(TestBase):
         ascend_config = MagicMock()
         mock_ascend_config.return_value = ascend_config
         ascend_config.torchair_graph_config.enabled = False
-        runner = MagicMock()
-        runner.graph_block_tables = torch.zeros((8, 4), dtype=torch.int32)
-        runner.chunked_prefill_enabled = False
-        runner.decode_token_per_req = 1
-        builder = AscendMLAMetadataBuilder(runner=runner)
+        mock_vllm_config = MagicMock()
+        mock_vllm_config.model_config.max_model_len = 64
+        mock_vllm_config.cache_config.block_size = 16
+        mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
+        mock_device = 'cpu'
+
+        builder = AscendMLAMetadataBuilder(mock_vllm_config, mock_device)
         block_tables = torch.randint(0, 100, (3, 10), dtype=torch.int32)
 
         result = builder._get_graph_runner_block_tables(3, block_tables)
@@ -310,11 +312,13 @@ class TestAscendMLAMetadataBuilder(TestBase):
         ascend_config = MagicMock()
         mock_ascend_config.return_value = ascend_config
         ascend_config.torchair_graph_config.enabled = False
-        runner = MagicMock()
-        runner.graph_block_tables = np.zeros((8, 64), dtype=np.int32)
-        runner.chunked_prefill_enabled = False
-        runner.decode_token_per_req = 1
-        builder = AscendMLAMetadataBuilder(runner=runner)
+        mock_vllm_config = MagicMock()
+        mock_vllm_config.model_config.max_model_len = 1024
+        mock_vllm_config.cache_config.block_size = 16
+        mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
+        mock_device = 'cpu'
+
+        builder = AscendMLAMetadataBuilder(mock_vllm_config, mock_device)
 
         block_tables = torch.randint(0, 100, (3, 10), dtype=torch.int32)
 
@@ -329,38 +333,45 @@ class TestAscendMLAMetadataBuilder(TestBase):
         ascend_config = MagicMock()
         mock_ascend_config.return_value = ascend_config
         ascend_config.torchair_graph_config.enabled = False
-        runner = MagicMock()
-        runner.model_config = MagicMock()
-        runner.device = "cpu"
-        runner.graph_block_tables = torch.zeros((8, 64), dtype=torch.int32)
-        runner.model_config.get_head_size.return_value = 64
-        runner.chunked_prefill_enabled = False
-        runner.attn_mask = torch.zeros((1, 1), dtype=torch.bool)
-        runner.spec_attn_mask = torch.zeros((1, 1), dtype=torch.bool)
-        runner.dtype = torch.float16
-        runner.decode_token_per_req = 1
 
-        builder = AscendMLAMetadataBuilder(runner=runner,
+        mock_vllm_config = MagicMock()
+        mock_vllm_config.model_config.max_model_len = 1024
+        mock_vllm_config.cache_config.block_size = 16
+        mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
+        mock_vllm_config.get_head_size.return_value = 64
+        mock_vllm_config.model_config.dtype = torch.float16
+        mock_device = 'cpu'
+
+        builder = AscendMLAMetadataBuilder(mock_vllm_config,
+                                           mock_device,
                                            metadata_cls=AscendMLAMetadata)
         builder.rope_dim = 64
 
         with patch.object(builder,
                           "_get_graph_runner_block_tables",
                           side_effect=lambda x, y: y):
-            metadata = builder.build_torchair_graph_dummy(3, 3)
+            common_attn_metadata = TorchairCommonAttentionMetadata(
+                num_reqs=3,
+                num_actual_tokens=3,
+                decode_token_per_req=1,
+                actual_seq_lengths_q=[0, 1, 2],
+                attn_mask=torch.zeros((1, 1), dtype=torch.bool),
+                spec_attn_mask=torch.zeros((1, 1), dtype=torch.bool),
+            )
+            metadata = builder.build_torchair_graph_dummy(common_attn_metadata)
 
         sin_golden = torch.ones(3,
                                 1,
                                 1,
                                 64,
-                                dtype=runner.dtype,
-                                device=runner.device)
+                                dtype=torch.float16,
+                                device=mock_device)
         cos_golden = torch.ones(3,
                                 1,
                                 1,
                                 64,
-                                dtype=runner.dtype,
-                                device=runner.device)
+                                dtype=torch.float16,
+                                device=mock_device)
 
         self.assertIsInstance(metadata, AscendMLAMetadata)
         self.assertEqual(metadata.num_input_tokens, 3)
diff --git a/tests/ut/core/test_scheduler.py b/tests/ut/core/test_scheduler.py
index b572629f6..3c55a3751 100644
--- a/tests/ut/core/test_scheduler.py
+++ b/tests/ut/core/test_scheduler.py
@@ -11,7 +11,7 @@ from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec)
-from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
 
@@ -68,7 +68,6 @@ def make_output(scheduler):
             for i, req in enumerate(scheduler.running)
         },
         sampled_token_ids=[[1000]] * len(scheduler.running),
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[])
@@ -296,7 +295,6 @@ class TestAscendScheduler(TestBase):
             },
             sampled_token_ids=[[EOS_TOKEN_ID], [10, 11]
                                ],  # First request hits EOS, second continues
-            spec_token_ids=None,
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[])
@@ -352,7 +350,6 @@ class TestAscendScheduler(TestBase):
             },
             sampled_token_ids=[[10, 42, 12],
                                [13, 14]],  # First request hits stop token
-            spec_token_ids=None,
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[])
@@ -407,7 +404,6 @@ class TestAscendScheduler(TestBase):
             },
             sampled_token_ids=[[10, 11, 12],
                                [13]],  # First request exceeds max_tokens
-            spec_token_ids=None,
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[])
@@ -451,7 +447,6 @@ class TestAscendScheduler(TestBase):
             req_ids=[requests[0].request_id],
             req_id_to_index={requests[0].request_id: 0},
             sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
-            spec_token_ids=None,
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[])
@@ -509,7 +504,6 @@ class TestAscendScheduler(TestBase):
                 req_ids=[requests[0].request_id],
                 req_id_to_index={requests[0].request_id: 0},
                 sampled_token_ids=[[0]],
-                spec_token_ids=None,
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[])
@@ -526,7 +520,6 @@ class TestAscendScheduler(TestBase):
                 req_ids=[requests[1].request_id],
                 req_id_to_index={requests[1].request_id: 0},
                 sampled_token_ids=[[0]],
-                spec_token_ids=None,
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[])
@@ -586,13 +579,14 @@ class TestAscendScheduler(TestBase):
                 req_ids=req_ids,
                 req_id_to_index=req_to_index,
                 sampled_token_ids=[[0] for _ in range(len(requests))],
-                spec_token_ids=spec_tokens,
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[])
+            draft_token_ids = DraftTokenIds(req_ids, spec_tokens)
 
             engine_core_outputs = scheduler.update_from_output(
                 output, model_runner_output)
+            scheduler.update_draft_token_ids(draft_token_ids)
 
             for i in range(len(requests)):
                 running_req = scheduler.running[i]
@@ -633,7 +627,6 @@ class TestAscendScheduler(TestBase):
                 req_ids=req_ids,
                 req_id_to_index=req_to_index,
                 sampled_token_ids=output_tokens,
-                spec_token_ids=None,
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[])
@@ -674,10 +667,6 @@ class TestAscendScheduler(TestBase):
         self.assertEqual(
             len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
                 num_cached_block), 0)
-        self.assertEqual(len(scheduler.kv_cache_manager.req_to_block_hashes),
-                         0)
-        self.assertEqual(len(scheduler.kv_cache_manager.req_to_block_hashes),
-                         0)
         num_free_blocks = (scheduler.kv_cache_manager.block_pool.
                            free_block_queue.num_free_blocks)
         self.assertEqual(
diff --git a/tests/ut/kv_connector/test_remote_prefill_lifecycle.py b/tests/ut/kv_connector/test_remote_prefill_lifecycle.py
index 867dafb29..c9b889155 100644
--- a/tests/ut/kv_connector/test_remote_prefill_lifecycle.py
+++ b/tests/ut/kv_connector/test_remote_prefill_lifecycle.py
@@ -42,7 +42,8 @@ def test_basic_lifecycle():
 
     request = create_request(request_id=1,
                              num_tokens=NUM_TOKENS,
-                             do_remote_prefill=True)
+                             do_remote_prefill=True,
+                             block_size=BLOCK_SIZE)
 
     scheduler.add_request(request)
     request_id = request.request_id
diff --git a/tests/ut/kv_connector/utils.py b/tests/ut/kv_connector/utils.py
index e696a7692..dd96c6b0c 100644
--- a/tests/ut/kv_connector/utils.py
+++ b/tests/ut/kv_connector/utils.py
@@ -10,6 +10,8 @@ import torch
 from vllm import SamplingParams
 from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
                          ModelConfig, SchedulerConfig, VllmConfig)
+from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
+                                         init_none_hash)
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec)
@@ -39,7 +41,6 @@ def assert_scheduler_empty(scheduler: Scheduler):
     # KVCache Manager.
     assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
                req_to_blocks) == 0
-    assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
     assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
                num_cached_block) == 0
     num_free_blocks = (
@@ -118,6 +119,9 @@ def create_scheduler(
     )
 
 
+_none_hash_initialized = False
+
+
 def create_request(
     request_id: int,
     num_tokens: int = 10,
@@ -126,8 +130,15 @@ def create_request(
     do_remote_prefill: bool = False,
     use_all_1s_for_prompt_tokens: bool = False,
     num_remote_blocks: int = 3,
+    block_size: int = 16,
 ) -> Request:
     """Make dummy request for testing."""
+    global _none_hash_initialized
+    if not _none_hash_initialized:
+        init_none_hash(hash)
+        _none_hash_initialized = True
+
+    block_hasher = get_request_block_hasher(block_size, hash)
 
     kv_transfer_params: Optional[dict[str, Any]] = None
 
@@ -164,6 +175,7 @@ def create_request(
             "pooling_params": []
         } if not vllm_version_is("0.9.1") else {}),
         eos_token_id=EOS_TOKEN_ID,
+        block_hasher=block_hasher,
     )
     req.kv_transfer_params = kv_transfer_params
     return req
@@ -196,7 +208,6 @@ def create_model_runner_output(
         req_ids=req_ids,
         req_id_to_index=req_id_to_index,
         sampled_token_ids=sampled_token_ids,
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
diff --git a/tests/ut/ops/test_fused_ops.py b/tests/ut/ops/test_fused_ops.py
index ec3037f2f..42370ebe5 100644
--- a/tests/ut/ops/test_fused_ops.py
+++ b/tests/ut/ops/test_fused_ops.py
@@ -184,6 +184,11 @@ class MockQuantMethod(nn.Module):
 
 
 class MockFusedMoEMethod(FusedMoEMethodBase):
+    # TODO(bnell): also pass quant_config?
+    moe = MagicMock()
+
+    def __init__(self):
+        super().__init__(self.moe)
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py
index 940d07f2e..67436a344 100644
--- a/tests/ut/test_platform.py
+++ b/tests/ut/test_platform.py
@@ -536,10 +536,10 @@ class TestNPUPlatform(TestBase):
         mock_config = MagicMock(spec=ModelConfig)
         self.assertTrue(self.platform.supports_v1(mock_config))
 
-    def test_get_piecewise_backend_cls_returns_correct_value(self):
+    def test_get_static_graph_wrapper_cls_returns_correct_value(self):
         self.assertEqual(
-            self.platform.get_piecewise_backend_cls(),
-            "vllm_ascend.compilation.piecewise_backend.NPUPiecewiseBackend",
+            self.platform.get_static_graph_wrapper_cls(),
+            "vllm_ascend.compilation.acl_graph.ACLGraphWrapper",
         )
 
     @patch("torch.distributed.is_hccl_available", return_value=True)
diff --git a/tests/ut/worker/test_input_batch.py b/tests/ut/worker/test_input_batch.py
index 685cf174a..3914f96d2 100644
--- a/tests/ut/worker/test_input_batch.py
+++ b/tests/ut/worker/test_input_batch.py
@@ -1,161 +1,371 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+import inspect
+from collections.abc import Sequence
+from typing import Optional
+
 import numpy as np
+import pytest
 import torch
 from vllm.sampling_params import SamplingParams
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.v1.pool.metadata import PoolingMetadata
+from vllm.v1.sample.logits_processor import LogitsProcessors
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.worker.block_table import MultiGroupBlockTable
+from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable
 
-from tests.ut.base import TestBase
 from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
 
+VOCAB_SIZE = 1024
+NUM_OUTPUT_TOKENS = 20
+MAX_PROMPT_SIZE = 100
+MAX_NUM_PROMPT_TOKENS = 64
 
-def mock_cached_request_state(req_id="1", prompt=[1, 2, 3], output=[4, 5, 6]):
-    return CachedRequestState(
-        req_id=req_id,
-        prompt_token_ids=prompt,
-        mm_kwargs=[],
-        mm_positions=[],
-        sampling_params=SamplingParams(),
-        pooling_params=None,
-        generator=None,
-        block_ids=([], ),
-        num_computed_tokens=0,
-        output_token_ids=output,
+
+def _compare_objs(obj1,
+                  obj2,
+                  skip: Sequence = ("logitsprocs", "batch_update_builder")):
+    attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a)))
+    attr_names = set([
+        a[0] for a in attrs
+        if not (a[0].startswith('__') and a[0].endswith('__'))
+    ])
+    for attr_name in attr_names:
+        if attr_name in skip:
+            continue
+
+        a = getattr(obj1, attr_name)
+        b = getattr(obj2, attr_name)
+
+        is_same = False
+        if isinstance(a, torch.Tensor):
+            if (a.numel() == 0 or b.numel() == 0):
+                is_same = (a.numel() == 0 and b.numel() == 0)
+            elif torch.allclose(a, b):
+                is_same = True
+        elif isinstance(a, np.ndarray):
+            if np.allclose(a, b):
+                is_same = True
+        elif isinstance(a, MultiGroupBlockTable):
+            for a_i, b_i in zip(a.block_tables, b.block_tables):
+                _compare_objs(a_i, b_i)
+            is_same = True
+        elif isinstance(a, (BlockTable, SamplingMetadata, PoolingMetadata)):
+            _compare_objs(a, b)
+            is_same = True  # if we make it here must be same
+        elif a == b:
+            is_same = True
+        assert is_same, f"Attribute {attr_name} is different"\
+            f" in {obj1} and {obj2}: {a} != {b}"
+
+
+def _remove_requests(input_batch: InputBatch, batch_size: int,
+                     reqs: list[CachedRequestState]) -> set[str]:
+    """
+    Remove some requests randomly from the batch and returns
+    set of request removed
+    """
+
+    num_reqs_to_remove = np.random.randint(0, batch_size)
+    req_indices_to_remove: set[int] = set()
+    for _ in range(num_reqs_to_remove):
+        req_index_to_remove = np.random.randint(0, batch_size)
+        req_indices_to_remove.add(req_index_to_remove)
+
+    req_ids_to_remove: set[str] = set()
+    for index in req_indices_to_remove:
+        input_batch.remove_request(reqs[index].req_id)
+        req_ids_to_remove.add(reqs[index].req_id)
+    return req_ids_to_remove
+
+
+def _construct_expected_sampling_metadata(
+    reqs: list[CachedRequestState],
+    req_ids_retained: set[int],
+    req_id_index_in_input_batch: dict[str, int],
+    device: torch.device,
+) -> SamplingMetadata:
+    """
+    Constructs and returns the expected SamplingMetadata for this
+    batch.
+    """
+    num_reqs = len(req_ids_retained)
+    output_token_ids: list[list[int]] = [list() for _ in range(num_reqs)]
+    prompt_token_ids: list[list[int]] = [list() for _ in range(num_reqs)]
+    presence_penalties = [0.0 for _ in range(num_reqs)]
+    frequency_penalties = [0.0 for _ in range(num_reqs)]
+    repetition_penalties = [1.0 for _ in range(num_reqs)]
+    top_k = [0 for _ in range(num_reqs)]
+    top_p = [0.0 for _ in range(num_reqs)]
+    temperature = [0.0 for _ in range(num_reqs)]
+    min_tokens = {}
+    logit_bias = [None] * num_reqs
+    allowed_token_ids_mask = torch.zeros(num_reqs,
+                                         VOCAB_SIZE,
+                                         dtype=torch.bool,
+                                         device=device)
+    bad_words_token_ids = {}
+    for req in reqs:
+        if req.req_id not in req_ids_retained:
+            continue
+        index_in_input_batch = req_id_index_in_input_batch[req.req_id]
+        output_token_ids[index_in_input_batch] = req.output_token_ids
+        prompt_token_ids[index_in_input_batch] = req.prompt_token_ids
+        presence_penalties[
+            index_in_input_batch] = req.sampling_params.presence_penalty
+        frequency_penalties[index_in_input_batch] = (
+            req.sampling_params.frequency_penalty)
+        repetition_penalties[index_in_input_batch] = (
+            req.sampling_params.repetition_penalty)
+        top_k[index_in_input_batch] = req.sampling_params.top_k
+        top_p[index_in_input_batch] = req.sampling_params.top_p
+        temperature[index_in_input_batch] = req.sampling_params.temperature
+        min_tokens[index_in_input_batch] = (
+            req.sampling_params.min_tokens,
+            req.sampling_params.all_stop_token_ids)
+        logit_bias[index_in_input_batch] = req.sampling_params.logit_bias
+        if req.sampling_params.allowed_token_ids:
+            allowed_token_ids_mask[index_in_input_batch][
+                req.sampling_params.allowed_token_ids] = True
+        if req.sampling_params.bad_words_token_ids:
+            bad_words_token_ids[
+                index_in_input_batch] = req.sampling_params.bad_words_token_ids
+
+    return SamplingMetadata(
+        temperature=torch.tensor(temperature, dtype=torch.float,
+                                 device=device),
+        all_greedy=False,
+        all_random=True,
+        top_p=None if all(x == 1.0 for x in top_p) else torch.tensor(
+            top_p, dtype=torch.float, device=device),
+        top_k=None if all(x == 0 for x in top_k) else torch.tensor(
+            top_k, dtype=torch.int, device=device),
+        generators={},
+        max_num_logprobs=0,
+        prompt_token_ids=make_tensor_with_pad(
+            prompt_token_ids,
+            pad=VOCAB_SIZE,
+            device=torch.device(device),
+            dtype=torch.int64,
+        ),
+        frequency_penalties=torch.tensor(frequency_penalties,
+                                         dtype=torch.float,
+                                         device=device),
+        presence_penalties=torch.tensor(presence_penalties,
+                                        dtype=torch.float,
+                                        device=device),
+        repetition_penalties=torch.tensor(repetition_penalties,
+                                          dtype=torch.float,
+                                          device=device),
+        output_token_ids=output_token_ids,
+        no_penalties=(all(x == 0 for x in presence_penalties)
+                      and all(x == 0 for x in frequency_penalties)
+                      and all(x == 1 for x in repetition_penalties)),
+        allowed_token_ids_mask=allowed_token_ids_mask,
+        bad_words_token_ids=bad_words_token_ids,
+        logitsprocs=LogitsProcessors(),
     )
 
 
-class TestInputBatch(TestBase):
+def _create_sampling_params():
+    return SamplingParams(
+        top_k=np.random.randint(1, 10),
+        top_p=np.random.uniform(0.0, 1.0),
+        presence_penalty=np.random.uniform(-2.0, 2.0),
+        repetition_penalty=np.random.uniform(0.0, 2.0),
+        frequency_penalty=np.random.uniform(-2.0, 2.0),
+        min_tokens=np.random.randint(1, 10),
+        stop_token_ids=[
+            np.random.randint(0, VOCAB_SIZE)
+            for _ in range(np.random.randint(10))
+        ],
+        logit_bias={0: np.random.uniform(-3.0, 3.0)},
+    )
 
-    def setUp(self):
-        self.max_num_reqs = 10
-        self.max_model_len = 32
-        self.max_num_batched_tokens = 132
-        self.vocab_size = 1000
-        self.device = torch.device("cpu")
-        self.block_sizes = [128]
 
-        self.input_batch = InputBatch(
-            max_num_reqs=self.max_num_reqs,
-            max_model_len=self.max_model_len,
-            max_num_batched_tokens=self.max_num_batched_tokens,
-            device=self.device,
-            pin_memory=False,
-            vocab_size=self.vocab_size,
-            block_sizes=self.block_sizes,
-        )
-        self.cached_request_state = mock_cached_request_state()
+def _construct_cached_request_state(req_id_suffix: int):
+    prompt_token_ids = [
+        np.random.randint(0, VOCAB_SIZE)
+        for _ in range(np.random.randint(0, MAX_PROMPT_SIZE))
+    ]
+    output_token_ids = [
+        np.random.randint(0, VOCAB_SIZE)
+        for _ in range(np.random.randint(0, NUM_OUTPUT_TOKENS))
+    ]
+    return CachedRequestState(
+        req_id=f"req_id_{req_id_suffix}",
+        prompt_token_ids=prompt_token_ids,
+        sampling_params=_create_sampling_params(),
+        pooling_params=None,
+        mm_kwargs=[],
+        mm_positions=[],
+        block_ids=([], ),
+        generator=None,
+        num_computed_tokens=len(output_token_ids),
+        output_token_ids=output_token_ids,
+    )
 
-    def test_shapes_and_defaults(self):
-        # torch tensor shape assertions
-        self.assertEqual(self.input_batch.token_ids_cpu_tensor.shape,
-                         (self.max_num_reqs, self.max_model_len))
-        self.assertEqual(self.input_batch.temperature.shape,
-                         (self.max_num_reqs, ))
-        self.assertEqual(self.input_batch.top_k.shape, (self.max_num_reqs, ))
-        self.assertEqual(self.input_batch.min_p_cpu_tensor.shape,
-                         (self.max_num_reqs, ))
 
-        # numpy shape assertions
-        self.assertEqual(self.input_batch.token_ids_cpu.shape,
-                         (self.max_num_reqs, self.max_model_len))
-        self.assertEqual(self.input_batch.num_tokens.shape,
-                         (self.max_num_reqs, ))
-        self.assertEqual(self.input_batch.num_tokens.shape,
-                         (self.max_num_reqs, ))
+@pytest.mark.parametrize("device", ["cpu"])
+@pytest.mark.parametrize("batch_size", [1, 2, 32, 64])
+def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
+    """
+    Tests the logic for managing sampling metadata in the InputBatch.
 
-        # type assertions
-        self.assertIsInstance(self.input_batch.greedy_reqs, set)
-        self.assertIsInstance(self.input_batch.req_id_to_index, dict)
-        self.assertIsInstance(self.input_batch.sampling_metadata,
-                              SamplingMetadata)
-        self.assertIsInstance(self.input_batch.block_table,
-                              MultiGroupBlockTable)
-        self.assertIsNone(self.input_batch.allowed_token_ids_mask)
-        self.assertIsNone(self.input_batch.allowed_token_ids_mask_cpu_tensor)
+    This test involves adding a set of requests to the InputBatch,
+    followed by removing a subset of them. Afterward, the batch is compacted,
+    and the `make_sampling_metadata` method is invoked on the batch. The
+    output of `make_sampling_metadata` is then compared against the expected
+    results to ensure correctness.
 
-    def test_add_request(self):
-        # case1: add a new req
-        self.input_batch.add_request(self.cached_request_state)
-        self.assertIn(self.cached_request_state.req_id,
-                      self.input_batch.req_id_to_index)
-        req_index = self.input_batch.req_id_to_index[
-            self.cached_request_state.req_id]
-        self.assertEqual(self.input_batch.num_prompt_tokens[req_index],
-                         len(self.cached_request_state.prompt_token_ids))
-        self.assertEqual(self.input_batch.num_tokens[req_index],
-                         self.cached_request_state.num_tokens)
+    Note: Ignore logits processor logic, which is tested separately
+    """
+    input_batch: InputBatch = InputBatch(
+        max_num_reqs=batch_size,
+        max_model_len=1024,
+        max_num_batched_tokens=1024,
+        device=torch.device(device),
+        pin_memory=is_pin_memory_available(),
+        vocab_size=1024,
+        block_sizes=[1],
+    )
+    reqs: list[CachedRequestState] = []
+    req_id_reqs = {}
+    req_id_output_token_ids = {}
 
-        # case2: add an existing req, maybe need update
-        self.cached_request_state.output_token_ids.extend([7, 8, 9])
-        self.cached_request_state.num_computed_tokens += 3
-        cached_index = self.input_batch.req_id_to_index[
-            self.cached_request_state.req_id]
-        self.input_batch.add_request(self.cached_request_state, cached_index)
-        # check if this index in the input_batch is updated
-        # This np arrat "token_ids_cpu" should be filled with prompt_token_ids + output_token_ids
-        self.assertTrue(
-            np.all(self.input_batch.token_ids_cpu[
-                cached_index, :self.cached_request_state.num_tokens]),
-            msg=f"Token IDs at index {cached_index} did not update correctly.")
+    # Add requests
+    for req_index in range(batch_size):
+        req: CachedRequestState = _construct_cached_request_state(req_index)
+        assigned_req_index = input_batch.add_request(req)
+        assert req_index == assigned_req_index
+        reqs.append(req)
+        req_id_reqs[req.req_id] = req
+        req_id_output_token_ids[req.req_id] = req.output_token_ids
 
-        # case3: add req that greater than max_num_reqs
-        with self.assertRaises(AssertionError):
-            self.input_batch.add_request(self.cached_request_state,
-                                         req_index=self.max_num_reqs)
+    # Remove some requests
+    req_ids_to_remove = _remove_requests(input_batch, batch_size, reqs)
+    req_ids_retained = set(req_id_reqs.keys()) - req_ids_to_remove
 
-        # case4: add req that out of max_model_len
-        long_prompt = list(range(self.max_model_len + 1))
-        long_request = mock_cached_request_state(req_id="2",
-                                                 prompt=long_prompt,
-                                                 output=[10])
-        with self.assertRaises(ValueError) as cm:
-            self.input_batch.add_request(long_request)
-        self.assertIn("could not broadcast", str(cm.exception))
+    # Compact the input batch
+    input_batch.condense()
 
-    def test_remove_request(self):
-        self.input_batch.add_request(self.cached_request_state)
-        req_index = self.input_batch.remove_request(
-            self.cached_request_state.req_id)
-        self.assertIsNotNone(req_index)
-        self.assertNotIn(self.cached_request_state.req_id,
-                         self.input_batch.req_id_to_index)
-        self.assertIsNone(self.input_batch._req_ids[req_index])
+    # Generate the sampling metadata
+    sampling_metadata = input_batch._make_sampling_metadata()
 
-    def test_condense(self):
-        # Let's say we have some requests like below
-        # Index     Req ID
-        #   0         1
-        #   1         2
-        #   2         3
-        #   3         4
-        for i in range(4):
-            request = mock_cached_request_state(req_id=str(i + 1))
-            self.input_batch.add_request(request)
-        removed_req_indices = []
-        id_to_remove = ["2", "4"]  # IDs to remove
-        for req_id in id_to_remove:
-            removed_index = self.input_batch.remove_request(req_id)
-            if removed_index is not None:
-                removed_req_indices.append(removed_index)
-        self.assertEqual(len(removed_req_indices), len(id_to_remove))
-        self.input_batch.condense(sorted(removed_req_indices, reverse=True))
+    # Create expected output.
+    expected_sampling_metadata = _construct_expected_sampling_metadata(
+        reqs,
+        req_ids_retained,
+        input_batch.req_id_to_index,
+        device=torch.device(device))
 
-        # Check if the remaining requests are condensed correctly
-        indices = [
-            self.input_batch.req_id_to_index[req_id] for req_id in ["1", "3"]
-        ]
-        self.assertTrue(all(idx < self.input_batch.num_reqs
-                            for idx in indices))
+    def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool:
+        return (t1 is None
+                and t2 is None) or (t1 is not None and t2 is not None
+                                    and torch.allclose(t1, t2))
 
-        for i in range(self.input_batch.num_reqs):
-            self.assertIsNotNone(self.input_batch._req_ids[i])
-        for i in range(self.input_batch.num_reqs,
-                       len(self.input_batch._req_ids)):
-            self.assertIsNone(self.input_batch._req_ids[i])
+    # Assert the actual and expected output.
+    assert torch.allclose(expected_sampling_metadata.temperature,
+                          sampling_metadata.temperature)
+    assert same(expected_sampling_metadata.top_p, sampling_metadata.top_p)
+    assert same(expected_sampling_metadata.top_k, sampling_metadata.top_k)
+    assert torch.allclose(
+        expected_sampling_metadata.frequency_penalties,
+        sampling_metadata.frequency_penalties,
+    )
+    assert torch.allclose(
+        expected_sampling_metadata.presence_penalties,
+        sampling_metadata.presence_penalties,
+    )
+    assert torch.allclose(
+        expected_sampling_metadata.repetition_penalties,
+        sampling_metadata.repetition_penalties,
+    )
+    assert torch.allclose(expected_sampling_metadata.prompt_token_ids,
+                          sampling_metadata.prompt_token_ids)
+    assert (expected_sampling_metadata.output_token_ids ==
+            sampling_metadata.output_token_ids)
+    assert expected_sampling_metadata.no_penalties == \
+           sampling_metadata.no_penalties
+    if sampling_metadata.allowed_token_ids_mask:
+        assert torch.allclose(
+            expected_sampling_metadata.allowed_token_ids_mask,
+            sampling_metadata.allowed_token_ids_mask)
+    assert expected_sampling_metadata.bad_words_token_ids == \
+        sampling_metadata.bad_words_token_ids
 
-        for req_id in ["1", "3"]:
-            idx = self.input_batch.req_id_to_index[req_id]
-            tokens = self.input_batch.token_ids_cpu[idx]
-            self.assertTrue(
-                tokens.any(),
-                f"Tokens at index {idx} for req {req_id} should not be all zero"
-            )
+
+@pytest.mark.parametrize("device", ["cpu"])
+@pytest.mark.parametrize("batch_size", [32])
+@pytest.mark.parametrize("swap_list", [((0, 1), )])
+def test_swap_states_in_input_batch(device: str, batch_size: int,
+                                    swap_list: list):
+    """
+    Tests the logic for managing sampling metadata in the InputBatch.
+
+    This test involves adding a set of requests to the InputBatch,
+    followed by removing a subset of them. Afterward, the batch is compacted,
+    and the `make_sampling_metadata` method is invoked on the batch. The
+    output of `make_sampling_metadata` is then compared against the expected
+    results to ensure correctness.
+
+    Note: Ignore logits processor logic, which is tested separately
+    """
+    input_batch: InputBatch = InputBatch(
+        max_num_reqs=batch_size,
+        max_model_len=1024,
+        max_num_batched_tokens=1024,
+        device=torch.device(device),
+        pin_memory=is_pin_memory_available(),
+        vocab_size=1024,
+        block_sizes=[1],
+    )
+    ref_input_batch: InputBatch = InputBatch(
+        max_num_reqs=batch_size,
+        max_model_len=1024,
+        max_num_batched_tokens=1024,
+        device=torch.device(device),
+        pin_memory=is_pin_memory_available(),
+        vocab_size=1024,
+        block_sizes=[1],
+    )
+
+    reqs: list[CachedRequestState] = []
+    req_id_reqs = {}
+    req_id_output_token_ids = {}
+    # Add requests
+    for req_index in range(batch_size):
+        req: CachedRequestState = _construct_cached_request_state(req_index)
+        assigned_req_index = input_batch.add_request(req)
+        assert assigned_req_index == req_index
+        reqs.append(req)
+        req_id_reqs[req.req_id] = req
+        req_id_output_token_ids[req.req_id] = req.output_token_ids
+
+    reordered_reqs = reqs.copy()
+    for swap_pair in swap_list:
+        reordered_reqs[swap_pair[0]], reordered_reqs[swap_pair[1]] = \
+            reordered_reqs[swap_pair[1]], reordered_reqs[swap_pair[0]]
+        input_batch.swap_states(swap_pair[0], swap_pair[1])
+
+    for req_index in range(batch_size):
+        req = reordered_reqs[req_index]
+        assigned_req_index = ref_input_batch.add_request(req)
+        assert assigned_req_index == req_index
+
+    input_batch.refresh_metadata()
+    ref_input_batch.refresh_metadata()
+
+    _compare_objs(input_batch, ref_input_batch)
diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
index e607517eb..700fafdca 100644
--- a/vllm_ascend/ascend_forward_context.py
+++ b/vllm_ascend/ascend_forward_context.py
@@ -4,10 +4,11 @@ from enum import Enum
 from typing import Any, Optional
 
 import torch
-from vllm.config import VllmConfig
+from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.distributed import (get_dp_group, get_ep_group,
                               get_tensor_model_parallel_world_size)
-from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.forward_context import (BatchDescriptor, get_forward_context,
+                                  set_forward_context)
 
 import vllm_ascend.envs as envs_ascend
 from vllm_ascend.distributed.moe_comm_method import MoECommMethod
@@ -48,26 +49,31 @@ def _get_fused_moe_state(ep_size: int, with_prefill: bool,
 
 @contextmanager
 def set_ascend_forward_context(
-    attn_metadata: Any,
-    vllm_config: VllmConfig,
-    virtual_engine: int = 0,
-    num_tokens: Optional[int] = None,
-    num_tokens_across_dp: Optional[torch.Tensor] = None,
-    with_prefill: bool = True,
-    in_profile_run: bool = False,
-    reserved_mc2_mask: Optional[torch.Tensor] = None,
-    moe_comm_method: Optional[MoECommMethod] = None,
-    num_actual_tokens: Optional[int] = None,
-):
+        attn_metadata: Any,
+        vllm_config: VllmConfig,
+        virtual_engine: int = 0,
+        num_tokens: Optional[int] = None,
+        num_tokens_across_dp: Optional[torch.Tensor] = None,
+        with_prefill: bool = True,
+        in_profile_run: bool = False,
+        reserved_mc2_mask: Optional[torch.Tensor] = None,
+        moe_comm_method: Optional[MoECommMethod] = None,
+        num_actual_tokens: Optional[int] = None,
+        aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
+        batch_descriptor: Optional[BatchDescriptor] = None):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     We add some additional param into forward_context.
     """
-    with set_forward_context(attn_metadata,
-                             vllm_config,
-                             virtual_engine=virtual_engine,
-                             num_tokens=num_tokens,
-                             num_tokens_across_dp=num_tokens_across_dp):
+    with set_forward_context(
+            attn_metadata,
+            vllm_config,
+            virtual_engine=virtual_engine,
+            num_tokens=num_tokens,
+            num_tokens_across_dp=num_tokens_across_dp,
+            cudagraph_runtime_mode=aclgraph_runtime_mode,
+            batch_descriptor=batch_descriptor,
+    ):
         forward_context = get_forward_context()
         forward_context.moe_comm_method = moe_comm_method
         forward_context.with_prefill = with_prefill
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
index 81a337563..87d698509 100644
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -20,14 +20,17 @@ from enum import Enum
 from typing import List, Optional, Tuple, Type
 
 import torch
+import torch.nn as nn
 import torch_npu
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.config import VllmConfig
 from vllm.forward_context import ForwardContext, get_forward_context
-from vllm.utils import direct_register_custom_op
+from vllm.utils import cdiv, direct_register_custom_op
 from vllm.v1.core.sched.output import SchedulerOutput
 
+from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
                                nd_to_nz_2d, nd_to_nz_spec)
@@ -157,35 +160,49 @@ class AscendMetadata:
 
 class AscendAttentionMetadataBuilder:
 
-    def __init__(self, runner):
-        self.runner = runner
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.device = device
+        self.max_num_blocks_per_req = cdiv(self.model_config.max_model_len,
+                                           vllm_config.cache_config.block_size)
 
     def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
         return False
 
-    def build(self,
-              num_reqs,
-              num_actual_tokens,
-              max_query_len,
-              enable_dbo_across_dp: bool = False,
-              is_only_prefill: bool = False,
-              *args,
-              **kwargs):
+    def build(
+        self,
+        common_attn_metadata: AscendCommonAttentionMetadata,
+        model: nn.Module,
+    ):
+        num_reqs = common_attn_metadata.num_reqs
+        num_actual_tokens = common_attn_metadata.num_actual_tokens
+        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu[:
+                                                                       num_reqs
+                                                                       + 1]
 
-        block_table = self.runner.input_batch.block_table[0].get_device_tensor(
-        )
-        block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
+        block_table = common_attn_metadata.block_table_tensor
+        block_table[:num_reqs, :self.max_num_blocks_per_req] = (
             block_table[:num_reqs])
 
-        query_lens = self.runner.query_lens
-        seq_lens = self.runner.seq_lens_cpu[:num_reqs]
-        slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
-            self.runner.device, non_blocking=True)
-        attn_mask = self.runner.attn_mask
-        attn_state = self.runner.attn_state
-        query_start_loc_cpu = self.runner.query_start_loc_cpu[:num_reqs + 1]
-        query_start_loc = query_start_loc_cpu.to(self.runner.device,
+        query_lens = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
+        seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs]
+        slot_mapping = common_attn_metadata.slot_mapping_cpu[:
+                                                             num_actual_tokens].to(
+                                                                 self.device,
+                                                                 non_blocking=
+                                                                 True)
+        attn_mask = common_attn_metadata.attn_mask
+        attn_state = common_attn_metadata.attn_state
+        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu[:
+                                                                       num_reqs
+                                                                       + 1]
+        query_start_loc = query_start_loc_cpu.to(self.device,
                                                  non_blocking=True)
 
         if is_310p():
@@ -204,12 +221,12 @@ class AscendAttentionMetadataBuilder:
             query_start_loc=query_start_loc,
             query_lens=query_lens,
             seq_lens=seq_lens,
-            max_query_len=max_query_len,
+            max_query_len=common_attn_metadata.max_query_len,
             slot_mapping=slot_mapping,
             attn_mask=attn_mask,
             attn_state=attn_state,
-            enable_dbo_across_dp=enable_dbo_across_dp,
-            is_only_prefill=is_only_prefill)
+            enable_dbo_across_dp=common_attn_metadata.enable_dbo_across_dp,
+            is_only_prefill=common_attn_metadata.is_only_prefill)
         return attn_metadata
 
 
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
index a52d117b4..72a2d4f78 100644
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -3,12 +3,13 @@ from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar
 
 import numpy as np
 import torch
+import torch.nn as nn
 import torch_npu
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
                                               AttentionMetadata,
                                               MLAAttentionImpl)
 from vllm.attention.backends.utils import PAD_SLOT_ID
-from vllm.config import get_current_vllm_config
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
@@ -17,11 +18,14 @@ from vllm.utils import cdiv, round_down
 import vllm_ascend.envs as envs_ascend
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
+from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
+                                         split_decodes_and_prefills)
 from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
 from vllm_ascend.multistream.context import get_multistream_comm_context
 from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
 from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
-from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
+from vllm_ascend.torchair.utils import (TorchairCommonAttentionMetadata,
+                                        npu_stream_switch, npu_wait_tensor)
 from vllm_ascend.utils import npu_prefetch
 from vllm_ascend.worker.npu_input_batch import InputBatch
 
@@ -172,20 +176,24 @@ class AscendMLAMetadataBuilder:
 
     # _attn_mask_builder = None
     def __init__(self,
-                 runner,
+                 vllm_config: VllmConfig,
+                 device: torch.device,
                  metadata_cls: Optional[AscendMLAMetadata] = None):
         self.metadata_cls: Optional[AscendMLAMetadata] = metadata_cls \
             if metadata_cls is not None else AscendMLAMetadata  # type: ignore
-        self.runner = runner
-        scheduler_config = runner.scheduler_config
-        model_config = runner.model_config
-        self.block_size = runner.block_size
-        self.chunked_prefill_enabled = runner.chunked_prefill_enabled
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.device = device
+        scheduler_config = vllm_config.scheduler_config
+        self.block_size = vllm_config.cache_config.block_size
+        self.max_blocks = (vllm_config.model_config.max_model_len +
+                           self.block_size - 1) // self.block_size
+        self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
         if self.chunked_prefill_enabled:
             self.chunked_prefill_workspace_size = min(
                 # Max sure there is enough for 8 full length request or at least
                 # 4 pages of cache per request
-                max(8 * model_config.max_model_len,
+                max(8 * self.model_config.max_model_len,
                     4 * scheduler_config.max_num_seqs * self.block_size),
                 # For long-context models try not to over-allocate limiting
                 # kv-cache space, limiting it to 64k tokens,
@@ -200,13 +208,13 @@ class AscendMLAMetadataBuilder:
                 scheduler_config.max_num_seqs * self.block_size
             self.chunked_prefill_workspace = torch.empty(
                 (self.chunked_prefill_workspace_size,
-                 model_config.get_head_size()),
-                dtype=model_config.dtype,
-                device=runner.device,
+                 self.model_config.get_head_size()),
+                dtype=self.model_config.dtype,
+                device=device,
             )
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
-        self.rope_dim = self.runner.model_config.hf_text_config.qk_rope_head_dim
+        self.rope_dim = self.model_config.hf_text_config.qk_rope_head_dim
         self.cos_cache = None
         self.sin_cache = None
 
@@ -220,8 +228,6 @@ class AscendMLAMetadataBuilder:
         # better naming here)
         decodes = []
         prefills = []
-        num_decode_tokens = 0
-        num_prefill_tokens = 0
 
         for i, req_id in enumerate(input_batch.req_ids):
             num_tokens = scheduler_output.num_scheduled_tokens[req_id]
@@ -231,18 +237,14 @@ class AscendMLAMetadataBuilder:
             if self.torchair_graph_enabled:
                 if num_tokens - num_spec_tokens == 1:
                     decodes.append(i)
-                    num_decode_tokens += num_tokens
                 else:
                     prefills.append(i)
-                    num_prefill_tokens += num_tokens
             # For eager mode we treat spec decoding as chunked prefill.
             else:
                 if num_tokens == 1:
                     decodes.append(i)
-                    num_decode_tokens += num_tokens
                 else:
                     prefills.append(i)
-                    num_prefill_tokens += num_tokens
 
         # We hope that this is fairly minimal since decodes
         # should be around for a number of iterations so hopefully they are
@@ -273,26 +275,15 @@ class AscendMLAMetadataBuilder:
         # Save for next `build` call
         # TODO(lucas): this is a bit of a hack, we should probably have a
         # better way of doing this
-        self._num_decodes = num_decodes
-        self._num_prefills = num_prefills
-        self._num_decode_tokens = num_decode_tokens
-        self._num_prefill_tokens = num_prefill_tokens
-
         return modified_batch
 
     def _get_graph_runner_block_tables(
             self, num_seqs: int, block_tables: torch.Tensor) -> torch.Tensor:
+        max_blocks = self.max_blocks
 
-        max_batch_size, max_blocks = self.runner.graph_block_tables.shape
-        assert max_batch_size >= num_seqs, f"max_batch_size: {max_batch_size} should be bigger than cur_num_seqs: {num_seqs}"
-
-        if isinstance(self.runner.graph_block_tables, np.ndarray):
-            graph_block_tables = torch.zeros((max_batch_size, max_blocks),
-                                             dtype=block_tables.dtype,
-                                             device=block_tables.device)
-        else:
-            graph_block_tables = self.runner.graph_block_tables.to(
-                device=block_tables.device, dtype=block_tables.dtype)
+        graph_block_tables = torch.zeros((num_seqs, max_blocks),
+                                         dtype=block_tables.dtype,
+                                         device=block_tables.device)
 
         num_blocks = block_tables.size(1)
         if num_blocks <= max_blocks:
@@ -304,18 +295,20 @@ class AscendMLAMetadataBuilder:
                                max_blocks] = block_tables[:num_seqs, :
                                                           max_blocks]
 
-        return graph_block_tables[:num_seqs, :max_blocks]
+        return graph_block_tables[:, :max_blocks]
 
     def build_torchair_graph_dummy(
-            self, num_reqs: int, num_actual_tokens: int) -> AscendMLAMetadata:
-        device = self.runner.device
-        _, max_blocks = self.runner.graph_block_tables.shape
-        block_table = torch.zeros((num_reqs, max_blocks),
+        self,
+        common_attn_metadata: TorchairCommonAttentionMetadata,
+    ) -> AscendMLAMetadata:
+        device = self.device
+        num_reqs = common_attn_metadata.num_reqs
+        block_table = torch.zeros((num_reqs, self.max_blocks),
                                   dtype=torch.int32,
                                   device=device)
         block_table = self._get_graph_runner_block_tables(
             num_reqs, block_table)
-        num_tokens = num_reqs * self.runner.decode_token_per_req
+        num_tokens = num_reqs * common_attn_metadata.decode_token_per_req
         seq_lens = torch.zeros(num_reqs, dtype=torch.int32, device=device)
         seq_lens_list = [0] * num_reqs
         input_positions = torch.zeros(num_tokens,
@@ -333,16 +326,16 @@ class AscendMLAMetadataBuilder:
                          1,
                          1,
                          self.rope_dim,
-                         dtype=self.runner.dtype,
+                         dtype=self.model_config.dtype,
                          device=device)
         cos = torch.ones(num_tokens,
                          1,
                          1,
                          self.rope_dim,
-                         dtype=self.runner.dtype,
+                         dtype=self.model_config.dtype,
                          device=device)
-        if self.runner.speculative_config is not None and\
-            self.runner.speculative_config.method == 'deepseek_mtp':
+        if self.vllm_config.speculative_config is not None and\
+            self.vllm_config.speculative_config.method == 'deepseek_mtp':
             attn_state = AscendAttentionState.SpecDecoding
             num_decode_tokens = 2
         else:
@@ -354,20 +347,21 @@ class AscendMLAMetadataBuilder:
             seq_lens=seq_lens,
             seq_lens_list=seq_lens_list,
             max_seq_lens=1,
-            attn_mask=self.runner.spec_attn_mask,
-            actual_seq_lengths_q=self.runner.actual_seq_lengths_q[:num_reqs],
+            attn_mask=common_attn_metadata.spec_attn_mask,
+            actual_seq_lengths_q=common_attn_metadata.
+            actual_seq_lengths_q[:num_reqs],
             sin=sin,
             cos=cos,
         )
         return self.metadata_cls(  # type: ignore
-            num_input_tokens=num_actual_tokens,
-            num_actual_tokens=num_actual_tokens,
+            num_input_tokens=common_attn_metadata.num_actual_tokens,
+            num_actual_tokens=common_attn_metadata.num_actual_tokens,
             slot_mapping=slot_mapping,
-            head_dim=self.runner.model_config.get_head_size(),
+            head_dim=self.model_config.get_head_size(),
             num_decodes=1,
             num_decode_tokens=num_decode_tokens,
             num_prefills=0,
-            attn_mask=self.runner.attn_mask,
+            attn_mask=common_attn_metadata.attn_mask,
             attn_state=attn_state,
             prefill=None,
             decode=decode_metadata,
@@ -378,58 +372,68 @@ class AscendMLAMetadataBuilder:
 
     def build(
         self,
-        num_reqs: int,
-        num_actual_tokens: int,
-        max_query_len: int,
-        graph_pad_size: int = -1,
-        query_start_loc: torch.Tensor = None,
-        enable_dbo_across_dp: bool = False,
-        *args,
-        **kwargs,
+        common_attn_metadata: AscendCommonAttentionMetadata,
+        model: nn.Module,
     ) -> AscendMLAMetadata:
-        assert self._num_decodes + self._num_prefills == num_reqs
+        num_reqs = common_attn_metadata.num_reqs
+        num_actual_tokens = common_attn_metadata.num_actual_tokens
+        query_start_loc = common_attn_metadata.query_start_loc
+        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
+        if self.torchair_graph_enabled and common_attn_metadata.attn_state in [
+                AscendAttentionState.DecodeOnly,
+                AscendAttentionState.SpecDecoding
+        ]:
+            decode_threshold = common_attn_metadata.decode_token_per_req
+        else:
+            # TODO(xyx): remove the if condition after mla supports torch mode speculative decoding
+            decode_threshold = 1
+        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = \
+            split_decodes_and_prefills(common_attn_metadata, decode_threshold=decode_threshold)
+        assert num_decodes + num_prefills == num_reqs
+        assert num_decode_tokens + num_prefill_tokens == num_actual_tokens
 
         # Note(simon): be careful about the CPU <> GPU memory movement in this
         # function. We should avoid GPU -> CPU sync as much as possible because
         # it blocks on all previous kernels.
-        device = self.runner.device
+        device = self.device
 
-        block_table = (self.runner.input_batch.block_table[0].
-                       get_device_tensor()[:num_reqs])
-        slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
-            device, non_blocking=True)
-        input_positions = self.runner.positions_cpu[:num_actual_tokens].to(
-            device, non_blocking=True).long()
+        block_table = (common_attn_metadata.block_table_tensor[:num_reqs])
+        slot_mapping = common_attn_metadata.slot_mapping_cpu[:
+                                                             num_actual_tokens].to(
+                                                                 device,
+                                                                 non_blocking=
+                                                                 True)
+        input_positions = common_attn_metadata.positions[:
+                                                         num_actual_tokens].long(
+                                                         )
 
-        seq_lens_cpu = self.runner.seq_lens_cpu[:num_reqs]
-        query_lens = seq_lens_cpu - self.runner.input_batch.num_computed_tokens_cpu_tensor[:
-                                                                                           num_reqs]
-        seq_lens = seq_lens_cpu
-        max_query_len = query_lens.max().item()
-        max_seq_lens = seq_lens.max().item()
         if self.cos_cache is None:
-            self.cos_cache = self.runner.get_model(
-            ).model.layers[0].self_attn.rotary_emb.cos_cached
-            self.sin_cache = self.runner.get_model(
-            ).model.layers[0].self_attn.rotary_emb.sin_cached
-        if self.cos_cache.dtype != self.runner.dtype:  # type: ignore
+            self.cos_cache = model.model.layers[
+                0].self_attn.rotary_emb.cos_cached
+            self.sin_cache = model.model.layers[
+                0].self_attn.rotary_emb.sin_cached
+        if self.cos_cache.dtype != self.model_config.dtype:  # type: ignore
             self.cos_cache = self.cos_cache.to(  # type: ignore
-                self.runner.dtype)  # type: ignore
+                self.model_config.dtype)  # type: ignore
             self.sin_cache = self.sin_cache.to(  # type: ignore
-                self.runner.dtype)  # type: ignore
+                self.model_config.dtype)  # type: ignore
+
+        query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
+        query_lens = query_seq_lens_cpu[:num_reqs]
+        seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs]
+        num_computed_tokens_cpu = (seq_lens - query_lens)
 
         prefill_metadata = None
         chunked_context_metadata = None
-        if self._num_prefills > 0:
-            reqs_start = self._num_decodes  # prefill_start
-            tokens_start = self._num_decode_tokens
+        if num_prefills > 0:
+            reqs_start = num_decodes  # prefill_start
+            tokens_start = num_decode_tokens
             max_query_len = query_lens[tokens_start:].max().item()
             max_seq_lens = seq_lens[tokens_start:].max().item()
             prefill_query_start_loc = query_start_loc[
                 reqs_start:] - query_start_loc[reqs_start]
 
-            context_lens_cpu = self.runner.input_batch.num_computed_tokens_cpu_tensor[
-                reqs_start:num_reqs]
+            context_lens_cpu = num_computed_tokens_cpu[reqs_start:num_reqs]
             max_context_len_cpu = context_lens_cpu.max().item()
             num_prefills_with_context_cpu = (context_lens_cpu > 0).sum().item()
             if self.chunked_prefill_enabled and max_context_len_cpu > 0:
@@ -441,12 +445,12 @@ class AscendMLAMetadataBuilder:
                 assert max_context_chunk > 0
                 num_chunks = cdiv(max_context_len_cpu, max_context_chunk)
                 chunk_starts = torch.arange(num_chunks, dtype=torch.int32) \
-                    .unsqueeze(1).expand(-1, self._num_prefills) * max_context_chunk
+                    .unsqueeze(1).expand(-1, num_prefills) * max_context_chunk
                 chunk_ends = torch.min(context_lens_cpu.unsqueeze(0),
                                        chunk_starts + max_context_chunk)
                 chunk_seq_lens = (chunk_ends - chunk_starts).clamp(min=0)
                 cu_seq_lens_cpu = torch.zeros(num_chunks,
-                                              self._num_prefills + 1,
+                                              num_prefills + 1,
                                               dtype=torch.int32,
                                               pin_memory=True)
                 torch.cumsum(chunk_seq_lens,
@@ -470,7 +474,7 @@ class AscendMLAMetadataBuilder:
                 prefill_input_positions].unsqueeze(  # type: ignore
                     1).unsqueeze(2)
             prefill_metadata = AscendMLAPrefillMetadata(
-                attn_mask=self.runner.attn_mask,
+                attn_mask=common_attn_metadata.attn_mask,
                 query_lens=query_lens[tokens_start:],
                 seq_lens=seq_lens,
                 context_lens=seq_lens[tokens_start:],
@@ -485,14 +489,15 @@ class AscendMLAMetadataBuilder:
             )
 
         decode_metadata = None
+        graph_pad_size = common_attn_metadata.graph_pad_size
         use_torchair_graph = graph_pad_size != -1
-        if self._num_decodes > 0:
+        if num_decodes > 0:
             actual_seq_lengths_q = query_start_loc[1:].tolist()
-            max_seq_lens = seq_lens[:self._num_decodes].max().item()
-            seq_lens = seq_lens[:self._num_decode_tokens]
-            input_positions = input_positions[:self._num_decode_tokens]
-            block_table = block_table[:self._num_decode_tokens, ...]
-            if use_torchair_graph and self.runner.attn_state in [
+            max_seq_lens = seq_lens[:num_decodes].max().item()
+            seq_lens = seq_lens[:num_decode_tokens]
+            input_positions = input_positions[:num_decode_tokens]
+            block_table = block_table[:num_decode_tokens, ...]
+            if use_torchair_graph and common_attn_metadata.attn_state in [
                     AscendAttentionState.DecodeOnly,
                     AscendAttentionState.SpecDecoding
             ]:
@@ -500,10 +505,10 @@ class AscendMLAMetadataBuilder:
                 num_token_pad_size = 0
                 if graph_pad_size != 0:
                     pad_value = 0
-                    num_token_pad_size = graph_pad_size - self._num_decode_tokens
+                    num_token_pad_size = graph_pad_size - num_decode_tokens
                     num_reqs_pad_size = (
-                        graph_pad_size // self.runner.decode_token_per_req -
-                        num_reqs)
+                        graph_pad_size //
+                        common_attn_metadata.decode_token_per_req - num_reqs)
                     padded_seq_lens = seq_lens.tolist(
                     ) + [pad_value] * num_reqs_pad_size
                 else:
@@ -531,14 +536,14 @@ class AscendMLAMetadataBuilder:
                 input_positions = torch.cat(
                     [input_positions, position_padding])
                 actual_seq_lengths_q = query_start_loc[1:].tolist(
-                ) + self.runner.actual_seq_lengths_q[num_reqs:num_reqs +
-                                                     num_reqs_pad_size]
+                ) + common_attn_metadata.actual_seq_lengths_q[
+                    num_reqs:num_reqs + num_reqs_pad_size]
             else:
                 seq_lens_list = seq_lens.tolist()
             # mtp torchair + PD scenario, last element of actual_seq_lengths_q must equal to batch_size(num_tokens)
             batch_size = slot_mapping.size(0)
             if actual_seq_lengths_q[-1] != batch_size \
-                and self.runner.attn_state == AscendAttentionState.SpecDecoding:
+                and common_attn_metadata.attn_state == AscendAttentionState.SpecDecoding:
                 actual_seq_lengths_q[-1] = batch_size
 
             cos = self.cos_cache[input_positions].unsqueeze(  # type: ignore
@@ -552,7 +557,7 @@ class AscendMLAMetadataBuilder:
                 seq_lens=seq_lens,
                 seq_lens_list=seq_lens_list,
                 max_seq_lens=max_seq_lens,
-                attn_mask=self.runner.spec_attn_mask,
+                attn_mask=common_attn_metadata.spec_attn_mask,
                 actual_seq_lengths_q=actual_seq_lengths_q,
                 sin=sin,
                 cos=cos)
@@ -561,18 +566,18 @@ class AscendMLAMetadataBuilder:
             num_actual_tokens=num_actual_tokens,
             query_lens=query_lens.tolist(),
             slot_mapping=slot_mapping,
-            head_dim=self.runner.model_config.get_head_size(),
-            num_decodes=self._num_decodes,
-            num_decode_tokens=self._num_decode_tokens,
-            num_prefills=self._num_prefills,
-            attn_mask=self.runner.attn_mask,
-            attn_state=self.runner.attn_state,
+            head_dim=self.model_config.get_head_size(),
+            num_decodes=num_decodes,
+            num_decode_tokens=num_decode_tokens,
+            num_prefills=num_prefills,
+            attn_mask=common_attn_metadata.attn_mask,
+            attn_state=common_attn_metadata.attn_state,
             prefill=prefill_metadata,
             decode=decode_metadata,
             query_start_loc=query_start_loc,
             block_tables=block_table,
             seq_lens=seq_lens,
-            enable_dbo_across_dp=enable_dbo_across_dp,
+            enable_dbo_across_dp=common_attn_metadata.enable_dbo_across_dp,
         )
 
 
diff --git a/vllm_ascend/attention/utils.py b/vllm_ascend/attention/utils.py
new file mode 100644
index 000000000..2ef537ff0
--- /dev/null
+++ b/vllm_ascend/attention/utils.py
@@ -0,0 +1,95 @@
+from dataclasses import dataclass
+from typing import Any
+
+import torch
+
+
+@dataclass
+class AscendCommonAttentionMetadata:
+    """
+    Per-batch attention metadata, shared across layers and backends.
+    AttentionMetadataBuilder instances use it to construct per-layer metadata.
+    
+    For many of the tensors we keep both GPU and CPU versions.
+    """
+
+    query_start_loc: torch.Tensor
+    query_start_loc_cpu: torch.Tensor
+    """(batch_size + 1,), the start location of each request in query Tensor"""
+
+    seq_lens_cpu: torch.Tensor
+    """(batch_size,), the length of each request including both computed tokens
+    and newly scheduled tokens"""
+
+    num_reqs: int
+    """Number of requests"""
+    num_actual_tokens: int
+    """Total number of tokens in batch"""
+
+    max_query_len: int
+    """Max token number of request in batch"""
+
+    decode_token_per_req: int
+    """decode token number per request"""
+
+    block_table_tensor: torch.Tensor
+
+    slot_mapping_cpu: torch.Tensor
+
+    actual_seq_lengths_q: list[int]
+
+    positions: torch.Tensor = None
+
+    attn_mask: torch.Tensor = None
+
+    spec_attn_mask: torch.Tensor = None
+
+    attn_state: Any = None
+
+    enable_dbo_across_dp: bool = False
+
+    is_only_prefill: bool = False
+
+    graph_pad_size: int = -1
+
+
+def split_decodes_and_prefills(
+    common_attn_metadata: AscendCommonAttentionMetadata,
+    decode_threshold: int = 1,
+) -> tuple[int, int, int, int]:
+    """
+    Assuming a reordered batch, finds the boundary between prefill and decode
+    requests.
+
+    Args:
+        common_attn_metadata: AscendCommonAttentionMetadata object containing the
+            batch metadata.
+        decode_threshold: The maximum query length to be considered a decode.
+
+    Returns:
+        num_decodes: The number of decode requests.
+        num_prefills: The number of prefill requests.
+        num_decode_tokens: The number of tokens in the decode requests.
+        num_prefill_tokens: The number of tokens in the prefill requests.
+    """
+    max_query_len = common_attn_metadata.max_query_len
+    num_reqs = common_attn_metadata.num_reqs
+    num_tokens = common_attn_metadata.num_actual_tokens
+    query_start_loc = common_attn_metadata.query_start_loc_cpu
+
+    if max_query_len <= decode_threshold:
+        return num_reqs, 0, num_tokens, 0
+
+    query_lens = query_start_loc[1:] - query_start_loc[:-1]
+    is_prefill = query_lens > decode_threshold
+    if not torch.any(is_prefill):
+        return num_reqs, 0, num_tokens, 0
+
+    first_prefill = is_prefill.int().argmax(dim=-1).item()
+    assert torch.all(query_lens[first_prefill:] >= decode_threshold)
+    assert torch.all(query_lens[:first_prefill] <= decode_threshold)
+    num_decodes = first_prefill
+    num_prefills = num_reqs - num_decodes
+    num_decode_tokens = query_start_loc[first_prefill].item()
+    num_prefill_tokens = num_tokens - num_decode_tokens
+    return (num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens)
diff --git a/vllm_ascend/compilation/acl_graph.py b/vllm_ascend/compilation/acl_graph.py
new file mode 100644
index 000000000..6f187e2bd
--- /dev/null
+++ b/vllm_ascend/compilation/acl_graph.py
@@ -0,0 +1,186 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+from contextlib import ExitStack
+from typing import Any, Callable, Optional
+from unittest.mock import patch
+
+import torch
+import vllm.envs as envs
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.cuda_graph import CUDAGraphOptions
+from vllm.compilation.monitor import validate_cudagraph_capturing_enabled
+from vllm.config import CUDAGraphMode, VllmConfig
+from vllm.forward_context import BatchDescriptor, get_forward_context
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import weak_ref_tensors
+
+logger = init_logger(__name__)
+
+
+@dataclasses.dataclass
+class ACLGraphEntry:
+    batch_descriptor: BatchDescriptor
+    aclgraph: Optional[torch.npu.NPUGraph] = None
+    output: Optional[Any] = None
+
+    # for aclgraph debugging, track the input addresses
+    # during capture, and check if they are the same during replay
+    input_addresses: Optional[list[int]] = None
+
+
+class ACLGraphWrapper:
+    """Wraps a runnable to add acl graph capturing and replaying ability. And
+    provide attribute access to the underlying `runnable` via `__getattr__`.
+
+    The workflow of this wrapper in the aclgraph dispatching is as follows:
+    1. At initialization, a runtime mode is assigned to the wrapper (FULL or
+    PIECEWISE). 
+    2. At runtime, the wrapper receives a runtime_mode and a 
+    batch_descriptor(key) from the forward context and blindly trust them
+    for aclgraph dispatching. 
+    3. If runtime_mode is NONE or runtime_mode does not match the mode of the
+    wrapper, just call the runnable directly.
+    4. Otherwise, i.e., the runtime_mode matches the mode of the wrapper,
+    the wrapper will perform aclgraph capture(if key does not exist, create
+    a new entry and cache it) or replay (if key exists in the cache).
+
+    Note: ACLGraphWrapper does not store persistent buffers or copy any
+    runtime inputs into that buffers for replay. We assume implementing them
+    is done outside of the wrapper. That is because we do not make any 
+    assumption on the dynamic shape (batch size) of the runtime inputs, as a
+    trade-off for staying orthogonal to compilation logic. Nevertheless, 
+    tracing and checking the input addresses to be consistent during replay is
+    guaranteed when VLLM_LOGGING_LEVEL == "DEBUG".
+    """
+
+    def __init__(self,
+                 runnable: Callable,
+                 vllm_config: VllmConfig,
+                 runtime_mode: CUDAGraphMode,
+                 graph_pool: Any = None,
+                 cudagraph_options: Optional[CUDAGraphOptions] = None):
+        self.runnable = runnable
+        self.vllm_config = vllm_config
+        self.graph_pool = graph_pool
+        self.runtime_mode = runtime_mode
+        self.compilation_config = vllm_config.compilation_config
+
+        self.first_run_finished = False
+        self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
+
+        # assert runtime_mode is not NONE(no aclgraph), otherwise, we don't
+        # need to initialize a ACLGraphWrapper.
+        assert self.runtime_mode != CUDAGraphMode.NONE
+        if self.graph_pool is None:
+            self.graph_pool = current_platform.get_global_graph_pool()
+
+        if cudagraph_options is None:
+            cudagraph_options = CUDAGraphOptions()
+        self.aclgraph_options = cudagraph_options
+        # the entries for different batch descriptors that we need to capture
+        # aclgraphs for.
+        self.concrete_aclgraph_entries: dict[BatchDescriptor, ACLGraphEntry]\
+                                                                        = {}
+
+    def __getattr__(self, key: str):
+        # allow accessing the attributes of the runnable.
+        if hasattr(self.runnable, key):
+            return getattr(self.runnable, key)
+        raise AttributeError(f"Attribute {key} not exists in the runnable of "
+                             f"aclgraph wrapper: {self.runnable}")
+
+    def unwrap(self) -> Callable:
+        # in case we need to access the original runnable.
+        return self.runnable
+
+    def __call__(self, *args, **kwargs):
+        forward_context = get_forward_context()
+        batch_descriptor = forward_context.batch_descriptor
+        aclgraph_runtime_mode = forward_context.cudagraph_runtime_mode
+
+        if aclgraph_runtime_mode == CUDAGraphMode.NONE or \
+                            aclgraph_runtime_mode != self.runtime_mode:
+            # CUDAGraphMode.NONE could mean the profile run, a warmup run, or
+            # running without aclgraphs.
+            # We do not trigger capture/replay if the runtime mode is not
+            # matches. This enables properly dispatching to the correct
+            # CUDAGraphWrapper when nesting multiple instances with different
+            # runtime modes.
+            return self.runnable(*args, **kwargs)
+
+        if batch_descriptor not in self.concrete_aclgraph_entries:
+            # create a new entry for this batch descriptor
+            self.concrete_aclgraph_entries[batch_descriptor] = \
+                ACLGraphEntry(batch_descriptor=batch_descriptor)
+
+        entry = self.concrete_aclgraph_entries[batch_descriptor]
+
+        if entry.aclgraph is None:
+            if self.aclgraph_options.debug_log_enable:
+                # Since we capture aclgraph for many different shapes and
+                # capturing is fast, we don't need to log it for every
+                # shape. E.g. we only log it for the first subgraph in
+                # piecewise mode.
+                logger.debug("Capturing a aclgraph on (%s,%s)",
+                             self.runtime_mode.name, entry.batch_descriptor)
+            # validate that aclgraph capturing is legal at this point.
+            validate_cudagraph_capturing_enabled()
+
+            input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            entry.input_addresses = input_addresses
+            aclgraph = torch.npu.NPUGraph()
+
+            with ExitStack() as stack:
+                if self.aclgraph_options.gc_disable:
+                    # during every model forward for piecewise aclgraph
+                    # mode, we will capture many pieces of aclgraphs
+                    # (roughly one per layer). running gc again and again
+                    # across layers will make the aclgraph capture very slow.
+                    # therefore, we only run gc for the first graph,
+                    # and disable gc for the rest of the graphs.
+                    stack.enter_context(patch("gc.collect", lambda: None))
+                    stack.enter_context(
+                        patch("torch.npu.empty_cache", lambda: None))
+
+                # mind-exploding: carefully manage the reference and memory.
+                with torch.npu.graph(aclgraph, pool=self.graph_pool):
+                    # `output` is managed by pytorch's aclgraph pool
+                    output = self.runnable(*args, **kwargs)
+                    if self.aclgraph_options.weak_ref_output:
+                        # by converting it to weak ref,
+                        # the original `output` will immediately be released
+                        # to save memory. It is only safe to do this for
+                        # the last graph in piecewise aclgraph mode, because
+                        # the output of the last graph will not be used by
+                        # any other acl graph.
+                        output = weak_ref_tensors(output)
+
+            # here we always use weak ref for the output
+            # to save memory
+            entry.output = weak_ref_tensors(output)
+            entry.aclgraph = aclgraph
+
+            compilation_counter.num_cudagraph_captured += 1
+
+            # important: we need to return the output, rather than
+            # the weak ref of the output, so that pytorch can correctly
+            # manage the memory during acl graph capture
+            return output
+
+        if self.is_debugging_mode:
+            # check if the input addresses are the same
+            new_input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            assert new_input_addresses == entry.input_addresses, (
+                f"Input addresses for aclgraphs are different "
+                f"during replay. Expected {entry.input_addresses}, "
+                f"got {new_input_addresses}")
+
+        entry.aclgraph.replay()
+        return entry.output
diff --git a/vllm_ascend/compilation/piecewise_backend.py b/vllm_ascend/compilation/piecewise_backend.py
deleted file mode 100644
index d0160f6d4..000000000
--- a/vllm_ascend/compilation/piecewise_backend.py
+++ /dev/null
@@ -1,225 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-# Adapted from vllm-project/vllm/vllm/compilation/cuda_piecewise_backend.py
-#
-
-import dataclasses
-from contextlib import ExitStack
-from typing import Any, Callable, Dict, List, Optional, Set
-from unittest.mock import patch
-
-import torch
-import torch.fx as fx
-import vllm.envs as envs_vllm
-from vllm.compilation.backends import VllmBackend
-from vllm.compilation.counter import compilation_counter
-from vllm.compilation.monitor import end_monitoring_torch_compile
-from vllm.config import VllmConfig
-from vllm.logger import logger
-from vllm.utils import weak_ref_tensors
-
-
-@dataclasses.dataclass
-class ConcreteSizeEntry:
-    runtime_shape: int
-    need_to_compile: bool  # the size is in compile_sizes
-    use_aclgraph: bool  # the size is in cudagraph_capture_sizes
-
-    compiled: bool = False
-    runnable: Callable = None  # type: ignore
-    num_finished_warmup: int = 0
-    aclgraph: Optional[torch.npu.NPUGraph] = None
-    output: Optional[Any] = None
-
-    # for aclgraph debugging, track the input addresses
-    # during capture, and check if they are the same during replay
-    input_addresses: Optional[List[int]] = None
-
-
-class NPUPiecewiseBackend:
-
-    def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
-                 graph_pool: Any, piecewise_compile_index: int,
-                 total_piecewise_compiles: int, sym_shape_indices: List[int],
-                 compiled_graph_for_general_shape: Callable,
-                 vllm_backend: VllmBackend):
-        """
-        The backend for piecewise compilation.
-        It mainly handles the compilation and aclgraph capturing.
-
-        We will compile `self.graph` once for the general shape,
-        and then compile for different shapes specified in
-        `compilation_config.compile_sizes`.
-
-        Independently, we will capture aclgraph for different shapes.
-
-        If a shape needs both compilation and aclgraph, we will
-        compile it first, and then capture aclgraph.
-        """
-        self.graph = graph
-        self.vllm_config = vllm_config
-        self.compilation_config = vllm_config.compilation_config
-        self.graph_pool = graph_pool
-        self.piecewise_compile_index = piecewise_compile_index
-        self.total_piecewise_compiles = total_piecewise_compiles
-        self.vllm_backend = vllm_backend
-
-        self.is_first_graph = piecewise_compile_index == 0
-        self.is_last_graph = (
-            piecewise_compile_index == total_piecewise_compiles - 1)
-
-        self.compile_sizes: Set[int] = set(
-            self.compilation_config.compile_sizes)
-        self.aclgraph_capture_sizes: Set[int] = set(
-            self.compilation_config.cudagraph_capture_sizes
-        ) if self.compilation_config.use_cudagraph else set()
-
-        self.first_run_finished = False
-
-        self.compiled_graph_for_general_shape = compiled_graph_for_general_shape  # noqa
-
-        self.sym_shape_indices = sym_shape_indices
-
-        self.is_debugging_mode = envs_vllm.VLLM_LOGGING_LEVEL == "DEBUG"
-
-        # the entries for different shapes that we need to either
-        # compile or capture aclgraph
-        self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
-
-        # to_be_compiled_sizes tracks the remaining sizes to compile,
-        # and updates during the compilation process, so we need to copy it
-        self.to_be_compiled_sizes: Set[int] = self.compile_sizes.copy()
-        for shape in self.compile_sizes.union(self.aclgraph_capture_sizes):
-            self.concrete_size_entries[shape] = ConcreteSizeEntry(
-                runtime_shape=shape,
-                need_to_compile=shape in self.compile_sizes,
-                use_aclgraph=shape in self.aclgraph_capture_sizes,
-            )
-
-    def check_for_ending_compilation(self):
-        if self.is_last_graph and not self.to_be_compiled_sizes:
-            # no specific sizes to compile
-            # save the hash of the inductor graph for the next run
-            self.vllm_backend.compiler_manager.save_to_file()
-            end_monitoring_torch_compile(self.vllm_config)
-
-    def __call__(self, *args) -> Any:
-        if not self.first_run_finished:
-            self.first_run_finished = True
-            self.check_for_ending_compilation()
-            return self.compiled_graph_for_general_shape(*args)
-
-        runtime_shape = args[self.sym_shape_indices[0]]
-        if runtime_shape not in self.concrete_size_entries:
-            # we don't need to do anything for this shape
-            return self.compiled_graph_for_general_shape(*args)
-
-        entry = self.concrete_size_entries[runtime_shape]
-
-        if entry.runnable is None:
-            entry.runnable = self.compiled_graph_for_general_shape
-
-        if entry.need_to_compile and not entry.compiled:
-            entry.compiled = True
-            self.to_be_compiled_sizes.remove(runtime_shape)
-            # args are real arguments
-            entry.runnable = self.vllm_backend.compiler_manager.compile(
-                self.graph,
-                args,
-                self.compilation_config.inductor_compile_config,
-                self.compilation_config,
-                graph_index=self.piecewise_compile_index,
-                num_graphs=self.total_piecewise_compiles,
-                runtime_shape=runtime_shape)
-
-            # finished compilations for all required shapes
-            if self.is_last_graph and not self.to_be_compiled_sizes:
-                self.check_for_ending_compilation()
-
-        if not entry.use_aclgraph:
-            return entry.runnable(*args)
-
-        if entry.aclgraph is None:
-            if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups:  # noqa
-                entry.num_finished_warmup += 1
-                if self.is_first_graph:
-                    logger.debug(
-                        "Warming up %s/%s for shape %s",
-                        entry.num_finished_warmup,
-                        self.compilation_config.cudagraph_num_of_warmups,
-                        runtime_shape)
-                return entry.runnable(*args)
-
-            if self.is_first_graph:
-                # Since we capture aclgraph for many different shapes and
-                # capturing is fast, we don't need to log it for every shape.
-                # We only log it in the debug mode.
-                logger.debug("Capturing a aclgraph for shape %s",
-                             runtime_shape)
-
-            input_addresses = [
-                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
-            ]
-            entry.input_addresses = input_addresses
-            aclgraph = torch.npu.NPUGraph()
-
-            with ExitStack() as stack:
-                if not self.is_first_graph:
-                    # during every model forward, we will capture
-                    # many pieces of aclgraphs (roughly one per layer).
-                    # running gc again and again across layers will
-                    # make the aclgraph capture very slow.
-                    # therefore, we only run gc for the first graph,
-                    # and disable gc for the rest of the graphs.
-                    stack.enter_context(patch("gc.collect", lambda: None))
-                    stack.enter_context(
-                        patch("torch.npu.empty_cache", lambda: None))
-
-                # mind-exploding: carefully manage the reference and memory.
-                with torch.npu.graph(aclgraph, pool=self.graph_pool):
-                    # `output` is managed by pytorch's aclgraph pool
-                    output = entry.runnable(*args)
-                    if self.is_last_graph:
-                        # by converting it to weak ref,
-                        # the original `output` will immediately be released
-                        # to save memory. It is only safe to do this for
-                        # the last graph, because the output of the last graph
-                        # will not be used by any other npu aclgraph.
-                        output = weak_ref_tensors(output)
-
-            # here we always use weak ref for the output
-            # to save memory
-            entry.output = weak_ref_tensors(output)
-            entry.aclgraph = aclgraph
-            compilation_counter.num_cudagraph_captured += 1
-
-            # important: we need to return the output, rather than
-            # the weak ref of the output, so that pytorch can correctly
-            # manage the memory during npu aclgraph capture
-            return output
-
-        if self.is_debugging_mode:
-            # check if the input addresses are the same
-            new_input_addresses = [
-                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
-            ]
-            assert new_input_addresses == entry.input_addresses, (
-                "Input addresses for aclgraphs are different during replay."
-                f" Expected {entry.input_addresses}, got {new_input_addresses}"
-            )
-
-        entry.aclgraph.replay()
-        return entry.output
diff --git a/vllm_ascend/lora/punica_wrapper/lora_ops.py b/vllm_ascend/lora/punica_wrapper/lora_ops.py
index a8ff21d74..e8bf8ad97 100644
--- a/vllm_ascend/lora/punica_wrapper/lora_ops.py
+++ b/vllm_ascend/lora/punica_wrapper/lora_ops.py
@@ -52,14 +52,9 @@ def bgmv_expand_slice(inputs: torch.Tensor,
                       slice_offset: int,
                       slice_size: int,
                       add_inputs: bool = True):
-    return torch.ops._C.bgmv_expand(
-        inputs,
-        lora_b_weights,
-        lora_indices_tensor,
-        output_tensor,
-        slice_offset,
-        slice_size
-    )
+    return torch.ops._C.bgmv_expand(inputs, lora_b_weights,
+                                    lora_indices_tensor, output_tensor,
+                                    slice_offset, slice_size)
 
 
 def sgmv_shrink(
@@ -74,8 +69,9 @@ def sgmv_shrink(
     token_nums: int,
     scaling: float,
 ):
-    return torch.ops._C.sgmv_shrink(inputs, lora_a_weights, lora_indices_tensor,
-                                    seq_len_tensor, output_tensor, scaling)
+    return torch.ops._C.sgmv_shrink(inputs, lora_a_weights,
+                                    lora_indices_tensor, seq_len_tensor,
+                                    output_tensor, scaling)
 
 
 def sgmv_expand(inputs: torch.Tensor,
@@ -111,12 +107,6 @@ def sgmv_expand_slice(inputs: torch.Tensor,
                       slice_offset: int,
                       slice_size: int,
                       add_inputs: bool = False):
-    return torch.ops._C.sgmv_expand(
-            inputs,
-            lora_b_weights,
-            lora_indices_tensor,
-            seq_len_tensor,
-            output_tensor,
-            slice_offset,
-            slice_size
-    )
+    return torch.ops._C.sgmv_expand(inputs, lora_b_weights,
+                                    lora_indices_tensor, seq_len_tensor,
+                                    output_tensor, slice_offset, slice_size)
diff --git a/vllm_ascend/meta_registration.py b/vllm_ascend/meta_registration.py
index f292e6142..47c775887 100644
--- a/vllm_ascend/meta_registration.py
+++ b/vllm_ascend/meta_registration.py
@@ -80,23 +80,18 @@ def get_masked_input_and_mask_meta(input: torch.Tensor,
 
     return masked_input, mask
 
-def bgmv_expand_meta(x: torch.Tensor,
-                                   weight: torch.Tensor,
-                                   indices: torch.Tensor,
-                                   y: torch.Tensor,
-                                   slice_offset: int,
-                                   slice_size: int):
+
+def bgmv_expand_meta(x: torch.Tensor, weight: torch.Tensor,
+                     indices: torch.Tensor, y: torch.Tensor, slice_offset: int,
+                     slice_size: int):
 
     y_out = torch.empty_like(y)
     return y_out
 
-def sgmv_expand_meta(x: torch.Tensor,
-                                   weight: torch.Tensor,
-                                   lora_indices: torch.Tensor,
-                                   seq_len: torch.Tensor,
-                                   y: torch.Tensor,
-                                   slice_offset: int,
-                                   slice_size: int):
+
+def sgmv_expand_meta(x: torch.Tensor, weight: torch.Tensor,
+                     lora_indices: torch.Tensor, seq_len: torch.Tensor,
+                     y: torch.Tensor, slice_offset: int, slice_size: int):
 
     y_out = torch.empty_like(y)
     return y_out
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
index 0299accc6..27b922b96 100644
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -139,33 +139,53 @@ class NPUPlatform(Platform):
             enforce_eager = getattr(model_config, "enforce_eager", False)
 
         check_ascend_config(vllm_config, enforce_eager)
+        from vllm.config.compilation import CUDAGraphMode
 
+        # TODO(cmq): update the post init in vllmconfig
+        # if cudagraph_mode is not explicitly set by users, set default value
+        if envs_vllm.VLLM_USE_V1 and compilation_config.level \
+            == CompilationLevel.PIECEWISE:
+            compilation_config.cudagraph_mode = \
+                CUDAGraphMode.PIECEWISE
+        else:
+            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+        vllm_config._set_cudagraph_sizes()
+
+        # TODO(cmq): update the compilation level config to be determined by CUDAGraphMode
         if enforce_eager or compilation_config.level == CompilationLevel.NO_COMPILATION:
             logger.info("Compilation disabled, using eager mode by default")
             compilation_config.level = CompilationLevel.NO_COMPILATION
+            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
         elif compilation_config.level != CompilationLevel.PIECEWISE:
             logger.warning(
                 "NPU does not support %s compilation level. Setting level to NO_COMPILATION",
                 compilation_config.level)
             compilation_config.level = CompilationLevel.NO_COMPILATION
+            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
         elif ascend_config.torchair_graph_config.enabled:
             logger.info(
                 "Torchair compilation enabled on NPU. Setting level to NO_COMPILATION"
             )
             compilation_config.level = CompilationLevel.NO_COMPILATION
+            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
         elif parallel_config.distributed_executor_backend == "ray":
             logger.warning(
                 "Ray distributed executor backend is not compatible with ACL Graph mode "
                 "right now. Setting level to NO_COMPILATION")
             compilation_config.level = CompilationLevel.NO_COMPILATION
+            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
         else:
             logger.info(
                 "PIECEWISE compilation enabled on NPU. use_inductor not supported - "
                 "using only ACL Graph mode")
+            if envs_vllm.VLLM_USE_V1 and \
+                compilation_config.level == CompilationLevel.PIECEWISE:
+                compilation_config.set_splitting_ops_for_v1()
             compilation_config.use_inductor = False
             compilation_config.splitting_ops.extend(
                 ["vllm.unified_ascend_attention_with_output"])
             update_aclgraph_sizes(vllm_config)
+            compilation_config.cudagraph_num_of_warmups = 1
 
         if parallel_config and parallel_config.worker_cls == "auto":
             if ascend_config.torchair_graph_config.enabled:
@@ -249,11 +269,11 @@ class NPUPlatform(Platform):
         return True
 
     @classmethod
-    def get_piecewise_backend_cls(cls) -> str:
+    def get_static_graph_wrapper_cls(cls) -> str:
         """
         Get piecewise backend class for piecewise graph.
         """
-        return "vllm_ascend.compilation.piecewise_backend.NPUPiecewiseBackend"  # noqa
+        return "vllm_ascend.compilation.acl_graph.ACLGraphWrapper"  # noqa
 
     @classmethod
     def stateless_init_device_torch_dist_pg(
diff --git a/vllm_ascend/torchair/torchair_attention.py b/vllm_ascend/torchair/torchair_attention.py
index a3fda6103..97123fa6b 100644
--- a/vllm_ascend/torchair/torchair_attention.py
+++ b/vllm_ascend/torchair/torchair_attention.py
@@ -20,15 +20,20 @@ from typing import List, Optional, Tuple, Type
 
 import numpy as np
 import torch
+import torch.nn as nn
 import torch_npu
 from vllm.attention.backends.abstract import (AttentionImpl, AttentionLayer,
                                               AttentionType)
 from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.config import VllmConfig
+from vllm.utils import cdiv
 
 from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend,
                                                 AscendAttentionMetadataBuilder,
                                                 AscendAttentionState,
                                                 AscendMetadata)
+from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
+from vllm_ascend.torchair.utils import TorchairCommonAttentionMetadata
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
                                nd_to_nz_2d)
 
@@ -91,22 +96,26 @@ class AscendTorchairMetadata(AscendMetadata):
 
 class AscendAttentionTorchairMetadataBuilder(AscendAttentionMetadataBuilder):
 
-    def __init__(self, runner):
-        super().__init__(runner)
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        super().__init__(vllm_config, device)
+        self.max_num_blocks_per_req = cdiv(
+            self.model_config.max_model_len,
+            self.vllm_config.cache_config.block_size)
+        self.max_blocks = (self.model_config.max_model_len +
+                           self.vllm_config.cache_config.block_size -
+                           1) // self.vllm_config.cache_config.block_size
 
     def _get_graph_runner_block_tables(
             self, num_seqs: int, block_tables: torch.Tensor) -> torch.Tensor:
+        max_blocks = self.max_blocks
 
-        max_batch_size, max_blocks = self.runner.graph_block_tables.shape
-        assert max_batch_size >= num_seqs, f"max_batch_size: {max_batch_size} should be bigger than cur_num_seqs: {num_seqs}"
-
-        if isinstance(self.runner.graph_block_tables, np.ndarray):
-            graph_block_tables = torch.zeros((max_batch_size, max_blocks),
-                                             dtype=block_tables.dtype,
-                                             device=block_tables.device)
-        else:
-            graph_block_tables = self.runner.graph_block_tables.to(
-                device=block_tables.device, dtype=block_tables.dtype)
+        graph_block_tables = torch.zeros((num_seqs, max_blocks),
+                                         dtype=block_tables.dtype,
+                                         device=block_tables.device)
 
         num_blocks = block_tables.size(1)
         if num_blocks <= max_blocks:
@@ -118,14 +127,14 @@ class AscendAttentionTorchairMetadataBuilder(AscendAttentionMetadataBuilder):
                                max_blocks] = block_tables[:num_seqs, :
                                                           max_blocks]
 
-        return graph_block_tables[:num_seqs, :max_blocks]
+        return graph_block_tables[:, :max_blocks]
 
     def build_torchair_graph_dummy(
-            self, num_reqs: int,
-            num_actual_tokens: int) -> AscendTorchairMetadata:
-        device = self.runner.device
-        _, max_blocks = self.runner.graph_block_tables.shape
-        block_table = torch.zeros((num_reqs, max_blocks),
+        self, common_attn_metadata: TorchairCommonAttentionMetadata
+    ) -> AscendTorchairMetadata:
+        device = self.device
+        num_reqs = common_attn_metadata.num_reqs
+        block_table = torch.zeros((num_reqs, self.max_blocks),
                                   dtype=torch.int32,
                                   device=device)
         block_table = self._get_graph_runner_block_tables(
@@ -150,7 +159,7 @@ class AscendAttentionTorchairMetadataBuilder(AscendAttentionMetadataBuilder):
                                                max_seq_lens=1)
 
         attn_metadata = AscendTorchairMetadata(
-            num_actual_tokens=num_actual_tokens,
+            num_actual_tokens=common_attn_metadata.num_actual_tokens,
             block_tables=block_table,
             query_lens=0,
             query_start_loc=query_start_loc,
@@ -160,52 +169,50 @@ class AscendAttentionTorchairMetadataBuilder(AscendAttentionMetadataBuilder):
             decode=decode_metadata)
         return attn_metadata
 
-    def build(self,
-              num_reqs,
-              num_actual_tokens,
-              max_query_len,
-              enable_dbo_across_dp: bool = False,
-              is_only_prefill: bool = False,
-              *args,
-              **kwargs):
+    def build(
+        self,
+        common_attn_metadata: AscendCommonAttentionMetadata,
+        model: nn.Module,
+    ):
+        num_reqs = common_attn_metadata.num_reqs
+        num_actual_tokens = common_attn_metadata.num_actual_tokens
 
-        if 'graph_pad_size' in kwargs:
-            graph_pad_size = kwargs['graph_pad_size']
-        else:
-            graph_pad_size = -1  # default value
-
-        device = self.runner.device
-
-        block_table = self.runner.input_batch.block_table[0].get_device_tensor(
-        )
-        block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
+        block_table = common_attn_metadata.block_table_tensor
+        block_table[:num_reqs, :self.max_num_blocks_per_req] = (
             block_table[:num_reqs])
 
-        query_lens = self.runner.query_lens
-        seq_lens = self.runner.seq_lens_cpu[:num_reqs]
-        slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
-            self.runner.device, non_blocking=True)
-        attn_mask = self.runner.attn_mask
+        seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs]
+        slot_mapping = common_attn_metadata.slot_mapping_cpu[:
+                                                             num_actual_tokens].to(
+                                                                 self.device,
+                                                                 non_blocking=
+                                                                 True)
+        attn_mask = common_attn_metadata.attn_mask
 
-        attn_state = self.runner.attn_state
+        attn_state = common_attn_metadata.attn_state
         if is_310p() and attn_state == AscendAttentionState.PrefillNoCache:
             mask_nz = nd_to_nz_2d(attn_mask)
             attn_mask = torch_npu.npu_format_cast(mask_nz.contiguous(), 29)
 
-        query_start_loc_cpu = self.runner.query_start_loc_cpu[:num_reqs + 1]
-        query_start_loc = query_start_loc_cpu.to(self.runner.device,
+        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu[:
+                                                                       num_reqs
+                                                                       + 1]
+        query_start_loc = query_start_loc_cpu.to(self.device,
                                                  non_blocking=True)
-        input_positions = self.runner.positions_cpu[:num_actual_tokens].to(
-            device, non_blocking=True).long()
+        query_lens = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
+        input_positions = common_attn_metadata.positions[:
+                                                         num_actual_tokens].long(
+                                                         )
 
         decode_metadata = None
+        graph_pad_size = common_attn_metadata.graph_pad_size
         use_torchair_graph = graph_pad_size > -1
-        if self.runner.attn_state in [
+        if common_attn_metadata.attn_state in [
                 AscendAttentionState.DecodeOnly,
         ]:
             max_seq_lens = seq_lens.max().item()
             num_seqs = len(seq_lens)
-            if use_torchair_graph and self.runner.attn_state in [
+            if use_torchair_graph and common_attn_metadata.attn_state in [
                     AscendAttentionState.DecodeOnly,
             ]:
                 num_reqs_pad_size = 0
@@ -214,8 +221,8 @@ class AscendAttentionTorchairMetadataBuilder(AscendAttentionMetadataBuilder):
                     pad_value = 0
                     num_token_pad_size = graph_pad_size - num_actual_tokens
                     num_reqs_pad_size = (
-                        graph_pad_size // self.runner.decode_token_per_req -
-                        num_reqs)
+                        graph_pad_size //
+                        common_attn_metadata.decode_token_per_req - num_reqs)
                 pad_value = 1
                 padded_seq_lens = seq_lens.tolist() + [pad_value
                                                        ] * num_reqs_pad_size
@@ -255,11 +262,11 @@ class AscendAttentionTorchairMetadataBuilder(AscendAttentionMetadataBuilder):
             query_start_loc=query_start_loc,
             query_lens=query_lens,
             seq_lens=seq_lens,
-            max_query_len=max_query_len,
+            max_query_len=common_attn_metadata.max_query_len,
             slot_mapping=slot_mapping,
             attn_mask=attn_mask,
             attn_state=attn_state,
-            enable_dbo_across_dp=enable_dbo_across_dp)
+            enable_dbo_across_dp=common_attn_metadata.enable_dbo_across_dp)
         return attn_metadata
 
 
diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py
index a07304b96..62b1e1e6f 100644
--- a/vllm_ascend/torchair/torchair_model_runner.py
+++ b/vllm_ascend/torchair/torchair_model_runner.py
@@ -26,7 +26,8 @@ from vllm.forward_context import get_forward_context
 from vllm.logger import logger
 
 from vllm_ascend.platform import NPUPlatform
-from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
+from vllm_ascend.torchair.utils import (TorchairCommonAttentionMetadata,
+                                        check_torchair_cache_exist,
                                         register_torchair_model,
                                         write_kv_cache_bytes_to_file)
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
@@ -71,8 +72,16 @@ class NPUTorchairModelRunner(NPUModelRunner):
         # NOTE: If torchair graph mode and not with_prefill,
         # we can't skip_attn, it will cause graph recompile.
         if not with_prefill:
+            common_attn_metadata = TorchairCommonAttentionMetadata(
+                num_reqs=num_reqs,
+                num_actual_tokens=1,
+                actual_seq_lengths_q=self.actual_seq_lengths_q,
+                attn_mask=self.attn_mask,
+                spec_attn_mask=self.spec_attn_mask,
+                decode_token_per_req=self.decode_token_per_req,
+            )
             attn_metadata = self.attn_metadata_builder.build_torchair_graph_dummy(
-                num_reqs=num_reqs, num_actual_tokens=1)
+                common_attn_metadata)
         else:
             attn_metadata = super()._build_attention_metadata(
                 with_prefill, num_reqs, skip_attn)
diff --git a/vllm_ascend/torchair/utils.py b/vllm_ascend/torchair/utils.py
index 0a94494cb..cdc4ba3a1 100644
--- a/vllm_ascend/torchair/utils.py
+++ b/vllm_ascend/torchair/utils.py
@@ -2,6 +2,7 @@ import fcntl
 import os
 import shutil
 from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass
 
 import torch
 
@@ -20,6 +21,32 @@ TORCHAIR_CACHE_DIR = os.getenv(
     'TORCHAIR_CACHE_HOME', os.path.join(os.getcwd(), TORCHAIR_CACHE_PATH_NAME))
 
 
+@dataclass
+class TorchairCommonAttentionMetadata:
+    """
+    Per-batch attention metadata, shared across layers and backends.
+    AttentionMetadataBuilder instances use it to construct per-layer metadata.
+    
+    For many of the tensors we keep both GPU and CPU versions.
+    """
+
+    num_reqs: int
+    """Number of requests"""
+
+    num_actual_tokens: int
+    """Total number of tokens in batch"""
+
+    decode_token_per_req: int
+
+    actual_seq_lengths_q: list[int]
+
+    attn_mask: torch.Tensor = None
+
+    spec_attn_mask: torch.Tensor = None
+
+    graph_pad_size: int = -1
+
+
 @contextmanager
 def _file_lock(file_descriptor, lock_type):
     fcntl.flock(file_descriptor, lock_type)
diff --git a/vllm_ascend/worker/eagle_proposer_v1.py b/vllm_ascend/worker/eagle_proposer_v1.py
index 18fb9fda8..895649327 100644
--- a/vllm_ascend/worker/eagle_proposer_v1.py
+++ b/vllm_ascend/worker/eagle_proposer_v1.py
@@ -16,6 +16,7 @@ from vllm.v1.sample.metadata import SamplingMetadata
 from vllm_ascend.ascend_forward_context import set_ascend_forward_context
 from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
+from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
 
 PADDING_SLOT_ID = -1
 
@@ -125,12 +126,27 @@ class EagleProposer:
         query_lens = cu_num_tokens[1:] - cu_num_tokens[:-1]
         max_query_len = query_lens.max().item()
 
-        # FIXME(woosuk): The below two ops cause synchronization. Optimize.
-        attn_metadata = self.runner.attn_metadata_builder.build(
+        common_attn_metadata = AscendCommonAttentionMetadata(
+            query_start_loc=self.runner.query_start_loc[:batch_size + 1],
+            query_start_loc_cpu=self.runner.query_start_loc_cpu[:batch_size +
+                                                                1],
+            seq_lens_cpu=self.runner.seq_lens_cpu,
+            max_query_len=max_query_len,
             num_reqs=batch_size,
             num_actual_tokens=num_tokens,
-            max_query_len=max_query_len,
+            actual_seq_lengths_q=self.runner.actual_seq_lengths_q,
+            block_table_tensor=self.runner.input_batch.block_table[0].
+            get_device_tensor(),
+            slot_mapping_cpu=target_slot_mapping,
+            positions=target_positions,
+            attn_mask=self.runner.attn_mask,
+            spec_attn_mask=self.runner.spec_attn_mask,
+            attn_state=self.runner.attn_state,
+            decode_token_per_req=self.runner.decode_token_per_req,
         )
+        # FIXME(woosuk): The below two ops cause synchronization. Optimize.
+        attn_metadata = self.runner.attn_metadata_builder.build(
+            common_attn_metadata, self.runner.model)
         if self.use_cuda_graph and \
             num_tokens <= self.cudagraph_batch_sizes[-1]:
             num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index f4450d352..f4169cf7a 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -23,7 +23,6 @@ import math
 import os
 import time
 import types
-import weakref
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union, cast
@@ -34,16 +33,21 @@ import torch
 import torch._dynamo.cache_size
 import torch.distributed as dist
 import torch.nn as nn
+from tqdm import tqdm  # type: ignore
 from vllm.attention import AttentionType, get_attn_backend
 from vllm.attention.layer import Attention
-from vllm.config import CompilationLevel, VllmConfig
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.monitor import set_cudagraph_capturing_enabled
+from vllm.config import CompilationLevel, CUDAGraphMode, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                           has_kv_transfer_group)
 from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
 from vllm.distributed.parallel_state import (get_dp_group, get_pp_group,
-                                             get_tp_group)
-from vllm.forward_context import DPMetadata, get_forward_context
+                                             get_tp_group,
+                                             is_global_first_rank)
+from vllm.forward_context import (BatchDescriptor, DPMetadata,
+                                  get_forward_context)
 from vllm.logger import logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
@@ -55,15 +59,17 @@ from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingType
-from vllm.sequence import IntermediateTensors
-from vllm.tasks import GenerationTask, SupportedTask
+from vllm.sequence import IntermediateTensors, PoolerOutput
+from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        LazyLoader, cdiv)
+                        LazyLoader, cdiv, is_pin_memory_available)
+from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
-from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
-                             ModelRunnerOutput)
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
+                             LogprobsTensors, ModelRunnerOutput)
 from vllm.v1.pool.metadata import PoolingMetadata
+from vllm.v1.sample.logits_processor import build_logitsprocs
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
@@ -79,6 +85,8 @@ from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
 from vllm_ascend.attention.attention_v1 import (AscendAttentionState,
                                                 AscendMetadata)
 from vllm_ascend.attention.mla_v1 import AscendMLAMetadata
+from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
+from vllm_ascend.compilation.acl_graph import ACLGraphWrapper
 from vllm_ascend.distributed.moe_comm_method import (AllGatherCommImpl,
                                                      DummyCommImpl,
                                                      MoECommMethod)
@@ -154,8 +162,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
+        self.compilation_config = vllm_config.compilation_config
+        self.load_config = vllm_config.load_config
         self.lora_config = vllm_config.lora_config
         self.parallel_config = vllm_config.parallel_config
+        self.pin_memory = is_pin_memory_available()
         self.scheduler_config = vllm_config.scheduler_config
         self.speculative_config = vllm_config.speculative_config
         self.block_size = vllm_config.cache_config.block_size
@@ -215,7 +226,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
             use_mla=self.model_config.use_mla,
         )
         self.attn_metadata_builder = self.attn_backend.get_builder_cls()(
-            weakref.proxy(self))
+            vllm_config, device)
         self.attn_mask_builder = AttentionMaskBuilder(
             min(self.model_config.max_model_len,
                 int(os.getenv("PAGED_ATTENTION_MASK_LEN", 10000))), self.dtype)
@@ -228,13 +239,12 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         self.drafter: Optional[Union[NgramProposer, EagleProposer,
                                      MtpProposer]] = None
         self.actual_seq_lengths_q = []
-        self.spec_token_num = 0
         self.decode_token_per_req = 1
         if self.speculative_config:
             self.use_spec_decode = True
-            self.spec_token_num = self.speculative_config.num_speculative_tokens
-            assert self.spec_token_num > 0
-            self.decode_token_per_req = 1 + self.spec_token_num
+            spec_token_num = self.speculative_config.num_speculative_tokens
+            assert spec_token_num > 0
+            self.decode_token_per_req = 1 + spec_token_num
             self.actual_seq_lengths_q = [
                 len for len in
                 range(self.decode_token_per_req, self.max_num_tokens +
@@ -331,13 +341,21 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                                         pin_memory=True)
         self.seq_lens_np = self.seq_lens_cpu.numpy()
 
-        self.use_aclgraph = (self.vllm_config.compilation_config.level
-                             == CompilationLevel.PIECEWISE
-                             and not self.model_config.enforce_eager and
-                             not ascend_config.torchair_graph_config.enabled)
+        self.use_aclgraph = (
+            self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            and self.compilation_config.level == CompilationLevel.PIECEWISE
+            and not self.model_config.enforce_eager
+            and not ascend_config.torchair_graph_config.enabled)
         self.aclgraph_batch_sizes = list(
-            reversed(
-                self.vllm_config.compilation_config.cudagraph_capture_sizes))
+            reversed(self.compilation_config.cudagraph_capture_sizes))
+
+        self.uniform_decode_query_len = 1 if not self.speculative_config else \
+            1 + self.speculative_config.num_speculative_tokens
+        # aclgraph dispatcher for runtime aclgraph dispatching.
+        self.aclgraph_dispatcher = CudagraphDispatcher(self.vllm_config)
+        # Cached outputs.
+        self._draft_token_ids: Optional[Union[list[list[int]],
+                                              torch.Tensor]] = None
 
         self.new_kv_cache_bytes = -1
         self.torchair_compiled_model = None  # type: ignore
@@ -405,12 +423,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
             )
 
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
-        """Update the cached states and the persistent batch with the scheduler
-        output.
-
-        The SamplingMetadata is updated and copied to the NPU if there is a
-        new/resumed/paused/finished request in the batch.
-        """
         # Remove finished requests from the cached states.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
@@ -421,11 +433,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         # then resubmitted with the same ID. In this case, we treat them as two
         # distinct requests - clearing the cached states for the first request
         # and handling the second as a new request.
-        removed_req_indices: List[int] = []
         for req_id in scheduler_output.finished_req_ids:
-            req_index = self.input_batch.remove_request(req_id)
-            if req_index is not None:
-                removed_req_indices.append(req_index)
+            self.input_batch.remove_request(req_id)
 
         # Free the cached encoder outputs.
         for req_id, input_id in scheduler_output.free_encoder_input_ids:
@@ -448,16 +457,15 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         # have low request overlap (e.g., alternating between two distinct
         # sets of requests), this optimization becomes very inefficient.
         for req_id in unscheduled_req_ids:
-            req_index = self.input_batch.remove_request(req_id)
-            assert req_index is not None
-            removed_req_indices.append(req_index)
+            self.input_batch.remove_request(req_id)
 
-        req_ids_to_add: List[str] = []
+        req_ids_to_add: list[str] = []
         # Add new requests to the cached states.
         for new_req_data in scheduler_output.scheduled_new_reqs:
             req_id = new_req_data.req_id
             sampling_params = new_req_data.sampling_params
             pooling_params = new_req_data.pooling_params
+
             if sampling_params and \
                 sampling_params.sampling_type == SamplingType.RANDOM_SEED:
                 generator = torch.Generator(device=self.device)
@@ -468,7 +476,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
             if pooling_params:
                 assert (task := pooling_params.task) is not None, (
                     "You did not set `task` in the API")
-                model = cast(VllmModelForPooling, self.model)
+                model = cast(VllmModelForPooling, self.get_model())
                 to_update = model.pooler.get_pooling_updates(task)
                 to_update.apply(pooling_params)
 
@@ -478,7 +486,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                 mm_kwargs=new_req_data.mm_kwargs,
                 mm_positions=new_req_data.mm_positions,
                 sampling_params=sampling_params,
-                pooling_params=new_req_data.pooling_params,
+                pooling_params=pooling_params,
                 generator=generator,
                 block_ids=new_req_data.block_ids,
                 num_computed_tokens=new_req_data.num_computed_tokens,
@@ -493,9 +501,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                 second_per_grid_ts = []
                 audio_feature_lengths = []
                 use_audio_in_video = False
-
-                for item in self.requests[req_id].mm_kwargs:
-                    mm_input = item.require_data()
+                for mm_item in self.requests[req_id].mm_kwargs:
+                    mm_input = mm_item.get_data()
                     if mm_input.get("image_grid_thw") is not None:
                         image_grid_thw.append(
                             mm_input["image_grid_thw"].tolist())
@@ -528,19 +535,24 @@ class NPUModelRunner(LoRAModelRunnerMixin):
             req_ids_to_add.append(req_id)
 
         # Update the states of the running/resumed requests.
-        req_data = scheduler_output.scheduled_cached_reqs
         is_last_rank = get_pp_group().is_last_rank
+        req_data = scheduler_output.scheduled_cached_reqs
         for i, req_id in enumerate(req_data.req_ids):
             req_state = self.requests[req_id]
             num_computed_tokens = req_data.num_computed_tokens[i]
             new_block_ids = req_data.new_block_ids[i]
             resumed_from_preemption = req_data.resumed_from_preemption[i]
 
+            # Update the cached states.
             req_state.num_computed_tokens = num_computed_tokens
+
             if not is_last_rank:
+                # When using PP, the scheduler sends the sampled tokens back,
+                # because there's no direct communication between the first-
+                # stage worker and the last-stage worker.
                 new_token_ids = req_data.new_token_ids[i]
                 # Add the sampled token(s) from the previous step (if any).
-                # This doesn't include "unverified" tokens like spec decode tokens.
+                # This doesn't include "unverified" tokens like spec tokens.
                 num_new_tokens = (num_computed_tokens + len(new_token_ids) -
                                   req_state.num_tokens)
                 if num_new_tokens == 1:
@@ -549,11 +561,12 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                 elif num_new_tokens > 0:
                     req_state.output_token_ids.extend(
                         new_token_ids[-num_new_tokens:])
+
             # Update the block IDs.
             if not resumed_from_preemption:
                 # Append the new blocks to the existing block IDs.
-                for block_ids, new_ids in zip(  # type: ignore[call-overload]
-                        req_state.block_ids, new_block_ids):
+                for block_ids, new_ids in zip(req_state.block_ids,
+                                              new_block_ids):
                     block_ids.extend(new_ids)
             else:
                 # The request is resumed from preemption.
@@ -571,9 +584,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
             # Update the persistent batch.
             self.input_batch.num_computed_tokens_cpu[req_index] = (
                 num_computed_tokens)
-
             self.input_batch.block_table.append_row(new_block_ids, req_index)
 
+            # For the last rank, we don't need to update the token_ids_cpu
+            # because the sampled tokens are already cached.
             if not is_last_rank:
                 # Add new_token_ids to token_ids_cpu.
                 start_token_index = num_computed_tokens
@@ -583,9 +597,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                     start_token_index:end_token_index] = new_token_ids
                 self.input_batch.num_tokens_no_spec[
                     req_index] = end_token_index
+                self.input_batch.num_tokens[req_index] = end_token_index
+
             # Add spec_token_ids to token_ids_cpu.
-            spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
-                req_id, ())
+            spec_token_ids = (
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
             if spec_token_ids:
                 num_spec_tokens = len(spec_token_ids)
                 start_index = self.input_batch.num_tokens_no_spec[req_index]
@@ -595,39 +611,17 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                 # NOTE(woosuk): `num_tokens` here may include spec tokens.
                 self.input_batch.num_tokens[req_index] += num_spec_tokens
 
-        # Check if the batch has changed. If not, we can skip copying the
-        # sampling metadata from CPU to GPU.
-        batch_changed = len(removed_req_indices) > 0 or len(req_ids_to_add) > 0
-
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
-        removed_req_indices.sort(reverse=True)
         for req_id in req_ids_to_add:
             req_state = self.requests[req_id]
-            if removed_req_indices:
-                # Fill the empty index.
-                req_index = removed_req_indices.pop()
-            else:
-                # Append to the end.
-                req_index = None
-            self.input_batch.add_request(req_state, req_index)
-            spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
-                req_id, ())
-            if spec_token_ids:
-                req_index = self.input_batch.num_reqs - 1
-                start_index = len(req_state.prompt_token_ids) + len(
-                    req_state.output_token_ids)
-                end_token_index = start_index + len(spec_token_ids)
-                self.input_batch.token_ids_cpu[
-                    req_index, start_index:end_token_index] = spec_token_ids
-                self.input_batch.num_tokens[req_index] = end_token_index
+            self.input_batch.add_request(req_state)
 
-        # Condense the batched states if there are empty indices.
-        if removed_req_indices:
-            self.input_batch.condense(removed_req_indices)
+        # Condense the batched states if there are gaps left by removed requests
+        self.input_batch.condense()
 
-        if batch_changed:
-            self.input_batch.refresh_sampling_metadata()
+        # Refresh batch metadata with any pending updates.
+        self.input_batch.refresh_metadata()
 
     def _get_forward_metadata_across_dp(
             self, num_tokens: int, with_prefill: bool,
@@ -798,17 +792,34 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         # in the same group share the same metadata.
         for kv_cache_group_id, kv_cache_group_spec in enumerate(
                 self.kv_cache_config.kv_cache_groups):
-            attn_metadata_i = self.attn_metadata_builder.build(
+            common_attn_metadata = AscendCommonAttentionMetadata(
+                query_start_loc=self.query_start_loc[:num_reqs + 1],
+                query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1],
+                seq_lens_cpu=self.seq_lens_cpu,
                 num_reqs=num_reqs,
-                num_actual_tokens=total_num_scheduled_tokens,
                 max_query_len=max_num_scheduled_tokens,
+                num_actual_tokens=total_num_scheduled_tokens,
+                actual_seq_lengths_q=self.actual_seq_lengths_q,
+                block_table_tensor=self.input_batch.block_table[0].
+                get_device_tensor(),
+                slot_mapping_cpu=self.slot_mapping_cpu,
+                positions=self.positions,
+                attn_mask=self.attn_mask,
+                spec_attn_mask=self.spec_attn_mask,
+                attn_state=self.attn_state,
+                decode_token_per_req=self.decode_token_per_req,
             )
+            attn_metadata_i = self.attn_metadata_builder.build(
+                common_attn_metadata, self.get_model())
             for layer_name in kv_cache_group_spec.layer_names:
                 attn_metadata[layer_name] = attn_metadata_i
 
         return attn_metadata
 
     def get_model(self) -> nn.Module:
+        # get raw model out of the aclgraph wrapper.
+        if isinstance(self.model, ACLGraphWrapper):
+            return self.model.unwrap()
         return self.model
 
     def get_supported_generation_tasks(self) -> "list[GenerationTask]":
@@ -1063,11 +1074,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
             num_input_tokens)
         num_input_tokens += num_pad
 
-        modified_batch = self.attn_metadata_builder.reorder_batch(
-            self.input_batch, scheduler_output)
-        if modified_batch:
-            self.input_batch.refresh_sampling_metadata()
-
+        self.attn_metadata_builder.reorder_batch(self.input_batch,
+                                                 scheduler_output)
         # OPTIMIZATION: Start copying the block table first.
         # This way, we can overlap the copy with the following CPU operations.
         self.input_batch.block_table.commit_block_table(num_reqs)
@@ -1168,8 +1176,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
             attn_state=attn_state)
         self.attn_state = attn_state  # type: ignore
 
-        extra_builder_kwargs = {}
-
         self.query_start_loc_np[0] = 0
         self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
         self.query_start_loc[:num_reqs + 1].copy_(
@@ -1186,45 +1192,44 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         ]
 
         is_only_prefill = bool(np.all(num_valid_tokens != 1))
-        extra_builder_kwargs['is_only_prefill'] = is_only_prefill
 
         enable_dbo = self._check_dbo_is_valid(self.query_lens.tolist(),
                                               attn_state,
                                               total_num_scheduled_tokens)
 
-        enable_dbo = self._check_dbo_is_valid(self.query_lens.tolist(),
-                                              attn_state,
-                                              total_num_scheduled_tokens)
         (padded_num_tokens_across_dp, num_tokens_across_dp, with_prefill,
          enable_dbo) = self._get_forward_metadata_across_dp_and_pad(
              total_num_scheduled_tokens, with_prefill, enable_dbo)
-        extra_builder_kwargs['enable_dbo_across_dp'] = enable_dbo
         self.with_prefill = with_prefill
         self.num_tokens_across_dp = num_tokens_across_dp
         if self.torchair_graph_enabled and not with_prefill:
             self.graph_pad_size = padded_num_tokens_across_dp
-            extra_builder_kwargs[
-                'graph_pad_size'] = self.graph_pad_size  # type: ignore
         else:
             self.graph_pad_size = -1
-
+        common_attn_metadata = AscendCommonAttentionMetadata(
+            query_start_loc=self.query_start_loc[:num_reqs + 1],
+            query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1],
+            seq_lens_cpu=self.seq_lens_cpu,
+            num_reqs=num_reqs,
+            num_actual_tokens=total_num_scheduled_tokens,
+            actual_seq_lengths_q=self.actual_seq_lengths_q,
+            block_table_tensor=self.input_batch.block_table[0].
+            get_device_tensor(),
+            slot_mapping_cpu=self.slot_mapping_cpu,
+            positions=self.positions,
+            attn_mask=self.attn_mask,
+            spec_attn_mask=self.spec_attn_mask,
+            attn_state=self.attn_state,
+            enable_dbo_across_dp=enable_dbo,
+            is_only_prefill=is_only_prefill,
+            max_query_len=max_num_scheduled_tokens,
+            graph_pad_size=self.graph_pad_size,
+            decode_token_per_req=self.decode_token_per_req,
+        )
+        attn_metadata = self.attn_metadata_builder.build(
+            common_attn_metadata, self.model)
         if self.vllm_config.model_config.use_mla:
-            extra_builder_kwargs[
-                "query_start_loc"] = self.query_start_loc[:num_reqs + 1]
-            attn_metadata = self.attn_metadata_builder.build(  # type: ignore
-                num_reqs=num_reqs,
-                num_actual_tokens=total_num_scheduled_tokens,
-                max_query_len=max_num_scheduled_tokens,
-                **extra_builder_kwargs,
-            )
             attn_metadata.num_input_tokens = num_input_tokens
-        else:
-            attn_metadata = self.attn_metadata_builder.build(  # type: ignore
-                num_reqs=num_reqs,
-                num_actual_tokens=total_num_scheduled_tokens,
-                max_query_len=max_num_scheduled_tokens,
-                **extra_builder_kwargs,
-            )
 
         # Prepare input_ids
         token_indices = (positions_np +
@@ -1534,7 +1539,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         )
         return logits.to(self.device).to(logits_dtype)
 
-    def _get_spec_token_ids(
+    def propose_draft_token_ids(
         self,
         valid_sampled_token_ids: list[list[int]],
         sampling_metadata: SamplingMetadata,
@@ -1549,23 +1554,23 @@ class NPUModelRunner(LoRAModelRunnerMixin):
     ) -> Optional[list[list[int]]]:
         if not self.use_spec_decode:
             # Speculative decoding is not enabled.
-            spec_token_ids = None
+            draft_token_ids = None
         elif self.speculative_config.method == "ngram":
-            spec_token_ids = self._generate_ngram_token_ids(
+            draft_token_ids = self._generate_ngram_token_ids(
                 valid_sampled_token_ids)
         elif self.speculative_config.method == "eagle":
             raise NotImplementedError("Eagle Is Not Supported Yet.")
         elif self.speculative_config.method == "eagle3":
-            spec_token_ids = self._generate_eagle3_token_ids(
+            draft_token_ids = self._generate_eagle3_token_ids(
                 valid_sampled_token_ids, sampling_metadata, scheduler_output,
                 spec_decode_metadata, positions, num_scheduled_tokens,
                 hidden_states, aux_hidden_states)
         elif self.speculative_config.method == 'deepseek_mtp':
-            spec_token_ids = self._generate_mtp_token_ids(
+            draft_token_ids = self._generate_mtp_token_ids(
                 valid_sampled_token_ids, sampling_metadata, scheduler_output,
                 spec_decode_metadata, positions, num_scheduled_tokens,
                 hidden_states, attn_metadata)
-        return spec_token_ids
+        return draft_token_ids
 
     def _pool(
         self,
@@ -1606,7 +1611,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
             req_ids=self.input_batch.req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=[],
-            spec_token_ids=None,
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=pooler_output,
@@ -1785,17 +1789,18 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                 req_state = self.requests[req_id]
                 req_state.output_token_ids.extend(sampled_ids)
 
-            spec_token_ids = self._get_spec_token_ids(
-                valid_sampled_token_ids,
-                sampling_metadata,
-                scheduler_output,
-                spec_decode_metadata,
-                positions,
-                num_scheduled_tokens,
-                hidden_states,
-                attn_metadata,
-                aux_hidden_states,
-            )
+            if self.speculative_config:
+                self._draft_token_ids = self.propose_draft_token_ids(
+                    valid_sampled_token_ids,
+                    sampling_metadata,
+                    scheduler_output,
+                    spec_decode_metadata,
+                    positions,
+                    num_scheduled_tokens,
+                    hidden_states,
+                    attn_metadata,
+                    aux_hidden_states,
+                )
 
             if has_kv_transfer_group():
                 get_kv_transfer_group().clear_connector_metadata()
@@ -1806,7 +1811,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
             req_ids=self.input_batch.req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=valid_sampled_token_ids,
-            spec_token_ids=spec_token_ids,
             logprobs=logprobs_lists,
             prompt_logprobs_dict=prompt_logprobs_dict,
             pooler_output=[],
@@ -1825,6 +1829,17 @@ class NPUModelRunner(LoRAModelRunnerMixin):
 
         return model_runner_output
 
+    def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
+        if self._draft_token_ids is None:
+            return None
+        req_ids = self.input_batch.req_ids
+        if isinstance(self._draft_token_ids, torch.Tensor):
+            draft_token_ids = self._draft_token_ids.tolist()
+        else:
+            draft_token_ids = self._draft_token_ids
+        self._draft_token_ids = None
+        return DraftTokenIds(req_ids, draft_token_ids)
+
     def kv_connector_no_forward(
             self, scheduler_output: "SchedulerOutput") -> ModelRunnerOutput:
         with set_ascend_forward_context(None, self.vllm_config):
@@ -1898,30 +1913,66 @@ class NPUModelRunner(LoRAModelRunnerMixin):
     def _dummy_run(
         self,
         num_tokens: int,
-        skip_attn: bool = True,
         with_prefill: bool = False,
         is_torchair_compile: bool = False,
         moe_comm_method: Type[MoECommMethod] = DummyCommImpl,
+        aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
+        force_attention: bool = False,
+        uniform_decode: bool = False,
     ) -> torch.Tensor:
+        # only support eager mode and piecewise graph now
+        assert aclgraph_runtime_mode in {
+            CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE
+        }
+        if force_attention:
+            raise RuntimeError(
+                "Capturing attention in aclgraph is unexpected, because full graph is not supported now"
+            )
+
         # Padding for DP
         (num_tokens, num_tokens_across_dp, with_prefill,
          _) = self._get_forward_metadata_across_dp_and_pad(
              num_tokens, with_prefill, False)
 
+        # If cudagraph_mode.decode_mode() == FULL and
+        # cudagraph_mode.seperate_routine(). This means that we are using
+        # different graphs and/or modes for mixed prefill-decode batches vs.
+        # uniform decode batches. A uniform decode batch means that all
+        # requests have identical query length, except a potential virtual
+        # request (shorter) in the batch account for padding.
+        # Uniform decode batch could either be common pure decode, where
+        # max_query_len == 1, or speculative decode, where
+        # max_query_len == 1 + num_spec_decode_tokens.
+
+        # When setting max_query_len = 1, we switch to and capture the optimized
+        # routine of FA2 for pure decode, i.e., Flashdecode + an optimization
+        # for GQA/MQA.
+        max_query_len = self.uniform_decode_query_len if uniform_decode else \
+                                                                num_tokens
+
+        max_num_reqs = self.scheduler_config.max_num_seqs
         # Set num_scheduled_tokens based on num_tokens and max_num_seqs
         # for dummy run with LoRA so that the num_reqs collectively
         # has num_tokens in total.
         assert num_tokens <= self.scheduler_config.max_num_batched_tokens
         max_num_reqs = self.scheduler_config.max_num_seqs
-        if with_prefill:
-            num_reqs = num_tokens
+        if uniform_decode:
+            num_reqs = cdiv(num_tokens, max_query_len)
+            assert num_reqs <= max_num_reqs, \
+                "Do not capture num_reqs > max_num_reqs for uniform batch"
+            num_scheduled_tokens_list = [max_query_len] * num_reqs
+            if num_tokens % max_query_len != 0:
+                num_scheduled_tokens_list[-1] = num_tokens % max_query_len
         else:
-            num_reqs = (num_tokens + self.decode_token_per_req -
-                        1) // self.decode_token_per_req
-        num_reqs = min(num_reqs, max_num_reqs)
-        min_tokens_per_req = num_tokens // num_reqs
-        num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
-        num_scheduled_tokens_list[-1] += num_tokens % num_reqs
+            if with_prefill:
+                num_reqs = num_tokens
+            else:
+                num_reqs = (num_tokens + self.decode_token_per_req -
+                            1) // self.decode_token_per_req
+            num_reqs = min(num_reqs, max_num_reqs)
+            min_tokens_per_req = num_tokens // num_reqs
+            num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
+            num_scheduled_tokens_list[-1] += num_tokens % num_reqs
         assert sum(num_scheduled_tokens_list) == num_tokens
         assert len(num_scheduled_tokens_list) == num_reqs
         num_scheduled_tokens = np.array(num_scheduled_tokens_list,
@@ -1931,8 +1982,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         if self.is_kv_producer:
             with_prefill = True
 
-        attn_metadata = self._build_attention_metadata(with_prefill, num_reqs,
-                                                       skip_attn)
+        attn_metadata = self._build_attention_metadata(with_prefill,
+                                                       num_reqs,
+                                                       skip_attn=True)
 
         with self.maybe_dummy_run_with_lora(self.lora_config,
                                             num_scheduled_tokens):
@@ -1961,6 +2013,18 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                     k: v[:num_tokens]
                     for k, v in self.intermediate_tensors.items()
                 })
+            if aclgraph_runtime_mode == CUDAGraphMode.NONE:
+                batch_descriptor = None
+            else:
+                # filter out the valid batch descriptor
+                _cg_mode, batch_descriptor = \
+                    self.aclgraph_dispatcher.dispatch(
+                        BatchDescriptor(num_tokens=num_tokens,
+                                        uniform_decode=uniform_decode))
+                # sanity check
+                assert aclgraph_runtime_mode == _cg_mode, (
+                    f"Aclgraph runtime mode mismatch at dummy_run. "
+                    f"Expected {_cg_mode}, but got {aclgraph_runtime_mode}.")
 
             with set_ascend_forward_context(
                     attn_metadata,
@@ -1973,7 +2037,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                     moe_comm_method=moe_comm_method(
                         self.device, self.dtype, self.model_config.hf_config),
                     num_actual_tokens=0,
-            ):
+                    aclgraph_runtime_mode=aclgraph_runtime_mode,
+                    batch_descriptor=batch_descriptor):
                 hidden_states = self._generate_dummy_run_hidden_states(
                     with_prefill, is_torchair_compile, input_ids, positions,
                     attn_metadata, num_tokens, intermediate_tensors,
@@ -1983,7 +2048,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                 self.drafter.dummy_run(
                     num_tokens=num_tokens,
                     with_prefill=with_prefill,
-                    skip_attn=skip_attn,
+                    skip_attn=True,
                     num_reqs=num_reqs,
                     num_tokens_across_dp=num_tokens_across_dp)
 
@@ -2026,53 +2091,71 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         self.encoder_cache.clear()
         gc.collect()
 
-    @torch.inference_mode()
-    def _dummy_pooler_run(
+    def _dummy_pooler_run_task(
         self,
         hidden_states: torch.Tensor,
-    ) -> torch.Tensor:
-
+        task: PoolingTask,
+    ) -> PoolerOutput:
         num_tokens = hidden_states.shape[0]
         max_num_reqs = self.scheduler_config.max_num_seqs
         num_reqs = min(num_tokens, max_num_reqs)
         min_tokens_per_req = num_tokens // num_reqs
         num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
         num_scheduled_tokens_list[-1] += num_tokens % num_reqs
+        assert sum(num_scheduled_tokens_list) == num_tokens
+        assert len(num_scheduled_tokens_list) == num_reqs
 
         hidden_states_list = list(
             torch.split(hidden_states, num_scheduled_tokens_list))
-
         req_num_tokens = num_tokens // num_reqs
 
-        model = cast(VllmModelForPooling, self.model)
-        dummy_task = self.get_supported_pooling_tasks()[0]
-        dummy_pooling_params = PoolingParams(task=dummy_task)
+        dummy_prompt_lens = torch.tensor(
+            [h.shape[0] for h in hidden_states_list],
+            device=self.device,
+        )
+        dummy_token_ids = torch.zeros((num_reqs, req_num_tokens),
+                                      dtype=torch.int32,
+                                      device=self.device)
 
-        to_update = model.pooler.get_pooling_updates(dummy_task)
+        model = cast(VllmModelForPooling, self.get_model())
+        dummy_pooling_params = PoolingParams(task=task)
+        to_update = model.pooler.get_pooling_updates(task)
         to_update.apply(dummy_pooling_params)
 
         dummy_metadata = PoolingMetadata(
-            prompt_lens=torch.tensor([h.shape[0] for h in hidden_states_list],
-                                     device=self.device),
-            prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
-                                         dtype=torch.int32,
-                                         device=self.device),
-            pooling_params=[dummy_pooling_params] * num_reqs)
+            prompt_lens=dummy_prompt_lens,
+            prompt_token_ids=dummy_token_ids,
+            pooling_params=[dummy_pooling_params] * num_reqs,
+        )
 
         try:
-            pooler_output = model.pooler(hidden_states=hidden_states_list,
-                                         pooling_metadata=dummy_metadata)
+            return model.pooler(hidden_states=hidden_states_list,
+                                pooling_metadata=dummy_metadata)
         except RuntimeError as e:
             if 'out of memory' in str(e):
                 raise RuntimeError(
-                    "NPU out of memory occurred when warming up pooler with "
-                    f"{num_reqs} dummy requests. Please try lowering "
-                    "`max_num_seqs` or `gpu_memory_utilization` when "
+                    "NPU out of memory occurred when warming up pooler "
+                    f"({task=}) with {num_reqs} dummy requests. Please try "
+                    "lowering `max_num_seqs` or `gpu_memory_utilization` when "
                     "initializing the engine.") from e
             else:
                 raise e
 
-        return pooler_output
+    @torch.inference_mode()
+    def _dummy_pooler_run(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> PoolerOutput:
+        # Find the task that has the largest output for subsequent steps
+        output_size = dict[PoolingTask, float]()
+        for task in self.get_supported_pooling_tasks():
+            # Run a full batch with each task to ensure none of them OOMs
+            output = self._dummy_pooler_run_task(hidden_states, task)
+            output_size[task] = output.get_data_nbytes()
+            del output  # Allow GC
+
+        max_task = max(output_size.items(), key=lambda x: x[1])[0]
+        return self._dummy_pooler_run_task(hidden_states, max_task)
 
     def load_model(self) -> None:
         logger.info("Starting to load model %s...", self.model_config.model)
@@ -2199,10 +2282,15 @@ class NPUModelRunner(LoRAModelRunnerMixin):
             max_model_len=self.model_config.max_model_len,
             max_num_batched_tokens=self.max_num_tokens,
             device=self.device,
-            pin_memory=True,
+            pin_memory=self.pin_memory,
             vocab_size=self.model_config.get_vocab_size(),
             block_sizes=[self.block_size],
             is_spec_decode=bool(self.vllm_config.speculative_config),
+            logitsprocs=build_logitsprocs(
+                self.vllm_config, self.device, self.pin_memory,
+                self.is_pooling_model,
+                self.vllm_config.model_config.logits_processors),
+            is_pooling_model=self.is_pooling_model,
         )
 
         kv_cache_sizes = {}
@@ -2315,10 +2403,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                     # KV cache specs.
                     raise ValueError("Unknown KV cache spec type.")
 
-        bind_kv_cache(
-            kv_caches,
-            self.vllm_config.compilation_config.static_forward_context,
-            self.kv_caches)
+        bind_kv_cache(kv_caches,
+                      self.compilation_config.static_forward_context,
+                      self.kv_caches)
 
         if has_kv_transfer_group():
             get_kv_transfer_group().register_kv_caches(kv_caches)
@@ -2332,7 +2419,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
             format. Layers that do not need KV cache are not included.
         """
 
-        forward_ctx = self.vllm_config.compilation_config.static_forward_context
+        forward_ctx = self.compilation_config.static_forward_context
         use_mla = self.vllm_config.model_config.use_mla
         kv_cache_spec: dict[str, KVCacheSpec] = {}
         for layer_name, attn_module in forward_ctx.items():
@@ -2361,30 +2448,82 @@ class NPUModelRunner(LoRAModelRunnerMixin):
 
         return kv_cache_spec
 
+    def initialize_aclgraph_capture(self) -> None:
+        # TODO: Add check of AttentionCGSupport and cudagraph_mode.decode_mode when full graph is supported
+        # Trigger aclgraph dispatching keys initialization here (after
+        # initializing attn backends).
+        self.aclgraph_dispatcher.initialize_cudagraph_keys(
+            self.compilation_config.cudagraph_mode,
+            self.uniform_decode_query_len)
+
+    def _capture_aclgraphs(self, compilation_cases: list[int],
+                           aclgraph_runtime_mode: CUDAGraphMode,
+                           uniform_decode: bool):
+        assert aclgraph_runtime_mode != CUDAGraphMode.NONE and \
+            aclgraph_runtime_mode in [CUDAGraphMode.PIECEWISE]
+
+        # Only rank 0 should print progress bar during capture
+        if is_global_first_rank():
+            compilation_cases = tqdm(
+                compilation_cases,
+                disable=not self.load_config.use_tqdm_on_load,
+                desc="Capturing ACL graphs ({}, {})".format(
+                    "decode" if uniform_decode else "mixed prefill-decode",
+                    aclgraph_runtime_mode.name))
+        # We skip EPLB here since we don't want to record dummy metrics
+        for num_tokens in compilation_cases:
+            for _ in range(self.compilation_config.cudagraph_num_of_warmups):
+                # Use CUDAGraphRuntimeStyle.NONE (default) for warmup.
+                # But be careful, warm up with `NONE`is orthogonal to
+                # if we want to warm up attention or not. This is
+                # different from the case where `FULL` implies capture
+                # attention while `PIECEWISE` implies no attention.
+                force_attention = (aclgraph_runtime_mode == CUDAGraphMode.FULL)
+                self._dummy_run(num_tokens,
+                                aclgraph_runtime_mode=CUDAGraphMode.NONE,
+                                force_attention=force_attention,
+                                uniform_decode=uniform_decode,
+                                moe_comm_method=self.moe_comm_method)
+            self._dummy_run(num_tokens,
+                            aclgraph_runtime_mode=aclgraph_runtime_mode,
+                            uniform_decode=uniform_decode,
+                            moe_comm_method=self.moe_comm_method)
+
     def _capture_model(self):
         if not self.use_aclgraph:
-            logger.info("Skipping NPU graph capture for eager mode.")
+            logger.warning(
+                "Skipping ACL graph capture. To turn on ACL graph capture, "
+                "ensure `aclraph_mode` was not manually set to `NONE`")
             return
+        else:
+            self.initialize_aclgraph_capture()
+
+        set_cudagraph_capturing_enabled(True)
         # Trigger ACL graph capture for specific shapes.
         # Capture the large shapes first so that the smaller shapes
         # can reuse the memory pool allocated for the large shapes.
         with graph_capture(device=self.device):
-            skip_attn = not self.vllm_config.compilation_config.full_cuda_graph
-            for num_tokens in reversed(self.aclgraph_batch_sizes):
-                for _ in range(self.vllm_config.compilation_config.
-                               cudagraph_num_of_warmups):
-                    self._dummy_run(
-                        num_tokens,
-                        skip_attn=skip_attn,
-                        moe_comm_method=self.moe_comm_method,
-                    )
-                self._dummy_run(
-                    num_tokens,
-                    skip_attn=skip_attn,
-                    moe_comm_method=self.moe_comm_method,
-                )
+            aclgraph_mode = self.compilation_config.cudagraph_mode
+            if aclgraph_mode.mixed_mode() != CUDAGraphMode.NONE:
+                aclgraph_runtime_mode = aclgraph_mode.mixed_mode()
+
+                compilation_cases = list(reversed(self.aclgraph_batch_sizes))
+                self._capture_aclgraphs(
+                    compilation_cases,
+                    aclgraph_runtime_mode=aclgraph_runtime_mode,
+                    uniform_decode=False)
+
+        # Disable aclgraph capturing globally, so any unexpected aclgraph
+        # capturing will be detected and raise an error after here.
+        # Note: We don't put it into graph_capture context manager because
+        # we may doing lazy capturing in future that still allows capturing
+        # after here.
+        set_cudagraph_capturing_enabled(False)
 
     def capture_model(self) -> None:
+
+        compilation_counter.num_gpu_runner_capture_triggers += 1
+
         start_time = time.perf_counter()
         start_free_npu_memory = torch.npu.mem_get_info()[0]
 
diff --git a/vllm_ascend/worker/mtp_proposer_v1.py b/vllm_ascend/worker/mtp_proposer_v1.py
index f4597de23..949314303 100644
--- a/vllm_ascend/worker/mtp_proposer_v1.py
+++ b/vllm_ascend/worker/mtp_proposer_v1.py
@@ -16,7 +16,9 @@ from vllm.v1.sample.metadata import SamplingMetadata
 
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ascend_forward_context import set_ascend_forward_context
+from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
 from vllm_ascend.models.deepseek_mtp import CustomDeepSeekMTP
+from vllm_ascend.torchair.utils import TorchairCommonAttentionMetadata
 from vllm_ascend.utils import ProfileExecuteDuration
 
 
@@ -88,7 +90,7 @@ class MtpProposer:
 
             # FIXME(woosuk): Avoid synchronization.
             num_tokens = cu_num_tokens[-1].item()
-            token_indices = torch.empty(
+            token_indices = torch.zeros(
                 num_tokens,
                 dtype=torch.int32,
                 device=cu_num_tokens.device,
@@ -136,9 +138,6 @@ class MtpProposer:
         # E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4]
         if token_indices is not None and self.runner.torchair_graph_enabled:
             last_token_indices = token_indices
-        else:
-            seq_lens = target_positions[last_token_indices] + 1
-            seq_lens = seq_lens.cpu()
 
         self.input_ids[last_token_indices] = next_token_ids
 
@@ -155,23 +154,36 @@ class MtpProposer:
         #     input_batch=self.runner.input_batch,
         #     scheduler_output=self.runner.scheduler_output,
         # )
-        extra_builder_kwargs = {}
-
         is_running_torchair = self.runner.torchair_graph_enabled and \
             not self.runner.with_prefill
 
         if is_running_torchair:
-            extra_builder_kwargs['graph_pad_size'] = self.runner.graph_pad_size
             num_input_tokens = self.runner.graph_pad_size
         else:
             num_input_tokens = num_tokens
 
-        attn_metadata = self.runner.attn_metadata_builder.build(
+        seq_lens = target_positions[last_token_indices] + 1
+        seq_lens = seq_lens.int()
+        common_attn_metadata = AscendCommonAttentionMetadata(
+            query_start_loc=cu_num_tokens[:batch_size + 1],
+            query_start_loc_cpu=cu_num_tokens[:batch_size + 1].cpu(),
+            seq_lens_cpu=seq_lens.cpu(),
             num_reqs=batch_size,
             num_actual_tokens=num_tokens,
             max_query_len=max_query_len,
-            query_start_loc=cu_num_tokens,
-            **extra_builder_kwargs)
+            actual_seq_lengths_q=self.runner.actual_seq_lengths_q,
+            block_table_tensor=self.runner.input_batch.block_table[0].
+            get_device_tensor(),
+            slot_mapping_cpu=target_slot_mapping,
+            positions=target_positions,
+            attn_mask=self.runner.attn_mask,
+            spec_attn_mask=self.runner.spec_attn_mask,
+            attn_state=self.runner.attn_state,
+            graph_pad_size=self.runner.graph_pad_size,
+            decode_token_per_req=self.runner.decode_token_per_req,
+        )
+        attn_metadata = self.runner.attn_metadata_builder.build(
+            common_attn_metadata, self.runner.get_model())
 
         self.positions[:num_tokens] = target_positions
         self.hidden_states[:num_tokens] = target_hidden_states
@@ -281,8 +293,16 @@ class MtpProposer:
         if skip_attn:
             attn_metadata = None
         else:
+            common_attn_metadata = TorchairCommonAttentionMetadata(
+                num_reqs=num_reqs,
+                num_actual_tokens=1,
+                actual_seq_lengths_q=self.runner.actual_seq_lengths_q,
+                attn_mask=self.runner.attn_mask,
+                spec_attn_mask=self.runner.spec_attn_mask,
+                decode_token_per_req=self.runner.decode_token_per_req,
+            )
             attn_metadata = self.runner.attn_metadata_builder.build_torchair_graph_dummy(
-                num_reqs=num_reqs, num_actual_tokens=1)
+                common_attn_metadata)
 
         input_ids = self.input_ids[:num_tokens]
         positions = self.positions[:num_tokens]
diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py
index 9b8132cc8..380dde446 100644
--- a/vllm_ascend/worker/npu_input_batch.py
+++ b/vllm_ascend/worker/npu_input_batch.py
@@ -22,28 +22,30 @@ from typing import Optional, cast
 
 import numpy as np
 import torch
+from typing_extensions import deprecated
 from vllm.lora.request import LoRARequest
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalKwargs, MultiModalKwargsItem,
+                                    PlaceholderRange)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.pool.metadata import PoolingMetadata
-from vllm.v1.sample.logits_processor import init_builtin_logitsprocs
+from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
+                                             LogitsProcessors,
+                                             MoveDirectionality)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable
 
-_SAMPLING_EPS = 1e-5
-
 
 @dataclass
 class CachedRequestState:
 
     req_id: str
     prompt_token_ids: list[int]
-    mm_kwargs: list[MultiModalKwargs]
+    mm_kwargs: list[MultiModalKwargsItem]
     mm_positions: list[PlaceholderRange]
     sampling_params: Optional[SamplingParams]
     pooling_params: Optional[PoolingParams]
@@ -65,6 +67,13 @@ class CachedRequestState:
     def num_tokens(self) -> int:
         return self.num_prompt_tokens + len(self.output_token_ids)
 
+    # Temporary back-compatibility for plugins that define model runner
+    @property
+    @deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be "
+                "removed in v0.13. Please use `mm_kwargs` instead.")
+    def mm_inputs(self) -> list[MultiModalKwargs]:
+        return [MultiModalKwargs([item]) for item in self.mm_kwargs]
+
     def get_token_id(self, idx: int) -> int:
         if idx < self.num_prompt_tokens:
             return self.prompt_token_ids[idx]
@@ -83,8 +92,11 @@ class InputBatch:
         pin_memory: bool,
         vocab_size: int,
         block_sizes: list[int],  # The block_size of each kv cache group
+        logitsprocs: Optional[LogitsProcessors] = None,
         is_spec_decode: bool = False,
+        is_pooling_model: bool = False,
     ):
+        self.is_pooling_model = is_pooling_model
         self.is_spec_decode = is_spec_decode
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
@@ -164,16 +176,6 @@ class InputBatch:
         # IDs of requests which do not support spec decoding
         self.spec_decode_unsupported_reqs: set[str] = set()
 
-        self.min_p = torch.empty((max_num_reqs, ),
-                                 dtype=torch.float32,
-                                 device=device)
-        self.min_p_cpu_tensor = torch.empty((max_num_reqs, ),
-                                            dtype=torch.float32,
-                                            device="cpu",
-                                            pin_memory=pin_memory)
-        self.min_p_cpu = self.min_p_cpu_tensor.numpy()
-        self.min_p_reqs: set[str] = set()
-
         # Frequency penalty related data structures
         self.frequency_penalties = torch.empty((max_num_reqs, ),
                                                dtype=torch.float,
@@ -212,9 +214,6 @@ class InputBatch:
             self.repetition_penalties_cpu_tensor.numpy()
         self.repetition_penalties_reqs: set[str] = set()
 
-        # req_index -> (min_tokens, stop_token_ids)
-        self.min_tokens: dict[int, tuple[int, set[int]]] = {}
-
         # lora related
         self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
                                              dtype=np.int32)
@@ -234,8 +233,12 @@ class InputBatch:
         # To accumulate prompt logprobs tensor chunks across prefill steps.
         self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
 
-        self.logit_bias: list[Optional[dict[int,
-                                            float]]] = [None] * max_num_reqs
+        # Internal representation of per-step batch state changes, used for
+        # reordering persistent batch and generating logitsprocs batch state
+        # updates. Should reset each step.
+        self.batch_update_builder = BatchUpdateBuilder()
+
+        # TODO convert this to LogitsProcessor
         self.has_allowed_token_ids: set[str] = set()
         # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
         # the value is False. Since we use masked_fill_ to set -inf.
@@ -244,18 +247,15 @@ class InputBatch:
 
         # req_index -> bad_words_token_ids
         self.bad_words_token_ids: dict[int, list[list[int]]] = {}
+
         self.logits_processing_needs_token_ids = np.zeros(max_num_reqs,
                                                           dtype=bool)
 
         self.req_output_token_ids: list[Optional[list[int]]] = []
 
-        # Define logits processors.
-        # TODO(andy): logits processor list should be extensible via engine
-        # constructor argument; for now the list is fixed.
-        self.logitsprocs = init_builtin_logitsprocs(
-            pin_memory_available=pin_memory,
-            max_num_reqs=max_num_reqs + 1,
-            device=device)
+        # Store provided logitsprocs. If none are provided, initialize empty
+        # data structure
+        self.logitsprocs = logitsprocs or LogitsProcessors()
 
         # This is updated each time the batch constituents change.
         self.sampling_metadata = self._make_sampling_metadata()
@@ -268,14 +268,35 @@ class InputBatch:
         # while performing state updates to the batch.
         return cast(list[str], self._req_ids)
 
+    def _register_add_request(self, request: "CachedRequestState") -> int:
+        """Track add-request operations for logits processors.
+        Not applicable to pooling models.
+        """
+
+        # Detailed added request metadata is only required for non-pooling
+        # models, to support logitsprocs
+        assert request.sampling_params
+
+        # Fill the next empty index if there is one.
+        if (new_req_index := self.batch_update_builder.pop_removed()) is None:
+            # Append to end otherwise.
+            new_req_index = self.num_reqs
+
+        assert new_req_index < self.max_num_reqs
+        self.batch_update_builder.added.append(
+            (new_req_index, request.sampling_params, request.prompt_token_ids,
+             request.output_token_ids))
+        return new_req_index
+
     def add_request(
         self,
         request: "CachedRequestState",
-        req_index: Optional[int] = None,
-    ) -> None:
-        if req_index is None:
+    ) -> int:
+        if not self.is_pooling_model:
+            # New request index bookkeeping for autoregressive models.
+            req_index = self._register_add_request(request)
+        else:
             req_index = self.num_reqs
-        assert req_index < self.max_num_reqs
 
         req_id = request.req_id
         if req_index == len(self._req_ids):
@@ -306,8 +327,8 @@ class InputBatch:
         self.block_table.add_row(request.block_ids, req_index)
 
         if sampling_params := request.sampling_params:
-            if self.is_spec_decode and is_spec_decode_unsupported(
-                    sampling_params):
+            if (self.is_spec_decode
+                    and is_spec_decode_unsupported(sampling_params)):
                 self.spec_decode_unsupported_reqs.add(req_id)
             if sampling_params.sampling_type == SamplingType.GREEDY:
                 # Avoid later division by zero.
@@ -326,11 +347,8 @@ class InputBatch:
             else:
                 top_k = self.vocab_size
             self.top_k_cpu[req_index] = top_k
-            self.min_p_cpu[req_index] = sampling_params.min_p
             self.frequency_penalties_cpu[
                 req_index] = sampling_params.frequency_penalty
-            if sampling_params.min_p > _SAMPLING_EPS:
-                self.min_p_reqs.add(req_id)
             if sampling_params.frequency_penalty != 0.0:
                 self.frequency_penalties_reqs.add(req_id)
             self.presence_penalties_cpu[
@@ -341,10 +359,6 @@ class InputBatch:
                 req_index] = sampling_params.repetition_penalty
             if sampling_params.repetition_penalty != 1.0:
                 self.repetition_penalties_reqs.add(req_id)
-            if sampling_params.min_tokens:
-                self.min_tokens[req_index] = (
-                    sampling_params.min_tokens,
-                    sampling_params.all_stop_token_ids)
 
             # NOTE(woosuk): self.generators should not include the requests that
             # do not have their own generator.
@@ -352,12 +366,12 @@ class InputBatch:
                 self.generators[req_index] = request.generator
 
             if sampling_params.logprobs is not None:
-                self.num_logprobs[req_id] = sampling_params.logprobs
+                self.num_logprobs[req_id] = (self.vocab_size
+                                             if sampling_params.logprobs == -1
+                                             else sampling_params.logprobs)
             if sampling_params.prompt_logprobs is not None:
                 self.num_prompt_logprobs[
                     req_id] = sampling_params.prompt_logprobs
-            if sampling_params.logit_bias is not None:
-                self.logit_bias[req_index] = sampling_params.logit_bias
 
             if sampling_params.allowed_token_ids:
                 self.has_allowed_token_ids.add(req_id)
@@ -402,12 +416,25 @@ class InputBatch:
             # No LoRA
             self.request_lora_mapping[req_index] = 0
 
+        return req_index
+
     def remove_request(self, req_id: str) -> Optional[int]:
-        """This method must always be followed by a call to condense()."""
+        """This method must always be followed by a call to condense().
+
+        Args:
+          req_id: request to remove
+
+        Returns:
+          Removed request index, or `None` if `req_id` not recognized
+        """
 
         req_index = self.req_id_to_index.pop(req_id, None)
         if req_index is None:
             return None
+        if not self.is_pooling_model:
+            # Autoregressive models require bookkeeping of removed requests to
+            # support logitsprocs.
+            self.batch_update_builder.removed_append(req_index)
         self._req_ids[req_index] = None
         self.req_output_token_ids[req_index] = None
 
@@ -415,12 +442,10 @@ class InputBatch:
         self.random_reqs.discard(req_id)
         self.top_p_reqs.discard(req_id)
         self.top_k_reqs.discard(req_id)
-        self.min_p_reqs.discard(req_id)
-        self.min_tokens.pop(req_index, None)
+        self.spec_decode_unsupported_reqs.discard(req_id)
         self.frequency_penalties_reqs.discard(req_id)
         self.presence_penalties_reqs.discard(req_id)
         self.repetition_penalties_reqs.discard(req_id)
-        self.spec_decode_unsupported_reqs.discard(req_id)
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
         self.num_prompt_logprobs.pop(req_id, None)
@@ -435,7 +460,6 @@ class InputBatch:
                 self.lora_id_to_lora_request.pop(lora_id)
             self.request_lora_mapping[req_index] = 0
 
-        self.logit_bias[req_index] = None
         self.has_allowed_token_ids.discard(req_id)
         if self.allowed_token_ids_mask_cpu_tensor is not None:
             # False means we don't fill with -inf.
@@ -445,6 +469,10 @@ class InputBatch:
         return req_index
 
     def swap_states(self, i1: int, i2: int) -> None:
+        # For autoregressive models, track detailed request reordering info
+        # to support logitsprocs
+        self.batch_update_builder.moved.append(
+            (i1, i2, MoveDirectionality.SWAP))
         old_id_i1 = self._req_ids[i1]
         old_id_i2 = self._req_ids[i2]
         self._req_ids[i1], self._req_ids[i2] =\
@@ -474,8 +502,6 @@ class InputBatch:
             self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
         self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] =\
             self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
-        self.min_p_cpu[i1], self.min_p_cpu[i2] =\
-            self.min_p_cpu[i2], self.min_p_cpu[i1]
 
         # NOTE: the following is unsafe
         # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
@@ -487,13 +513,10 @@ class InputBatch:
         self.token_ids_cpu[i2, ...] = tmp
 
         swap_dict_values(self.generators, i1, i2)
-        swap_dict_values(self.min_tokens, i1, i2)
         swap_dict_values(self.bad_words_token_ids, i1, i2)
 
         self.request_lora_mapping[i1], self.request_lora_mapping[i2] =\
             self.request_lora_mapping[i2], self.request_lora_mapping[i1]
-        self.logit_bias[i1], self.logit_bias[i2] =\
-            self.logit_bias[i2], self.logit_bias[i1]
 
         if self.allowed_token_ids_mask_cpu_tensor is not None:
             self.allowed_token_ids_mask_cpu_tensor[i1], \
@@ -502,13 +525,31 @@ class InputBatch:
                     self.allowed_token_ids_mask_cpu_tensor[i1]
         self.block_table.swap_row(i1, i2)
 
-    def condense(self, empty_req_indices: list[int]) -> None:
-        """Move non-empty requests down into lower, empty indices.
-        
+    def condense(self) -> None:
+        """Slide non-empty requests down into lower, empty indices.
+
+        Any consecutive empty indices at the very end of the list are not
+        filled.
+
         Args:
-          empty_req_indices: empty batch indices, sorted descending.
+          empty_req_indices: empty indices which may be filled.
+
+        Returns:
+          swaps: list of (from,to) swap tuples for moved requests
+          empty_req_indices: indices not filled by condensation
         """
         num_reqs = self.num_reqs
+
+        if self.is_pooling_model:
+            # Will be contiguous in pooling case, just trim the lists.
+            del self._req_ids[num_reqs:]
+            del self.req_output_token_ids[num_reqs:]
+            return
+
+        if not (empty_req_indices := self.batch_update_builder.removed):
+            # All removed requests were replaced by added requests, or else no
+            # requests were removed at all. No condense() needed
+            return
         if num_reqs == 0:
             # The batched states are empty.
             self._req_ids.clear()
@@ -524,11 +565,19 @@ class InputBatch:
                 last_req_index -= 1
 
             # Find the smallest empty index.
-            empty_index = empty_req_indices.pop()
+            empty_index = self.batch_update_builder.peek_removed()
+            assert empty_index is not None
             if empty_index >= last_req_index:
                 break
 
-            # Swap the states.
+            # Move active request down into empty request
+            # index.
+            self.batch_update_builder.pop_removed()
+            # Autoregressive models require detailed tracking of condense
+            # operations to support logitsprocs
+            self.batch_update_builder.moved.append(
+                (last_req_index, empty_index,
+                 MoveDirectionality.UNIDIRECTIONAL))
             req_id = self._req_ids[last_req_index]
             output_token_ids = self.req_output_token_ids[last_req_index]
             assert req_id is not None
@@ -559,20 +608,14 @@ class InputBatch:
                 empty_index] = self.presence_penalties_cpu[last_req_index]
             self.repetition_penalties_cpu[
                 empty_index] = self.repetition_penalties_cpu[last_req_index]
-            self.min_p_cpu[empty_index] = self.min_p_cpu[last_req_index]
             generator = self.generators.pop(last_req_index, None)
             if generator is not None:
                 self.generators[empty_index] = generator
 
-            min_token = self.min_tokens.pop(last_req_index, None)
-            if min_token is not None:
-                self.min_tokens[empty_index] = min_token
-
             self.request_lora_mapping[empty_index] = self.request_lora_mapping[
                 last_req_index]
 
-            self.logit_bias[empty_index] = self.logit_bias[last_req_index]
-
+            # TODO convert these to LogitsProcessors
             if self.allowed_token_ids_mask_cpu_tensor is not None:
                 self.allowed_token_ids_mask_cpu_tensor[
                     empty_index] = self.allowed_token_ids_mask_cpu_tensor[
@@ -582,15 +625,30 @@ class InputBatch:
                 last_req_index, None)
             if bad_words_token_ids is not None:
                 self.bad_words_token_ids[empty_index] = bad_words_token_ids
+
             # Decrement last_req_index since it is now empty.
             last_req_index -= 1
 
         # Trim lists to the batch size.
-        del self._req_ids[self.num_reqs:]
-        del self.req_output_token_ids[self.num_reqs:]
+        del self._req_ids[num_reqs:]
+        del self.req_output_token_ids[num_reqs:]
 
-    def refresh_sampling_metadata(self):
-        self.sampling_metadata = self._make_sampling_metadata()
+    def refresh_metadata(self):
+        """Apply any batch updates to sampling metadata."""
+
+        if self.is_pooling_model:
+            # Batch changes every step for pooling models.
+            self.sampling_metadata = self._make_sampling_metadata()
+            return
+
+        # For non-pooling models - generate and apply logitsprocs update;
+        # reset batch update tracking.
+        # Update sampling metadata if batch state is changed.
+        batch_update = self.batch_update_builder.get_and_reset(self.num_reqs)
+        for logit_proc in self.logitsprocs.all:
+            logit_proc.update_state(batch_update)
+        if batch_update:
+            self.sampling_metadata = self._make_sampling_metadata()
 
     def _make_sampling_metadata(self) -> SamplingMetadata:
         num_reqs = self.num_reqs
@@ -603,8 +661,6 @@ class InputBatch:
             copy_slice(self.top_p_cpu_tensor, self.top_p, num_reqs)
         if not self.no_top_k:
             copy_slice(self.top_k_cpu_tensor, self.top_k, num_reqs)
-        if not self.no_min_p:
-            copy_slice(self.min_p_cpu_tensor, self.min_p, num_reqs)
 
         if not self.no_penalties:
             # Since syncing these tensors is expensive only copy them
@@ -735,10 +791,6 @@ class InputBatch:
     def no_top_k(self) -> bool:
         return len(self.top_k_reqs) == 0
 
-    @property
-    def no_min_p(self) -> bool:
-        return len(self.min_p_reqs) == 0
-
     @property
     def no_penalties(self) -> bool:
         return (len(self.presence_penalties_reqs) == 0
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
index 4e75a7da1..8bd11d37d 100644
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -236,7 +236,9 @@ class NPUWorker(WorkerBase):
             self.model_runner.load_model()
 
     def compile_or_warm_up_model(self) -> None:
-        warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
+        # Note: need to adapt for graph mode.
+        warmup_sizes = (self.vllm_config.compilation_config.compile_sizes
+                        or []).copy()
         if not self.model_config.enforce_eager:
             warmup_sizes = [
                 x for x in warmup_sizes if x not in