Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
352c7a0
update gpu block size based on xattn
rnwang04 Sep 24, 2025
51d0018
update gpu block size based on xattn
rnwang04 Sep 25, 2025
eded411
merge
rnwang04 Sep 25, 2025
ac7a454
add missing GenerationConfig
rnwang04 Sep 26, 2025
e6ba90a
fix use_sparse_attention=False
rnwang04 Oct 7, 2025
4101008
Merge branch 'master' into pa_block_xattn
ceciliapeng2011 Oct 11, 2025
9dcb2da
update GenerationConfig based on comments
rnwang04 Oct 13, 2025
c5c67ba
refactor: get gpu block_size from value_cache.
ceciliapeng2011 Oct 13, 2025
9ccf083
update solution for use_sparse_attention=False based on comments
rnwang04 Oct 13, 2025
79fd027
Merge branch 'pa_block_xattn' of https://github.com/rnwang04/openvino…
rnwang04 Oct 13, 2025
c96728c
Merge branch 'master' into pa_block_xattn
rnwang04 Oct 13, 2025
9c27fc1
Merge branch 'master' into pa_block_xattn
peterchen-intel Oct 14, 2025
9ab91e1
add log to show if XAttention is actually ON/OFF.
ceciliapeng2011 Oct 15, 2025
526ba22
fix
ceciliapeng2011 Oct 15, 2025
7bd2aaf
wwb support xattention
wgzintel Oct 15, 2025
9c752b9
remove blank line
wgzintel Oct 15, 2025
13053a9
refactoring the code
wgzintel Oct 16, 2025
8d481e8
Merge pull request #2 from wgzintel/guozhong/wwb_support_xattention
rnwang04 Oct 16, 2025
a740969
Merge branch 'master' into pa_block_xattn
peterchen-intel Oct 19, 2025
3cdda53
Code format update
peterchen-intel Oct 20, 2025
516bff5
Merge branch 'master' into pa_block_xattn
peterchen-intel Oct 22, 2025
9f46be6
refactor based on copilot review
ceciliapeng2011 Oct 22, 2025
77f7961
fix lint error
ceciliapeng2011 Oct 22, 2025
66950d5
fix lint error
ceciliapeng2011 Oct 22, 2025
237cb5c
Merge branch 'master' into pa_block_xattn
peterchen-intel Oct 25, 2025
e5bd53e
Merge branch 'master' into pa_block_xattn
peterchen-intel Oct 26, 2025
5d1e2e6
Merge branch 'master' into pa_block_xattn
peterchen-intel Oct 27, 2025
2782171
Merge branch 'master' into pa_block_xattn
ceciliapeng2011 Oct 28, 2025
4669a25
remove other changes, only keep gpu block size
rnwang04 Oct 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions src/cpp/src/continuous_batching/cache_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ class CacheManager {
"Continuous batching: execution device is expected to be single CPU / single GPU / multi GPUs");
m_device = execution_devices[0];
// set block_size depending on device
const size_t cpu_block_size = 32, gpu_block_size = 16;
m_block_size = all_gpu_device ? gpu_block_size : cpu_block_size;
const size_t cpu_block_size = 32, gpu_block_size = 16, gpu_block_size_xattn = 256;

if (all_gpu_device) {
m_context = m_request.get_compiled_model().get_context();
Expand Down Expand Up @@ -75,6 +74,13 @@ class CacheManager {
}
}

bool has_xattention = false;
if ((m_key_shapes[0][2].get_length() == m_value_shapes[0][2].get_length()) &&
(m_key_shapes[0][3].get_length() == m_value_shapes[0][3].get_length()) &&
(m_key_shapes[0][2].get_length() == gpu_block_size_xattn)) {
has_xattention = true;
}
m_block_size = all_gpu_device ? ( has_xattention ? gpu_block_size_xattn : gpu_block_size ) : cpu_block_size;
m_num_decoder_layers = m_value_precisions.size();
OPENVINO_ASSERT(m_num_decoder_layers == m_key_precisions.size(), "Invalid case: a different number of K and V caches in a LLM model");
}
Expand Down
12 changes: 8 additions & 4 deletions tools/llm_bench/llm_bench_utils/ov_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,10 +190,14 @@ def get_scheduler_config_genai(config_data, config_name="CB config"):
sparse_attention_kwargs = user_config.pop('sparse_attention_config')
if "mode" in sparse_attention_kwargs.keys():
sparse_attention_kwargs["mode"] = getattr(openvino_genai.SparseAttentionMode, sparse_attention_kwargs["mode"])

scheduler_config.use_sparse_attention = True
scheduler_config.sparse_attention_config = openvino_genai.SparseAttentionConfig(**sparse_attention_kwargs)
log.info("Sparse Attention mode ON")
if 'use_sparse_attention' in user_config.keys():
use_sparse_attention = user_config.pop('use_sparse_attention')
else:
use_sparse_attention = None
if use_sparse_attention != False:
scheduler_config.use_sparse_attention = True
scheduler_config.sparse_attention_config = openvino_genai.SparseAttentionConfig(**sparse_attention_kwargs)
log.info("Sparse Attention mode ON")

for param, value in user_config.items():
setattr(scheduler_config, param, value)
Expand Down
1 change: 1 addition & 0 deletions tools/llm_bench/task/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
if args['infer_count'] is not None:
out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size'])
log.info(out_str)
from openvino_genai import GenerationConfig
gen_config = model.get_generation_config() if hasattr(model, 'get_generation_config') else GenerationConfig()
gen_config.max_new_tokens = max_gen_tokens
# llama-3-8b-instruct's generation_config.json has 4096 max_length.
Expand Down
Loading