Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions src/cpp/src/continuous_batching/cache_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include <list>

#include "openvino/runtime/tensor.hpp"

#include "utils.hpp"
namespace ov::genai {

class CacheManager {
Expand Down Expand Up @@ -45,8 +45,7 @@ class CacheManager {
"Continuous batching: execution device is expected to be single CPU / single GPU / multi GPUs");
m_device = execution_devices[0];
// set block_size depending on device
const size_t cpu_block_size = 32, gpu_block_size = 16;
m_block_size = all_gpu_device ? gpu_block_size : cpu_block_size;
const size_t cpu_block_size = 32, gpu_block_size = 16, gpu_block_size_xattn = 256;
Copy link

Copilot AI Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variable gpu_block_size_xattn is defined but m_block_size assignment was moved to line 89. This leaves gpu_block_size_xattn defined far from its only usage point. Consider moving this constant definition closer to where it's used (before line 89) or defining it as a class constant for better maintainability.

Copilot uses AI. Check for mistakes.


if (all_gpu_device) {
m_context = m_request.get_compiled_model().get_context();
Expand Down Expand Up @@ -75,6 +74,19 @@ class CacheManager {
}
}

bool has_xattention = false;
if (all_gpu_device) {
if (m_value_shapes[0][2].get_length() == gpu_block_size_xattn) {
has_xattention = true;
}
if (utils::env_setup_for_print_debug_info()) {
if (has_xattention)
std::cout << "[XAttention]: ENABLED on GPU device." << std::endl;
else
std::cout << "[XAttention]: DISABLED on GPU device." << std::endl;
}
}
m_block_size = all_gpu_device ? ( has_xattention ? gpu_block_size_xattn : gpu_block_size ) : cpu_block_size;
Copy link

Copilot AI Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The nested ternary operator makes this line difficult to read. Consider using an if-else statement for better clarity, especially since this is a critical configuration decision.

Suggested change
m_block_size = all_gpu_device ? ( has_xattention ? gpu_block_size_xattn : gpu_block_size ) : cpu_block_size;
if (all_gpu_device) {
if (has_xattention) {
m_block_size = gpu_block_size_xattn;
} else {
m_block_size = gpu_block_size;
}
} else {
m_block_size = cpu_block_size;
}

Copilot uses AI. Check for mistakes.

m_num_decoder_layers = m_value_precisions.size();
OPENVINO_ASSERT(m_num_decoder_layers == m_key_precisions.size(), "Invalid case: a different number of K and V caches in a LLM model");
}
Expand Down
10 changes: 6 additions & 4 deletions tools/llm_bench/llm_bench_utils/ov_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,10 +190,12 @@ def get_scheduler_config_genai(config_data, config_name="CB config"):
sparse_attention_kwargs = user_config.pop('sparse_attention_config')
if "mode" in sparse_attention_kwargs.keys():
sparse_attention_kwargs["mode"] = getattr(openvino_genai.SparseAttentionMode, sparse_attention_kwargs["mode"])

scheduler_config.use_sparse_attention = True
scheduler_config.sparse_attention_config = openvino_genai.SparseAttentionConfig(**sparse_attention_kwargs)
log.info("Sparse Attention mode ON")
if user_config.pop('use_sparse_attention', True):
scheduler_config.use_sparse_attention = True
scheduler_config.sparse_attention_config = openvino_genai.SparseAttentionConfig(**sparse_attention_kwargs)
log.info("Sparse Attention mode ON")
else:
raise RuntimeError("==Failure ==: sparse_attention_config value can't be used with use_sparse_attention=False")
Copy link

Copilot AI Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extra space in error message prefix: '==Failure ==' should be '==Failure=='.

Suggested change
raise RuntimeError("==Failure ==: sparse_attention_config value can't be used with use_sparse_attention=False")
raise RuntimeError("==Failure==: sparse_attention_config value can't be used with use_sparse_attention=False")

Copilot uses AI. Check for mistakes.


for param, value in user_config.items():
setattr(scheduler_config, param, value)
Expand Down
3 changes: 2 additions & 1 deletion tools/llm_bench/task/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
"Enabled input prompt permutations. It means that generated results may vary on different steps. "
"If it is not expected, please specify --disable_prompt_permutation in your benchmarking command to disable this behavior"
)
from openvino_genai import TokenizedInputs, GenerationConfig
from openvino_genai import TokenizedInputs
import openvino as ov

input_ids = input_data.input_ids.data
Expand All @@ -315,6 +315,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
if args['infer_count'] is not None:
out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size'])
log.info(out_str)
from openvino_genai import GenerationConfig
gen_config = model.get_generation_config() if hasattr(model, 'get_generation_config') else GenerationConfig()
gen_config.max_new_tokens = max_gen_tokens
# llama-3-8b-instruct's generation_config.json has 4096 max_length.
Expand Down
24 changes: 23 additions & 1 deletion tools/who_what_benchmark/whowhatbench/model_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,36 @@ def __getattr__(self, attr):
return getattr(self.model, attr)


def get_scheduler_config_genai(cb_config):
def configure_sparse_attention(scheduler_params, scheduler_config):
"""
Configures sparse attention settings based on scheduler parameters.
"""
import openvino_genai
sparse_attention_kwargs = scheduler_params.pop('sparse_attention_config', None)

if sparse_attention_kwargs:
# Convert mode string to enum if present
mode = sparse_attention_kwargs.get("mode")
if mode:
sparse_attention_kwargs["mode"] = getattr(openvino_genai.SparseAttentionMode, mode)

# Check if sparse attention is enabled
if scheduler_params.pop('use_sparse_attention', True):
scheduler_config.use_sparse_attention = True
scheduler_config.sparse_attention_config = openvino_genai.SparseAttentionConfig(**sparse_attention_kwargs)
logger.info("Sparse Attention mode ON")
else:
raise RuntimeError("==Failure==: sparse_attention_config value can't be used with use_sparse_attention=False")


def get_scheduler_config_genai(cb_config):
import openvino_genai
default_cb_config = {"cache_size": 1}
scheduler_config = openvino_genai.SchedulerConfig()
scheduler_params = cb_config or default_cb_config
if scheduler_params:
logger.info(f"Scheduler parameters for:\n{scheduler_params}")
configure_sparse_attention(scheduler_params, scheduler_config)
Copy link

Copilot AI Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trailing whitespace at end of line should be removed.

Suggested change
configure_sparse_attention(scheduler_params, scheduler_config)
configure_sparse_attention(scheduler_params, scheduler_config)

Copilot uses AI. Check for mistakes.

for param, value in scheduler_params.items():
if param == "cache_eviction_config":
value = openvino_genai.CacheEvictionConfig(aggregation_mode=openvino_genai.AggregationMode.NORM_SUM, **value)
Expand Down
Loading