diff --git a/src/cpp/src/continuous_batching/cache_manager.hpp b/src/cpp/src/continuous_batching/cache_manager.hpp index 8bb651792e..a77c844e0d 100644 --- a/src/cpp/src/continuous_batching/cache_manager.hpp +++ b/src/cpp/src/continuous_batching/cache_manager.hpp @@ -7,7 +7,7 @@ #include #include "openvino/runtime/tensor.hpp" - +#include "utils.hpp" namespace ov::genai { class CacheManager { @@ -45,8 +45,7 @@ class CacheManager { "Continuous batching: execution device is expected to be single CPU / single GPU / multi GPUs"); m_device = execution_devices[0]; // set block_size depending on device - const size_t cpu_block_size = 32, gpu_block_size = 16; - m_block_size = all_gpu_device ? gpu_block_size : cpu_block_size; + const size_t cpu_block_size = 32, gpu_block_size = 16, gpu_block_size_xattn = 256; if (all_gpu_device) { m_context = m_request.get_compiled_model().get_context(); @@ -75,6 +74,19 @@ class CacheManager { } } + bool has_xattention = false; + if (all_gpu_device) { + if (m_value_shapes[0][2].get_length() == gpu_block_size_xattn) { + has_xattention = true; + } + if (utils::env_setup_for_print_debug_info()) { + if (has_xattention) + std::cout << "[XAttention]: ENABLED on GPU device." << std::endl; + else + std::cout << "[XAttention]: DISABLED on GPU device." << std::endl; + } + } + m_block_size = all_gpu_device ? ( has_xattention ? gpu_block_size_xattn : gpu_block_size ) : cpu_block_size; m_num_decoder_layers = m_value_precisions.size(); OPENVINO_ASSERT(m_num_decoder_layers == m_key_precisions.size(), "Invalid case: a different number of K and V caches in a LLM model"); } diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py index bdd810b5b7..8a73dbb270 100644 --- a/tools/llm_bench/llm_bench_utils/ov_utils.py +++ b/tools/llm_bench/llm_bench_utils/ov_utils.py @@ -190,10 +190,12 @@ def get_scheduler_config_genai(config_data, config_name="CB config"): sparse_attention_kwargs = user_config.pop('sparse_attention_config') if "mode" in sparse_attention_kwargs.keys(): sparse_attention_kwargs["mode"] = getattr(openvino_genai.SparseAttentionMode, sparse_attention_kwargs["mode"]) - - scheduler_config.use_sparse_attention = True - scheduler_config.sparse_attention_config = openvino_genai.SparseAttentionConfig(**sparse_attention_kwargs) - log.info("Sparse Attention mode ON") + if user_config.pop('use_sparse_attention', True): + scheduler_config.use_sparse_attention = True + scheduler_config.sparse_attention_config = openvino_genai.SparseAttentionConfig(**sparse_attention_kwargs) + log.info("Sparse Attention mode ON") + else: + raise RuntimeError("==Failure==: sparse_attention_config value can't be used with use_sparse_attention=False") for param, value in user_config.items(): setattr(scheduler_config, param, value) diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py index 5c1f6ab7c7..bba8fe24a1 100644 --- a/tools/llm_bench/task/text_generation.py +++ b/tools/llm_bench/task/text_generation.py @@ -294,7 +294,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data "Enabled input prompt permutations. It means that generated results may vary on different steps. " "If it is not expected, please specify --disable_prompt_permutation in your benchmarking command to disable this behavior" ) - from openvino_genai import TokenizedInputs, GenerationConfig + from openvino_genai import TokenizedInputs import openvino as ov input_ids = input_data.input_ids.data @@ -315,6 +315,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data if args['infer_count'] is not None: out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size']) log.info(out_str) + from openvino_genai import GenerationConfig gen_config = model.get_generation_config() if hasattr(model, 'get_generation_config') else GenerationConfig() gen_config.max_new_tokens = max_gen_tokens # llama-3-8b-instruct's generation_config.json has 4096 max_length. diff --git a/tools/who_what_benchmark/whowhatbench/model_loaders.py b/tools/who_what_benchmark/whowhatbench/model_loaders.py index dd78ab1c8e..d3afa54f9a 100644 --- a/tools/who_what_benchmark/whowhatbench/model_loaders.py +++ b/tools/who_what_benchmark/whowhatbench/model_loaders.py @@ -42,14 +42,36 @@ def __getattr__(self, attr): return getattr(self.model, attr) -def get_scheduler_config_genai(cb_config): +def configure_sparse_attention(scheduler_params, scheduler_config): + """ + Configures sparse attention settings based on scheduler parameters. + """ import openvino_genai + sparse_attention_kwargs = scheduler_params.pop('sparse_attention_config', None) + + if sparse_attention_kwargs: + # Convert mode string to enum if present + mode = sparse_attention_kwargs.get("mode") + if mode: + sparse_attention_kwargs["mode"] = getattr(openvino_genai.SparseAttentionMode, mode) + + # Check if sparse attention is enabled + if scheduler_params.pop('use_sparse_attention', True): + scheduler_config.use_sparse_attention = True + scheduler_config.sparse_attention_config = openvino_genai.SparseAttentionConfig(**sparse_attention_kwargs) + logger.info("Sparse Attention mode ON") + else: + raise RuntimeError("==Failure==: sparse_attention_config value can't be used with use_sparse_attention=False") + +def get_scheduler_config_genai(cb_config): + import openvino_genai default_cb_config = {"cache_size": 1} scheduler_config = openvino_genai.SchedulerConfig() scheduler_params = cb_config or default_cb_config if scheduler_params: logger.info(f"Scheduler parameters for:\n{scheduler_params}") + configure_sparse_attention(scheduler_params, scheduler_config) for param, value in scheduler_params.items(): if param == "cache_eviction_config": value = openvino_genai.CacheEvictionConfig(aggregation_mode=openvino_genai.AggregationMode.NORM_SUM, **value)