-
Notifications
You must be signed in to change notification settings - Fork 290
update gpu block size based on xattn #2764
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
352c7a0
51d0018
eded411
ac7a454
e6ba90a
4101008
9dcb2da
c5c67ba
9ccf083
79fd027
c96728c
9c27fc1
9ab91e1
526ba22
7bd2aaf
9c752b9
13053a9
8d481e8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -7,7 +7,7 @@ | |||||||||||||||||||||
#include <list> | ||||||||||||||||||||||
|
||||||||||||||||||||||
#include "openvino/runtime/tensor.hpp" | ||||||||||||||||||||||
|
||||||||||||||||||||||
#include "utils.hpp" | ||||||||||||||||||||||
namespace ov::genai { | ||||||||||||||||||||||
|
||||||||||||||||||||||
class CacheManager { | ||||||||||||||||||||||
|
@@ -45,8 +45,7 @@ class CacheManager { | |||||||||||||||||||||
"Continuous batching: execution device is expected to be single CPU / single GPU / multi GPUs"); | ||||||||||||||||||||||
m_device = execution_devices[0]; | ||||||||||||||||||||||
// set block_size depending on device | ||||||||||||||||||||||
const size_t cpu_block_size = 32, gpu_block_size = 16; | ||||||||||||||||||||||
m_block_size = all_gpu_device ? gpu_block_size : cpu_block_size; | ||||||||||||||||||||||
const size_t cpu_block_size = 32, gpu_block_size = 16, gpu_block_size_xattn = 256; | ||||||||||||||||||||||
|
||||||||||||||||||||||
if (all_gpu_device) { | ||||||||||||||||||||||
m_context = m_request.get_compiled_model().get_context(); | ||||||||||||||||||||||
|
@@ -75,6 +74,19 @@ class CacheManager { | |||||||||||||||||||||
} | ||||||||||||||||||||||
} | ||||||||||||||||||||||
|
||||||||||||||||||||||
bool has_xattention = false; | ||||||||||||||||||||||
if (all_gpu_device) { | ||||||||||||||||||||||
if (m_value_shapes[0][2].get_length() == gpu_block_size_xattn) { | ||||||||||||||||||||||
has_xattention = true; | ||||||||||||||||||||||
} | ||||||||||||||||||||||
if (utils::env_setup_for_print_debug_info()) { | ||||||||||||||||||||||
if (has_xattention) | ||||||||||||||||||||||
std::cout << "[XAttention]: ENABLED on GPU device." << std::endl; | ||||||||||||||||||||||
else | ||||||||||||||||||||||
std::cout << "[XAttention]: DISABLED on GPU device." << std::endl; | ||||||||||||||||||||||
} | ||||||||||||||||||||||
} | ||||||||||||||||||||||
m_block_size = all_gpu_device ? ( has_xattention ? gpu_block_size_xattn : gpu_block_size ) : cpu_block_size; | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [nitpick] The nested ternary operator makes this line difficult to read. Consider using an if-else statement for better clarity, especially since this is a critical configuration decision.
Suggested change
Copilot uses AI. Check for mistakes. Positive FeedbackNegative Feedback |
||||||||||||||||||||||
m_num_decoder_layers = m_value_precisions.size(); | ||||||||||||||||||||||
OPENVINO_ASSERT(m_num_decoder_layers == m_key_precisions.size(), "Invalid case: a different number of K and V caches in a LLM model"); | ||||||||||||||||||||||
} | ||||||||||||||||||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -190,10 +190,12 @@ def get_scheduler_config_genai(config_data, config_name="CB config"): | |||||
sparse_attention_kwargs = user_config.pop('sparse_attention_config') | ||||||
if "mode" in sparse_attention_kwargs.keys(): | ||||||
sparse_attention_kwargs["mode"] = getattr(openvino_genai.SparseAttentionMode, sparse_attention_kwargs["mode"]) | ||||||
|
||||||
scheduler_config.use_sparse_attention = True | ||||||
scheduler_config.sparse_attention_config = openvino_genai.SparseAttentionConfig(**sparse_attention_kwargs) | ||||||
log.info("Sparse Attention mode ON") | ||||||
if user_config.pop('use_sparse_attention', True): | ||||||
scheduler_config.use_sparse_attention = True | ||||||
scheduler_config.sparse_attention_config = openvino_genai.SparseAttentionConfig(**sparse_attention_kwargs) | ||||||
log.info("Sparse Attention mode ON") | ||||||
Wovchena marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
else: | ||||||
raise RuntimeError("==Failure ==: sparse_attention_config value can't be used with use_sparse_attention=False") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Extra space in error message prefix: '==Failure ==' should be '==Failure=='.
Suggested change
Copilot uses AI. Check for mistakes. Positive FeedbackNegative Feedback |
||||||
|
||||||
for param, value in user_config.items(): | ||||||
setattr(scheduler_config, param, value) | ||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -42,14 +42,36 @@ def __getattr__(self, attr): | |||||
return getattr(self.model, attr) | ||||||
|
||||||
|
||||||
def get_scheduler_config_genai(cb_config): | ||||||
def configure_sparse_attention(scheduler_params, scheduler_config): | ||||||
""" | ||||||
Configures sparse attention settings based on scheduler parameters. | ||||||
""" | ||||||
import openvino_genai | ||||||
sparse_attention_kwargs = scheduler_params.pop('sparse_attention_config', None) | ||||||
|
||||||
if sparse_attention_kwargs: | ||||||
# Convert mode string to enum if present | ||||||
mode = sparse_attention_kwargs.get("mode") | ||||||
if mode: | ||||||
sparse_attention_kwargs["mode"] = getattr(openvino_genai.SparseAttentionMode, mode) | ||||||
|
||||||
# Check if sparse attention is enabled | ||||||
if scheduler_params.pop('use_sparse_attention', True): | ||||||
scheduler_config.use_sparse_attention = True | ||||||
scheduler_config.sparse_attention_config = openvino_genai.SparseAttentionConfig(**sparse_attention_kwargs) | ||||||
logger.info("Sparse Attention mode ON") | ||||||
else: | ||||||
raise RuntimeError("==Failure==: sparse_attention_config value can't be used with use_sparse_attention=False") | ||||||
|
||||||
|
||||||
def get_scheduler_config_genai(cb_config): | ||||||
import openvino_genai | ||||||
default_cb_config = {"cache_size": 1} | ||||||
scheduler_config = openvino_genai.SchedulerConfig() | ||||||
scheduler_params = cb_config or default_cb_config | ||||||
if scheduler_params: | ||||||
logger.info(f"Scheduler parameters for:\n{scheduler_params}") | ||||||
configure_sparse_attention(scheduler_params, scheduler_config) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Trailing whitespace at end of line should be removed.
Suggested change
Copilot uses AI. Check for mistakes. Positive FeedbackNegative Feedback |
||||||
for param, value in scheduler_params.items(): | ||||||
if param == "cache_eviction_config": | ||||||
value = openvino_genai.CacheEvictionConfig(aggregation_mode=openvino_genai.AggregationMode.NORM_SUM, **value) | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The variable
gpu_block_size_xattn
is defined butm_block_size
assignment was moved to line 89. This leavesgpu_block_size_xattn
defined far from its only usage point. Consider moving this constant definition closer to where it's used (before line 89) or defining it as a class constant for better maintainability.Copilot uses AI. Check for mistakes.