Skip to content

Commit ec6d459

Browse files
v1.3.1
1 parent d0841cc commit ec6d459

File tree

7 files changed

+17
-11
lines changed

7 files changed

+17
-11
lines changed

Cargo.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ members = [
88
]
99

1010
[workspace.package]
11-
version = "1.3.0"
11+
version = "1.3.1"
1212
edition = "2021"
1313
authors = ["Olivier Dehaene"]
1414
homepage = "https://github.com/huggingface/text-generation-inference"

docs/openapi.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"name": "Apache 2.0",
1111
"url": "https://www.apache.org/licenses/LICENSE-2.0"
1212
},
13-
"version": "1.3.0"
13+
"version": "1.3.1"
1414
},
1515
"paths": {
1616
"/": {

integration-tests/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "text-generation-integration-tests"
3-
version = "1.3.0"
3+
version = "1.3.1"
44
description = "Text Generation Inference integration tests"
55
authors = ["Nicolas Patry <[email protected]>"]
66

server/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "text-generation-server"
3-
version = "1.3.0"
3+
version = "1.3.1"
44
description = "Text Generation Inference Python gRPC Server"
55
authors = ["Olivier Dehaene <[email protected]>"]
66

server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -391,14 +391,15 @@ def forward(
391391
slots: torch.Tensor,
392392
input_lengths: torch.Tensor,
393393
max_s: int,
394+
true_max_s: int,
394395
prefill_cache_indices: Optional[torch.Tensor],
395396
) -> torch.Tensor:
396397
hidden_states = self.embed_tokens(input_ids)
397398

398399
# Get rotary cos and sin for this forward
399400
# Avoid to index in each layer
400401
cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
401-
position_ids, max_s, hidden_states.dtype
402+
position_ids, true_max_s, hidden_states.dtype
402403
)
403404

404405
residual = None
@@ -449,6 +450,7 @@ def forward(
449450
prefill_cache_indices: Optional[torch.Tensor],
450451
lm_head_indices: Optional[torch.Tensor] = None,
451452
) -> torch.Tensor:
453+
true_max_s = max_s
452454
if prefill_cache_indices is not None:
453455
# Slots also need to be sliced as it has the same size as the whole kv tensor
454456
slots = slots[prefill_cache_indices]
@@ -467,6 +469,7 @@ def forward(
467469
slots,
468470
input_lengths,
469471
max_s,
472+
true_max_s,
470473
prefill_cache_indices,
471474
)
472475
if lm_head_indices is not None:

server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -401,7 +401,7 @@ def topology(self, x: torch.Tensor, padded_bins: torch.Tensor):
401401
self.offsets_block_rows = block_rows
402402
offsets = self.offsets
403403
else:
404-
offsets = self.offsets[:block_rows]
404+
offsets = self.offsets[: block_rows + 1]
405405

406406
# Indices for the sparse matrix. The indices for
407407
# the intermediate matrix are dynamic depending
@@ -632,14 +632,15 @@ def forward(
632632
slots: torch.Tensor,
633633
input_lengths: torch.Tensor,
634634
max_s: int,
635+
true_max_s: int,
635636
prefill_cache_indices: Optional[torch.Tensor],
636637
) -> torch.Tensor:
637638
hidden_states = self.embed_tokens(input_ids)
638639

639640
# Get rotary cos and sin for this forward
640641
# Avoid to index in each layer
641642
cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
642-
position_ids, max_s, hidden_states.dtype
643+
position_ids, true_max_s, hidden_states.dtype
643644
)
644645

645646
residual = None
@@ -690,6 +691,7 @@ def forward(
690691
prefill_cache_indices: Optional[torch.Tensor],
691692
lm_head_indices: Optional[torch.Tensor] = None,
692693
) -> torch.Tensor:
694+
true_max_s = max_s
693695
if prefill_cache_indices is not None:
694696
# Slots also need to be sliced as it has the same size as the whole kv tensor
695697
slots = slots[prefill_cache_indices]
@@ -708,6 +710,7 @@ def forward(
708710
slots,
709711
input_lengths,
710712
max_s,
713+
true_max_s,
711714
prefill_cache_indices,
712715
)
713716
if lm_head_indices is not None:

0 commit comments

Comments
 (0)