Fix forward method overload in TextDecoder

eleanorTurintech · eleanorTurintech · commit b7eaf44f318c · 2025-03-11T10:45:40.000Z
diff --git a/whisper/model.py b/whisper/model.py
@@ -228,50 +228,16 @@ def __init__(
         if torch.cuda.is_available():
             self.register_buffer("mask_cuda", mask.cuda(), persistent=False)
 
-
-    def forward(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
+    def forward(self, tokens: Tensor, audio_features: Tensor, kv_cache: Optional[dict] = None):
         """
         Args:
             tokens: (n_batch, n_token)
             audio_features: (n_batch, n_audio_ctx, n_audio_state)
+            kv_cache: Optional cache for key/value tensors
 
         Returns:
             logits: (n_batch, n_token, n_vocab)
         """
-        n_batch, n_token = tokens.shape
-        n_audio_ctx, n_audio_state = audio_features.shape[1:]
-
-        x = self.token_embedding(tokens) + self.positional_embedding[:n_token]
-
-        # Optimisation: Move audio_features to GPU once here.
-        if torch.cuda.is_available():
-            audio_features = audio_features.cuda()
-
-
-        for block in self.blocks:
-            x = block(x, audio_features)
-
-        x = self.ln(x)
-        logits = x @ self.token_embedding.weight.T
-
-        # Optimisation: Apply the precomputed CUDA mask if available.
-        if torch.cuda.is_available():
-             mask = self.mask_cuda[:n_token, :n_token]
-        else:
-            mask = self.mask[:n_token, :n_token]
-        
-        logits = logits + mask
-
-        return logits
-
-
-    def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
-        """
-        Args:
-            tokens: (n_batch, n_token) or x tensor
-            audio_features: (n_batch, n_audio_ctx, n_audio_state) or xa tensor
-            kv_cache: Optional cache for key/value tensors
-        """
         if kv_cache is not None:
             # Handle the kv_cache case
             offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
@@ -313,7 +279,6 @@ def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
 
             return logits
 
-
 # The Whisper class has been moved outside of TextDecoder and is now a top-level class
 class Whisper(nn.Module):
     def __init__(self, dims: ModelDimensions):