Batch matmul fast path in MHAWithCache (#449)

rohan-varma · facebook-github-bot · commit dfd2ec63365e · 2023-08-16T17:49:41.000-07:00
Summary: Pull Request resolved: #449 When doing self attention, an optimization is to combine the Q, K, V input projection matrices and do a single matmul, instead of 3. Adding this optimization in MHAWithCache. Differential Revision: D48418780 fbshipit-source-id: e8001eb870e827b05146221bb66f82939deae0c6
diff --git a/torchmultimodal/modules/layers/multi_head_attention.py b/torchmultimodal/modules/layers/multi_head_attention.py
@@ -7,7 +7,6 @@
 from typing import NamedTuple, Optional, Tuple, Union
 
 import torch
-
 import torch.nn.functional as F
 from torch import nn, Tensor
 
@@ -17,6 +16,14 @@ class MHAWithCacheOutput(NamedTuple):
     past_key_value: Tuple[Tensor, Tensor]
 
 
+def _batched_input_proj(
+    query: Tensor, input_proj: Tensor
+) -> Tuple[Tensor, Tensor, Tensor]:
+    projected_query = input_proj(query)
+    query, key, value = projected_query.chunk(3, dim=-1)
+    return query, key, value
+
+
 class MultiHeadSelfAttention(nn.Module):
     """
     Multihead self attention.
@@ -59,8 +66,7 @@ def forward(
 
         bsz = query.size(0)
         embed_dim = query.size(-1)
-        projected_query = self.input_proj(query)
-        query, key, value = projected_query.chunk(3, dim=-1)
+        query, key, value = _batched_input_proj(query=query, input_proj=self.input_proj)
 
         head_dim = embed_dim // self.num_heads
         # bsz x seq len x embed_dim => bsz x num_heads x seq len x head_dim
@@ -105,9 +111,15 @@ def __init__(
     ) -> None:
         super().__init__()
         self.num_heads = num_heads
-        self.q_proj = nn.Linear(dim_q, dim_q, bias=add_bias)
-        self.k_proj = nn.Linear(dim_kv, dim_q, bias=add_bias)
-        self.v_proj = nn.Linear(dim_kv, dim_q, bias=add_bias)
+        if dim_kv == dim_q:
+            # Module is being used for self-attention, so batch the matmuls
+            self.input_proj = nn.Linear(dim_q, 3 * dim_q, bias=add_bias)
+            self.is_self_attn = True
+        else:
+            self.q_proj = nn.Linear(dim_q, dim_q, bias=add_bias)
+            self.k_proj = nn.Linear(dim_kv, dim_q, bias=add_bias)
+            self.v_proj = nn.Linear(dim_kv, dim_q, bias=add_bias)
+            self.is_self_attn = False
         self.output_proj = nn.Linear(dim_q, dim_q)
         self.dropout = dropout
 
@@ -144,9 +156,14 @@ def forward(
         bsz = query.size(0)
         embed_dim = query.size(-1)
         head_dim = embed_dim // self.num_heads
-        query = self.q_proj(query)
-        key = self.k_proj(key)
-        value = self.v_proj(value)
+        if self.is_self_attn:
+            query, key, value = _batched_input_proj(
+                query=query, input_proj=self.input_proj
+            )
+        else:
+            query = self.q_proj(query)
+            key = self.k_proj(key)
+            value = self.v_proj(value)
 
         # bsz x seq_len x embed_dim => bsz x num_heads x seq_len x head_dim
         query = query.view(bsz, -1, self.num_heads, head_dim).transpose(1, 2)