From 6916f3766cd88922054f745a2dbb13420e7e8bdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Anders=20Granskogen=20Bj=C3=B8rnstad?= <andersgb@gmail.com>
Date: Tue, 23 Apr 2024 13:59:19 +0200
Subject: [PATCH 1/3] models/TST: Docstring fixup on MHA layers

Fix documentation on default value of d_k and d_v. If they are not provided, and
d_model and n_heads are kept at default values, they will be set to 128//16
which is 8.

Also, specify the usual values range to 8-64 which should more or less correspond
to a d_model/n_heads range.
---
 nbs/049_models.TST.ipynb | 8 ++++----
 tsai/models/TST.py       | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/nbs/049_models.TST.ipynb b/nbs/049_models.TST.ipynb
index c8b877509..73ab32eac 100644
--- a/nbs/049_models.TST.ipynb
+++ b/nbs/049_models.TST.ipynb
@@ -71,8 +71,8 @@
     "* max_seq_len: useful to control the temporal resolution in long time series to avoid memory issues. Default. None.\n",
     "* d_model: total dimension of the model (number of features created by the model). Usual values: 128-1024. Default: 128.\n",
     "* n_heads:  parallel attention heads. Usual values: 8-16. Default: 16.\n",
-    "* d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.\n",
-    "* d_v: size of the learned linear projection of values in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.\n",
+    "* d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.\n",
+    "* d_v: size of the learned linear projection of values in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.\n",
     "* d_ff: the dimension of the feedforward network model. Usual values: 256-4096. Default: 256.\n",
     "* dropout: amount of residual dropout applied in the encoder. Usual values: 0.-0.3. Default: 0.1.\n",
     "* activation: the activation function of intermediate layer, relu or gelu. Default: 'gelu'.\n",
@@ -320,8 +320,8 @@
     "            max_seq_len: useful to control the temporal resolution in long time series to avoid memory issues.\n",
     "            d_model: total dimension of the model (number of features created by the model)\n",
     "            n_heads:  parallel attention heads.\n",
-    "            d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.\n",
-    "            d_v: size of the learned linear projection of values in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.\n",
+    "            d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.\n",
+    "            d_v: size of the learned linear projection of values in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.\n",
     "            d_ff: the dimension of the feedforward network model.\n",
     "            dropout: amount of residual dropout applied in the encoder.\n",
     "            act: the activation function of intermediate layer, relu or gelu.\n",
diff --git a/tsai/models/TST.py b/tsai/models/TST.py
index c8f51f361..431f9091e 100644
--- a/tsai/models/TST.py
+++ b/tsai/models/TST.py
@@ -142,8 +142,8 @@ def __init__(self, c_in:int, c_out:int, seq_len:int, max_seq_len:Optional[int]=N
             max_seq_len: useful to control the temporal resolution in long time series to avoid memory issues.
             d_model: total dimension of the model (number of features created by the model)
             n_heads:  parallel attention heads.
-            d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.
-            d_v: size of the learned linear projection of values in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.
+            d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.
+            d_v: size of the learned linear projection of values in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.
             d_ff: the dimension of the feedforward network model.
             dropout: amount of residual dropout applied in the encoder.
             act: the activation function of intermediate layer, relu or gelu.

From 52926076cbdc26fbb2d6fff10cd34c041799343b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Anders=20Granskogen=20Bj=C3=B8rnstad?= <andersgb@gmail.com>
Date: Tue, 23 Apr 2024 13:59:32 +0200
Subject: [PATCH 2/3] models/TST: Fix d_model divisible assertion

Follow the 050_models.TSTPlus.ipynb implementation and make sure we actually
assert what we say we assert on this line. This commit breaks the inline test
where d_model is 128 and n_heads is 3, see below.

```
AssertionError in /home/anders/dev/ml/tsai/nbs/049_models.TST.ipynb:
===========================================================================

While Executing Cell #13:
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
Cell In[1], line 2
      1 t = torch.rand(16, 50, 128)
----> 2 output = _TSTEncoderLayer(q_len=50, d_model=128, n_heads=3, d_k=None, d_v=None, d_ff=512, dropout=0.1, activation='gelu')(t)
      3 output.shape

File ~/anaconda3/envs/tsai_dev/lib/python3.9/site-packages/fastcore/meta.py:40, in PrePostInitMeta.__call__(cls, *args, **kwargs)
     38 if type(res)==cls:
     39     if hasattr(res,'__pre_init__'): res.__pre_init__(*args,**kwargs)
---> 40     res.__init__(*args,**kwargs)
     41     if hasattr(res,'__post_init__'): res.__post_init__(*args,**kwargs)
     42 return res

Cell In[1], line 11, in _TSTEncoderLayer.__init__(self, q_len, d_model, n_heads, d_k, d_v, d_ff, dropout, activation)
      8 def __init__(self, q_len:int, d_model:int, n_heads:int, d_k:Optional[int]=None, d_v:Optional[int]=None, d_ff:int=256, dropout:float=0.1,
      9              activation:str="gelu"):
---> 11     assert not d_model%n_heads, f"d_model ({d_model}) must be divisible by n_heads ({n_heads})"
     12     d_k = ifnone(d_k, d_model // n_heads)
     13     d_v = ifnone(d_v, d_model // n_heads)

AssertionError: d_model (128) must be divisible by n_heads (3)
```
---
 nbs/049_models.TST.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nbs/049_models.TST.ipynb b/nbs/049_models.TST.ipynb
index 73ab32eac..e21195ab1 100644
--- a/nbs/049_models.TST.ipynb
+++ b/nbs/049_models.TST.ipynb
@@ -218,7 +218,7 @@
     "    def __init__(self, q_len:int, d_model:int, n_heads:int, d_k:Optional[int]=None, d_v:Optional[int]=None, d_ff:int=256, dropout:float=0.1, \n",
     "                 activation:str=\"gelu\"):\n",
     "\n",
-    "        assert d_model // n_heads, f\"d_model ({d_model}) must be divisible by n_heads ({n_heads})\"\n",
+    "        assert not d_model%n_heads, f\"d_model ({d_model}) must be divisible by n_heads ({n_heads})\"\n",
     "        d_k = ifnone(d_k, d_model // n_heads)\n",
     "        d_v = ifnone(d_v, d_model // n_heads)\n",
     "\n",

From a65b5ffd053e81441f671758527980d779a8d1af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Anders=20Granskogen=20Bj=C3=B8rnstad?= <andersgb@gmail.com>
Date: Tue, 23 Apr 2024 13:59:34 +0200
Subject: [PATCH 3/3] models/TST: Remove d_model % n_heads assertion

I believe this assertion is unnecessary because the dimensions will actually
work out even if d_model is not divisible by n_heads. See a model printout of
d_model=128, n_heads=3, d_k=11, d_v=9 below. Not saying a parameter change like
this is a good idea, but it will happily run and learn on a sample dataset I'm
using at least. The in_features and out_features of the entire MHA block will be
d_model in any case.

Comparing with torch.nn.MultiheadAttention [1] (which is used in the original
paper implementation [2]), I think our `d_k*n_heads` corresponds to the `kdim`
optional parameter. And, similarly, our `d_v*n_heads` corresponds to the `vdim`
parameter.

[1] https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html
[2] https://github.com/gzerveas/mvts_transformer/blob/3f2e378bc77d02e82a44671f20cf15bc7761671a/src/models/ts_transformer.py#L152

```
In [2]: clf.model
Out[2]:
TST(
  (W_P): Linear(in_features=600, out_features=128, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (encoder): _TSTEncoder(
    (layers): ModuleList(
      (0-2): 3 x _TSTEncoderLayer(
        (self_attn): _MultiHeadAttention(
          (W_Q): Linear(in_features=128, out_features=33, bias=False)
          (W_K): Linear(in_features=128, out_features=33, bias=False)
          (W_V): Linear(in_features=128, out_features=27, bias=False)
          (W_O): Linear(in_features=27, out_features=128, bias=False)
        )
        (dropout_attn): Dropout(p=0.1, inplace=False)
        (batchnorm_attn): Sequential(
          (0): Transpose(1, 2)
          (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): Transpose(1, 2)
        )
        (ff): Sequential(
          (0): Linear(in_features=128, out_features=256, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.1, inplace=False)
          (3): Linear(in_features=256, out_features=128, bias=True)
        )
        (dropout_ffn): Dropout(p=0.1, inplace=False)
        (batchnorm_ffn): Sequential(
          (0): Transpose(1, 2)
          (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): Transpose(1, 2)
        )
      )
    )
  )
  (flatten): fastai.layers.Flatten(full=False)
  (head): Sequential(
    (0): GELU(approximate='none')
    (1): fastai.layers.Flatten(full=False)
    (2): Linear(in_features=2304, out_features=2, bias=True)
  )
)
```
---
 nbs/049_models.TST.ipynb | 1 -
 tsai/models/TST.py       | 1 -
 2 files changed, 2 deletions(-)

diff --git a/nbs/049_models.TST.ipynb b/nbs/049_models.TST.ipynb
index e21195ab1..21842f865 100644
--- a/nbs/049_models.TST.ipynb
+++ b/nbs/049_models.TST.ipynb
@@ -218,7 +218,6 @@
     "    def __init__(self, q_len:int, d_model:int, n_heads:int, d_k:Optional[int]=None, d_v:Optional[int]=None, d_ff:int=256, dropout:float=0.1, \n",
     "                 activation:str=\"gelu\"):\n",
     "\n",
-    "        assert not d_model%n_heads, f\"d_model ({d_model}) must be divisible by n_heads ({n_heads})\"\n",
     "        d_k = ifnone(d_k, d_model // n_heads)\n",
     "        d_v = ifnone(d_v, d_model // n_heads)\n",
     "\n",
diff --git a/tsai/models/TST.py b/tsai/models/TST.py
index 431f9091e..abb03769d 100644
--- a/tsai/models/TST.py
+++ b/tsai/models/TST.py
@@ -76,7 +76,6 @@ class _TSTEncoderLayer(Module):
     def __init__(self, q_len:int, d_model:int, n_heads:int, d_k:Optional[int]=None, d_v:Optional[int]=None, d_ff:int=256, dropout:float=0.1, 
                  activation:str="gelu"):
 
-        assert d_model // n_heads, f"d_model ({d_model}) must be divisible by n_heads ({n_heads})"
         d_k = ifnone(d_k, d_model // n_heads)
         d_v = ifnone(d_v, d_model // n_heads)