From 6916f3766cd88922054f745a2dbb13420e7e8bdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anders=20Granskogen=20Bj=C3=B8rnstad?= Date: Tue, 23 Apr 2024 13:59:19 +0200 Subject: [PATCH 1/3] models/TST: Docstring fixup on MHA layers Fix documentation on default value of d_k and d_v. If they are not provided, and d_model and n_heads are kept at default values, they will be set to 128//16 which is 8. Also, specify the usual values range to 8-64 which should more or less correspond to a d_model/n_heads range. --- nbs/049_models.TST.ipynb | 8 ++++---- tsai/models/TST.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/nbs/049_models.TST.ipynb b/nbs/049_models.TST.ipynb index c8b877509..73ab32eac 100644 --- a/nbs/049_models.TST.ipynb +++ b/nbs/049_models.TST.ipynb @@ -71,8 +71,8 @@ "* max_seq_len: useful to control the temporal resolution in long time series to avoid memory issues. Default. None.\n", "* d_model: total dimension of the model (number of features created by the model). Usual values: 128-1024. Default: 128.\n", "* n_heads: parallel attention heads. Usual values: 8-16. Default: 16.\n", - "* d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.\n", - "* d_v: size of the learned linear projection of values in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.\n", + "* d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.\n", + "* d_v: size of the learned linear projection of values in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.\n", "* d_ff: the dimension of the feedforward network model. Usual values: 256-4096. Default: 256.\n", "* dropout: amount of residual dropout applied in the encoder. Usual values: 0.-0.3. Default: 0.1.\n", "* activation: the activation function of intermediate layer, relu or gelu. Default: 'gelu'.\n", @@ -320,8 +320,8 @@ " max_seq_len: useful to control the temporal resolution in long time series to avoid memory issues.\n", " d_model: total dimension of the model (number of features created by the model)\n", " n_heads: parallel attention heads.\n", - " d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.\n", - " d_v: size of the learned linear projection of values in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32.\n", + " d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.\n", + " d_v: size of the learned linear projection of values in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8.\n", " d_ff: the dimension of the feedforward network model.\n", " dropout: amount of residual dropout applied in the encoder.\n", " act: the activation function of intermediate layer, relu or gelu.\n", diff --git a/tsai/models/TST.py b/tsai/models/TST.py index c8f51f361..431f9091e 100644 --- a/tsai/models/TST.py +++ b/tsai/models/TST.py @@ -142,8 +142,8 @@ def __init__(self, c_in:int, c_out:int, seq_len:int, max_seq_len:Optional[int]=N max_seq_len: useful to control the temporal resolution in long time series to avoid memory issues. d_model: total dimension of the model (number of features created by the model) n_heads: parallel attention heads. - d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32. - d_v: size of the learned linear projection of values in the MHA. Usual values: 16-512. Default: None -> (d_model/n_heads) = 32. + d_k: size of the learned linear projection of queries and keys in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8. + d_v: size of the learned linear projection of values in the MHA. Usual values: 8-64. Default: None -> (d_model/n_heads) = 8. d_ff: the dimension of the feedforward network model. dropout: amount of residual dropout applied in the encoder. act: the activation function of intermediate layer, relu or gelu. From 52926076cbdc26fbb2d6fff10cd34c041799343b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anders=20Granskogen=20Bj=C3=B8rnstad?= Date: Tue, 23 Apr 2024 13:59:32 +0200 Subject: [PATCH 2/3] models/TST: Fix d_model divisible assertion Follow the 050_models.TSTPlus.ipynb implementation and make sure we actually assert what we say we assert on this line. This commit breaks the inline test where d_model is 128 and n_heads is 3, see below. ``` AssertionError in /home/anders/dev/ml/tsai/nbs/049_models.TST.ipynb: =========================================================================== While Executing Cell #13: --------------------------------------------------------------------------- AssertionError Traceback (most recent call last) Cell In[1], line 2 1 t = torch.rand(16, 50, 128) ----> 2 output = _TSTEncoderLayer(q_len=50, d_model=128, n_heads=3, d_k=None, d_v=None, d_ff=512, dropout=0.1, activation='gelu')(t) 3 output.shape File ~/anaconda3/envs/tsai_dev/lib/python3.9/site-packages/fastcore/meta.py:40, in PrePostInitMeta.__call__(cls, *args, **kwargs) 38 if type(res)==cls: 39 if hasattr(res,'__pre_init__'): res.__pre_init__(*args,**kwargs) ---> 40 res.__init__(*args,**kwargs) 41 if hasattr(res,'__post_init__'): res.__post_init__(*args,**kwargs) 42 return res Cell In[1], line 11, in _TSTEncoderLayer.__init__(self, q_len, d_model, n_heads, d_k, d_v, d_ff, dropout, activation) 8 def __init__(self, q_len:int, d_model:int, n_heads:int, d_k:Optional[int]=None, d_v:Optional[int]=None, d_ff:int=256, dropout:float=0.1, 9 activation:str="gelu"): ---> 11 assert not d_model%n_heads, f"d_model ({d_model}) must be divisible by n_heads ({n_heads})" 12 d_k = ifnone(d_k, d_model // n_heads) 13 d_v = ifnone(d_v, d_model // n_heads) AssertionError: d_model (128) must be divisible by n_heads (3) ``` --- nbs/049_models.TST.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nbs/049_models.TST.ipynb b/nbs/049_models.TST.ipynb index 73ab32eac..e21195ab1 100644 --- a/nbs/049_models.TST.ipynb +++ b/nbs/049_models.TST.ipynb @@ -218,7 +218,7 @@ " def __init__(self, q_len:int, d_model:int, n_heads:int, d_k:Optional[int]=None, d_v:Optional[int]=None, d_ff:int=256, dropout:float=0.1, \n", " activation:str=\"gelu\"):\n", "\n", - " assert d_model // n_heads, f\"d_model ({d_model}) must be divisible by n_heads ({n_heads})\"\n", + " assert not d_model%n_heads, f\"d_model ({d_model}) must be divisible by n_heads ({n_heads})\"\n", " d_k = ifnone(d_k, d_model // n_heads)\n", " d_v = ifnone(d_v, d_model // n_heads)\n", "\n", From a65b5ffd053e81441f671758527980d779a8d1af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anders=20Granskogen=20Bj=C3=B8rnstad?= Date: Tue, 23 Apr 2024 13:59:34 +0200 Subject: [PATCH 3/3] models/TST: Remove d_model % n_heads assertion I believe this assertion is unnecessary because the dimensions will actually work out even if d_model is not divisible by n_heads. See a model printout of d_model=128, n_heads=3, d_k=11, d_v=9 below. Not saying a parameter change like this is a good idea, but it will happily run and learn on a sample dataset I'm using at least. The in_features and out_features of the entire MHA block will be d_model in any case. Comparing with torch.nn.MultiheadAttention [1] (which is used in the original paper implementation [2]), I think our `d_k*n_heads` corresponds to the `kdim` optional parameter. And, similarly, our `d_v*n_heads` corresponds to the `vdim` parameter. [1] https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html [2] https://github.com/gzerveas/mvts_transformer/blob/3f2e378bc77d02e82a44671f20cf15bc7761671a/src/models/ts_transformer.py#L152 ``` In [2]: clf.model Out[2]: TST( (W_P): Linear(in_features=600, out_features=128, bias=True) (dropout): Dropout(p=0.1, inplace=False) (encoder): _TSTEncoder( (layers): ModuleList( (0-2): 3 x _TSTEncoderLayer( (self_attn): _MultiHeadAttention( (W_Q): Linear(in_features=128, out_features=33, bias=False) (W_K): Linear(in_features=128, out_features=33, bias=False) (W_V): Linear(in_features=128, out_features=27, bias=False) (W_O): Linear(in_features=27, out_features=128, bias=False) ) (dropout_attn): Dropout(p=0.1, inplace=False) (batchnorm_attn): Sequential( (0): Transpose(1, 2) (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): Transpose(1, 2) ) (ff): Sequential( (0): Linear(in_features=128, out_features=256, bias=True) (1): GELU(approximate='none') (2): Dropout(p=0.1, inplace=False) (3): Linear(in_features=256, out_features=128, bias=True) ) (dropout_ffn): Dropout(p=0.1, inplace=False) (batchnorm_ffn): Sequential( (0): Transpose(1, 2) (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): Transpose(1, 2) ) ) ) ) (flatten): fastai.layers.Flatten(full=False) (head): Sequential( (0): GELU(approximate='none') (1): fastai.layers.Flatten(full=False) (2): Linear(in_features=2304, out_features=2, bias=True) ) ) ``` --- nbs/049_models.TST.ipynb | 1 - tsai/models/TST.py | 1 - 2 files changed, 2 deletions(-) diff --git a/nbs/049_models.TST.ipynb b/nbs/049_models.TST.ipynb index e21195ab1..21842f865 100644 --- a/nbs/049_models.TST.ipynb +++ b/nbs/049_models.TST.ipynb @@ -218,7 +218,6 @@ " def __init__(self, q_len:int, d_model:int, n_heads:int, d_k:Optional[int]=None, d_v:Optional[int]=None, d_ff:int=256, dropout:float=0.1, \n", " activation:str=\"gelu\"):\n", "\n", - " assert not d_model%n_heads, f\"d_model ({d_model}) must be divisible by n_heads ({n_heads})\"\n", " d_k = ifnone(d_k, d_model // n_heads)\n", " d_v = ifnone(d_v, d_model // n_heads)\n", "\n", diff --git a/tsai/models/TST.py b/tsai/models/TST.py index 431f9091e..abb03769d 100644 --- a/tsai/models/TST.py +++ b/tsai/models/TST.py @@ -76,7 +76,6 @@ class _TSTEncoderLayer(Module): def __init__(self, q_len:int, d_model:int, n_heads:int, d_k:Optional[int]=None, d_v:Optional[int]=None, d_ff:int=256, dropout:float=0.1, activation:str="gelu"): - assert d_model // n_heads, f"d_model ({d_model}) must be divisible by n_heads ({n_heads})" d_k = ifnone(d_k, d_model // n_heads) d_v = ifnone(d_v, d_model // n_heads)