Merge pull request #43 from jdb78/test/hyperparameter_optimization

jdb78 · web-flow · commit 5e7c808cf4a1 · 2020-09-13T23:43:01.000+01:00
Test and docs for hyperparameter optimization
diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@ Pytorch Forecasting aims to ease timeseries forecasting with neural networks for
   for real-world deployment and come with in-built interpretation capabilities
 - Multi-horizon timeseries metrics
 - Ranger optimizer for faster model training
+- Hyperparameter tuning with [optuna](https://optuna.readthedocs.io/)
 
 The package is built on [pytorch-lightning])(https://pytorch-lightning.readthedocs.io/) to allow training on CPUs, single and multiple GPUs out-of-the-box.
 
@@ -28,7 +29,7 @@ Visit the documentation at [https://pytorch-forecasting.readthedocs.io](https://
 # Available models
 
 - [Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting](https://arxiv.org/pdf/1912.09363.pdf)
-- [N-Beats](http://arxiv.org/abs/1905.10437)
+- [N-BEATS: Neural basis expansion analysis for interpretable time series forecasting](http://arxiv.org/abs/1905.10437)
 
 # Usage
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -17,6 +17,7 @@ research alike. Specifically, the package provides
   for real-world deployment and come with in-built interpretation capabilities
 * Multi-horizon timeseries metrics
 * Ranger optimizer for faster model training
+* Hyperparameter tuning with `optuna <https://optuna.readthedocs.io/>`_
 
 The package is built on `PyTorch Lightning <https://pytorch-lightning.readthedocs.io/>`_ to allow 
 training on CPUs, single and multiple GPUs out-of-the-box.
diff --git a/docs/source/models.rst b/docs/source/models.rst
@@ -11,6 +11,9 @@ Pytorch Forecasting provides a ``.from_dataset()`` method for each model that
 takes a :py:class:`~data.timeseries.TimeSeriesDataSet` and additional parameters
 that cannot directy derived from the dataset such as, e.g. ``learning_rate`` or ``hidden_size``.
 
+To tune models, `optuna <https://optuna.readthedocs.io/>`_ can be used. For example, tuning of the :py:class:`~models.temporal_fusion_transformer.TemporalFusionTransformer`
+is implemented by :py:func:`~models.temporal_fusion_transformer.tuning.optimize_hyperparameters`
+
 Details
 --------
 
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -40,7 +40,7 @@ homepage = "https://pytorch-forecasting.readthedocs.io"
 [tool.poetry.dependencies]
 python= "^3.6.1"
 
-torch = "^1.6"
+torch = "^1.4"
 pytorch-lightning = "^0.9.0"
 optuna = "^2.0.0"
 scipy = "*"
diff --git a/pytorch_forecasting/models/nbeats/__init__.py b/pytorch_forecasting/models/nbeats/__init__.py
@@ -37,6 +37,10 @@ def __init__(
         """
         Initialize NBeats Model - use its :py:meth:`~from_dataset` method if possible.
 
+        Based on the article
+        `N-BEATS: Neural basis expansion analysis for interpretable time series
+        forecasting <http://arxiv.org/abs/1905.10437>`_.
+
         Args:
             stack_types: One of the following values: “generic”, “seasonality" or “trend". A list of strings
                 of length 1 or ‘num_stacks’. Default and recommended value
diff --git a/pytorch_forecasting/models/temporal_fusion_transformer/__init__.py b/pytorch_forecasting/models/temporal_fusion_transformer/__init__.py
@@ -60,6 +60,25 @@ def __init__(
         """
         Temporal Fusion Transformer for forecasting timeseries - use its :py:meth:`~from_dataset` method if possible.
 
+        Implementation of the article
+        `Temporal Fusion Transformers for Interpretable Multi-horizon Time Series
+        Forecasting <https://arxiv.org/pdf/1912.09363.pdf>`_.
+
+        Enhancements compared to the original implementation (apart from capabilities added through base model
+        such as monotone constraints):
+
+        * static variables can be continuous
+        * multiple categorical variables can be summarized with an EmbeddingBag
+        * variable encoder and decoder length by sample
+        * categorical embeddings are not transformed by variable selection network (because it is a redundant operation)
+        * variable dimension in variable selection network are scaled up via linear interpolation to reduce
+          number of parameters
+        * non-linear variable processing in variable selection network can be shared among decoder and encoder
+          (not shared by default)
+
+        Tune its hyperparameters with
+        :py:func:`~pytorch_forecasting.models.temporal_fusion_transformer.tuning.optimize_hyperparameters`.
+
         Args:
 
             hidden_size: hidden size of network which is its main hyperparameter and can range from 8 to 512
diff --git a/pytorch_forecasting/models/temporal_fusion_transformer/tuning.py b/pytorch_forecasting/models/temporal_fusion_transformer/tuning.py
@@ -42,10 +42,46 @@ def optimize_hyperparameters(
     hidden_continuous_size_range: Tuple[int, int] = (8, 64),
     attention_head_size_range: Tuple[int, int] = (1, 4),
     dropout_range: Tuple[float, float] = (0.1, 0.3),
+    learning_rate_range: Tuple[float, float] = (1e-5, 1.0),
+    use_learning_rate_finder: bool = True,
     trainer_kwargs: Dict[str, Any] = {},
     log_dir: str = "lightning_logs",
     **kwargs,
 ) -> optuna.Study:
+    """
+    Optimize Temporal Fusion Transformer hyperparameters.
+
+    Run hyperparameter optimization. Learning rate for is determined with
+    the PyTorch Lightning learning rate finder.
+
+    Args:
+        train_dataloader (DataLoader): dataloader for training model
+        val_dataloader (DataLoader): dataloader for validating model
+        model_path (str): folder to which model checkpoints are saved
+        max_epochs (int, optional): Maximum number of epochs to run training. Defaults to 20.
+        n_trials (int, optional): Number of hyperparameter trials to run. Defaults to 100.
+        timeout (float, optional): Time in seconds after which training is stopped regardless of number of epochs
+            or validation metric. Defaults to 3600*8.0.
+        hidden_size_range (Tuple[int, int], optional): Minimum and maximum of ``hidden_size`` hyperparameter. Defaults
+            to (16, 265).
+        hidden_continuous_size_range (Tuple[int, int], optional):  Minimum and maximum of ``hidden_continuous_size``
+            hyperparameter. Defaults to (8, 64).
+        attention_head_size_range (Tuple[int, int], optional):  Minimum and maximum of ``attention_head_size``
+            hyperparameter. Defaults to (1, 4).
+        dropout_range (Tuple[float, float], optional):  Minimum and maximum of ``dropout`` hyperparameter. Defaults to
+            (0.1, 0.3).
+        learning_rate_range (Tuple[float, float], optional): Learning rate range. Defaults to (1e-5, 1.0).
+        use_learning_rate_finder (bool): If to use learning rate finder or optimize as part of hyperparameters.
+            Defaults to True.
+        trainer_kwargs (Dict[str, Any], optional): Additional arguments to the
+            `PyTorch Lightning trainer <https://pytorch-lightning.readthedocs.io/en/latest/trainer.html>`_ such
+            as ``limit_train_batches``. Defaults to {}.
+        log_dir (str, optional): Folder into which to log results for tensorboard. Defaults to "lightning_logs".
+        **kwargs: Additional arguments for the :py:class:`~TemporalFusionTransformer`.
+
+    Returns:
+        optuna.Study: optuna study results
+    """
     assert isinstance(train_dataloader.dataset, TimeSeriesDataSet) and isinstance(
         val_dataloader.dataset, TimeSeriesDataSet
     ), "dataloaders must be built from timeseriesdataset"
@@ -92,7 +128,7 @@ def objective(trial: optuna.Trial) -> float:
             **kwargs,
         )
         # find good learning rate
-        if "learning_rate" not in kwargs or isinstance(kwargs["learning_rate"], (tuple, list)):
+        if use_learning_rate_finder:
             lr_trainer = pl.Trainer(
                 gradient_clip_val=gradient_clip_val,
                 gpus=[0] if torch.cuda.is_available() else None,
@@ -103,9 +139,9 @@ def objective(trial: optuna.Trial) -> float:
                 train_dataloader=train_dataloader,
                 val_dataloaders=val_dataloader,
                 early_stop_threshold=10000.0,
-                min_lr=kwargs.get("learning_rate", [1e-5, 1.0])[0],
+                min_lr=learning_rate_range[0],
                 num_training=100,
-                max_lr=kwargs.get("learning_rate", [1e-5, 1.0])[1],
+                max_lr=learning_rate_range[1],
             )
 
             loss_finite = np.isfinite(res.results["loss"])
@@ -118,6 +154,8 @@ def objective(trial: optuna.Trial) -> float:
             optimal_lr = lr_smoothed[optimal_idx]
             print(f"Using learning rate of {optimal_lr:.3g}")
             model.hparams.learning_rate = optimal_lr
+        else:
+            model.hparams.learning_rate = trial.suggest_loguniform("learning_rate_range", *learning_rate_range)
 
         # fit
         trainer.fit(model, train_dataloader=train_dataloader, val_dataloaders=val_dataloader)