From 92afd95347b12792556b2d751f331bdaefb91653 Mon Sep 17 00:00:00 2001
From: Abdallah Ahmed <abdallah.wallyallah97@gmail.com>
Date: Thu, 10 Dec 2020 18:30:52 +0200
Subject: [PATCH 1/5] Add model checkpoint

This commit adds checkpoint.py file which allows:
1.	Saving weights based on lowest val_loss from flag save_best_weights in config.yml
2.	Saving weights frequently with specified frequency int in the confing.yml
---
 config.yaml                     |  3 ++
 main.py                         |  5 ++++
 pytorch_ner/model_checkpoint.py | 51 +++++++++++++++++++++++++++++++++
 pytorch_ner/save.py             |  8 ++++--
 pytorch_ner/train.py            | 27 +++++++++++++++++
 5 files changed, 91 insertions(+), 3 deletions(-)
 create mode 100644 pytorch_ner/model_checkpoint.py

diff --git a/config.yaml b/config.yaml
index f770aa6..c1ba0a1 100644
--- a/config.yaml
+++ b/config.yaml
@@ -52,4 +52,7 @@ train:
 
 save:
   path_to_folder: 'models/test_main/'
+  model_checkpoint:
+    save_frequency: 2
+    save_best_weights: true
   export_onnx: true
diff --git a/main.py b/main.py
index 289d551..4f35865 100644
--- a/main.py
+++ b/main.py
@@ -8,6 +8,7 @@
 from torch.utils.data import DataLoader
 
 from pytorch_ner.dataset import NERCollator, NERDataset
+from pytorch_ner.model_checkpoint import model_checkpoint
 from pytorch_ner.nn_modules.architecture import BiLSTM
 from pytorch_ner.nn_modules.embedding import Embedding
 from pytorch_ner.nn_modules.linear import LinearHead
@@ -201,6 +202,10 @@ def main(path_to_config: str):
         optimizer=optimizer,
         device=device,
         n_epoch=config["train"]["n_epoch"],
+        export_onnx=config["save"]["export_onnx"],
+        path_to_folder=config["save"]["path_to_folder"],
+        save_frequency=config["save"]["model_checkpoint"]["save_frequency"],
+        save_best_weights=config["save"]["model_checkpoint"]["save_best_weights"],
         verbose=config["train"]["verbose"],
     )
 
diff --git a/pytorch_ner/model_checkpoint.py b/pytorch_ner/model_checkpoint.py
new file mode 100644
index 0000000..91cdf89
--- /dev/null
+++ b/pytorch_ner/model_checkpoint.py
@@ -0,0 +1,51 @@
+import json
+import os
+from typing import Dict
+
+import torch
+import torch.nn as nn
+import yaml
+
+from pytorch_ner.onnx import onnx_export_and_check
+from pytorch_ner.utils import mkdir, rmdir
+import numpy as np
+
+def model_checkpoint(
+                model: nn.Module,
+                epoch: int,
+                save_best_weights: bool, 
+                val_metrics,
+                val_losses,
+                path_to_folder: str,
+                export_onnx: bool,
+                save_frequency: int,
+                ):
+    
+    '''
+    This function creates check point based on either one of the two scenarios:
+        1. Save best weights regarding the val_loss
+        2. Save weights frequently with save_frequency int
+
+    '''
+    if save_best_weights:
+        if np.mean(val_metrics['loss']) < min(val_losses):
+            # This iteration has lower val_loss, let's save it
+            val_losses.append(np.mean(val_metrics['loss']))
+            pth_file_name = "best_model.pth"
+            onnx_file_name = "best_model.onnx"
+        else:
+            # No need to save weights
+            return
+    else:
+        if epoch % save_frequency == 0:
+            # We're at multiple of save_frequency, let's save weights
+            pth_file_name = "model_epoch_" + str(epoch) + ".pth"
+            onnx_file_name = "model_epoch_" + str(epoch) + ".onnx"
+        else:
+            # No need to save weights
+            return
+    
+
+    torch.save(model.state_dict(),os.path.join(path_to_folder, pth_file_name))
+    if export_onnx:
+        onnx_export_and_check(model=model, path_to_save=os.path.join(path_to_folder, onnx_file_name))
\ No newline at end of file
diff --git a/pytorch_ner/save.py b/pytorch_ner/save.py
index fa0fd7f..c2bc05b 100644
--- a/pytorch_ner/save.py
+++ b/pytorch_ner/save.py
@@ -18,9 +18,11 @@ def save_model(
     config: Dict,
     export_onnx: bool = False,
 ):
-    # make empty dir
-    rmdir(path_to_folder)
-    mkdir(path_to_folder)
+    
+    # if os.path.exists(path_to_folder):
+    #     # make empty dir
+    #     rmdir(path_to_folder)
+    # mkdir(path_to_folder)
 
     model.cpu()
     model.eval()
diff --git a/pytorch_ner/train.py b/pytorch_ner/train.py
index 60e1d52..f353303 100644
--- a/pytorch_ner/train.py
+++ b/pytorch_ner/train.py
@@ -2,6 +2,7 @@
 from typing import Callable, DefaultDict, List, Optional
 
 import numpy as np
+import os
 import torch
 import torch.nn as nn
 import torch.optim as optim
@@ -11,6 +12,9 @@
 from pytorch_ner.metrics import calculate_metrics
 from pytorch_ner.utils import to_numpy
 
+from pytorch_ner.model_checkpoint import model_checkpoint
+from pytorch_ner.utils import mkdir, rmdir
+from pytorch_ner.onnx import onnx_export_and_check
 
 def masking(lengths: torch.Tensor) -> torch.Tensor:
     """
@@ -144,12 +148,23 @@ def train(
     optimizer: optim.Optimizer,
     device: torch.device,
     n_epoch: int,
+    export_onnx: bool,
+    path_to_folder: str,
+    save_frequency: int,
+    save_best_weights: bool,
     testloader: Optional[DataLoader] = None,
     verbose: bool = True,
 ):
     """
     Training / validation loop for n_epoch with final testing.
     """
+    if os.path.exists(path_to_folder):
+        # delete any previous versions of models
+        rmdir(path_to_folder)
+    mkdir(path_to_folder)
+
+    # List that tracks val_loss over training to save best weights
+    val_losses = [np.inf]
 
     for epoch in range(n_epoch):
 
@@ -183,6 +198,18 @@ def train(
                 print(f"val {metric_name}: {np.mean(metric_list)}")
             print()
 
+        # Model Checkpoint
+        model_checkpoint(
+            model=model,
+            epoch=epoch,
+            save_best_weights=save_best_weights, 
+            val_metrics=val_metrics,
+            val_losses=val_losses,
+            path_to_folder=path_to_folder,
+            export_onnx=export_onnx,
+            save_frequency=save_frequency,
+            )
+
     if testloader is not None:
 
         test_metrics = validate_loop(

From bffaaa224876df1e1eb26a1bd917dec417408e0b Mon Sep 17 00:00:00 2001
From: Abdallah Ahmed <abdallah.wallyallah97@gmail.com>
Date: Thu, 10 Dec 2020 18:34:20 +0200
Subject: [PATCH 2/5] Fix Black Formatting

---
 pytorch_ner/model_checkpoint.py | 36 +++++++++++++++++----------------
 pytorch_ner/save.py             |  2 +-
 pytorch_ner/train.py            |  5 +++--
 3 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/pytorch_ner/model_checkpoint.py b/pytorch_ner/model_checkpoint.py
index 91cdf89..3e5952b 100644
--- a/pytorch_ner/model_checkpoint.py
+++ b/pytorch_ner/model_checkpoint.py
@@ -10,27 +10,28 @@
 from pytorch_ner.utils import mkdir, rmdir
 import numpy as np
 
+
 def model_checkpoint(
-                model: nn.Module,
-                epoch: int,
-                save_best_weights: bool, 
-                val_metrics,
-                val_losses,
-                path_to_folder: str,
-                export_onnx: bool,
-                save_frequency: int,
-                ):
-    
-    '''
+    model: nn.Module,
+    epoch: int,
+    save_best_weights: bool,
+    val_metrics,
+    val_losses,
+    path_to_folder: str,
+    export_onnx: bool,
+    save_frequency: int,
+):
+
+    """
     This function creates check point based on either one of the two scenarios:
         1. Save best weights regarding the val_loss
         2. Save weights frequently with save_frequency int
 
-    '''
+    """
     if save_best_weights:
-        if np.mean(val_metrics['loss']) < min(val_losses):
+        if np.mean(val_metrics["loss"]) < min(val_losses):
             # This iteration has lower val_loss, let's save it
-            val_losses.append(np.mean(val_metrics['loss']))
+            val_losses.append(np.mean(val_metrics["loss"]))
             pth_file_name = "best_model.pth"
             onnx_file_name = "best_model.onnx"
         else:
@@ -44,8 +45,9 @@ def model_checkpoint(
         else:
             # No need to save weights
             return
-    
 
-    torch.save(model.state_dict(),os.path.join(path_to_folder, pth_file_name))
+    torch.save(model.state_dict(), os.path.join(path_to_folder, pth_file_name))
     if export_onnx:
-        onnx_export_and_check(model=model, path_to_save=os.path.join(path_to_folder, onnx_file_name))
\ No newline at end of file
+        onnx_export_and_check(
+            model=model, path_to_save=os.path.join(path_to_folder, onnx_file_name)
+        )
diff --git a/pytorch_ner/save.py b/pytorch_ner/save.py
index c2bc05b..f6ead13 100644
--- a/pytorch_ner/save.py
+++ b/pytorch_ner/save.py
@@ -18,7 +18,7 @@ def save_model(
     config: Dict,
     export_onnx: bool = False,
 ):
-    
+
     # if os.path.exists(path_to_folder):
     #     # make empty dir
     #     rmdir(path_to_folder)
diff --git a/pytorch_ner/train.py b/pytorch_ner/train.py
index f353303..1a97f49 100644
--- a/pytorch_ner/train.py
+++ b/pytorch_ner/train.py
@@ -16,6 +16,7 @@
 from pytorch_ner.utils import mkdir, rmdir
 from pytorch_ner.onnx import onnx_export_and_check
 
+
 def masking(lengths: torch.Tensor) -> torch.Tensor:
     """
     Convert lengths tensor to binary mask
@@ -202,13 +203,13 @@ def train(
         model_checkpoint(
             model=model,
             epoch=epoch,
-            save_best_weights=save_best_weights, 
+            save_best_weights=save_best_weights,
             val_metrics=val_metrics,
             val_losses=val_losses,
             path_to_folder=path_to_folder,
             export_onnx=export_onnx,
             save_frequency=save_frequency,
-            )
+        )
 
     if testloader is not None:
 

From 33d8f0ca2f987c97a31c87e1e2c86b9540a03c94 Mon Sep 17 00:00:00 2001
From: Abdallah Ahmed <abdallah.wallyallah97@gmail.com>
Date: Thu, 10 Dec 2020 18:37:07 +0200
Subject: [PATCH 3/5] Solves Black and isort formatting

---
 pytorch_ner/model_checkpoint.py | 2 +-
 pytorch_ner/train.py            | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/pytorch_ner/model_checkpoint.py b/pytorch_ner/model_checkpoint.py
index 3e5952b..1c07cbb 100644
--- a/pytorch_ner/model_checkpoint.py
+++ b/pytorch_ner/model_checkpoint.py
@@ -2,13 +2,13 @@
 import os
 from typing import Dict
 
+import numpy as np
 import torch
 import torch.nn as nn
 import yaml
 
 from pytorch_ner.onnx import onnx_export_and_check
 from pytorch_ner.utils import mkdir, rmdir
-import numpy as np
 
 
 def model_checkpoint(
diff --git a/pytorch_ner/train.py b/pytorch_ner/train.py
index 1a97f49..a0a9432 100644
--- a/pytorch_ner/train.py
+++ b/pytorch_ner/train.py
@@ -1,8 +1,8 @@
+import os
 from collections import defaultdict
 from typing import Callable, DefaultDict, List, Optional
 
 import numpy as np
-import os
 import torch
 import torch.nn as nn
 import torch.optim as optim
@@ -10,11 +10,9 @@
 from tqdm import tqdm
 
 from pytorch_ner.metrics import calculate_metrics
-from pytorch_ner.utils import to_numpy
-
 from pytorch_ner.model_checkpoint import model_checkpoint
-from pytorch_ner.utils import mkdir, rmdir
 from pytorch_ner.onnx import onnx_export_and_check
+from pytorch_ner.utils import mkdir, rmdir, to_numpy
 
 
 def masking(lengths: torch.Tensor) -> torch.Tensor:

From 3c55d36dc39aaaf988aa327deeacac45bb7ec27a Mon Sep 17 00:00:00 2001
From: Abdallah Ahmed <abdallah.wallyallah97@gmail.com>
Date: Thu, 10 Dec 2020 18:48:05 +0200
Subject: [PATCH 4/5] Change train function in test_train.py

Since train() function was changed in previous commit to include checkpoint
train function needed to be changed in test_train.py as well
---
 tests/test_train.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test_train.py b/tests/test_train.py
index 4158166..3efab0e 100644
--- a/tests/test_train.py
+++ b/tests/test_train.py
@@ -75,6 +75,10 @@
     optimizer=optimizer,
     device=device,
     n_epoch=5,
+    export_onnx=True,
+    path_to_folder="models/test_main/",
+    save_frequency=1,
+    save_best_weights=True,
     verbose=False,
 )
 

From 8c95794ef3b736abb247b58b0c6d166d3ed65135 Mon Sep 17 00:00:00 2001
From: Abdallah Ahmed <abdallah.wallyallah97@gmail.com>
Date: Thu, 10 Dec 2020 18:58:24 +0200
Subject: [PATCH 5/5] Edit in save function to solve FileNotFoundError

This error was because I moved creating new folder to the train function
to cope with the checkpoint addition. However, test_save works independantly
thus I creating new directory (folder) is created in save function with a condition
---
 pytorch_ner/save.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pytorch_ner/save.py b/pytorch_ner/save.py
index f6ead13..50a5b61 100644
--- a/pytorch_ner/save.py
+++ b/pytorch_ner/save.py
@@ -19,10 +19,9 @@ def save_model(
     export_onnx: bool = False,
 ):
 
-    # if os.path.exists(path_to_folder):
-    #     # make empty dir
-    #     rmdir(path_to_folder)
-    # mkdir(path_to_folder)
+    if not os.path.exists(path_to_folder):
+        # make empty dir
+        mkdir(path_to_folder)
 
     model.cpu()
     model.eval()